Source code for swamp.mr.targetdata
import gemmi
from Bio import SeqIO
from swamp.parsers import MtzParser
from Bio.Alphabet import generic_protein
from Bio.SeqUtils import molecular_weight
[docs]class TargetData(object):
"""Class to store relevant information about the target structure to be solved through MR
:param str fasta_fname: target's fasta filename
:param str mtz_fname: target's mtz filename
:param str phased_mtz_fname: target's mtz filename containing phases (default: None)
:param `~swamp.logger.swamplogger.SwampLogger` logger: logging interface for the MR pipeline (default None)
"""
def __init__(self, fasta_fname, mtz_fname, phased_mtz_fname=None, logger=None):
self.fasta_fname = fasta_fname
self.mtz_fname = mtz_fname
self.phased_mtz_fname = phased_mtz_fname
self.mw = None
self.use_f = False
self.resolution = None
self.nreflections = None
self.spacegroup_symbol = None
self.solvent = None
self.seq_length = None
self.nreflections = None
self.spacegroup_symbol = None
self.spacegroup_name = None
self.ncopies = None
self.f = None
self.sigf = None
self.i = None
self.sigi = None
self.dp = None
self.sigdp = None
self.free = None
self.f_plus = None
self.sigf_plus = None
self.i_plus = None
self.sigi_plus = None
self.f_minus = None
self.sigf_minus = None
self.i_minus = None
self.sigi_minus = None
self.logger = logger
[docs] def get_info(self):
"""Get all the information required to perform MR on the given target and store it into corresponding attributes
of this :py:obj:`~swamp.mr.targetdata.TargetData` instance"""
self.mw, self.seq_length = self.read_fasta(self.fasta_fname)
mtz_parser = MtzParser(self.mtz_fname, logger=self.logger)
self.nreflections = mtz_parser.nreflections
self.spacegroup_symbol = mtz_parser.spacegroup_symbol
self.spacegroup_name = mtz_parser.spacegroup_symbol.replace(' ', '')
self.resolution = mtz_parser.resolution
mtz_parser.parse()
if mtz_parser.i is None and mtz_parser.f is not None:
self.use_f = True
self.f, self.sigf, self.i, self.sigi, self.free, self.dp, self.sigdp, self.f_plus, self.sigf_plus, \
self.i_plus, self.sigi_plus, self.f_minus, self.sigf_minus, self.i_minus, self.sigi_minus = mtz_parser.summary
self.ncopies, self.solvent = self.estimate_contents(mtz_parser.reflection_file.cell.volume_per_image(), self.mw)
[docs] @staticmethod
def read_fasta(fname):
"""Extract information about the target's sequence from a fasta file
:param str fname: the file name of the fasta file of interest
:returns the combined molecular weight and length of the unique sequences in the fasta file (tuple)
"""
target_chains = [str(chain.seq) for chain in list(SeqIO.parse(fname, "fasta", alphabet=generic_protein))]
target_chains = list(set(target_chains))
mw = 0.0
for seq in target_chains:
seq = seq.replace("X", "A")
mw += round(molecular_weight(seq, "protein"), 2)
seq_length = sum([len(seq) for seq in target_chains])
return mw, seq_length
[docs] @staticmethod
def estimate_contents(cell_volume, mw):
"""Estimate the number of copies and the solvent content of the crystal
:param float cell_volume: the volume of the crystal's cell
:param float mw: molecular weight of each copy of the structure
:returns: the no. of copies in the asu and the solvent content (tuple)
"""
for ncopies in [1, 2, 3, 4, 5]:
matthews = cell_volume / (mw * ncopies)
protein_fraction = 1. / (6.02214e23 * 1e-24 * 1.35 * matthews)
solvent = round((1 - protein_fraction), 1)
if round(matthews, 3) <= 3.59:
break
if solvent <= 0.4 and ncopies != 1:
ncopies -= 1
matthews = cell_volume / (mw * ncopies)
protein_fraction = 1. / (6.02214e23 * 1e-24 * 1.35 * matthews)
solvent = round((1 - protein_fraction), 1)
return ncopies, solvent