Source code for swamp.parsers.topconsparser
import pandas as pd
from swamp.parsers.parser import Parser
from itertools import groupby
from operator import itemgetter
[docs]class TopconsParser(Parser):
"""Topcons file parser
:param str fname: the file name to be parsed (default None)
:param `~swamp.logger.swamplogger.SwampLogger` logger: logging interface for the parser (default None)
:ivar `~pandas.Dataframe` tmhelices: Dataframe with the mappings for the TM helices predicted by topcons
:ivar `~pandas.Dataframe` residue_topology: Dataframe with the residue topology as predicted by topcons
:example:
>>> from swamp.parsers import TopconsParser
>>> my_parser = TopconsParser('<fname>')
>>> my_parser.parse()
"""
def __init__(self, fname, logger=None):
self.tmhelices = None
self.residue_topology = None
super(TopconsParser, self).__init__(fname, logger=logger)
@property
def summary(self):
"""Abstract property to store a summary of the parsed figures of merit"""
return None
[docs] def parse(self):
"""Parse the :py:attr:`~swamp.parsers.parser.fname` prediction file and retrieve the TM topology"""
if self.error:
self.logger.warning("Previous errors prevent parsing TOPCONS file!")
return
with open(self.fname, "r") as fhandle:
self.inputfile_contents = fhandle.readlines()
try:
topcons_prediction = self.inputfile_contents[
self.inputfile_contents.index('TOPCONS predicted topology:\n') + 1].rstrip()
except ValueError as e:
raise ValueError('TOPCONS file cannot be parsed. Please check it is TOPCONS format!')
residues = []
for index, ss_residue in enumerate(topcons_prediction):
if ss_residue == "o":
residues.append([index + 1, True, False, False])
elif ss_residue == "M":
residues.append([index + 1, False, True, False])
else:
residues.append([index + 1, False, False, True])
self.residue_topology = pd.DataFrame(residues)
self.residue_topology.columns = ["idx", "out", "membr", "in"]
if any(self.residue_topology.membr.tolist()):
self._get_tmhelices_map()
else:
self.logger.warning('No TM helices were parsed from TOPCONS file!')
def _get_tmhelices_map(self):
"""Create a datframe with the start/stop of the TM helices contained in the input file"""
helices = []
id = 1
for k, g in groupby(enumerate([x for x in self.residue_topology[self.residue_topology.membr].idx]),
lambda ix: ix[0] - ix[1]):
residues = [x for x in map(itemgetter(1), g)]
start = residues[0]
try:
stop = residues[-1]
except IndexError:
stop = start
helices.append([id, start, stop])
id += 1
self.tmhelices = pd.DataFrame(helices)
self.tmhelices.columns = ["id", "start", "stop"]