Source code for swamp.parsers.topconsparser

import pandas as pd
from swamp.parsers.parser import Parser
from itertools import groupby
from operator import itemgetter


[docs]class TopconsParser(Parser):
    """Topcons file parser

    :param str fname: the file name to be parsed (default None)
    :param `~swamp.logger.swamplogger.SwampLogger` logger: logging interface for the parser (default None)
    :ivar `~pandas.Dataframe` tmhelices: Dataframe with the mappings for the TM helices predicted by topcons
    :ivar `~pandas.Dataframe` residue_topology: Dataframe with the residue topology as predicted by topcons

    :example:

    >>> from swamp.parsers import TopconsParser
    >>> my_parser = TopconsParser('<fname>')
    >>> my_parser.parse()
    """

    def __init__(self, fname, logger=None):
        self.tmhelices = None
        self.residue_topology = None
        super(TopconsParser, self).__init__(fname, logger=logger)

    @property
    def summary(self):
        """Abstract property to store a summary of the parsed figures of merit"""
        return None

[docs]    def parse(self):
        """Parse the :py:attr:`~swamp.parsers.parser.fname` prediction file and retrieve the TM topology"""

        if self.error:
            self.logger.warning("Previous errors prevent parsing TOPCONS file!")
            return

        with open(self.fname, "r") as fhandle:
            self.inputfile_contents = fhandle.readlines()
            try:
                topcons_prediction = self.inputfile_contents[
                    self.inputfile_contents.index('TOPCONS predicted topology:\n') + 1].rstrip()
            except ValueError as e:
                raise ValueError('TOPCONS file cannot be parsed. Please check it is TOPCONS format!')
            residues = []
            for index, ss_residue in enumerate(topcons_prediction):
                if ss_residue == "o":
                    residues.append([index + 1, True, False, False])
                elif ss_residue == "M":
                    residues.append([index + 1, False, True, False])
                else:
                    residues.append([index + 1, False, False, True])

        self.residue_topology = pd.DataFrame(residues)
        self.residue_topology.columns = ["idx", "out", "membr", "in"]
        if any(self.residue_topology.membr.tolist()):
            self._get_tmhelices_map()
        else:
            self.logger.warning('No TM helices were parsed from TOPCONS file!')

    def _get_tmhelices_map(self):
        """Create a datframe with the start/stop of the TM helices contained in the input file"""

        helices = []
        id = 1
        for k, g in groupby(enumerate([x for x in self.residue_topology[self.residue_topology.membr].idx]),
                            lambda ix: ix[0] - ix[1]):
            residues = [x for x in map(itemgetter(1), g)]
            start = residues[0]
            try:
                stop = residues[-1]
            except IndexError:
                stop = start
            helices.append([id, start, stop])
            id += 1
        self.tmhelices = pd.DataFrame(helices)
        self.tmhelices.columns = ["id", "start", "stop"]