Source code for modlamp.datasets

# -*- coding: utf-8 -*-
"""
.. currentmodule:: modlamp.datasets

.. moduleauthor:: ETH Zurich Alex Mueller <alex.mueller@pharma.ethz.ch>

This module incorporates functions to load different peptide datasets used for classification.

=============================        ============================================================================
Function                             Data
=============================        ============================================================================
:py:func:`load_AMPvsTM`              Antimicrobial peptides versus trans-membrane sequences
:py:func:`load_AMPvsUniProt`         AMPs from the *APD3* versus other peptides from the *UniProt* database
:py:func:`load_ACPvsTM`              Anticancer peptides (*CancerPPD*) versus helical transmembrane sequences
:py:func:`load_ACPvsRandom`          Anticancer peptides (*CancerPPD*) versus random scrambled AMP sequences
:py:func:`load_custom`               A custom data set provided in ``modlamp/data`` as a ``.csv`` file
=============================        ============================================================================
"""

from os.path import dirname, join

import numpy as np

__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"


[docs]class Bunch(dict):
    """Container object for datasets

    Dictionary-like object that exposes its keys as attributes. Taken from the `sklearn <http://scikit-learn.org>`_
    package.

    :Example:

    >>> b = Bunch(a=1, b=2)
    >>> b['b']
    2
    >>> b.b  # key can also be called as attribute
    2
    >>> b.a = 3
    >>> b['a']
    3
    >>> b.c = 6
    >>> b['c']
    6
    """
[docs]    def __init__(self, **kwargs):
        dict.__init__(self, kwargs)

    def __setattr__(self, key, value):
        self[key] = value

    def __getattr__(self, key):
        try:
            return self[key]
        except KeyError:
            raise AttributeError(key)

    def __setstate__(self, state):
        pass


[docs]def load_AMPvsTM():
    """Function to load a dataset consisting of **AMP sequences** and **transmembrane regions of proteins** for
    classification.

    The AMP class consists of an intersection of all activity annotations of the `APD2 <http://aps.unmc.edu/AP/>`_ and
    `CAMP <http://camp.bicnirrh.res.in/>`_ databases, where for gram positive, gram negative and antifungal exact
    matches were observed. A detailed description of how the dataset was compiled can be found in the following
    publication: Schneider, P., Müller, A. T., Gabernet, G., Button, A. L., Posselt, G., Wessler, S., Hiss, J. A. and
    Schneider, G. (2016), Hybrid Network Model for “Deep Learning” of Chemical Data: Application to Antimicrobial
    Peptides. Mol. Inf.. `doi:10.1002/minf.201600011 <http://onlinelibrary.wiley.com/doi/10.1002/minf.201600011/full>`_

    =================    ===
    Classes                2
    Samples per class    206
    Samples total        412
    Dimensionality         1
    =================    ===

    :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``,
        the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the
        features.
    :Example:

    >>> from modlamp.datasets import load_AMPvsTM
    >>> data = load_AMPvsTM()
    >>> data.sequences
    ['AAGAATVLLVIVLLAGSYLAVLA','LWIVIACLACVGSAAALTLRA','FYRFYMLREGTAVPAVWFSIELIFGLFA','GTLELGVDYGRAN',...]
    >>> list(data.target_names)
    ['TM', 'AMP']
    >>> len(data.sequences)
    412
    >>> data.target[:10]
    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    """

    module_path = dirname(__file__)
    with open(join(module_path, 'data', 'AMPvsTMset.csv')) as f:
        n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',')
        sequences, target = list(), list()
        for line in f:
            tmp = line.strip().split(',')
            sequences.append(tmp[0])
            target.append(tmp[1])

    return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'),
                 target_names=[target_name1, target_name2],
                 feature_names=['Sequence'])


[docs]def load_AMPvsUniProt():
    """Function to load a dataset consisting of the whole **APD3** versus the same number of sequences randomly
    extracted from the **UniProt** database, to be used for classification.

    The AMP class consists of 2600 AMP sequences from the `APD3 <http://aps.unmc.edu/AP/>`_ (extracted Jan. 2016).
    The UniProt class consists of 2600 randomly extracted protein sequences from the `UniProt Database
    <http://uniprot.org/>`_ with the search query *length 10 TO 50* filtered for unnatural amino acids.

    =================    =====
    Classes                 2
    AMP Samples          2600
    UniProt Samples      2600
    Samples total        5200
    Dimensionality          1
    =================    =====

    :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``,
        the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the
        features.
    :Example:

    >>> from modlamp.datasets import load_AMPvsUniProt
    >>> data = load_AMPvsUniProt()
    >>> data.sequences[:10]
    ['GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV', 'YVPLPNVPQPGRRPFPTFPGQGPFNPKIKWPQGY', ... ]
    >>> list(data.target_names)
    ['AMP', 'UniProt']
    >>> len(data.sequences)
    5200
    >>> data.target[:10]
    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    """

    module_path = dirname(__file__)
    with open(join(module_path, 'data', 'AMPvsUniProt.csv')) as f:
        n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',')
        sequences, target = list(), list()
        for line in f:
            tmp = line.strip().split(',')
            sequences.append(tmp[0])
            target.append(tmp[1])

    return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'),
                 target_names=[target_name1, target_name2],
                 feature_names=['Sequence'])


[docs]def load_ACPvsTM():
    """Function to load a dataset consisting of ACP sequences from the CancerPPD database and negative peptides
    extracted from alpha-helical transmembrane regions of proteins for classification.

    The ACP class consists of a collection of 413 ACPs from the `CancerPPD
    <http://crdd.osdd.net/raghava/cancerppd/index.php>`_ database with length between 7 and 30 aa and without cysteines
    to facilitate peptide synthesis.

    The Negative peptide set contains a random selection of 413 transmembrane alpha-helices (extracted from
    the `PDBTM <http://pdbtm.enzim.hu/>`_ ) isolated directly from the proteins crystal structure.

    =================    ===
    Classes                2
    ACP peptides         413
    Negative peptides    413
    Total peptides       826
    Dimensionality         1
    =================    ===

    :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``,
        the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the
        features.
    :Example:

    >>> from modlamp.datasets import load_ACPvsTM
    >>> data = load_ACPvsTM()
    >>> data.sequences[:4]
    ['AAKKWAKAKWAKAKKWAKAA', 'AAVPIVNLKDELLFPSWEALFSGSE', 'AAWKWAWAKKWAKAKKWAKAA', 'AFGMALKLLKKVL']
    >>> list(data.target_names)
    ['TM', 'ACP']
    >>> len(data.sequences)
    826
    """

    module_path = dirname(__file__)
    with open(join(module_path, 'data', 'ACP_CancPPD_vs_TM.csv')) as f:
        n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',')
        sequences, target = list(), list()
        for line in f:
            tmp = line.strip().split(',')
            sequences.append(tmp[0])
            target.append(tmp[1])

    return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'),
                 target_names=[target_name1, target_name2],
                 feature_names=['Sequence'])


[docs]def load_ACPvsRandom():
    """Function to load a dataset consisting of ACP sequences from the CancerPPD database and negative peptides generated
     randomly with the amino acid composition of AMPs.

    The ACP class consists of a collection of 413 ACPs from the `CancerPPD
    <http://crdd.osdd.net/raghava/cancerppd/index.php>`_ database with length between 7 and 30 aa and without cysteines
    to facilitate peptide synthesis.

    The Negative peptide set contains a random selection of 413 randomly generated peptides with the amino acid
    composition of AMPs in the APD2 database.

    =================    ===
    Classes                2
    ACP peptides         413
    Negative peptides    413
    Total peptides       826
    Dimensionality         1
    =================    ===

    :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``,
        the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the
        features.
    :Example:

    >>> from modlamp.datasets import load_ACPvsRandom
    >>> data = load_ACPvsRandom()
    >>> data.sequences[:3]
    ['AAKKWAKAKWAKAKKWAKAA', 'AAVPIVNLKDELLFPSWEALFSGSE', 'AAWKWAWAKKWAKAKKWAKAA']
    >>> list(data.target_names)
    ['Random', 'ACP']
    >>> len(data.sequences)
    826
    """

    module_path = dirname(__file__)
    with open(join(module_path, 'data', 'ACP_CancPPD_vs_Random.csv')) as f:
        n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',')
        sequences, target = list(), list()
        for line in f:
            tmp = line.strip().split(',')
            sequences.append(tmp[0])
            target.append(tmp[1])

    return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'),
                 target_names=[target_name1, target_name2],
                 feature_names=['Sequence'])


[docs]def load_custom(filename):
    """Function to load a custom dataset saved in ``modlamp/data/`` as a ``.csv`` file.
    
    The following header needs to be included: *Nr. of sequences*, *Nr. of columns - 1*, *Class name for 0*,
    *Class name for 1*
    
    Example ``.csv`` file structure::
    
        4, 1, TM, AMP
        GTLEFDVTIGRAN, 0
        GSNVHLASNLLA, 0
        GLFDIVKKVVGALGSL, 0
        GLFDIIKKIAESF, 0
    
    :param filename: {str} filename of the data file to be loaded; the file must be located in ``modlamp/data/``
    :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``,
        the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the
        features.
    :Example:

    >>> from modlamp.datasets import load_custom
    >>> data = load_custom('custom_data.csv')
    """

    module_path = dirname(__file__)
    with open(join(module_path, 'data', filename)) as f:
        n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',')
        sequences, target = list(), list()
        for line in f:
            tmp = line.strip().split(',')
            sequences.append(tmp[0])
            target.append(tmp[1])

    return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'),
                 target_names=[target_name1, target_name2],
                 feature_names=['Sequence'])