Source code for modlamp.datasets

# -*- coding: utf-8 -*-
"""
.. currentmodule:: modlamp.datasets

.. moduleauthor:: ETH Zurich Alex Mueller <alex.mueller@pharma.ethz.ch>

This module incorporates functions to load different peptide datasets used for classification.

=============================        ============================================================================
Function                             Data
=============================        ============================================================================
:py:func:`load_AMPvsTM`              Antimicrobial peptides versus trans-membrane sequences
:py:func:`load_AMPvsUniProt`         AMPs from the *APD3* versus other peptides from the *UniProt* database
:py:func:`load_ACPvsTM`              Anticancer peptides (*CancerPPD*) versus helical transmembrane sequences
:py:func:`load_ACPvsRandom`          Anticancer peptides (*CancerPPD*) versus random scrambled AMP sequences
:py:func:`load_custom`               A custom data set provided in ``modlamp/data`` as a ``.csv`` file
=============================        ============================================================================
"""

from os.path import dirname, join

import numpy as np

__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"


[docs]class Bunch(dict): """Container object for datasets Dictionary-like object that exposes its keys as attributes. Taken from the `sklearn <http://scikit-learn.org>`_ package. :Example: >>> b = Bunch(a=1, b=2) >>> b['b'] 2 >>> b.b # key can also be called as attribute 2 >>> b.a = 3 >>> b['a'] 3 >>> b.c = 6 >>> b['c'] 6 """
[docs] def __init__(self, **kwargs): dict.__init__(self, kwargs)
def __setattr__(self, key, value): self[key] = value def __getattr__(self, key): try: return self[key] except KeyError: raise AttributeError(key) def __setstate__(self, state): pass
[docs]def load_AMPvsTM(): """Function to load a dataset consisting of **AMP sequences** and **transmembrane regions of proteins** for classification. The AMP class consists of an intersection of all activity annotations of the `APD2 <http://aps.unmc.edu/AP/>`_ and `CAMP <http://camp.bicnirrh.res.in/>`_ databases, where for gram positive, gram negative and antifungal exact matches were observed. A detailed description of how the dataset was compiled can be found in the following publication: Schneider, P., Müller, A. T., Gabernet, G., Button, A. L., Posselt, G., Wessler, S., Hiss, J. A. and Schneider, G. (2016), Hybrid Network Model for “Deep Learning” of Chemical Data: Application to Antimicrobial Peptides. Mol. Inf.. `doi:10.1002/minf.201600011 <http://onlinelibrary.wiley.com/doi/10.1002/minf.201600011/full>`_ ================= === Classes 2 Samples per class 206 Samples total 412 Dimensionality 1 ================= === :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``, the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the features. :Example: >>> from modlamp.datasets import load_AMPvsTM >>> data = load_AMPvsTM() >>> data.sequences ['AAGAATVLLVIVLLAGSYLAVLA','LWIVIACLACVGSAAALTLRA','FYRFYMLREGTAVPAVWFSIELIFGLFA','GTLELGVDYGRAN',...] >>> list(data.target_names) ['TM', 'AMP'] >>> len(data.sequences) 412 >>> data.target[:10] array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) """ module_path = dirname(__file__) with open(join(module_path, 'data', 'AMPvsTMset.csv')) as f: n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',') sequences, target = list(), list() for line in f: tmp = line.strip().split(',') sequences.append(tmp[0]) target.append(tmp[1]) return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'), target_names=[target_name1, target_name2], feature_names=['Sequence'])
[docs]def load_AMPvsUniProt(): """Function to load a dataset consisting of the whole **APD3** versus the same number of sequences randomly extracted from the **UniProt** database, to be used for classification. The AMP class consists of 2600 AMP sequences from the `APD3 <http://aps.unmc.edu/AP/>`_ (extracted Jan. 2016). The UniProt class consists of 2600 randomly extracted protein sequences from the `UniProt Database <http://uniprot.org/>`_ with the search query *length 10 TO 50* filtered for unnatural amino acids. ================= ===== Classes 2 AMP Samples 2600 UniProt Samples 2600 Samples total 5200 Dimensionality 1 ================= ===== :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``, the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the features. :Example: >>> from modlamp.datasets import load_AMPvsUniProt >>> data = load_AMPvsUniProt() >>> data.sequences[:10] ['GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV', 'YVPLPNVPQPGRRPFPTFPGQGPFNPKIKWPQGY', ... ] >>> list(data.target_names) ['AMP', 'UniProt'] >>> len(data.sequences) 5200 >>> data.target[:10] array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) """ module_path = dirname(__file__) with open(join(module_path, 'data', 'AMPvsUniProt.csv')) as f: n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',') sequences, target = list(), list() for line in f: tmp = line.strip().split(',') sequences.append(tmp[0]) target.append(tmp[1]) return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'), target_names=[target_name1, target_name2], feature_names=['Sequence'])
[docs]def load_ACPvsTM(): """Function to load a dataset consisting of ACP sequences from the CancerPPD database and negative peptides extracted from alpha-helical transmembrane regions of proteins for classification. The ACP class consists of a collection of 413 ACPs from the `CancerPPD <http://crdd.osdd.net/raghava/cancerppd/index.php>`_ database with length between 7 and 30 aa and without cysteines to facilitate peptide synthesis. The Negative peptide set contains a random selection of 413 transmembrane alpha-helices (extracted from the `PDBTM <http://pdbtm.enzim.hu/>`_ ) isolated directly from the proteins crystal structure. ================= === Classes 2 ACP peptides 413 Negative peptides 413 Total peptides 826 Dimensionality 1 ================= === :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``, the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the features. :Example: >>> from modlamp.datasets import load_ACPvsTM >>> data = load_ACPvsTM() >>> data.sequences[:4] ['AAKKWAKAKWAKAKKWAKAA', 'AAVPIVNLKDELLFPSWEALFSGSE', 'AAWKWAWAKKWAKAKKWAKAA', 'AFGMALKLLKKVL'] >>> list(data.target_names) ['TM', 'ACP'] >>> len(data.sequences) 826 """ module_path = dirname(__file__) with open(join(module_path, 'data', 'ACP_CancPPD_vs_TM.csv')) as f: n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',') sequences, target = list(), list() for line in f: tmp = line.strip().split(',') sequences.append(tmp[0]) target.append(tmp[1]) return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'), target_names=[target_name1, target_name2], feature_names=['Sequence'])
[docs]def load_ACPvsRandom(): """Function to load a dataset consisting of ACP sequences from the CancerPPD database and negative peptides generated randomly with the amino acid composition of AMPs. The ACP class consists of a collection of 413 ACPs from the `CancerPPD <http://crdd.osdd.net/raghava/cancerppd/index.php>`_ database with length between 7 and 30 aa and without cysteines to facilitate peptide synthesis. The Negative peptide set contains a random selection of 413 randomly generated peptides with the amino acid composition of AMPs in the APD2 database. ================= === Classes 2 ACP peptides 413 Negative peptides 413 Total peptides 826 Dimensionality 1 ================= === :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``, the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the features. :Example: >>> from modlamp.datasets import load_ACPvsRandom >>> data = load_ACPvsRandom() >>> data.sequences[:3] ['AAKKWAKAKWAKAKKWAKAA', 'AAVPIVNLKDELLFPSWEALFSGSE', 'AAWKWAWAKKWAKAKKWAKAA'] >>> list(data.target_names) ['Random', 'ACP'] >>> len(data.sequences) 826 """ module_path = dirname(__file__) with open(join(module_path, 'data', 'ACP_CancPPD_vs_Random.csv')) as f: n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',') sequences, target = list(), list() for line in f: tmp = line.strip().split(',') sequences.append(tmp[0]) target.append(tmp[1]) return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'), target_names=[target_name1, target_name2], feature_names=['Sequence'])
[docs]def load_custom(filename): """Function to load a custom dataset saved in ``modlamp/data/`` as a ``.csv`` file. The following header needs to be included: *Nr. of sequences*, *Nr. of columns - 1*, *Class name for 0*, *Class name for 1* Example ``.csv`` file structure:: 4, 1, TM, AMP GTLEFDVTIGRAN, 0 GSNVHLASNLLA, 0 GLFDIVKKVVGALGSL, 0 GLFDIIKKIAESF, 0 :param filename: {str} filename of the data file to be loaded; the file must be located in ``modlamp/data/`` :return: Bunch, a dictionary-like object, the interesting attributes are: ``sequences``, the sequences, ``target``, the classification labels, ``target_names``, the meaning of the labels and ``feature_names``, the meaning of the features. :Example: >>> from modlamp.datasets import load_custom >>> data = load_custom('custom_data.csv') """ module_path = dirname(__file__) with open(join(module_path, 'data', filename)) as f: n_samples, n_features, target_name1, target_name2 = next(f).strip().split(',') sequences, target = list(), list() for line in f: tmp = line.strip().split(',') sequences.append(tmp[0]) target.append(tmp[1]) return Bunch(sequences=np.array(sequences, dtype='str'), target=np.array(target, dtype='int'), target_names=[target_name1, target_name2], feature_names=['Sequence'])