Source code for modlamp.database

# -*- coding: utf-8 -*-
"""
.. currentmodule:: modlamp.database

.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>

This module incorporates functions to connect to several peptide databases. It also allows to connect to a custom
SQL database for which the configuration is given in a specified config file.
"""

from os.path import exists
import json
from getpass import getpass

import mysql.connector
import pandas as pd
import requests
from lxml import html
from mysql.connector import Error

__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"


def _read_db_config(configfile):
    """
    Read database configuration and return a dictionary object.
    This function generally does not need to be used as it is called by the function :func:`query_sequences` directly.

    :param configfile: {str} path to the configuration file containing the database information with hostname,
        database name, username and password.
    :return: a dictionary of read database parameters
    """
    if exists(configfile):
        with open(configfile, 'r') as cfg:
            db = json.load(cfg)

        if not db['password']:
            db['password'] = getpass()
    
        return db
    else:
        raise IOError('Path to config file is wrong or file does not exist!\n%s' % configfile)


def _connect(configfile):
    """
    Connect to a given MySQL database in conf. This function is called by the function :func:`query_sequences`.

    :param configfile: path to the MySQL config file containing the hostname, database name, username and password.
        This file is passed to :py:func:`_read_db_config()`.
    :return: a ``mysql.connector`` connection object
    """
    config = _read_db_config(configfile)

    try:
        print('Connecting to MySQL database...')
        conn = mysql.connector.connect(**config)
        print('connection established!')
        return conn

    except mysql.connector.Error as err:
        print(err)


[docs]def query_database(table, columns=None, configfile='./modlamp/data/db_config.json'): """ This function extracts experimental results from the modlab peptide database. All data from the given table and column names is extracted and returned. :param table: the mysql database table to be queried :param columns: a list of the column names {str} to be extracted from the table *default*: ``*`` (all columns) :param configfile: location of the database configuration file containing the hostname etc. for the database to be queried. :return: {numpy.array} queried data :Example: >>> data = query_database(table='modlab_experiments', columns=['sequence', 'MCF7_activity', 'Saureus_activity']) Password: ********* Connecting to MySQL database... connection established! >>> data[:5] array([ ['ILGTILGILKGL', None, 1.0], ['ILGTILGFLKGL', None, 1.0], ['ILGNILGFLKGL', None, 1.0], ['ILGQILGILKGL', None, 1.0], ['ILGHILGYLKGL', None, 1.0]], dtype=object) .. note:: If ``None`` or ``NULL`` appears as a value, this means no data was measured for this peptide and not that activity is none (inactive). """ if not columns: columns = ['*'] try: conn = _connect(configfile) df = pd.read_sql("SELECT " + ', '.join(columns) + " FROM " + table, con=conn) return df except Error as e: print(e)
[docs]def query_apd(ids): """ A function to query sequences from the antimicrobial peptide database `APD <http://aps.unmc.edu/AP/>`_. If the whole database should be scraped, simply look up the latest entry ID and take a ``range(1, 'latestID')`` as function input. :param ids: {list of int} list of APD IDs to be queried from the database :return: list of peptide sequences corresponding to entered ids. :Example: >>> query_apd([15, 16, 18, 19, 20]) ['GLFDIVKKVVGALGSL', 'GLFDIVKKVVGAIGSL', 'GLFDIVKKVVGAFGSL', 'GLFDIAKKVIGVIGSL', 'GLFDIVKKIAGHIAGSI'] """ seqs = [] for i in ids: page = requests.get('http://aps.unmc.edu/AP/database/query_output.php?ID=%0.5d' % i) tree = html.fromstring(page.content) seqs.extend(tree.xpath('//font[@color="#ff3300"]/text()')) return seqs
[docs]def query_camp(ids): """ A function to query sequences from the antimicrobial peptide database `CAMP <http://camp.bicnirrh.res.in/>`_. If the whole database should be scraped, simply look up the latest entry ID and take a ``range(1, 'latestID')`` as function input. :param ids: {list of int} list of CAMP IDs to be queried from the database :return: list of peptide sequences corresponding to entered ids. :Example: >>> query_camp([2705, 2706]) ['GLFDIVKKVVGALGSL', 'GLFDIVKKVVGTLAGL'] """ seqs = [] for i in ids: page = requests.get('http://camp.bicnirrh.res.in/seqDisp.php?id=CAMPSQ%i' % i) tree = html.fromstring(page.content) seqs.extend(tree.xpath('//td[@class="fasta"]/text()')) return seqs