Source code for modlamp.database
# -*- coding: utf-8 -*-
"""
.. currentmodule:: modlamp.database
.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>
This module incorporates functions to connect to several peptide databases. It also allows to connect to a custom
SQL database for which the configuration is given in a specified config file.
"""
from os.path import exists
import json
from getpass import getpass
import mysql.connector
import pandas as pd
import requests
from lxml import html
from mysql.connector import Error
__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"
def _read_db_config(configfile):
"""
Read database configuration and return a dictionary object.
This function generally does not need to be used as it is called by the function :func:`query_sequences` directly.
:param configfile: {str} path to the configuration file containing the database information with hostname,
database name, username and password.
:return: a dictionary of read database parameters
"""
if exists(configfile):
with open(configfile, 'r') as cfg:
db = json.load(cfg)
if not db['password']:
db['password'] = getpass()
return db
else:
raise IOError('Path to config file is wrong or file does not exist!\n%s' % configfile)
def _connect(configfile):
"""
Connect to a given MySQL database in conf. This function is called by the function :func:`query_sequences`.
:param configfile: path to the MySQL config file containing the hostname, database name, username and password.
This file is passed to :py:func:`_read_db_config()`.
:return: a ``mysql.connector`` connection object
"""
config = _read_db_config(configfile)
try:
print('Connecting to MySQL database...')
conn = mysql.connector.connect(**config)
print('connection established!')
return conn
except mysql.connector.Error as err:
print(err)
[docs]def query_database(table, columns=None, configfile='./modlamp/data/db_config.json'):
"""
This function extracts experimental results from the modlab peptide database. All data from the given table and
column names is extracted and returned.
:param table: the mysql database table to be queried
:param columns: a list of the column names {str} to be extracted from the table *default*: ``*`` (all columns)
:param configfile: location of the database configuration file containing the hostname etc. for the database to
be queried.
:return: {numpy.array} queried data
:Example:
>>> data = query_database(table='modlab_experiments', columns=['sequence', 'MCF7_activity', 'Saureus_activity'])
Password: *********
Connecting to MySQL database...
connection established!
>>> data[:5]
array([ ['ILGTILGILKGL', None, 1.0],
['ILGTILGFLKGL', None, 1.0],
['ILGNILGFLKGL', None, 1.0],
['ILGQILGILKGL', None, 1.0],
['ILGHILGYLKGL', None, 1.0]], dtype=object)
.. note::
If ``None`` or ``NULL`` appears as a value, this means no data was measured for this peptide and not that
activity is none (inactive).
"""
if not columns:
columns = ['*']
try:
conn = _connect(configfile)
df = pd.read_sql("SELECT " + ', '.join(columns) + " FROM " + table, con=conn)
return df
except Error as e:
print(e)
[docs]def query_apd(ids):
"""
A function to query sequences from the antimicrobial peptide database `APD <http://aps.unmc.edu/AP/>`_.
If the whole database should be scraped, simply look up the latest entry ID and take a ``range(1, 'latestID')``
as function input.
:param ids: {list of int} list of APD IDs to be queried from the database
:return: list of peptide sequences corresponding to entered ids.
:Example:
>>> query_apd([15, 16, 18, 19, 20])
['GLFDIVKKVVGALGSL', 'GLFDIVKKVVGAIGSL', 'GLFDIVKKVVGAFGSL', 'GLFDIAKKVIGVIGSL', 'GLFDIVKKIAGHIAGSI']
"""
seqs = []
for i in ids:
page = requests.get('http://aps.unmc.edu/AP/database/query_output.php?ID=%0.5d' % i)
tree = html.fromstring(page.content)
seqs.extend(tree.xpath('//font[@color="#ff3300"]/text()'))
return seqs
[docs]def query_camp(ids):
"""
A function to query sequences from the antimicrobial peptide database `CAMP <http://camp.bicnirrh.res.in/>`_.
If the whole database should be scraped, simply look up the latest entry ID and take a ``range(1, 'latestID')``
as function input.
:param ids: {list of int} list of CAMP IDs to be queried from the database
:return: list of peptide sequences corresponding to entered ids.
:Example:
>>> query_camp([2705, 2706])
['GLFDIVKKVVGALGSL', 'GLFDIVKKVVGTLAGL']
"""
seqs = []
for i in ids:
page = requests.get('http://camp.bicnirrh.res.in/seqDisp.php?id=CAMPSQ%i' % i)
tree = html.fromstring(page.content)
seqs.extend(tree.xpath('//td[@class="fasta"]/text()'))
return seqs