# -*- coding: utf-8 -*-
"""
.. currentmodule:: modlamp.analysis
.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>
This module can be used for diverse analysis of given peptide libraries.
"""
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
from modlamp.core import count_aas
from modlamp.descriptors import GlobalDescriptor, PeptideDescriptor
__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"
Axes3D = Axes3D # hack for evading pycharm auto import
[docs]class GlobalAnalysis(object):
"""
Base class for amino acid sequence library analysis
.. versionadded:: 2.6.0
"""
[docs] def __init__(self, library, names=None):
"""
:param library: {list, numpy.ndarray, pandas.DataFrame} sequence library, if 2D, the rows are considered as
sub-libraries.
:param names: {list} list of library names to plot as labels and legend
:Example:
>>> g = GlobalAnalysis(['GLFDIVKKVVGALG', 'KLLKLLKKLLKLLK', ...], names=['Library1'])
"""
self.libnames = None
self.library = None
self.shapes = list() # find out about library structure and check if libraries are of different sizes
self.H = list()
self.uH = list()
self.charge = list()
self.len = list()
self.AAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
if names:
self.libnames = names
if type(library) == np.ndarray:
self.library = library
elif type(library) == pd.core.frame.DataFrame:
if library.shape[0] > library.shape[1]: # if each library is a column
self.library = library.values.T
if not self.libnames:
self.libnames = library.columns.values.tolist() # take library names from column headers
else: # if each library is a row
self.library = library.values
if not self.libnames:
self.libnames = library.index.values.tolist() # take library names from row headers
elif type(library) == pd.core.series.Series:
self.library = library.values
self.libnames = [library.name]
elif isinstance(library[0], list):
for i, l in enumerate(library):
self.shapes.append(len(l))
self.library = np.array(library, dtype='object')
else:
self.library = np.array(library)
# reshape library to 2D array if without sub-libraries
if len(self.library.shape) == 1 and isinstance(self.library[0], str):
self.library = self.library.reshape((1, -1))
if not self.libnames:
self.libnames = ['Lib ' + str(x + 1) for x in range(self.library.shape[0])]
self.aafreq = np.zeros((self.library.shape[0], 20), dtype='float64') # template for AA counts
[docs] def calc_aa_freq(self, plot=True, color='#83AF9B', filename=None):
"""Method to get the frequency of every amino acid in the library. If the library consists of sub-libraries,
the frequencies of these are calculated independently.
:param plot: {bool} whether the amino acid frequencies should be plotted in a histogram.
:param color: {str} color of the plot
:param filename: {str} filename to save the plot to, if None, the plot is shown
:return: {numpy.ndarray} amino acid frequencies in the attribute :py:attr:`aafreq`. The values are oredered
alphabetically.
:Example:
>>> g = GlobalAnalysis(sequences) # sequences being a list / array of amino acid sequences
>>> g.calc_aa_freq()
>>> g.aafreq
array([[ 0.08250071, 0. , 0.02083928, 0.0159863 , 0.1464459 ,
0.04795889, 0.06622895, 0.0262632 , 0.12988867, 0. ,
0.09192121, 0.03111619, 0.01712818, 0.04852983, 0.05937768,
0.07079646, 0.04396232, 0.0225521 , 0.05994862, 0.01855552]])
.. image:: ../docs/static/AA_dist.png
:height: 300px
"""
for l in range(self.library.shape[0]):
concatseq = ''.join(self.library[l])
d_aa = count_aas(concatseq)
self.aafreq[l] = [d_aa[a] for a in self.AAs]
if plot:
fig, ax = plt.subplots()
for a in range(20):
plt.bar(a, self.aafreq[l, a], 0.9, color=color)
plt.xlim([-0.75, 19.75])
plt.ylim([0, max(self.aafreq[l, :]) + 0.05])
plt.xticks(range(20), d_aa.keys(), fontweight='bold')
plt.ylabel('Amino Acid Frequency', fontweight='bold')
plt.title('Amino Acid Distribution', fontsize=16, fontweight='bold')
# only left and bottom axes, no box
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
if filename:
plt.savefig(filename)
else:
plt.show()
[docs] def calc_H(self, scale='eisenberg'):
"""Method for calculating global hydrophobicity (Eisenberg scale) of all sequences in the library.
:param scale: {str} hydrophobicity scale to use. For available scales,
see :class:`modlamp.descriptors.PeptideDescriptor`.
:return: {numpy.ndarray} Eisenberg hydrophobicities in the attribute :py:attr:`H`.
.. seealso:: :func:`modlamp.descriptors.PeptideDescriptor.calculate_global()`
"""
for l in range(self.library.shape[0]):
d = PeptideDescriptor(self.library[l], scale)
d.calculate_global()
self.H.append(d.descriptor[:, 0])
[docs] def calc_uH(self, window=1000, angle=100, modality='max'):
"""Method for calculating hydrophobic moments (Eisenberg scale) for all sequences in the library.
:param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the
window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences
shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal
hydrophiobic moment for the chosen window size found in the sequence will be returned.
:param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets.
:param modality: {'max' or 'mean'} calculate respectively maximum or mean hydrophobic moment.
:return: {numpy.ndarray} calculated hydrophobic moments in the attribute :py:attr:`uH`.
.. seealso:: :func:`modlamp.descriptors.PeptideDescriptor.calculate_moment()`
"""
for l in range(self.library.shape[0]):
d = PeptideDescriptor(self.library[l], 'eisenberg')
d.calculate_moment(window=window, angle=angle, modality=modality)
self.uH.append(d.descriptor[:, 0])
[docs] def calc_charge(self, ph=7.0, amide=True):
"""Method to calculate the total molecular charge at a given pH for all sequences in the library.
:param ph: {float} ph at which to calculate the peptide charge.
:param amide: {boolean} whether the sequences have an amidated C-terminus (-> charge += 1).
:return: {numpy.ndarray} calculated charges in the attribute :py:attr:`charge`.
"""
for l in range(self.library.shape[0]):
d = GlobalDescriptor(self.library[l])
d.calculate_charge(ph=ph, amide=amide)
self.charge.append(d.descriptor[:, 0])
[docs] def calc_len(self):
"""Method to get the sequence length of all sequences in the library.
:return: {numpy.ndarray} sequence lengths in the attribute :py:attr:`len`.
"""
for l in range(self.library.shape[0]):
d = GlobalDescriptor(self.library[l])
d.length()
self.len.append(d.descriptor[:, 0])
[docs] def plot_summary(self, filename=None, colors=None, plot=True):
"""Method to generate a visual summary of different characteristics of the given library. The class methods
are used with their standard options.
:param filename: {str} path to save the generated plot to.
:param colors: {str / list} color or list of colors to use for plotting. e.g. '#4E395D', 'red', 'k'
:param plot: {boolean} whether the plot should be created or just the features are calculated
:return: visual summary (plot) of the library characteristics (if ``plot=True``).
:Example:
>>> g = GlobalAnalysis([seqs1, seqs2, seqs3]) # seqs being lists / arrays of sequences
>>> g.plot_summary()
.. image:: ../docs/static/summary.png
:height: 600px
"""
# calculate all global properties
self.calc_len()
self.calc_aa_freq(plot=False)
self.calc_charge(ph=7.4, amide=True)
self.calc_H()
self.calc_uH()
if plot:
# plot settings
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(25, 15))
((ax2, ax5, ax1), (ax3, ax4, ax6)) = axes
plt.suptitle('Summary', fontweight='bold', fontsize=16.)
labels = self.libnames
if not colors:
colors = ['#FA6900', '#69D2E7', '#542437', '#53777A', '#CCFC8E', '#9CC4E4']
num = len(labels)
for a in [ax1, ax2, ax3, ax4, ax5, ax6]:
# only left and bottom axes, no box
a.spines['right'].set_visible(False)
a.spines['top'].set_visible(False)
a.xaxis.set_ticks_position('bottom')
a.yaxis.set_ticks_position('left')
# 1 length box plot
box = ax1.boxplot(self.len, notch=1, vert=1, patch_artist=True)
plt.setp(box['whiskers'], color='black')
plt.setp(box['medians'], linestyle='-', linewidth=1.5, color='black')
for p, patch in enumerate(box['boxes']):
patch.set(facecolor=colors[p], edgecolor='black', alpha=0.8)
ax1.set_ylabel('Sequence Length', fontweight='bold', fontsize=14.)
ax1.set_xticks([x + 1 for x in range(len(labels))])
ax1.set_xticklabels(labels, fontweight='bold')
# 2 AA bar plot
d_aa = count_aas('')
hands = [mpatches.Patch(label=labels[i], facecolor=colors[i], alpha=0.8) for i in range(len(labels))]
w = .9 / num # bar width
offsets = np.arange(start=-w, step=w, stop=num * w) # bar offsets if many libs
if self.shapes: # if the library consists of different sized sub libraries
for i, l in enumerate(self.aafreq):
for a in range(20):
ax2.bar(a - offsets[i], l[a], w, color=colors[i], alpha=0.8)
else:
for a in range(20):
ax2.bar(a, self.aafreq[0][a], w, color=colors[0], alpha=0.8)
ax2.set_xlim([-1., 20.])
ax2.set_ylim([0, 1.05 * np.max(self.aafreq)])
ax2.set_xticks(range(20))
ax2.set_xticklabels(d_aa.keys(), fontweight='bold')
ax2.set_ylabel('Fraction', fontweight='bold', fontsize=14.)
ax2.set_xlabel('Amino Acids', fontweight='bold', fontsize=14.)
ax2.legend(handles=hands, labels=labels)
# 3 hydophobicity violin plot
for i, l in enumerate(self.H):
vplot = ax3.violinplot(l, positions=[i + 1], widths=0.5, showmeans=True, showmedians=False)
# crappy adaptions of violin dictionary elements
vplot['cbars'].set_edgecolor('black')
vplot['cmins'].set_edgecolor('black')
vplot['cmeans'].set_edgecolor('black')
vplot['cmaxes'].set_edgecolor('black')
vplot['cmeans'].set_linestyle('--')
for pc in vplot['bodies']:
pc.set_facecolor(colors[i])
pc.set_alpha(0.8)
pc.set_edgecolor('black')
pc.set_linewidth(1.5)
pc.set_alpha(0.7)
pc.set_label(labels[i])
ax3.set_xticks([x + 1 for x in range(len(labels))])
ax3.set_xticklabels(labels, fontweight='bold')
ax3.set_ylabel('Global Hydrophobicity', fontweight='bold', fontsize=14.)
# 4 hydrophobic moment violin plot
for i, l in enumerate(self.uH):
vplot = ax4.violinplot(l, positions=[i + 1], widths=0.5, showmeans=True, showmedians=False)
# crappy adaptions of violin dictionary elements
vplot['cbars'].set_edgecolor('black')
vplot['cmins'].set_edgecolor('black')
vplot['cmeans'].set_edgecolor('black')
vplot['cmaxes'].set_edgecolor('black')
vplot['cmeans'].set_linestyle('--')
for pc in vplot['bodies']:
pc.set_facecolor(colors[i])
pc.set_alpha(0.8)
pc.set_edgecolor('black')
pc.set_linewidth(1.5)
pc.set_alpha(0.7)
pc.set_label(labels[i])
ax4.set_xticks([x + 1 for x in range(len(labels))])
ax4.set_xticklabels(labels, fontweight='bold')
ax4.set_ylabel('Global Hydrophobic Moment', fontweight='bold', fontsize=14.)
# 5 charge histogram
if self.shapes: # if the library consists of different sized sub libraries
bwidth = 1. / len(self.shapes)
for i, c in enumerate(self.charge):
counts, bins = np.histogram(c, range=[-5, 20], bins=25)
ax5.bar(bins[1:] + i * bwidth, counts / np.max(counts), bwidth, color=colors[i], label=labels[i], alpha=0.8)
else:
ax5.hist(self.charge / np.max(self.charge), 25, alpha=0.8, align='left', rwidth=0.95, histtype='bar', label=labels,
color=colors[:num])
ax5.set_xlabel('Global Charge', fontweight='bold', fontsize=14.)
ax5.set_ylabel('Fraction', fontweight='bold', fontsize=14.)
ax5.title.set_text('pH: 7.4 , amide: true')
ax5.legend(loc='best')
# 6 3D plot
ax6.spines['left'].set_visible(False)
ax6.spines['bottom'].set_visible(False)
ax6.set_xticks([])
ax6.set_yticks([])
ax6 = fig.add_subplot(2, 3, 6, projection='3d')
for i, l in enumerate(range(num)):
xt = self.H[l] # find all values in x for the given target
yt = self.charge[l] # find all values in y for the given target
zt = self.uH[l] # find all values in y for the given target
ax6.scatter(xt, yt, zt, c=colors[l], alpha=.8, s=25, label=labels[i])
ax6.set_xlabel('H', fontweight='bold', fontsize=14.)
ax6.set_ylabel('Charge', fontweight='bold', fontsize=14.)
ax6.set_zlabel('uH', fontweight='bold', fontsize=14.)
data_c = [item for sublist in self.charge for item in sublist] # flatten charge data into one list
data_H = [item for sublist in self.H for item in sublist] # flatten H data into one list
data_uH = [item for sublist in self.uH for item in sublist] # flatten uH data into one list
ax6.set_xlim([np.min(data_H), np.max(data_H)])
ax6.set_ylim([np.min(data_c), np.max(data_c)])
ax6.set_zlim([np.min(data_uH), np.max(data_uH)])
ax6.legend(loc='best')
if filename:
plt.savefig(filename, dpi=200)
else:
plt.show()