Source code for modlamp.analysis

# -*- coding: utf-8 -*-
"""
.. currentmodule:: modlamp.analysis

.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>

This module can be used for diverse analysis of given peptide libraries.
"""
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd

from modlamp.core import count_aas
from modlamp.descriptors import GlobalDescriptor, PeptideDescriptor

__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"

Axes3D = Axes3D  # hack for evading pycharm auto import


[docs]class GlobalAnalysis(object): """ Base class for amino acid sequence library analysis .. versionadded:: 2.6.0 """
[docs] def __init__(self, library, names=None): """ :param library: {list, numpy.ndarray, pandas.DataFrame} sequence library, if 2D, the rows are considered as sub-libraries. :param names: {list} list of library names to plot as labels and legend :Example: >>> g = GlobalAnalysis(['GLFDIVKKVVGALG', 'KLLKLLKKLLKLLK', ...], names=['Library1']) """ self.libnames = None self.library = None self.shapes = list() # find out about library structure and check if libraries are of different sizes self.H = list() self.uH = list() self.charge = list() self.len = list() self.AAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] if names: self.libnames = names if type(library) == np.ndarray: self.library = library elif type(library) == pd.core.frame.DataFrame: if library.shape[0] > library.shape[1]: # if each library is a column self.library = library.values.T if not self.libnames: self.libnames = library.columns.values.tolist() # take library names from column headers else: # if each library is a row self.library = library.values if not self.libnames: self.libnames = library.index.values.tolist() # take library names from row headers elif type(library) == pd.core.series.Series: self.library = library.values self.libnames = [library.name] elif isinstance(library[0], list): for i, l in enumerate(library): self.shapes.append(len(l)) self.library = np.array(library, dtype='object') else: self.library = np.array(library) # reshape library to 2D array if without sub-libraries if len(self.library.shape) == 1 and isinstance(self.library[0], str): self.library = self.library.reshape((1, -1)) if not self.libnames: self.libnames = ['Lib ' + str(x + 1) for x in range(self.library.shape[0])] self.aafreq = np.zeros((self.library.shape[0], 20), dtype='float64') # template for AA counts
[docs] def calc_aa_freq(self, plot=True, color='#83AF9B', filename=None): """Method to get the frequency of every amino acid in the library. If the library consists of sub-libraries, the frequencies of these are calculated independently. :param plot: {bool} whether the amino acid frequencies should be plotted in a histogram. :param color: {str} color of the plot :param filename: {str} filename to save the plot to, if None, the plot is shown :return: {numpy.ndarray} amino acid frequencies in the attribute :py:attr:`aafreq`. The values are oredered alphabetically. :Example: >>> g = GlobalAnalysis(sequences) # sequences being a list / array of amino acid sequences >>> g.calc_aa_freq() >>> g.aafreq array([[ 0.08250071, 0. , 0.02083928, 0.0159863 , 0.1464459 , 0.04795889, 0.06622895, 0.0262632 , 0.12988867, 0. , 0.09192121, 0.03111619, 0.01712818, 0.04852983, 0.05937768, 0.07079646, 0.04396232, 0.0225521 , 0.05994862, 0.01855552]]) .. image:: ../docs/static/AA_dist.png :height: 300px """ for l in range(self.library.shape[0]): concatseq = ''.join(self.library[l]) d_aa = count_aas(concatseq) self.aafreq[l] = [d_aa[a] for a in self.AAs] if plot: fig, ax = plt.subplots() for a in range(20): plt.bar(a, self.aafreq[l, a], 0.9, color=color) plt.xlim([-0.75, 19.75]) plt.ylim([0, max(self.aafreq[l, :]) + 0.05]) plt.xticks(range(20), d_aa.keys(), fontweight='bold') plt.ylabel('Amino Acid Frequency', fontweight='bold') plt.title('Amino Acid Distribution', fontsize=16, fontweight='bold') # only left and bottom axes, no box ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') if filename: plt.savefig(filename) else: plt.show()
[docs] def calc_H(self, scale='eisenberg'): """Method for calculating global hydrophobicity (Eisenberg scale) of all sequences in the library. :param scale: {str} hydrophobicity scale to use. For available scales, see :class:`modlamp.descriptors.PeptideDescriptor`. :return: {numpy.ndarray} Eisenberg hydrophobicities in the attribute :py:attr:`H`. .. seealso:: :func:`modlamp.descriptors.PeptideDescriptor.calculate_global()` """ for l in range(self.library.shape[0]): d = PeptideDescriptor(self.library[l], scale) d.calculate_global() self.H.append(d.descriptor[:, 0])
[docs] def calc_uH(self, window=1000, angle=100, modality='max'): """Method for calculating hydrophobic moments (Eisenberg scale) for all sequences in the library. :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal hydrophiobic moment for the chosen window size found in the sequence will be returned. :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets. :param modality: {'max' or 'mean'} calculate respectively maximum or mean hydrophobic moment. :return: {numpy.ndarray} calculated hydrophobic moments in the attribute :py:attr:`uH`. .. seealso:: :func:`modlamp.descriptors.PeptideDescriptor.calculate_moment()` """ for l in range(self.library.shape[0]): d = PeptideDescriptor(self.library[l], 'eisenberg') d.calculate_moment(window=window, angle=angle, modality=modality) self.uH.append(d.descriptor[:, 0])
[docs] def calc_charge(self, ph=7.0, amide=True): """Method to calculate the total molecular charge at a given pH for all sequences in the library. :param ph: {float} ph at which to calculate the peptide charge. :param amide: {boolean} whether the sequences have an amidated C-terminus (-> charge += 1). :return: {numpy.ndarray} calculated charges in the attribute :py:attr:`charge`. """ for l in range(self.library.shape[0]): d = GlobalDescriptor(self.library[l]) d.calculate_charge(ph=ph, amide=amide) self.charge.append(d.descriptor[:, 0])
[docs] def calc_len(self): """Method to get the sequence length of all sequences in the library. :return: {numpy.ndarray} sequence lengths in the attribute :py:attr:`len`. """ for l in range(self.library.shape[0]): d = GlobalDescriptor(self.library[l]) d.length() self.len.append(d.descriptor[:, 0])
[docs] def plot_summary(self, filename=None, colors=None, plot=True): """Method to generate a visual summary of different characteristics of the given library. The class methods are used with their standard options. :param filename: {str} path to save the generated plot to. :param colors: {str / list} color or list of colors to use for plotting. e.g. '#4E395D', 'red', 'k' :param plot: {boolean} whether the plot should be created or just the features are calculated :return: visual summary (plot) of the library characteristics (if ``plot=True``). :Example: >>> g = GlobalAnalysis([seqs1, seqs2, seqs3]) # seqs being lists / arrays of sequences >>> g.plot_summary() .. image:: ../docs/static/summary.png :height: 600px """ # calculate all global properties self.calc_len() self.calc_aa_freq(plot=False) self.calc_charge(ph=7.4, amide=True) self.calc_H() self.calc_uH() if plot: # plot settings fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(25, 15)) ((ax2, ax5, ax1), (ax3, ax4, ax6)) = axes plt.suptitle('Summary', fontweight='bold', fontsize=16.) labels = self.libnames if not colors: colors = ['#FA6900', '#69D2E7', '#542437', '#53777A', '#CCFC8E', '#9CC4E4'] num = len(labels) for a in [ax1, ax2, ax3, ax4, ax5, ax6]: # only left and bottom axes, no box a.spines['right'].set_visible(False) a.spines['top'].set_visible(False) a.xaxis.set_ticks_position('bottom') a.yaxis.set_ticks_position('left') # 1 length box plot box = ax1.boxplot(self.len, notch=1, vert=1, patch_artist=True) plt.setp(box['whiskers'], color='black') plt.setp(box['medians'], linestyle='-', linewidth=1.5, color='black') for p, patch in enumerate(box['boxes']): patch.set(facecolor=colors[p], edgecolor='black', alpha=0.8) ax1.set_ylabel('Sequence Length', fontweight='bold', fontsize=14.) ax1.set_xticks([x + 1 for x in range(len(labels))]) ax1.set_xticklabels(labels, fontweight='bold') # 2 AA bar plot d_aa = count_aas('') hands = [mpatches.Patch(label=labels[i], facecolor=colors[i], alpha=0.8) for i in range(len(labels))] w = .9 / num # bar width offsets = np.arange(start=-w, step=w, stop=num * w) # bar offsets if many libs if self.shapes: # if the library consists of different sized sub libraries for i, l in enumerate(self.aafreq): for a in range(20): ax2.bar(a - offsets[i], l[a], w, color=colors[i], alpha=0.8) else: for a in range(20): ax2.bar(a, self.aafreq[0][a], w, color=colors[0], alpha=0.8) ax2.set_xlim([-1., 20.]) ax2.set_ylim([0, 1.05 * np.max(self.aafreq)]) ax2.set_xticks(range(20)) ax2.set_xticklabels(d_aa.keys(), fontweight='bold') ax2.set_ylabel('Fraction', fontweight='bold', fontsize=14.) ax2.set_xlabel('Amino Acids', fontweight='bold', fontsize=14.) ax2.legend(handles=hands, labels=labels) # 3 hydophobicity violin plot for i, l in enumerate(self.H): vplot = ax3.violinplot(l, positions=[i + 1], widths=0.5, showmeans=True, showmedians=False) # crappy adaptions of violin dictionary elements vplot['cbars'].set_edgecolor('black') vplot['cmins'].set_edgecolor('black') vplot['cmeans'].set_edgecolor('black') vplot['cmaxes'].set_edgecolor('black') vplot['cmeans'].set_linestyle('--') for pc in vplot['bodies']: pc.set_facecolor(colors[i]) pc.set_alpha(0.8) pc.set_edgecolor('black') pc.set_linewidth(1.5) pc.set_alpha(0.7) pc.set_label(labels[i]) ax3.set_xticks([x + 1 for x in range(len(labels))]) ax3.set_xticklabels(labels, fontweight='bold') ax3.set_ylabel('Global Hydrophobicity', fontweight='bold', fontsize=14.) # 4 hydrophobic moment violin plot for i, l in enumerate(self.uH): vplot = ax4.violinplot(l, positions=[i + 1], widths=0.5, showmeans=True, showmedians=False) # crappy adaptions of violin dictionary elements vplot['cbars'].set_edgecolor('black') vplot['cmins'].set_edgecolor('black') vplot['cmeans'].set_edgecolor('black') vplot['cmaxes'].set_edgecolor('black') vplot['cmeans'].set_linestyle('--') for pc in vplot['bodies']: pc.set_facecolor(colors[i]) pc.set_alpha(0.8) pc.set_edgecolor('black') pc.set_linewidth(1.5) pc.set_alpha(0.7) pc.set_label(labels[i]) ax4.set_xticks([x + 1 for x in range(len(labels))]) ax4.set_xticklabels(labels, fontweight='bold') ax4.set_ylabel('Global Hydrophobic Moment', fontweight='bold', fontsize=14.) # 5 charge histogram if self.shapes: # if the library consists of different sized sub libraries bwidth = 1. / len(self.shapes) for i, c in enumerate(self.charge): counts, bins = np.histogram(c, range=[-5, 20], bins=25) ax5.bar(bins[1:] + i * bwidth, counts / np.max(counts), bwidth, color=colors[i], label=labels[i], alpha=0.8) else: ax5.hist(self.charge / np.max(self.charge), 25, alpha=0.8, align='left', rwidth=0.95, histtype='bar', label=labels, color=colors[:num]) ax5.set_xlabel('Global Charge', fontweight='bold', fontsize=14.) ax5.set_ylabel('Fraction', fontweight='bold', fontsize=14.) ax5.title.set_text('pH: 7.4 , amide: true') ax5.legend(loc='best') # 6 3D plot ax6.spines['left'].set_visible(False) ax6.spines['bottom'].set_visible(False) ax6.set_xticks([]) ax6.set_yticks([]) ax6 = fig.add_subplot(2, 3, 6, projection='3d') for i, l in enumerate(range(num)): xt = self.H[l] # find all values in x for the given target yt = self.charge[l] # find all values in y for the given target zt = self.uH[l] # find all values in y for the given target ax6.scatter(xt, yt, zt, c=colors[l], alpha=.8, s=25, label=labels[i]) ax6.set_xlabel('H', fontweight='bold', fontsize=14.) ax6.set_ylabel('Charge', fontweight='bold', fontsize=14.) ax6.set_zlabel('uH', fontweight='bold', fontsize=14.) data_c = [item for sublist in self.charge for item in sublist] # flatten charge data into one list data_H = [item for sublist in self.H for item in sublist] # flatten H data into one list data_uH = [item for sublist in self.uH for item in sublist] # flatten uH data into one list ax6.set_xlim([np.min(data_H), np.max(data_H)]) ax6.set_ylim([np.min(data_c), np.max(data_c)]) ax6.set_zlim([np.min(data_uH), np.max(data_uH)]) ax6.legend(loc='best') if filename: plt.savefig(filename, dpi=200) else: plt.show()