Source code for modlamp.wetlab

# -*- coding: utf-8 -*-
"""
.. currentmodule:: modlamp.wetlab

.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>

This module incorporates functions to load raw data files from wetlab experiments, calculate different
characteristics and plot.

=============================        ============================================================================
Class                                Data
=============================        ============================================================================
:py:class:`CD`                       Class for handling Circular Dichroism data.
=============================        ============================================================================
"""

from os import listdir, makedirs
from os.path import join, exists, splitext

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from modlamp.descriptors import GlobalDescriptor

__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"


[docs]class CD: """ Class to handle circular dichroism data files and calculate several ellipticity and helicity values. The class can handle data files of the *Applied Photophysics* type. For explanations of different units used in CD spectroscopy, visit https://www.photophysics.com/resources/7-cd-units-conversions and read the following publication: N. J. Greenfield, *Nat. Protoc.* **2006**, 1, 2876–2890. .. note:: All files which should be read must have **4 header lines** as shown in the image below. CD data to be read must start in line 5 (separated in 2 columns: *Wavelength* and *Signal*). .. image:: ../docs/static/fileheader.png First line: *Molecule Name* Second line: *Sequence* Third line: *concentration in µM* Fourth line: *solvent* Recognized solvents are **W** for water and **T** for TFE. """
[docs] def __init__(self, directory, wmin, wmax, amide=True, pathlen=1): """Init method for class CD. :param directory: {str} directory containing all data files to be read. Files need a **.csv** ending :param wmin: {int} smalles wavelength measured :param wmax: {int} highest wavelength measured :param amide: {bool} specifies whether the sequences have amidated C-termini :param pathlen: {float} cuvette path length in mm :Example: >>> cd = CD('/path/to/your/folder', 185, 260) >>> cd.filenames ['160819_Pep1_T_smooth.csv', '160819_Pep1_W_smooth.csv', '160819_Pep5_T_smooth.csv', ...] >>> cd.names ['Pep 10', 'Pep 10', 'Pep 11', 'Pep 11', ... ] >>> cd.conc_umol [33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, ... ] >>> cd.meanres_mw [114.29920769230768, 114.29920769230768, 111.68257692307689, 111.68257692307689, ... ] """ # read filenames from directory files = listdir(directory) self.filenames = [filename for filename in files if filename.endswith('.csv')] # get all .csv files in dir # initialize attributes self.names = list() self.sequences = list() self.conc_umol = list() self.conc_mgml = list() self.mw = list() self.meanres_mw = list() self.solvent = list() self.circular_dichroism = list() self.molar_ellipticity = list() self.meanres_ellipticity = list() self.helicity_values = pd.DataFrame() self.directory = directory self.wmin = wmin self.wmax = wmax self.amide = amide self.pathlen = pathlen self._read_header() # call the _read_header function to fill up all attributes
def _read_header(self): """Priveat method called by ``__init__`` to read all file headers into the class attributes and calculate sequence dependant values. :return: headers in class attributes. """ d = GlobalDescriptor('X') # template # loop through all files in the directory for i, file in enumerate(self.filenames): with open(join(self.directory, file)) as f: # read first 4 lines as header, rest as data head = [next(f) for _ in range(4)] data = [next(f) for _ in range(4, (self.wmax - self.wmin) + 5)] # read headers into class attributes name = head[0].split('\r\n')[0] self.names.append(name) sequence = head[1].split('\r\n')[0].strip() self.sequences.append(sequence) umol = float(head[2].split('\r\n')[0]) self.conc_umol.append(umol) self.solvent.append(head[3].split('\r\n')[0]) # read CD data wlengths = [int(line.split(',')[0]) for line in data] # get rid of s***** line ends ellipts = [float(line.split(',')[1].split('\r\n')[0]) for line in data] self.circular_dichroism.append(np.array(list(zip(wlengths, ellipts)))) # calculate MW and transform concentration to mg/ml d.sequences = [sequence] d.calculate_MW(amide=self.amide) self.mw.append(d.descriptor[0][0]) self.conc_mgml.append(self.mw[i] * umol / 10 ** 6) self.meanres_mw.append(self.mw[i] / (len(sequence) - 1)) # mean residue molecular weight (MW / n-1)
[docs] def calc_molar_ellipticity(self): """Method to calculate the molar ellipticity for all loaded data in :py:attr:`circular_dichroism` according to the following formula: .. math:: [\Theta] = (\Theta * MW) / (c * l) :return: {numpy array} molar ellipticity in :py:attr:`molar_ellipticity` :Example: >>> cd.calc_molar_ellipticity() >>> cd.molar_ellipticity array([[ 260., -1.40893636e+04], [ 259., -1.00558182e+04], [ 258., -1.25173636e+04], ... """ for i, data in enumerate(self.circular_dichroism): # calculate molar ellipticity: (theta * MW) / (conc * pathlength); and concat. with wavelengths mol_ellipt = (data[:, 1] * self.mw[i]) / (self.conc_mgml[i] * self.pathlen) self.molar_ellipticity.append(np.array(list(zip(data[:, 0], mol_ellipt))))
[docs] def calc_meanres_ellipticity(self): """Method to calculate the mean residue ellipticity for all loaded data in :py:attr:`circular_dichroism` according to the following formula: .. math:: (\Theta * MRW) / (c * l) = [\Theta] MRW = MW / (n - 1) With *MRW* = mean residue weight (g/mol), *n* = number of residues in the peptide, *c* = concentration (mg/mL) and *l* = cuvette path length (mm). :return: {numpy array} molar ellipticity in :py:attr:`meanres_ellipticity` :Example: >>> cd.calc_meanres_ellipticity() >>> cd.meanres_ellipticity array([[ 260. , -2669.5804196], [ 259. , -3381.3286713], [ 258. , -3872.5174825], ... """ for i, data in enumerate(self.circular_dichroism): # calculate molar ellipticity: (theta * mrw) / (conc * pathlength); and concat. with wavelengths mol_ellipt = (data[:, 1] * self.meanres_mw[i]) / (self.conc_mgml[i] * self.pathlen) self.meanres_ellipticity.append(np.array(list(zip(data[:, 0], mol_ellipt))))
def _plot_single(self, w, d, col, y_label, title, filename, y_min, y_max): """Private plot function used by :py:func:`modlamp.wetlab.CD.plot()` for plotting single CD plots""" fig, ax = plt.subplots() line = ax.plot(w, d / 1000.) # used legend is 10^3 so divide by 1000 plt.setp(line, color=col, linewidth=2.0) plt.title(title, fontsize=18, fontweight='bold') ax.set_xlabel('Wavelength [nm]', fontsize=16) ax.set_ylabel(y_label, fontsize=16) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') plt.xlim(np.min(w), np.max(w)) plt.ylim((y_min / 1000., y_max / 1000.)) img_name = splitext(filename)[0] + '.pdf' plt.savefig(join(self.directory, 'PDF', img_name), dpi=150) def _plot_double(self, w, dt, dw, y_label, title, filename, y_min, y_max): """Private plot function used by :py:func:`modlamp.wetlab.CD.plot()` for plotting combined CD plots""" fig, ax = plt.subplots() line1 = ax.plot(w, dt / 1000.) line2 = ax.plot(w, dw / 1000.) plt.setp(line1, color='r', linewidth=2.0, label='TFE', linestyle='--') plt.setp(line2, color='b', linewidth=2.0, label='Water') plt.title(title, fontsize=18, fontweight='bold') ax.set_xlabel('Wavelength [nm]', fontsize=16) ax.set_ylabel(y_label, fontsize=16) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') plt.xlim(np.min(w), np.max(w)) plt.ylim((y_min / 1000., y_max / 1000.)) plt.legend(loc=1) img_name = splitext(filename)[0] + '_M.pdf' plt.savefig(join(self.directory, 'PDF', img_name), dpi=150) def _plot_all(self, data, w, y_lim): """Private plot function used by :py:func:`modlamp.wetlab.CD.plot()` for plotting combined CD plots""" colors = ['#53777A', '#542437', '#C02942', '#D95B43', '#ECD078', '#CFF09E', '#A8DBA8', '#79BD9A', '#3B8686', '#0B486B', '#2790B0', '#94BA65', '#353432', '#4E4D4A', '##808080', '#CCCCCC'] fig, ax = plt.subplots() y_label = '' # assign empty y_min, y_max = (0, 1) # assign empty for i, f in enumerate(self.filenames): d, _, y_label, y_min, y_max = self._check_datatype(data, i, 'all') if y_lim: y_min = 1000 * y_lim[0] # * 1000 because axis are usually shown as 10^3 y_max = 1000 * y_lim[1] vars()['line' + str(i)] = ax.plot(w, d / 1000.) # mark the line plots with the iterator, for labelling try: plt.setp(vars()['line' + str(i)], color=colors[i], linewidth=1.5, label='%s' % f.split('.')[0], linestyle='-') except IndexError: # if more data than colors: start with dashed lines plt.setp(vars()['line' + str(i)], color=colors[i - len(colors)], linewidth=1.5, label='%s' % f.split( '.')[0], linestyle='--') plt.title("Combined Plot", fontsize=18, fontweight='bold') ax.set_xlabel('Wavelength [nm]', fontsize=16) ax.set_ylabel(y_label, fontsize=16) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') plt.xlim(np.min(w), np.max(w)) plt.ylim((y_min / 1000., y_max / 1000.)) plt.legend(loc=1) plt.savefig(join(self.directory, 'PDF', 'all.pdf'), dpi=150) def _check_datatype(self, data, i, comb_flag): """Private function to check data type; used by :py:func`modlamp.wetlab.CD.plot` and :py:func`modlamp.wetlab.CD.dichroweb`""" d2 = [] if data == 'mean residue ellipticity': d = self.meanres_ellipticity[i][:, 1] if comb_flag == 'solvent' and i % 2 == 0: d2 = self.meanres_ellipticity[i + 1][:, 1] y_label = r"$[\Theta] \ast 10^{-3} (deg \ast cm^2 \ast dmol^{-1})$" y_min = np.min(d) * 1.1 y_max = np.max(d) * 1.1 elif data == 'molar ellipticity': d = self.molar_ellipticity[i][:, 1] if comb_flag == 'solvent' and i % 2 == 0: d2 = self.molar_ellipticity[i + 1][:, 1] y_label = r"$[\Theta] \ast 10^{-3} (deg \ast cm^2 \ast dmol^{-1})$" y_min = np.min(d) * 1.1 y_max = np.max(d) * 1.1 else: d = self.circular_dichroism[i][:, 1] if comb_flag == 'solvent' and i % 2 == 0: d2 = self.molar_ellipticity[i + 1][:, 1] y_label = r"$\Delta A \ast 32.986 \ast 10^{-3}$" y_min = np.min(d) * 1.1 y_max = np.max(d) * 1.1 return d, d2, y_label, y_min, y_max
[docs] def plot(self, data='mean residue ellipticity', combine='solvent', ylim=None): """Method to generate CD plots for all read data in the initial directory. :param data: {str} which data should be plotted (``mean residue ellipticity``, ``molar ellipticity`` or ``circular dichroism``) :param combine: {str} if ``solvent``, overlays of different solvents will be created for the same molecule. The amino acid sequence in the header is used to find corresponding data. if ``all``, all data is combined in one single plot. To ignore combination, pass an empty string. :param ylim: {tuple} If not none, this tuple of values is taken as the minimum and maximum of the y axis :return: .pdf plots saved to the directory containing the read files. :Example: >>> cd = CD('/path/to/your/folder', 185, 260) >>> cd.calc_meanres_ellipticity() >>> cd.plot(data='mean residue ellipticity', combine='solvent') .. image:: ../docs/static/cd1.png :height: 300px .. image:: ../docs/static/cd2.png :height: 300px .. image:: ../docs/static/cd3.png :height: 300px """ try: # prepare combination of solvent plots if combine == 'solvent': d = {s: self.sequences.count(s) for s in set(self.sequences)} # create dict with seq counts for combine if sum(d.values()) != 2 * len(d.values()): raise ValueError # check if output folder exists already, else create one if not exists(join(self.directory, 'PDF')): makedirs(join(self.directory, 'PDF')) w = range(self.wmax, self.wmin - 1, -1) # wavelengths # check input data option if data in ['mean residue ellipticity', 'molar ellipticity', 'circular dichroism']: # loop through all data for single plots for i, f in enumerate(self.filenames): # get data type to be plotted d, d2, y_label, y_min, y_max = self._check_datatype(data, i, combine) if self.solvent[i] == 'T': # color col = 'r' else: col = 'b' if ylim: y_min = 1000 * ylim[0] # * 1000 because axis are usually shown as 10^3 y_max = 1000 * ylim[1] # plot single plots self._plot_single(w, d, col, y_label, self.names[i] + ' ' + self.solvent[i], f, y_min, y_max) # plot mixed plots if combine == 'solvent' and i % 2 == 0: self._plot_double(w, d, d2, y_label, self.names[i], f, y_min, y_max) if combine == 'all': self._plot_all(data, w, ylim) else: print("ERROR\nWrong data option given!\nAvailable:") print("['mean residue ellipticity', 'molar ellipticity', 'circular dichroism']") except IndexError: # if data arrays are empty, no data was calculated print("ERROR\nSpecified data array empty, call the calculate functions first!") print("e.g. self.calc_molar_ellipticity()") except ValueError: print("ERROR\nSolvent pairs not even / missing.") print("Check if all measurements were performed in both TFE and water")
[docs] def dichroweb(self, data='mean residue ellipticity'): """Method to save the calculated CD data into DichroWeb readable format (semi-colon separated). The produced files can then directly be uploaded to the `DichroWeb <http://dichroweb.cryst.bbk.ac.uk>`_ analysis tool. :param data: {str} which data should be plotted (``mean residue ellipticity``, ``molar ellipticity`` or ``circular dichroism``) :return: .csv data files saved to the directory containing the read files. """ # check if output folder exists already, else create one if not exists(join(self.directory, 'Dichro')): makedirs(join(self.directory, 'Dichro')) if data in ['mean residue ellipticity', 'molar ellipticity', 'circular dichroism']: # loop through all data for single plots for i, f in enumerate(self.filenames): # get data type to be plotted d, _, _, _, _ = self._check_datatype(data, i, False) w = range(self.wmax, self.wmin - 1, -1) # wavelengths dichro = pd.DataFrame(data=zip(w, d), columns=["V1", "V2"], dtype='float') fname = splitext(f)[0] + '.csv' dichro.to_csv(join(self.directory, 'Dichro', fname), sep=';', index=False)
[docs] def helicity(self, temperature=24., k=3.5, induction=True, filename=None): """Method to calculate the percentage of helicity out of the mean residue ellipticity data. The calculation is based on the fromula by Fairlie and co-workers: .. math:: [\Theta]_{222\infty} = (-44000 * 250 * T) * (1 - k / N) The helicity is then calculated as the ratio of .. math:: ([\Theta]_{222} / [\Theta]_{222\infty}) * 100 \% :Reference: `Shepherd, N. E., Hoang, H. N., Abbenante, G. & Fairlie, D. P. J. Am. Chem. Soc. 127, 2974–2983 (2005). <https://dx.doi.org/10.1021/ja0456003>`_ :param temperature: {float} experiment temperature in °C :param k: {float, 2.4 - 4.5} finite length correction factor. Can be adapted to the helicity of a known peptide. :param induction: {bool} wether the helical induction upon changing from one solvent to another should be calculated. :param filename: {str} if given, helicity data is saved to the file "filename".csv :return: approximate helicity for every sequence in the attribute :py:attr:`helicity_values`. :Example: >>> cd.calc_meanres_ellipticity() >>> cd.helicity(temperature=24., k=3.492185008, induction=True) >>> cd.helicity_values Name Solvent Helicity Induction 0 Aurein2.2d2 T 100.0 3.823 1 Aurein2.2d2 W 26.16 0.000 2 Klak14 T 76.38 3.048 3 Klak14 W 25.06 0.000 """ values = self.meanres_ellipticity if values: hel = [] for i, v in enumerate(values): indx = np.where(v[:, 0] == 222.)[0][0] # get index of wavelength 222 nm hel_100 = (-44000. + 250. * temperature) * (1. - (float(k) / len(self.sequences[i]))) # inf hel 222 hel.append(round((v[indx, 1] / hel_100) * 100., 2)) self.helicity_values = pd.DataFrame(np.array([self.names, self.solvent, hel]).T, columns=['Name', 'Solvent', 'Helicity']) if induction: induct = [] try: for i in self.helicity_values.index: if self.helicity_values.iloc[i]['Name'] == self.helicity_values.iloc[i + 1]['Name'] and \ self.helicity_values.iloc[i]['Solvent'] != self.helicity_values.iloc[i + 1][ 'Solvent']: induct.append(round(float(self.helicity_values.iloc[i]['Helicity']) / float( self.helicity_values.iloc[i + 1]['Helicity']), 3)) # if following entry is same molecule # but not same solvent, calculate the helical induction and round to .3f else: # else just append 0 induct.append(0.) except IndexError: # at the end of the DataFrame, an index error will be raised because of i+1 induct.append(0.) self.helicity_values['Induction'] = induct if filename: self.helicity_values.to_csv(filename, index=False) else: print("ERROR\nmeanres_ellipticity data array empty, call the calculate function first:") print("calc_meanres_ellipticity()")