# -*- coding: utf-8 -*-
.. currentmodule:: modlamp.wetlab
.. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>
This module incorporates functions to load raw data files from wetlab experiments, calculate different
characteristics and plot.
============================= ============================================================================
Class Data
============================= ============================================================================
:py:class:`CD` Class for handling Circular Dichroism data.
============================= ============================================================================
from os import listdir, makedirs
from os.path import join, exists, splitext
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from modlamp.descriptors import GlobalDescriptor
__author__ = "Alex Müller, Gisela Gabernet"
__docformat__ = "restructuredtext en"
[docs]class CD:
Class to handle circular dichroism data files and calculate several ellipticity and helicity values.
The class can handle data files of the *Applied Photophysics* type.
For explanations of different units used in CD spectroscopy,
visit https://www.photophysics.com/resources/7-cd-units-conversions and read the following publication:
N. J. Greenfield, *Nat. Protoc.* **2006**, 1, 2876–2890.
.. note::
All files which should be read must have **4 header lines** as shown in the image below. CD data to be read
must start in line 5 (separated in 2 columns: *Wavelength* and *Signal*).
.. image:: ../docs/static/fileheader.png
First line: *Molecule Name*
Second line: *Sequence*
Third line: *concentration in µM*
Fourth line: *solvent*
Recognized solvents are **W** for water and **T** for TFE.
[docs] def __init__(self, directory, wmin, wmax, amide=True, pathlen=1):
"""Init method for class CD.
:param directory: {str} directory containing all data files to be read. Files need a **.csv** ending
:param wmin: {int} smalles wavelength measured
:param wmax: {int} highest wavelength measured
:param amide: {bool} specifies whether the sequences have amidated C-termini
:param pathlen: {float} cuvette path length in mm
>>> cd = CD('/path/to/your/folder', 185, 260)
>>> cd.filenames
['160819_Pep1_T_smooth.csv', '160819_Pep1_W_smooth.csv', '160819_Pep5_T_smooth.csv', ...]
>>> cd.names
['Pep 10', 'Pep 10', 'Pep 11', 'Pep 11', ... ]
>>> cd.conc_umol
[33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, 33.0, ... ]
>>> cd.meanres_mw
[114.29920769230768, 114.29920769230768, 111.68257692307689, 111.68257692307689, ... ]
# read filenames from directory
files = listdir(directory)
self.filenames = [filename for filename in files if filename.endswith('.csv')] # get all .csv files in dir
# initialize attributes
self.names = list()
self.sequences = list()
self.conc_umol = list()
self.conc_mgml = list()
self.mw = list()
self.meanres_mw = list()
self.solvent = list()
self.circular_dichroism = list()
self.molar_ellipticity = list()
self.meanres_ellipticity = list()
self.helicity_values = pd.DataFrame()
self.directory = directory
self.wmin = wmin
self.wmax = wmax
self.amide = amide
self.pathlen = pathlen
self._read_header() # call the _read_header function to fill up all attributes
def _read_header(self):
"""Priveat method called by ``__init__`` to read all file headers into the class attributes and calculate
sequence dependant values.
:return: headers in class attributes.
d = GlobalDescriptor('X') # template
# loop through all files in the directory
for i, file in enumerate(self.filenames):
with open(join(self.directory, file)) as f: # read first 4 lines as header, rest as data
head = [next(f) for _ in range(4)]
data = [next(f) for _ in range(4, (self.wmax - self.wmin) + 5)]
# read headers into class attributes
name = head[0].split('\r\n')[0]
sequence = head[1].split('\r\n')[0].strip()
umol = float(head[2].split('\r\n')[0])
# read CD data
wlengths = [int(line.split(',')[0]) for line in data] # get rid of s***** line ends
ellipts = [float(line.split(',')[1].split('\r\n')[0]) for line in data]
self.circular_dichroism.append(np.array(list(zip(wlengths, ellipts))))
# calculate MW and transform concentration to mg/ml
d.sequences = [sequence]
self.conc_mgml.append(self.mw[i] * umol / 10 ** 6)
self.meanres_mw.append(self.mw[i] / (len(sequence) - 1)) # mean residue molecular weight (MW / n-1)
[docs] def calc_molar_ellipticity(self):
"""Method to calculate the molar ellipticity for all loaded data in :py:attr:`circular_dichroism` according
to the following formula:
.. math::
[\Theta] = (\Theta * MW) / (c * l)
:return: {numpy array} molar ellipticity in :py:attr:`molar_ellipticity`
>>> cd.calc_molar_ellipticity()
>>> cd.molar_ellipticity
array([[ 260., -1.40893636e+04],
[ 259., -1.00558182e+04],
[ 258., -1.25173636e+04], ...
for i, data in enumerate(self.circular_dichroism):
# calculate molar ellipticity: (theta * MW) / (conc * pathlength); and concat. with wavelengths
mol_ellipt = (data[:, 1] * self.mw[i]) / (self.conc_mgml[i] * self.pathlen)
self.molar_ellipticity.append(np.array(list(zip(data[:, 0], mol_ellipt))))
[docs] def calc_meanres_ellipticity(self):
"""Method to calculate the mean residue ellipticity for all loaded data in :py:attr:`circular_dichroism`
according to the following formula:
.. math::
(\Theta * MRW) / (c * l) = [\Theta]
MRW = MW / (n - 1)
With *MRW* = mean residue weight (g/mol), *n* = number of residues in the peptide, *c* = concentration (mg/mL)
and *l* = cuvette path length (mm).
:return: {numpy array} molar ellipticity in :py:attr:`meanres_ellipticity`
>>> cd.calc_meanres_ellipticity()
>>> cd.meanres_ellipticity
array([[ 260. , -2669.5804196],
[ 259. , -3381.3286713],
[ 258. , -3872.5174825], ...
for i, data in enumerate(self.circular_dichroism):
# calculate molar ellipticity: (theta * mrw) / (conc * pathlength); and concat. with wavelengths
mol_ellipt = (data[:, 1] * self.meanres_mw[i]) / (self.conc_mgml[i] * self.pathlen)
self.meanres_ellipticity.append(np.array(list(zip(data[:, 0], mol_ellipt))))
def _plot_single(self, w, d, col, y_label, title, filename, y_min, y_max):
"""Private plot function used by :py:func:`modlamp.wetlab.CD.plot()` for plotting single CD plots"""
fig, ax = plt.subplots()
line = ax.plot(w, d / 1000.) # used legend is 10^3 so divide by 1000
plt.setp(line, color=col, linewidth=2.0)
plt.title(title, fontsize=18, fontweight='bold')
ax.set_xlabel('Wavelength [nm]', fontsize=16)
ax.set_ylabel(y_label, fontsize=16)
plt.xlim(np.min(w), np.max(w))
plt.ylim((y_min / 1000., y_max / 1000.))
img_name = splitext(filename)[0] + '.pdf'
plt.savefig(join(self.directory, 'PDF', img_name), dpi=150)
def _plot_double(self, w, dt, dw, y_label, title, filename, y_min, y_max):
"""Private plot function used by :py:func:`modlamp.wetlab.CD.plot()` for plotting combined CD plots"""
fig, ax = plt.subplots()
line1 = ax.plot(w, dt / 1000.)
line2 = ax.plot(w, dw / 1000.)
plt.setp(line1, color='r', linewidth=2.0, label='TFE', linestyle='--')
plt.setp(line2, color='b', linewidth=2.0, label='Water')
plt.title(title, fontsize=18, fontweight='bold')
ax.set_xlabel('Wavelength [nm]', fontsize=16)
ax.set_ylabel(y_label, fontsize=16)
plt.xlim(np.min(w), np.max(w))
plt.ylim((y_min / 1000., y_max / 1000.))
img_name = splitext(filename)[0] + '_M.pdf'
plt.savefig(join(self.directory, 'PDF', img_name), dpi=150)
def _plot_all(self, data, w, y_lim):
"""Private plot function used by :py:func:`modlamp.wetlab.CD.plot()` for plotting combined CD plots"""
colors = ['#53777A', '#542437', '#C02942', '#D95B43', '#ECD078', '#CFF09E', '#A8DBA8', '#79BD9A', '#3B8686',
'#0B486B', '#2790B0', '#94BA65', '#353432', '#4E4D4A', '##808080', '#CCCCCC']
fig, ax = plt.subplots()
y_label = '' # assign empty
y_min, y_max = (0, 1) # assign empty
for i, f in enumerate(self.filenames):
d, _, y_label, y_min, y_max = self._check_datatype(data, i, 'all')
if y_lim:
y_min = 1000 * y_lim[0] # * 1000 because axis are usually shown as 10^3
y_max = 1000 * y_lim[1]
vars()['line' + str(i)] = ax.plot(w, d / 1000.) # mark the line plots with the iterator, for labelling
plt.setp(vars()['line' + str(i)], color=colors[i], linewidth=1.5, label='%s' % f.split('.')[0],
except IndexError: # if more data than colors: start with dashed lines
plt.setp(vars()['line' + str(i)], color=colors[i - len(colors)], linewidth=1.5, label='%s' % f.split(
'.')[0], linestyle='--')
plt.title("Combined Plot", fontsize=18, fontweight='bold')
ax.set_xlabel('Wavelength [nm]', fontsize=16)
ax.set_ylabel(y_label, fontsize=16)
plt.xlim(np.min(w), np.max(w))
plt.ylim((y_min / 1000., y_max / 1000.))
plt.savefig(join(self.directory, 'PDF', 'all.pdf'), dpi=150)
def _check_datatype(self, data, i, comb_flag):
"""Private function to check data type; used by :py:func`modlamp.wetlab.CD.plot` and
d2 = []
if data == 'mean residue ellipticity':
d = self.meanres_ellipticity[i][:, 1]
if comb_flag == 'solvent' and i % 2 == 0:
d2 = self.meanres_ellipticity[i + 1][:, 1]
y_label = r"$[\Theta] \ast 10^{-3} (deg \ast cm^2 \ast dmol^{-1})$"
y_min = np.min(d) * 1.1
y_max = np.max(d) * 1.1
elif data == 'molar ellipticity':
d = self.molar_ellipticity[i][:, 1]
if comb_flag == 'solvent' and i % 2 == 0:
d2 = self.molar_ellipticity[i + 1][:, 1]
y_label = r"$[\Theta] \ast 10^{-3} (deg \ast cm^2 \ast dmol^{-1})$"
y_min = np.min(d) * 1.1
y_max = np.max(d) * 1.1
d = self.circular_dichroism[i][:, 1]
if comb_flag == 'solvent' and i % 2 == 0:
d2 = self.molar_ellipticity[i + 1][:, 1]
y_label = r"$\Delta A \ast 32.986 \ast 10^{-3}$"
y_min = np.min(d) * 1.1
y_max = np.max(d) * 1.1
return d, d2, y_label, y_min, y_max
[docs] def plot(self, data='mean residue ellipticity', combine='solvent', ylim=None):
"""Method to generate CD plots for all read data in the initial directory.
:param data: {str} which data should be plotted (``mean residue ellipticity``, ``molar ellipticity`` or
``circular dichroism``)
:param combine: {str} if ``solvent``, overlays of different solvents will be created for the same molecule.
The amino acid sequence in the header is used to find corresponding data.
if ``all``, all data is combined in one single plot. To ignore combination, pass an empty string.
:param ylim: {tuple} If not none, this tuple of values is taken as the minimum and maximum of the y axis
:return: .pdf plots saved to the directory containing the read files.
>>> cd = CD('/path/to/your/folder', 185, 260)
>>> cd.calc_meanres_ellipticity()
>>> cd.plot(data='mean residue ellipticity', combine='solvent')
.. image:: ../docs/static/cd1.png
:height: 300px
.. image:: ../docs/static/cd2.png
:height: 300px
.. image:: ../docs/static/cd3.png
:height: 300px
# prepare combination of solvent plots
if combine == 'solvent':
d = {s: self.sequences.count(s) for s in set(self.sequences)} # create dict with seq counts for combine
if sum(d.values()) != 2 * len(d.values()):
raise ValueError
# check if output folder exists already, else create one
if not exists(join(self.directory, 'PDF')):
makedirs(join(self.directory, 'PDF'))
w = range(self.wmax, self.wmin - 1, -1) # wavelengths
# check input data option
if data in ['mean residue ellipticity', 'molar ellipticity', 'circular dichroism']:
# loop through all data for single plots
for i, f in enumerate(self.filenames):
# get data type to be plotted
d, d2, y_label, y_min, y_max = self._check_datatype(data, i, combine)
if self.solvent[i] == 'T': # color
col = 'r'
col = 'b'
if ylim:
y_min = 1000 * ylim[0] # * 1000 because axis are usually shown as 10^3
y_max = 1000 * ylim[1]
# plot single plots
self._plot_single(w, d, col, y_label, self.names[i] + ' ' + self.solvent[i], f, y_min, y_max)
# plot mixed plots
if combine == 'solvent' and i % 2 == 0:
self._plot_double(w, d, d2, y_label, self.names[i], f, y_min, y_max)
if combine == 'all':
self._plot_all(data, w, ylim)
print("ERROR\nWrong data option given!\nAvailable:")
print("['mean residue ellipticity', 'molar ellipticity', 'circular dichroism']")
except IndexError: # if data arrays are empty, no data was calculated
print("ERROR\nSpecified data array empty, call the calculate functions first!")
print("e.g. self.calc_molar_ellipticity()")
except ValueError:
print("ERROR\nSolvent pairs not even / missing.")
print("Check if all measurements were performed in both TFE and water")
[docs] def dichroweb(self, data='mean residue ellipticity'):
"""Method to save the calculated CD data into DichroWeb readable format (semi-colon separated). The produced
files can then directly be uploaded to the `DichroWeb <http://dichroweb.cryst.bbk.ac.uk>`_ analysis tool.
:param data: {str} which data should be plotted (``mean residue ellipticity``, ``molar ellipticity`` or
``circular dichroism``)
:return: .csv data files saved to the directory containing the read files.
# check if output folder exists already, else create one
if not exists(join(self.directory, 'Dichro')):
makedirs(join(self.directory, 'Dichro'))
if data in ['mean residue ellipticity', 'molar ellipticity', 'circular dichroism']:
# loop through all data for single plots
for i, f in enumerate(self.filenames):
# get data type to be plotted
d, _, _, _, _ = self._check_datatype(data, i, False)
w = range(self.wmax, self.wmin - 1, -1) # wavelengths
dichro = pd.DataFrame(data=zip(w, d), columns=["V1", "V2"], dtype='float')
fname = splitext(f)[0] + '.csv'
dichro.to_csv(join(self.directory, 'Dichro', fname), sep=';', index=False)
[docs] def helicity(self, temperature=24., k=3.5, induction=True, filename=None):
"""Method to calculate the percentage of helicity out of the mean residue ellipticity data.
The calculation is based on the fromula by Fairlie and co-workers:
.. math::
[\Theta]_{222\infty} = (-44000 * 250 * T) * (1 - k / N)
The helicity is then calculated as the ratio of
.. math::
([\Theta]_{222} / [\Theta]_{222\infty}) * 100 \%
:Reference: `Shepherd, N. E., Hoang, H. N., Abbenante, G. & Fairlie, D. P. J. Am. Chem. Soc. 127, 2974–2983
(2005). <https://dx.doi.org/10.1021/ja0456003>`_
:param temperature: {float} experiment temperature in °C
:param k: {float, 2.4 - 4.5} finite length correction factor. Can be adapted to the helicity of a known peptide.
:param induction: {bool} wether the helical induction upon changing from one solvent to another should be
:param filename: {str} if given, helicity data is saved to the file "filename".csv
:return: approximate helicity for every sequence in the attribute :py:attr:`helicity_values`.
>>> cd.calc_meanres_ellipticity()
>>> cd.helicity(temperature=24., k=3.492185008, induction=True)
>>> cd.helicity_values
Name Solvent Helicity Induction
0 Aurein2.2d2 T 100.0 3.823
1 Aurein2.2d2 W 26.16 0.000
2 Klak14 T 76.38 3.048
3 Klak14 W 25.06 0.000
values = self.meanres_ellipticity
if values:
hel = []
for i, v in enumerate(values):
indx = np.where(v[:, 0] == 222.)[0][0] # get index of wavelength 222 nm
hel_100 = (-44000. + 250. * temperature) * (1. - (float(k) / len(self.sequences[i]))) # inf hel 222
hel.append(round((v[indx, 1] / hel_100) * 100., 2))
self.helicity_values = pd.DataFrame(np.array([self.names, self.solvent, hel]).T, columns=['Name', 'Solvent',
if induction:
induct = []
for i in self.helicity_values.index:
if self.helicity_values.iloc[i]['Name'] == self.helicity_values.iloc[i + 1]['Name'] and \
self.helicity_values.iloc[i]['Solvent'] != self.helicity_values.iloc[i + 1][
induct.append(round(float(self.helicity_values.iloc[i]['Helicity']) / float(
self.helicity_values.iloc[i + 1]['Helicity']),
3)) # if following entry is same molecule
# but not same solvent, calculate the helical induction and round to .3f
else: # else just append 0
except IndexError: # at the end of the DataFrame, an index error will be raised because of i+1
self.helicity_values['Induction'] = induct
if filename:
self.helicity_values.to_csv(filename, index=False)
print("ERROR\nmeanres_ellipticity data array empty, call the calculate function first:")