3. Example scripts

For documentation of the used modules see the the Documentation section.

3.1. A modlamp example script for peptide classification with a Random Forest classifier.

import pandas as pd
from modlamp.datasets import load_AMPvsUniProt
from modlamp.descriptors import PeptideDescriptor
from modlamp.ml import train_best_model, score_cv
from modlamp.descriptors import PeptideDescriptor
from modlamp.sequences import MixedLibrary

# define the size of the peptide library to screen
libsize = 1000
print("Chosen library size is %i" % libsize)

# load training sequences
data = load_AMPvsUniProt()

# describe sequences with PepCATS descriptor
descr = PeptideDescriptor(data.sequences, 'pepcats')
descr.calculate_crosscorr(7)

# find best Random Forest classifier based on the PEPCATS data
best_RF = train_best_model('RF', descr.descriptor, data.target)  # might take a while

# evaluate performance of best model in 10-fold cross validation
score_cv(best_RF, descr.descriptor, data.target, cv=10)

# generate a virtual peptide library of `libsize` sequences to screen
lib = MixedLibrary(libsize)
lib.generate_sequences()
print("Actual lirutal library size (without duplicates): %i" % len(lib.sequences))

# describe library with PEPCATS descriptor
lib_desc = PeptideDescriptor(lib.sequences, 'pepcats')
lib_desc.calculate_crosscorr(7)

# predict class probabilities for sequences in Library
proba = best_RF.predict_proba(lib_desc.descriptor)

# create ordered dictionary with sequences and prediction values and order it according to AMP predictions
d = pd.DataFrame({'sequence': lib.sequences, 'prediction': proba[:, 1]})
d50 = d.sort_values('prediction', ascending=False)[:50]  # 50 top AMP predictions

# print the 50 top ranked predictions with their predicted probabilities
print(d50)

3.2. An alternative example for single peptide classification with a RF classifier.

import pandas as pd
from modlamp.datasets import load_AMPvsUniProt
from modlamp.descriptors import PeptideDescriptor
from modlamp.ml import train_best_model, score_cv
from modlamp.descriptors import PeptideDescriptor

### IN THE FOLLOWING LIST, ADD YOUR SEQUENCES TO BE PREDICTED ###
to_predict = ['GLLDSLLALLFEWASQ', 'KLLKLLKLLKLLKLLKKKLKLKL', 'GLFDDSKALLKKDFWWW']

# load training sequences
data = load_AMPvsUniProt()

# describe sequences with PepCATS descriptor
descr = PeptideDescriptor(data.sequences, 'pepcats')
descr.calculate_crosscorr(7)

# train a Random Forest classifier with given parameters based on the PEPCATS data
best_RF = train_best_model('RF', descr.descriptor, data.target, cv=2,
                            param_grid={'clf__bootstrap': [True], 'clf__criterion': ['gini'], 'clf__max_features':
                            ['sqrt'], 'clf__n_estimators': [500]})

# evaluate performance of the model in 5-fold cross validation
score_cv(best_RF, descr.descriptor, data.target, cv=5)

# describe sequences to be predicted with PEPCATS descriptor
lib_desc = PeptideDescriptor(to_predict, 'pepcats')
lib_desc.calculate_crosscorr(7)

# predict class probabilities for the desired sequences
proba = best_RF.predict_proba(lib_desc.descriptor)

# create ordered dictionary with sequences and prediction values and order it according to AMP predictions
d = pd.DataFrame({'sequence': to_predict, 'prediction': proba[:, 1]})
d = d.sort_values('prediction', ascending=False)
print(d)  # print the final predictions (sorted according to decreasing probabilities)

3.3. Loading sequences from a FASTA file

A further example of how to load a list of own amino acid sequences from a .FASTA formatted file, calculate descriptors and save the values back to a .csv file.

from modlamp.descriptors import PeptideDescriptor

# load sequences from FASTA file and calculate the pepcats cross-correlated descriptor
x = PeptideDescriptor('location/of/your/file.fasta', 'pepcats')
x.calculate_crosscorr(window=7)
# save calculated descriptor to a .csv file
x.save_descriptor('location/of/your/outputfile.csv', delimiter=',')

3.4. Combining different descriptors & saving to csv

Many more descriptors are available for calculations. Here is another example of reading a sequence file and calculating two sets of descriptors followed by saving them to .csv files.

from modlamp.descriptors import PeptideDescriptor, GlobalDescriptor

# Load sequence file into descriptor object
pepdesc = PeptideDescriptor('/path/to/sequences.fasta', 'eisenberg')  # use Eisenberg consensus scale
globdesc = GlobalDescriptor('/path/to/sequences.fasta')

# --------------- Peptide Descriptor (AA scales) Calculations ---------------
pepdesc.calculate_global()  # calculate global Eisenberg hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate Eisenberg hydrophobic moment

# load other AA scales
pepdesc.load_scale('gravy')  # load GRAVY scale
pepdesc.calculate_global(append=True)  # calculate global GRAVY hydrophobicity
pepdesc.calculate_moment(append=True)  # calculate GRAVY hydrophobic moment
pepdesc.load_scale('z3')  # load old Z scale
pepdesc.calculate_autocorr(1, append=True)  # calculate global Z scale (=window1 autocorrelation)

# save descriptor data to .csv file
col_names1 = 'ID,Sequence,H_Eisenberg,uH_Eisenberg,H_GRAVY,uH_GRAVY,Z3_1,Z3_2,Z3_3'
pepdesc.save_descriptor('/path/to/descriptors1.csv', header=col_names1)

# --------------- Global Descriptor Calculations ---------------
globdesc.length()  # sequence length
globdesc.boman_index(append=True)  # Boman index
globdesc.aromaticity(append=True)  # global aromaticity
globdesc.aliphatic_index(append=True)  # aliphatic index
globdesc.instability_index(append=True)  # instability index
globdesc.calculate_charge(ph=7.4, amide=False, append=True)  # net charge
globdesc.calculate_MW(amide=False, append=True)  # molecular weight

# save descriptor data to .csv file
col_names2 = 'ID,Sequence,Length,BomanIndex,Aromaticity,AliphaticIndex,InstabilityIndex,Charge,MW'
globdesc.save_descriptor('/path/to/descriptors2.csv', header=col_names2)