forked from compomics/ms2pip
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta2speclib.py
400 lines (327 loc) · 15.7 KB
/
fasta2speclib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
"""
Create a spectral library starting from a proteome in fasta format.
The script runs through the following steps:
- In silico cleavage of proteins from the fasta file
- Remove peptide redundancy
- Add all variations of variable modifications (max 7 PTMs/peptide)
- Add variations on charge state
- Run peptides through MS2PIP
- Write to MSP, MGF or HDF file
"""
__author__ = "Ralf Gabriels"
__copyright__ = "CompOmics 2018"
__credits__ = ["Ralf Gabriels", "Sven Degroeve", "Lennart Martens"]
__license__ = "Apache License, Version 2.0"
__email__ = "[email protected]"
# Native libraries
import json
import logging
import argparse
from math import ceil
from multiprocessing import Pool
from itertools import combinations
# Third party libraries
import numpy as np
import pandas as pd
from pyteomics import mass
from pyteomics.parser import cleave, expasy_rules
from Bio import SeqIO
# MS2PIP
from ms2pipC import run
from ms2pip_tools.spectrum_output import write_mgf, write_msp
from ms2pip_tools.get_elude_predictions import get_elude_predictions
def ArgParse():
parser = argparse.ArgumentParser(description='Create an MS2PIP-predicted spectral library, starting from a fasta file.')
parser.add_argument('fasta_filename', action='store',
help='Path to the fasta file containing protein sequences')
parser.add_argument('-o', dest='output_filename', action='store',
help='Name for output file(s) (if not given, derived from input file)')
parser.add_argument('-c', dest='config_filename', action='store',
help='Name of configuration json file (default: fasta2speclib_config.json)')
args = parser.parse_args()
return args
def get_params():
args = ArgParse()
if not args.config_filename:
config_filename = 'fasta2speclib_config.json'
else:
config_filename = args.config_filename
with open(config_filename, 'rt') as config_file:
params = json.load(config_file)
params.update({
'fasta_filename': args.fasta_filename,
'log_level': logging.INFO,
})
if args.output_filename:
params['output_filename'] = args.output_filename
else:
params['output_filename'] = '_'.join(params['fasta_filename'].split('\\')[-1].split('.')[:-1])
return params
def prot_to_peprec(protein):
params = get_params()
tmp = pd.DataFrame(columns=['spec_id', 'peptide', 'modifications', 'charge'])
pep_count = 0
for peptide in cleave(str(protein.seq), expasy_rules['trypsin'], params['missed_cleavages']):
if False not in [aa not in peptide for aa in ['B', 'J', 'O', 'U', 'X', 'Z']]:
if params['min_peplen'] <= len(peptide) < int(params['max_pepmass'] / 186 + 2):
if not mass.calculate_mass(sequence=peptide) > params['max_pepmass']:
pep_count += 1
row = {'spec_id': '{}_{:03d}'.format(protein.id, pep_count),
'peptide': peptide, 'modifications': '-', 'charge': np.nan}
tmp = tmp.append(row, ignore_index=True)
return tmp
def get_protein_list(df):
peptide_to_prot = {}
for pi, pep in zip(df['spec_id'], df['peptide']):
pi = '_'.join(pi.split('_')[0:2])
if pep in peptide_to_prot.keys():
peptide_to_prot[pep].append(pi)
else:
peptide_to_prot[pep] = [pi]
df['protein_list'] = [list(set(peptide_to_prot[pep])) for pep in df['peptide']]
df = df[~df.duplicated(['peptide', 'charge', 'modifications'])]
return df
def add_mods(tup):
"""
See fasta2speclib_config.md for more information.
"""
_, row = tup
params = get_params()
mod_versions = [dict()]
# First add all fixed modifications
for mod in params['modifications']:
if mod['fixed']:
if not mod['n_term'] and mod['amino_acid']:
mod_versions[0].update({i:mod['name'] for i, aa in enumerate(row['peptide']) if aa == mod['amino_acid']})
elif mod['n_term']:
if mod['amino_acid']:
if row['peptide'][0] == mod['amino_acid']:
mod_versions[0]['N'] = mod['name']
else:
mod_versions[0]['N'] = mod['name']
# Continue with variable modifications
for mod in params['modifications']:
if mod['fixed']:
continue
# List all positions with specific amino acid, to avoid combinatorial explotion, limit to 4 positions
all_pos = [i for i, aa in enumerate(row['peptide']) if aa == mod['amino_acid']]
if len(all_pos) > 4:
all_pos = all_pos[:4]
for version in mod_versions:
# For non-position-specific mods:
if not mod['n_term']:
pos = [p for p in all_pos if p not in version.keys()]
combos = [x for l in range(1, len(pos) + 1) for x in combinations(pos, l)]
for combo in combos:
new_version = version.copy()
for pos in combo:
new_version[pos] = mod['name']
mod_versions.append(new_version)
# For N-term mods and N-term is not yet modified:
elif mod['n_term'] and 'N' not in version.keys():
# N-term with specific first AA:
if mod['amino_acid']:
if row['peptide'][0] == mod['amino_acid']:
new_version = version.copy()
new_version['N'] = mod['name']
mod_versions.append(new_version)
# N-term without specific first AA:
else:
new_version = version.copy()
new_version['N'] = mod['name']
mod_versions.append(new_version)
df_out = pd.DataFrame(columns=row.index)
df_out['modifications'] = ['|'.join('{}|{}'.format(0, value) if key == 'N'
else '{}|{}'.format(key + 1, value) for key, value
in version.items()) for version in mod_versions]
df_out['modifications'] = ['-' if not mods else mods for mods in df_out['modifications']]
df_out['spec_id'] = ['{}_{:03d}'.format(row['spec_id'], i) for i in range(len(mod_versions))]
df_out['charge'] = row['charge']
df_out['peptide'] = row['peptide']
if 'protein_list' in row.index:
df_out['protein_list'] = str(row['protein_list'])
return df_out
def add_charges(df_in):
params = get_params()
df_out = pd.DataFrame(columns=df_in.columns)
for charge in params['charges']:
tmp = df_in.copy()
tmp['spec_id'] = tmp['spec_id'] + '_{}'.format(charge)
tmp['charge'] = charge
df_out = df_out.append(tmp, ignore_index=True)
df_out.sort_values(['spec_id', 'charge'], inplace=True)
df_out.reset_index(drop=True, inplace=True)
return df_out
def create_decoy_peprec(peprec, spec_id_prefix='decoy_', keep_cterm_aa=True, remove_redundancy=True, move_mods=True):
"""
Create decoy peptides by reversing the sequences in a PEPREC DataFrame.
Keyword arguments:
spec_id_prefix -- string to prefix the decoy spec_ids (default: 'decoy_')
keep_cterm_aa -- True if the last amino acid should stay in place (for example to keep tryptic properties) (default: True)
remove_redundancy -- True if reversed peptides that are also found in the set of normal peptide should be removed (default: True)
move_mods -- True to move modifications according to reversed sequence (default: True)
Known issues:
- C-terminal modifications (with position `-1`) are sorted to the front (eg: `-1|Cterm|0|Nterm|2|NormalPTM`).
"""
def move_mods(row):
mods = row['modifications']
if type(mods) == str:
if not mods == '-':
mods = mods.split('|')
mods = sorted(zip([int(p) if (p == '-1' or p == '0')
else len(row['peptide']) - int(p)
for p in mods[::2]
], mods[1::2]))
mods = '|'.join(['|'.join([str(x) for x in mod]) for mod in mods])
row['modifications'] = mods
return row
peprec_decoy = peprec.copy()
peprec_decoy['spec_id'] = spec_id_prefix + peprec_decoy['spec_id'].astype(str)
if keep_cterm_aa:
peprec_decoy['peptide'] = peprec_decoy['peptide'].apply(lambda pep: pep[-2::-1] + pep[-1])
else:
peprec_decoy['peptide'] = peprec_decoy['peptide'].apply(lambda pep: pep[-1::-1])
if remove_redundancy:
peprec_decoy = peprec_decoy[~peprec_decoy['peptide'].isin(peprec['peptide'])]
if 'protein_list' in peprec_decoy.columns:
peprec_decoy['protein_list'] = 'decoy'
if move_mods:
peprec_decoy = peprec_decoy.apply(move_mods, axis=1)
return peprec_decoy
def remove_from_peprec_filter(peprec_pred, peprec_filter):
peprec_pred_comb = peprec_pred['modifications'] + peprec_pred['peptide'] + peprec_pred['charge'].astype(str)
peprec_filter_comb = peprec_filter['modifications'] + peprec_filter['peptide'] + peprec_filter['charge'].astype(str)
return peprec_pred[~peprec_pred_comb.isin(peprec_filter_comb)].copy()
def run_batches(peprec, decoy=False):
params = get_params()
if decoy:
params['output_filename'] += '_decoy'
ms2pip_params = {
'model': params['ms2pip_model'],
'frag_error': 0.02,
# Modify fasta2speclib modifications dict to MS2PIP params PTMs entry
'ptm': ['{},{},opt,{}'.format(mods['name'], mods['mass_shift'], mods['amino_acid'])
if not mods['n_term']
else '{},{},opt,N-term'.format(mods['name'], mods['mass_shift'])
for mods in params['modifications']],
'sptm': [],
'gptm': [],
}
# Split up into batches to save memory:
b_size = params['batch_size']
b_count = 0
num_b_counts = ceil(len(peprec) / b_size)
for i in range(0, len(peprec), b_size):
if i + b_size < len(peprec):
peprec_batch = peprec[i:i + b_size]
else:
peprec_batch = peprec[i:]
b_count += 1
logging.info("Predicting batch %d of %d, containing %d unmodified peptides", b_count, num_b_counts, len(peprec_batch))
logging.debug("Adding all modification combinations")
peprec_mods = pd.DataFrame(columns=peprec_batch.columns)
with Pool(params['num_cpu']) as p:
peprec_mods = peprec_mods.append(p.map(add_mods, peprec_batch.iterrows()), ignore_index=True)
peprec_batch = peprec_mods
if type(params['elude_model_file']) == str:
logging.debug("Adding ELUDE predicted retention times")
peprec_batch['rt'] = get_elude_predictions(
peprec_batch,
params['elude_model_file'],
unimod_mapping={mod['name']: mod['unimod_accession'] for mod in params['modifications']}
)
logging.debug("Adding charge states %s", str(params['charges']))
peprec_batch = add_charges(peprec_batch)
if type(params['peprec_filter']) == str:
logging.debug("Removing peptides present in peprec filter")
peprec_filter = pd.read_csv(params['peprec_filter'], sep=' ')
peprec_batch = remove_from_peprec_filter(peprec_batch, peprec_filter)
# Write ptm/charge-extended peprec from this batch to H5 file:
# peprec_batch.astype(str).to_hdf(
# '{}_expanded_{}.peprec.hdf'.format(params['output_filename'], b_count), key='table',
# format='table', complevel=3, complib='zlib', mode='w'
# )
logging.info("Running MS2PIPc for %d peptides", len(peprec_batch))
all_preds = run(peprec_batch, num_cpu=params['num_cpu'], output_filename=params['output_filename'],
params=ms2pip_params, return_results=True)
if b_count == 1:
write_mode = 'w'
append = False
else:
write_mode = 'a'
append = True
if 'hdf' in params['output_filetype']:
logging.info("Writing predictions to %s_predictions.hdf", params['output_filename'])
all_preds.astype(str).to_hdf(
'{}_predictions.hdf'.format(params['output_filename']),
key='table', format='table', complevel=3, complib='zlib',
mode=write_mode, append=append, min_itemsize=50
)
if 'msp' in params['output_filetype']:
logging.info("Writing MSP file with unmodified peptides")
write_msp(
all_preds,
peprec_batch[peprec_batch['modifications'] == '-'],
output_filename="{}_unmodified".format(params['output_filename']),
write_mode=write_mode,
)
logging.info("Writing MSP file with all peptides")
write_msp(
all_preds,
peprec_batch,
output_filename="{}_withmods".format(params['output_filename']),
write_mode=write_mode,
)
if 'mgf' in params['output_filetype']:
logging.info("Writing MGF file with unmodified peptides")
write_mgf(
all_preds,
peprec=peprec_batch[peprec_batch['modifications'] == '-'],
output_filename="{}_unmodified".format(params['output_filename']),
write_mode=write_mode
)
logging.info("Writing MGF file with all peptides")
write_mgf(
all_preds,
peprec=peprec_batch,
output_filename="{}_withmods".format(params['output_filename']),
write_mode=write_mode
)
del all_preds
del peprec_batch
def main():
params = get_params()
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=params['log_level']
)
peprec = pd.DataFrame(columns=['spec_id', 'peptide', 'modifications', 'charge'])
logging.info("Cleaving proteins, adding peptides to peprec")
with Pool(params['num_cpu']) as p:
peprec = peprec.append(p.map(prot_to_peprec, SeqIO.parse(params['fasta_filename'], "fasta")), ignore_index=True)
logging.info("Removing peptide redundancy, adding protein list to peptides")
peprec = get_protein_list(peprec)
peprec_nonmod = peprec.copy()
save_peprec = False
if save_peprec:
logging.info("Saving non-expanded PEPREC to %s.peprec.hdf", params['output_filename'])
peprec_nonmod['protein_list'] = ['/'.join(prot) for prot in peprec_nonmod['protein_list']]
peprec_nonmod.astype(str).to_hdf(
'{}_nonexpanded.peprec.hdf'.format(params['output_filename']), key='table',
format='table', complevel=3, complib='zlib', mode='w'
)
if not params['decoy']:
del peprec_nonmod
run_batches(peprec, decoy=False)
# For testing
# peprec_nonmod = pd.read_hdf('data/uniprot_proteome_yeast_head_nonexpanded.peprec.hdf', key='table')
if params['decoy']:
logging.info("Reversing sequences for decoy peptides")
peprec_decoy = create_decoy_peprec(peprec_nonmod, move_mods=False)
del peprec_nonmod
logging.info("Predicting spectra for decoy peptides")
run_batches(peprec_decoy, decoy=True)
logging.info("Fasta2SpecLib is ready!")
if __name__ == "__main__":
main()