-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKaiko_2.py
74 lines (55 loc) · 2.58 KB
/
Kaiko_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import re
import pandas as pd
from math import floor
from pathlib import Path
## This is a the same in function as the R script which combines the denovo output into a single fasta file.
## The only difference is that python and R handle ties in ordering differently. When a table is ordered,
## And the top 25% of the rows are taken, there can be slight differences between R and python.
directory = 'Kaiko_volume/Kaiko_intermediate/denovo_output/'
files = [f for f in os.listdir(directory) if bool(re.search(r'_out.txt', f))]
selection = 0.25
samples = []
# @profile
def combine_denovo_output(directory, prefix, selection = 0.25):
files = [f for f in os.listdir(directory) if bool(re.search(r'_out.txt', f))]
samples = []
for file in files:
xx = pd.read_csv(directory / file, sep = "\t", header = 0)
xx['output_seq'] = [re.sub(",", "", str(peptide)) for peptide in xx['output_seq']]
xx['output_seq'] = [re.sub("mod", "", str(peptide)) for peptide in xx['output_seq']]
xx = xx.loc[xx['output_score'] != float("Inf")]
xx = xx.sort_values('output_score', ascending = False)
xx = xx.head(floor(selection * floor(len(xx.index))))
#xx = xx[['scan', 'output_seq']]
xx['pep_length'] = [len(peptide) for peptide in xx['output_seq']]
xx = xx.loc[(xx['pep_length'] >= 10) & (xx['pep_length'] <= 17)]
xx['rank'] = list(range(1, len(xx.index) + 1))
grouped = xx.groupby('output_seq')
summary = grouped.apply(summary_times).to_frame()
# summary = grouped.apply(summary_times)
summary['output_seq'] = summary.index
summary.columns = ['times', 'output_seq']
summary = summary[['output_seq', 'times']]
summary['rank'] = grouped.apply(summary_rank)
summary['scans'] = grouped.apply(summary_scans)
samples += [summary]
combined_fasta = Path('Kaiko_volume/Kaiko_intermediate/' + prefix + '_combined_denovo.fasta')
for summary in samples:
nms = [">S" + summary['scans'][i] + "_" + str(summary['times'][i]) for i in range(len(summary))]
to_write = [None]*2*len(nms)
to_write[::2] = nms
to_write[1::2] = summary['output_seq']
with combined_fasta.open('a') as fasta_file:
for line in to_write:
fasta_file.write(f"{line}\n")
# fasta_file.write(line + "\n")
def summary_times(group):
scans = group['scan']
return len(scans)
def summary_scans(group):
scans = group['scan']
return "_".join(scans)
def summary_rank(group):
ranks = group['rank']
return min(ranks)