Skip to content

Commit

Permalink
plots are working
Browse files Browse the repository at this point in the history
  • Loading branch information
mostafakalhor committed Apr 23, 2024
1 parent 32978b1 commit ae7f423
Show file tree
Hide file tree
Showing 3 changed files with 189 additions and 30 deletions.
2 changes: 1 addition & 1 deletion oktoberfest/rescore/rescore.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
import pandas as pd
from spectrum_fundamentals.metrics.percolator import Percolator
from spectrum_io.search_result.xisearch import xisearch

from ..data import FragmentType, Spectra

logger = logging.getLogger(__name__)
Expand Down
213 changes: 186 additions & 27 deletions oktoberfest/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,26 +564,137 @@ def _rescore(fdr_dir: Path, config: Config):
'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"'
)

def xl_fdr(df: pd.DataFrame, score: str) -> pd.DataFrame:
"""
"calculate and add fdr_xl to the DataFrame : (TD-DD)/(TT)".
:param df: DataFrame containing the data.
:param score: Column name containing the scores used for calculating FDR
:return: DataFrame with the new column 'fdr'
"""
df = df.sort_values(by=score, ascending=False)
df['TD_sum'] = (df['label'] == 'TD').cumsum()
df['DD_sum'] = (df['label'] == 'DD').cumsum()
df['TT_sum'] = (df['label'] == 'TT').cumsum()
df['mokapot q-value'] = (df['TD_sum'] - df['DD_sum']) / df['TT_sum']
df.loc[df['TT_sum'] == 0, 'mokapot q-value'] = 0 # Handling division by zero
df = df.drop(['TD_sum', 'DD_sum', 'TT_sum'], axis=1)
return df


def xl_preprocessing_plot_unique_xl(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str):
columns_to_keep_mokapot = ["SpecId_a", "label", "ScanNr_a", "filename_a", "Peptide_a", "mokapot score", "mokapot q-value", "mokapot q-value_a", "mokapot q-value_b", "Proteins_b", "mod_pep_a_b", "mod_pep_b_b"]
mokapot_csms = mokapot_csms[columns_to_keep_mokapot]
mokapot_csms = mokapot_csms.rename(columns={'SpecId_a': 'SpecId', 'ScanNr_a': 'ScanNr', 'filename_a': 'filename', 'Peptide_a': 'Peptide', 'Proteins_b': 'Proteins', 'mod_pep_a_b': 'mod_pep_a', 'mod_pep_b_b': 'mod_pep_b'})
mokapot_csms['label'] = mokapot_csms['label'].replace({'TT': True, 'TD': False, 'DD': False})
mokapot_csms['base_pep_a'] = mokapot_csms['mod_pep_a'].str.replace(r'\[.*?\]', '', regex=True)
mokapot_csms['base_pep_b'] = mokapot_csms['mod_pep_b'].str.replace(r'\[.*?\]', '', regex=True)
mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_10.csv")
mokapot_csms = mokapot_csms.sort_values(by='mokapot score', ascending = False)
swap_mask = mokapot_csms['base_pep_a'] > mokapot_csms['base_pep_b']
mokapot_csms.loc[swap_mask, 'min_seq'] = mokapot_csms['base_pep_b']
mokapot_csms.loc[swap_mask, 'max_seq'] = mokapot_csms['base_pep_a']
mokapot_csms.loc[~swap_mask, 'min_seq'] = mokapot_csms['base_pep_a']
mokapot_csms.loc[~swap_mask, 'max_seq'] = mokapot_csms['base_pep_b']
mokapot_uniq_xls = mokapot_csms.drop_duplicates(subset=['min_seq', 'max_seq'])
xl_fdr(mokapot_uniq_xls, "mokapot score")
mokapot_uniq_xls_true = mokapot_uniq_xls[mokapot_uniq_xls['label'] == True]
mokapot_uniq_xls_decoy = mokapot_uniq_xls[mokapot_uniq_xls['label'] == False]
if original_or_rescore == "original":
mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep='\t', index=False)
mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.peptides.txt", sep='\t', index=False)
else:
mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep='\t', index=False)
mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.peptides.txt", sep='\t', index=False)

def prepare_rescore_xl_csm_level(
featrures_dir: Path
):
original_tab_file = pd.read_csv(featrures_dir + "/original.tab", sep="\t")
rescore_tab_file = pd.read_csv(featrures_dir + "/rescore.tab", sep="\t")
columns_to_remove = ["run_name",

def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str):
columns_to_keep_mokapot = ["SpecId_a", "label", "ScanNr_a", "filename_a", "Peptide_a", "mokapot score", "mokapot q-value", "mokapot q-value_a", "mokapot q-value_b", "Proteins_b"]
mokapot_csms = mokapot_csms[columns_to_keep_mokapot]
mokapot_csms = mokapot_csms.rename(columns={'SpecId_a': 'SpecId', 'ScanNr_a': 'ScanNr', 'filename_a': 'filename', 'Peptide_a': 'Peptide', 'Proteins_b': 'Proteins'})
mokapot_csms['label'] = mokapot_csms['label'].replace({'TT': True, 'TD': False, 'DD': False})
mokapot_csms_true = mokapot_csms[mokapot_csms['label'] == True]
mokapot_csms_decoy = mokapot_csms[mokapot_csms['label'] == False]
if original_or_rescore == "original":
mokapot_csms_true.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep='\t', index=False)
mokapot_csms_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", sep='\t', index=False)
else:
mokapot_csms_true.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep='\t', index=False)
mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", sep='\t', index=False)


def xl_psm_to_csm(featrures_dir: Path, original_or_rescore: str ):
def get_label(row):
if row['is_decoy_p1_a'] == 'False' and row['is_decoy_p2_a'] == 'False':
return 'TT'
elif row['is_decoy_p1_a'] == 'True' and row['is_decoy_p2_a'] == 'True':
return 'DD'
else:
return 'TD'

def min_score(row):
return min(row['mokapot score_a'], row['mokapot score_b'])

if original_or_rescore == "original":
decoy_psms = pd.read_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", delimiter='\t')
target_psms = pd.read_csv(featrures_dir + "/original.mokapot.psms.txt", delimiter='\t')
else:
decoy_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", delimiter='\t')
target_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.psms.txt", delimiter='\t')

split_data = target_psms['SpecId'].str.rsplit('-', n=10, expand=True)
new_columns = ["raw_file","scan_number", "mod_pep_a", "mod_pep_b", "charge",
"decoy_p1", "is_decoy_p1", "decoy_p2", "is_decoy_p2", "index"]
split_data.columns = new_columns
df_psm_target = pd.concat([target_psms, split_data], axis=1)
split_data = decoy_psms['SpecId'].str.rsplit('-', n=10, expand=True)
new_columns = ["raw_file","scan_number", "mod_pep_a", "mod_pep_b", "charge",
"decoy_p1", "is_decoy_p1", "decoy_p2", "is_decoy_p2", "index"]
split_data.columns = new_columns
df_psm_decoy = pd.concat([decoy_psms, split_data], axis=1)
df_mokapot = pd.concat([df_psm_decoy , df_psm_target], axis = 0)
df_mokapot[['index_csm', '_', 'which_pep',]] = df_mokapot['index'].str.split('_', expand=True)
df_mokapot.drop(columns=['index', '_', 'decoy_p1', 'decoy_p2'], inplace=True)
df_pep_1 = df_mokapot[df_mokapot['which_pep'] == "1"].copy()
df_pep_2 = df_mokapot[df_mokapot['which_pep'] == "2"].copy()
df_pep_1.drop(columns=['which_pep'], inplace=True)
df_pep_2.drop(columns=['which_pep'], inplace=True)
df_pep_1.columns = [col + '_a' if col != 'index_csm' else col for col in df_pep_1.columns]
df_pep_2.columns = [col + '_b' if col != 'index_csm' else col for col in df_pep_2.columns]
mokapot_csms = pd.merge(df_pep_1, df_pep_2, on='index_csm')
mokapot_csms['mokapot score'] = mokapot_csms.apply(min_score, axis=1)
mokapot_csms['label'] = mokapot_csms.apply(get_label, axis=1)
mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms.csv")
return mokapot_csms


def prepare_rescore_xl_psm_level(featrures_dir: Path, original_or_rescore: str ):
def extract_label_pep_a(specid):
if "-decoy_p1-True-" in specid:
return -1
elif "-decoy_p1-False-" in specid:
return 1
else:
return None
def extract_label_pep_b(specid):
if "-decoy_p2-True-" in specid:
return -1
elif "-decoy_p2-False-" in specid:
return 1
else:
return None

columns_to_remove_psm_rescore= ["run_name",
"scan_number",
"precursor_mass",
"precursor_charge",
"crosslinker_name",
"decoy_p1",
"base_sequence_p1",
"aa_len_p1",
"link_pos_p1",
"linked_aa_p1",
"mods_p1",
"mod_pos_p1",
"decoy_p2",
"base_sequence_p2",
"aa_len_p2",
"link_pos_p2",
Expand All @@ -606,28 +717,65 @@ def prepare_rescore_xl_csm_level(
"CROSSLINKER_POSITION_A",
"CROSSLINKER_POSITION_B",
"ModificationPositions1",
"MZ_RANGE",
"ModificationPositions2",
"MODIFIED_SEQUENCE_A",
"MODIFIED_SEQUENCE_B",
"filename",
"ExpMass",
"FRAGMENTATION",
"INSTRUMENT_TYPES",
"MASS_ANALYZER",
"Proteins",
"spectral_angle",
"SCORE",
"Unnamed: 0"
]
original_tab_file.drop(columns=columns_to_remove, inplace=True)
rescore_tab_file.drop(columns=columns_to_remove, inplace=True)
original_tab_file = original_tab_file.fillna(0)
columns_to_remove_psm_original = list(set(columns_to_remove_psm_rescore + ["match_score"]) - set(["spectral_angle"]))

if original_or_rescore == "original":
columns_to_remove_psm = columns_to_remove_psm_original
rescore_tab_file = pd.read_csv(featrures_dir + "/original.tab", sep="\t")
else:
columns_to_remove_psm = columns_to_remove_psm_rescore
rescore_tab_file = pd.read_csv(featrures_dir + "/rescore.tab", sep="\t")


rescore_tab_file.drop(columns=columns_to_remove_psm, inplace=True)
rescore_tab_file = rescore_tab_file.fillna(0)
string_columns = ['SpecId', 'Peptide', 'Proteins']
original_tab_file_numeric_columns = [col for col in original_tab_file.columns if col not in string_columns]
string_columns = ['SpecId', 'Peptide', 'Proteins', 'protein_p1', 'protein_p2', 'filename']
rescore_tab_file_numeric_columns = [col for col in rescore_tab_file.columns if col not in string_columns]
original_tab_file[original_tab_file_numeric_columns] = original_tab_file[original_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce')
rescore_tab_file[rescore_tab_file_numeric_columns] = rescore_tab_file[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce')

return original_tab_file, rescore_tab_file

rescore_tab_file = rescore_tab_file.reset_index(drop=True)
rescore_tab_file['Proteins'] = 'p1_' + rescore_tab_file['protein_p1'].astype(str) + '_p2_' + rescore_tab_file['protein_p2'].astype(str)
rescore_tab_file['SpecId'] = (
rescore_tab_file['SpecId'] + '-decoy_p1-' +
rescore_tab_file['decoy_p1'].astype(str) + '-decoy_p2-' +
rescore_tab_file['decoy_p2'].astype(str) + '-' +
rescore_tab_file.index.astype(str) # Adding index to SpecId
)
rescore_tab_file.drop(columns=[ 'protein_p1', 'protein_p2', 'decoy_p1', 'decoy_p2'], inplace=True)
columns_feature_pep_a = [col for col in rescore_tab_file.columns if col.endswith('_a') or col.endswith('_A') or not (col.endswith('_b') or col.endswith('_B'))]
columns_feature_pep_b = [col for col in rescore_tab_file.columns if col.endswith('_b') or col.endswith('_B') or not (col.endswith('_a') or col.endswith('_A'))]
rescore_tab_file_a = rescore_tab_file.loc[:, columns_feature_pep_a]
rescore_tab_file_b = rescore_tab_file.loc[:, columns_feature_pep_b]
rescore_tab_file_a['SpecId'] = rescore_tab_file_a['SpecId'] + '_' + rescore_tab_file_a.index.astype(str)
rescore_tab_file_b['SpecId'] = rescore_tab_file_b['SpecId'] + '_' + rescore_tab_file_b.index.astype(str)
#1 means pep a and 2 means pep b
rescore_tab_file_a['SpecId'] = rescore_tab_file_a['SpecId'] + '_1'
rescore_tab_file_b['SpecId'] = rescore_tab_file_b['SpecId'] + '_2'
rescore_tab_file_a.columns = [col[:-2] if col.endswith(('_a', '_A')) else col for col in rescore_tab_file_a.columns]
rescore_tab_file_b.columns = [col[:-2] if col.endswith(('_b', '_B')) else col for col in rescore_tab_file_b.columns]
rescore_tab_file_a['Label'] = rescore_tab_file_a['SpecId'].apply(extract_label_pep_a)
rescore_tab_file_b['Label'] = rescore_tab_file_b['SpecId'].apply(extract_label_pep_b)
string_columns = ['SpecId', 'Peptide', 'Proteins', 'filename']
rescore_tab_file_numeric_columns = [col for col in rescore_tab_file_a.columns if col not in string_columns]
rescore_tab_file_a[rescore_tab_file_numeric_columns] = rescore_tab_file_a[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce')
rescore_tab_file_b[rescore_tab_file_numeric_columns] = rescore_tab_file_b[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce')
# change ExpMass of rescore_tab_file_a
max_ExpMass = rescore_tab_file_a['ExpMass'].max()
rescore_tab_file_b['ExpMass'] += max_ExpMass
input_mokapot_psm = pd.concat([rescore_tab_file_a, rescore_tab_file_b], axis=0, ignore_index=True)
input_mokapot_psm['Proteins'].fillna("unknown", inplace=True)
return input_mokapot_psm


def run_rescoring(config_path: Union[str, Path]):
Expand All @@ -652,7 +800,7 @@ def run_rescoring(config_path: Union[str, Path]):
processing_pool = JobPool(processes=config.num_threads)
for spectra_file in spectra_files:
if "xl" in config.models["intensity"].lower():
processing_pool.apply_async(_calculate_features(spectra_file, config, xl=True), [spectra_file, config])
processing_pool.apply_async(_calculate_features, [spectra_file, config], xl=True)
else:
processing_pool.apply_async(_calculate_features, [spectra_file, config])
processing_pool.check_pool()
Expand Down Expand Up @@ -682,14 +830,25 @@ def run_rescoring(config_path: Union[str, Path]):
logger.info("Merging input tab files for rescoring with peptide property prediction")
re.merge_input(tab_files=rescore_tab_files, output_file=fdr_dir / "rescore.tab")
prepare_tab_rescore_step.mark_done()

if "xl" in config.models["intensity"].lower():
original_tab_file, rescore_tab_file = prepare_rescore_xl_csm_level(str(fdr_dir))
original_tab_file.to_csv(str(fdr_dir) + "/original.tab", sep="\t", index=False)
rescore_tab_file.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=False)


# rescoring
_rescore(fdr_dir, config)
if "xl" in config.models["intensity"].lower(): #xl-psm-level
input_mokapot_psm_rescore = prepare_rescore_xl_psm_level(str(fdr_dir), "rescore")
input_mokapot_psm_rescore.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=None)
input_mokapot_psm_original = prepare_rescore_xl_psm_level(str(fdr_dir), "original")
input_mokapot_psm_original.to_csv(str(fdr_dir) + "/original.tab", sep="\t", index=None)
_rescore(fdr_dir, config)
mokapot_csms_rescore = xl_psm_to_csm(str(fdr_dir), "rescore")
mokapot_csms_original = xl_psm_to_csm(str(fdr_dir), "original")
mokapot_csms_rescore = xl_fdr(mokapot_csms_rescore, score = "mokapot score")
mokapot_csms_original = xl_fdr(mokapot_csms_original, score = "mokapot score")
xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_rescore, "rescore")
xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_original, "original")
xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_rescore, "rescore")
xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_original, "original")

else:
_rescore(fdr_dir, config)

# plotting
logger.info("Generating summary plots...")
Expand Down
4 changes: 2 additions & 2 deletions oktoberfest/utils/multiprocessing_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def __init__(self, processes: int = 1, warning_filter: str = "default"):
self.pool = Pool(processes, self.init_worker)
self.results = []

def apply_async(self, f, args):
def apply_async(self, f, args, **kwargs):
"""Apply async."""
r = self.pool.apply_async(f, args)
r = self.pool.apply_async(f, args=args, kwds=kwargs)
self.results.append(r)

def init_worker(self):
Expand Down

0 comments on commit ae7f423

Please sign in to comment.