From 4d8e00ba5e22944947369c156e1146ce4736da21 Mon Sep 17 00:00:00 2001 From: Mostafa Kalhor Date: Fri, 26 Apr 2024 15:10:44 +0000 Subject: [PATCH] xl_fdr of uxls fixed --- oktoberfest/data/spectra.py | 24 +- oktoberfest/predict/koina.py | 12 +- oktoberfest/predict/predict.py | 27 +- oktoberfest/preprocessing/preprocessing.py | 23 +- oktoberfest/rescore/rescore.py | 55 ++- oktoberfest/runner.py | 523 +++++++++++++-------- 6 files changed, 416 insertions(+), 248 deletions(-) diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py index 43dd002c..ca4723a5 100644 --- a/oktoberfest/data/spectra.py +++ b/oktoberfest/data/spectra.py @@ -19,15 +19,15 @@ class FragmentType(Enum): """FragmentType class to enumerate pred, raw, and mz.""" - PRED = 1 + PRED = 1 PRED_A = 2 PRED_B = 3 - RAW = 4 - RAW_A = 5 - RAW_B = 6 - MZ = 7 - MZ_A = 8 - MZ_B = 9 + RAW = 4 + RAW_A = 5 + RAW_B = 6 + MZ = 7 + MZ_A = 8 + MZ_B = 9 class Spectra(anndata.AnnData): @@ -65,7 +65,7 @@ def _gen_vars_df(xl: bool = False) -> pd.DataFrame: else: max_range = 30 ion_nums = np.repeat(np.arange(1, max_range), 6) - ion_charge = np.tile([1, 2, 3], (max_range-1) * 2) + ion_charge = np.tile([1, 2, 3], (max_range - 1) * 2) temp_cols = [] for size in range(1, max_range): for typ in ["Y", "B"]: @@ -89,7 +89,7 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> List[str if xl: max_range = 59 else: - max_range = 30 + max_range = 30 for i in range(1, max_range): for column in Spectra.COLUMNS_FRAGMENT_ION: columns.append(prefix + "_" + column.replace("1", str(i))) @@ -107,11 +107,11 @@ def _resolve_prefix(fragment_type: FragmentType) -> str: if fragment_type.value == 1: prefix = Spectra.INTENSITY_PRED_PREFIX elif fragment_type.value == 2: - prefix = Spectra.INTENSITY_PRED_PREFIX_A + prefix = Spectra.INTENSITY_PRED_PREFIX_A elif fragment_type.value == 3: prefix = Spectra.INTENSITY_PRED_PREFIX_B elif fragment_type.value == 4: - prefix = Spectra.INTENSITY_COLUMN_PREFIX + prefix = Spectra.INTENSITY_COLUMN_PREFIX elif fragment_type.value == 5: prefix = Spectra.INTENSITY_COLUMN_PREFIX_A elif fragment_type.value == 6: @@ -119,7 +119,7 @@ def _resolve_prefix(fragment_type: FragmentType) -> str: elif fragment_type.value == 7: prefix = Spectra.MZ_COLUMN_PREFIX elif fragment_type.value == 8: - prefix = Spectra.MZ_COLUMN_PREFIX_A + prefix = Spectra.MZ_COLUMN_PREFIX_A else: prefix = Spectra.MZ_COLUMN_PREFIX_B return prefix diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py index 3cb4dcac..01e99309 100644 --- a/oktoberfest/predict/koina.py +++ b/oktoberfest/predict/koina.py @@ -1,7 +1,7 @@ import time import warnings from functools import partial -from typing import Dict, Generator, KeysView, List, Optional, Union +from typing import Dict, Generator, KeysView, List, Optional, Union, Tuple import numpy as np import pandas as pd @@ -35,9 +35,14 @@ alternative_column_map_xl_switched = { "peptide_sequences_1": "MODIFIED_SEQUENCE_B", "peptide_sequences_2": "MODIFIED_SEQUENCE_A", - **{key: value for key, value in alternative_column_map_xl.items() if key not in ["peptide_sequences_1", "peptide_sequences_2"]} + **{ + key: value + for key, value in alternative_column_map_xl.items() + if key not in ["peptide_sequences_1", "peptide_sequences_2"] + }, } + class Koina: """A class for interacting with Koina models for inference.""" @@ -495,7 +500,7 @@ def predict_xl( data: Union[Dict[str, np.ndarray], pd.DataFrame], _async: bool = True, debug=False, - ) -> Dict[str, np.ndarray]: + ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]: """ Perform inference on the xl data using the Koina model. @@ -539,7 +544,6 @@ def predict_xl( else: return self.__predict_sequential(data_1), self.__predict_sequential(data_2) - def __predict_async(self, data: Dict[str, np.ndarray], debug=False) -> Dict[str, np.ndarray]: """ Perform asynchronous inference on the given data using the Koina model. diff --git a/oktoberfest/predict/predict.py b/oktoberfest/predict/predict.py index 1688e64e..f83d7fa6 100644 --- a/oktoberfest/predict/predict.py +++ b/oktoberfest/predict/predict.py @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) -def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.ndarray]: +def predict(data: pd.DataFrame, xl: bool = False, **kwargs) -> Dict[str, np.ndarray]: """ Retrieve predictions from koina. @@ -29,10 +29,10 @@ def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.nda predictor = Koina(**kwargs) if xl: results_a, results_b = predictor.predict_xl(data) - return results_a, results_b + return results_a, results_b else: results = predictor.predict(data) - return results + return results def parse_fragment_labels( @@ -71,7 +71,9 @@ def parse_fragment_labels( return {"type": fragment_type_array, "number": fragment_number_array, "charge": fragment_charge_array} -def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl : bool = False) -> Spectra: +def _prepare_alignment_df( + library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl: bool = False +) -> Spectra: """ Prepare an alignment DataFrame from the given Spectra library. @@ -102,7 +104,9 @@ def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_ return alignment_library -def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl : bool = False, **server_kwargs ) -> Spectra: +def ce_calibration( + library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl: bool = False, **server_kwargs +) -> Spectra: """ Calculate best collision energy for peptide property predictions. @@ -119,13 +123,13 @@ def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge: """ alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge, xl=xl) if xl: - intensities_a, intensities_b = predict(alignment_library.obs, xl = True, **server_kwargs) + intensities_a, intensities_b = predict(alignment_library.obs, xl=True, **server_kwargs) alignment_library.add_matrix(intensities_a["intensities"], FragmentType.PRED_A) alignment_library.add_matrix(intensities_b["intensities"], FragmentType.PRED_B) else: intensities = predict(alignment_library.obs, **server_kwargs) alignment_library.add_matrix(intensities["intensities"], FragmentType.PRED) - + _alignment(alignment_library, xl=xl) return alignment_library @@ -148,13 +152,12 @@ def _alignment(alignment_library: Spectra, xl: bool = False): sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b) alignment_library.add_column(sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0), "SPECTRAL_ANGLE_A") alignment_library.add_column(sm_b.spectral_angle(raw_intensity_b, pred_intensity_b, 0), "SPECTRAL_ANGLE_B") - alignment_library.add_column((alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2, "SPECTRAL_ANGLE") + alignment_library.add_column( + (alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2, + "SPECTRAL_ANGLE", + ) else: pred_intensity = alignment_library.get_matrix(FragmentType.PRED)[0] raw_intensity = alignment_library.get_matrix(FragmentType.RAW)[0] sm = SimilarityMetrics(pred_intensity, raw_intensity) alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE") - - - - diff --git a/oktoberfest/preprocessing/preprocessing.py b/oktoberfest/preprocessing/preprocessing.py index 510801ce..1798f43a 100644 --- a/oktoberfest/preprocessing/preprocessing.py +++ b/oktoberfest/preprocessing/preprocessing.py @@ -14,7 +14,7 @@ from spectrum_io.d import convert_d_hdf, read_and_aggregate_timstof from spectrum_io.file import csv from spectrum_io.raw import ThermoRaw -from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Xisearch +from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Scout, Xisearch from spectrum_io.spectral_library.digest import get_peptide_to_protein_map from ..data.spectra import FragmentType, Spectra @@ -198,6 +198,7 @@ def filter_peptides( return peptides[peptide_filter] + def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int, max_charge: int) -> pd.DataFrame: """ Filter xl search results using given constraints. @@ -216,19 +217,20 @@ def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int, df = peptides.obs else: df = peptides - + peptide_filter = ( - (df["PEPTIDE_LENGTH_A"] <= max_length) - & (df["PEPTIDE_LENGTH_B"] <= max_length) - & (df["PEPTIDE_LENGTH_A"] >= min_length) - & (df["PEPTIDE_LENGTH_B"] >= min_length) - & (df["PRECURSOR_CHARGE"] <= max_charge) - & (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]")) - & (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]")) + (df["PEPTIDE_LENGTH_A"] <= max_length) + & (df["PEPTIDE_LENGTH_B"] <= max_length) + & (df["PEPTIDE_LENGTH_A"] >= min_length) + & (df["PEPTIDE_LENGTH_B"] >= min_length) + & (df["PRECURSOR_CHARGE"] <= max_charge) + & (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]")) + & (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]")) ) return peptides[peptide_filter] + def process_and_filter_spectra_data(library: Spectra, model: str, tmt_label: Optional[str] = None) -> Spectra: """ Process and filter the spectra data in the given SpectralLibrary object. @@ -318,6 +320,8 @@ def convert_search( search_result = Sage elif search_engine == "xisearch": search_result = Xisearch + elif search_engine == "scout": + search_result = Scout else: raise ValueError(f"Unknown search engine provided: {search_engine}") @@ -581,6 +585,7 @@ def annotate_spectral_library( return aspec + def annotate_spectral_library_xl(psms: Spectra, mass_tol: Optional[float] = None, unit_mass_tol: Optional[str] = None): """ Annotate spectral library with peaks and mass for cross-linked peptides. diff --git a/oktoberfest/rescore/rescore.py b/oktoberfest/rescore/rescore.py index aae1c237..a82a08df 100644 --- a/oktoberfest/rescore/rescore.py +++ b/oktoberfest/rescore/rescore.py @@ -2,10 +2,11 @@ import subprocess from pathlib import Path from typing import List, Optional, Union -import scipy.sparse as sp + import mokapot import numpy as np import pandas as pd +import scipy.sparse as sp from spectrum_fundamentals.metrics.percolator import Percolator from ..data import FragmentType, Spectra @@ -34,21 +35,21 @@ def generate_features( :param regression_method: The regression method to use for iRT alignment """ if xl: - pred_a= library.get_matrix(FragmentType.PRED_A)[0] - pred_b= library.get_matrix(FragmentType.PRED_B)[0] - raw_a= library.get_matrix(FragmentType.RAW_A)[0] - raw_b= library.get_matrix(FragmentType.RAW_B)[0] - mz_a= library.get_matrix(FragmentType.MZ_A)[0] - mz_b= library.get_matrix(FragmentType.MZ_B)[0] + pred_a = library.get_matrix(FragmentType.PRED_A)[0] + pred_b = library.get_matrix(FragmentType.PRED_B)[0] + raw_a = library.get_matrix(FragmentType.RAW_A)[0] + raw_b = library.get_matrix(FragmentType.RAW_B)[0] + mz_a = library.get_matrix(FragmentType.MZ_A)[0] + mz_b = library.get_matrix(FragmentType.MZ_B)[0] perc_features = Percolator( - metadata=library.get_meta_data().reset_index(drop=True), - pred_intensities=sp.hstack([pred_a, pred_b]), - true_intensities=sp.hstack([raw_a, raw_b]), - mz=sp.hstack([mz_a, mz_b]), - input_type=search_type, - all_features_flag=all_features, - regression_method=regression_method, - ) + metadata=library.get_meta_data().reset_index(drop=True), + pred_intensities=sp.hstack([pred_a, pred_b]), + true_intensities=sp.hstack([raw_a, raw_b]), + mz=sp.hstack([mz_a, mz_b]), + input_type=search_type, + all_features_flag=all_features, + regression_method=regression_method, + ) else: perc_features = Percolator( metadata=library.get_meta_data().reset_index(drop=True), @@ -108,6 +109,7 @@ def rescore_with_percolator( num_threads: int = 3, test_fdr: float = 0.01, train_fdr: float = 0.01, + xl: bool = False, ): """ Rescore using percolator. @@ -140,18 +142,23 @@ def rescore_with_percolator( target_peptides = output_folder / f"{file_prefix}.percolator.peptides.txt" decoy_peptides = output_folder / f"{file_prefix}.percolator.decoy.peptides.txt" log_file = output_folder / f"{file_prefix}.log" - - cmd = f"percolator --weights {weights_file} \ - --num-threads {num_threads} \ - --subset-max-train 500000 \ - --post-processing-tdc \ - --testFDR {test_fdr} \ - --trainFDR {train_fdr} \ + if xl: + cmd = f"percolator --weights {weights_file} \ --results-psms {target_psms} \ --decoy-results-psms {decoy_psms} \ - --results-peptides {target_peptides} \ - --decoy-results-peptides {decoy_peptides} \ {input_file} 2> {log_file}" + else: + cmd = f"percolator --weights {weights_file} \ + --num-threads {num_threads} \ + --subset-max-train 500000 \ + --post-processing-tdc \ + --testFDR {test_fdr} \ + --trainFDR {train_fdr} \ + --results-psms {target_psms} \ + --decoy-results-psms {decoy_psms} \ + --results-peptides {target_peptides} \ + --decoy-results-peptides {decoy_peptides} \ + {input_file} 2> {log_file}" logger.info(f"Starting percolator with command {cmd}") subprocess.run(cmd, shell=True, check=True) diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py index 8f5bd880..f72b463d 100644 --- a/oktoberfest/runner.py +++ b/oktoberfest/runner.py @@ -9,8 +9,9 @@ from multiprocessing import Manager, Process, pool from pathlib import Path from typing import Dict, List, Optional, Tuple, Type, Union -import pandas as pd + import numpy as np +import pandas as pd from sklearn.linear_model import LinearRegression, RANSACRegressor from spectrum_io.spectral_library import MSP, DLib, SpectralLibrary, Spectronaut from tqdm.auto import tqdm @@ -126,9 +127,13 @@ def _annotate_and_get_library(spectra_file: Path, config: Config, tims_meta_file search = pp.load_search(config.output / "msms" / spectra_file.with_suffix(".rescore").name) library = pp.merge_spectra_and_peptides(spectra, search) if "xl" in config.models["intensity"].lower(): - aspec = pp.annotate_spectral_library_xl(library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance) + aspec = pp.annotate_spectral_library_xl( + library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance + ) else: - paspecp.annotate_spectral_library(library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance) + paspecp.annotate_spectral_library( + library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance + ) aspec.write_as_hdf5(hdf5_path) # write_metadata_annotation return aspec @@ -145,7 +150,7 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config): } use_ransac_model = config.use_ransac_model if "xl" in config.models["intensity"].lower(): - alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, xl =True, **server_kwargs ) + alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, xl=True, **server_kwargs) else: alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, **server_kwargs) @@ -488,22 +493,22 @@ def _calculate_features(spectra_file: Path, config: Config, xl: bool = False): "ssl": config.ssl, } if xl: - pred_intensities_a, pred_intensities_b = pr.predict( - data=library.obs, - model_name=config.models["intensity"], - xl=True, - **predict_kwargs, - ) + pred_intensities_a, pred_intensities_b = pr.predict( + data=library.obs, + model_name=config.models["intensity"], + xl=True, + **predict_kwargs, + ) library.add_matrix(pred_intensities_a["intensities"], FragmentType.PRED_A) library.add_matrix(pred_intensities_b["intensities"], FragmentType.PRED_B) library.write_as_hdf5(config.output / "data" / spectra_file.with_suffix(".mzml.pred.hdf5").name) predict_step.mark_done() else: pred_intensities = pr.predict( - data=library.obs, - model_name=config.models["intensity"], - **predict_kwargs, - ) + data=library.obs, + model_name=config.models["intensity"], + **predict_kwargs, + ) pred_irts = pr.predict(data=library.obs, model_name=config.models["irt"], **predict_kwargs) library.add_matrix(pred_intensities["intensities"], FragmentType.PRED) library.add_column(pred_irts["irt"].squeeze(), name="PREDICTED_IRT") @@ -534,7 +539,7 @@ def _calculate_features(spectra_file: Path, config: Config, xl: bool = False): calc_feature_step.mark_done() -def _rescore(fdr_dir: Path, config: Config): +def _rescore(fdr_dir: Path, config: Config, xl : bool = False): """ High level rescore function for original and rescore. @@ -547,128 +552,240 @@ def _rescore(fdr_dir: Path, config: Config): if config.fdr_estimation_method == "percolator": if not rescore_original_step.is_done(): - re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir) + re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir, xl = xl) rescore_original_step.mark_done() if not rescore_prosit_step.is_done(): - re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir) + re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir, xl = xl) rescore_prosit_step.mark_done() elif config.fdr_estimation_method == "mokapot": if not rescore_original_step.is_done(): - re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir) + re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir, xl = xl) rescore_original_step.mark_done() if not rescore_prosit_step.is_done(): - re.rescore_with_mokapot(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir) + re.rescore_with_mokapot(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir, xl = xl) rescore_prosit_step.mark_done() else: raise ValueError( 'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"' ) -def xl_fdr(df: pd.DataFrame, score: str) -> pd.DataFrame: + +def xl_fdr(df: pd.DataFrame, score: str) -> pd.DataFrame: """ - "calculate and add fdr_xl to the DataFrame : (TD-DD)/(TT)". + "calculate and add fdr_xl to the DataFrame : (TD-DD)/(TT)". :param df: DataFrame containing the data. - :param score: Column name containing the scores used for calculating FDR + :param score: Column name containing the scores used for calculating FDR :return: DataFrame with the new column 'fdr' """ df = df.sort_values(by=score, ascending=False) - df['TD_sum'] = (df['label'] == 'TD').cumsum() - df['DD_sum'] = (df['label'] == 'DD').cumsum() - df['TT_sum'] = (df['label'] == 'TT').cumsum() - df['mokapot q-value'] = (df['TD_sum'] - df['DD_sum']) / df['TT_sum'] - df.loc[df['TT_sum'] == 0, 'mokapot q-value'] = 0 # Handling division by zero - df = df.drop(['TD_sum', 'DD_sum', 'TT_sum'], axis=1) + df["TD_sum"] = (df["label"] == "TD").cumsum() + df["DD_sum"] = (df["label"] == "DD").cumsum() + df["TT_sum"] = (df["label"] == "TT").cumsum() + df["q-value"] = (df["TD_sum"] - df["DD_sum"]) / df["TT_sum"] + df.loc[df["TT_sum"] == 0, "q-value"] = 0 # Handling division by zero + df = df.drop(["TD_sum", "DD_sum", "TT_sum"], axis=1) return df -def xl_preprocessing_plot_unique_xl(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str): - columns_to_keep_mokapot = ["SpecId_a", "label", "ScanNr_a", "filename_a", "Peptide_a", "mokapot score", "mokapot q-value", "mokapot q-value_a", "mokapot q-value_b", "Proteins_b", "mod_pep_a_b", "mod_pep_b_b"] +def xl_between_or_self(df: pd.DataFrame, score: str) -> pd.DataFrame: + mokapot_csms_rescore_between =df[df["fdr_group"]=="between"] + mokapot_csms_rescore_self =df[df["fdr_group"]=="self"] + mokapot_csms_rescore_between= xl_fdr(mokapot_csms_rescore_between, score=score) + mokapot_csms_rescore_self= xl_fdr(mokapot_csms_rescore_self, score=score) + mokapot_csms_rescore = pd.concat([mokapot_csms_rescore_between, mokapot_csms_rescore_self], axis=0) + return mokapot_csms_rescore + + +def xl_preprocessing_plot_unique_xl(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str, percolator_or_mokapot: str): + mokapot_csms["is_target"] = mokapot_csms["label"] + mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms.csv") + columns_to_keep_mokapot = [ + "PSMId_a", + "label", + "is_target", + "scan_number_a", + "filename_a", + "peptide_a", + "mokapot score", + "q-value", + "q-value_a", + "q-value_b", + "proteinIds_b", + "mod_pep_a_b", + "fdr_group_a", + "mod_pep_b_b", + ] mokapot_csms = mokapot_csms[columns_to_keep_mokapot] - mokapot_csms = mokapot_csms.rename(columns={'SpecId_a': 'SpecId', 'ScanNr_a': 'ScanNr', 'filename_a': 'filename', 'Peptide_a': 'Peptide', 'Proteins_b': 'Proteins', 'mod_pep_a_b': 'mod_pep_a', 'mod_pep_b_b': 'mod_pep_b'}) - mokapot_csms['label'] = mokapot_csms['label'].replace({'TT': True, 'TD': False, 'DD': False}) - mokapot_csms['base_pep_a'] = mokapot_csms['mod_pep_a'].str.replace(r'\[.*?\]', '', regex=True) - mokapot_csms['base_pep_b'] = mokapot_csms['mod_pep_b'].str.replace(r'\[.*?\]', '', regex=True) - mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_10.csv") - mokapot_csms = mokapot_csms.sort_values(by='mokapot score', ascending = False) - swap_mask = mokapot_csms['base_pep_a'] > mokapot_csms['base_pep_b'] - mokapot_csms.loc[swap_mask, 'min_seq'] = mokapot_csms['base_pep_b'] - mokapot_csms.loc[swap_mask, 'max_seq'] = mokapot_csms['base_pep_a'] - mokapot_csms.loc[~swap_mask, 'min_seq'] = mokapot_csms['base_pep_a'] - mokapot_csms.loc[~swap_mask, 'max_seq'] = mokapot_csms['base_pep_b'] - mokapot_uniq_xls = mokapot_csms.drop_duplicates(subset=['min_seq', 'max_seq']) - xl_fdr(mokapot_uniq_xls, "mokapot score") - mokapot_uniq_xls_true = mokapot_uniq_xls[mokapot_uniq_xls['label'] == True] - mokapot_uniq_xls_decoy = mokapot_uniq_xls[mokapot_uniq_xls['label'] == False] + + mokapot_csms = mokapot_csms.rename( + columns={ + "PSMId_a": "SpecId", + "scan_number_a": "ScanNr", + "filename_a": "filename", + "peptide_a": "Peptide", + "proteinIds_b": "Proteins", + "mod_pep_a_b": "mod_pep_a", + "fdr_group_a": "fdr_group", + "mod_pep_b_b": "mod_pep_b", + } + ) + mokapot_csms["label"] = mokapot_csms["label"].replace({"TT": True, "TD": False, "DD": False}) + mokapot_csms["base_pep_a"] = mokapot_csms["mod_pep_a"].str.replace(r"\[.*?\]", "", regex=True) + mokapot_csms["base_pep_b"] = mokapot_csms["mod_pep_b"].str.replace(r"\[.*?\]", "", regex=True) + mokapot_csms = mokapot_csms.sort_values(by="mokapot score", ascending=False) + swap_mask = mokapot_csms["base_pep_a"] > mokapot_csms["base_pep_b"] + mokapot_csms.loc[swap_mask, "min_seq"] = mokapot_csms["base_pep_b"] + mokapot_csms.loc[swap_mask, "max_seq"] = mokapot_csms["base_pep_a"] + mokapot_csms.loc[~swap_mask, "min_seq"] = mokapot_csms["base_pep_a"] + mokapot_csms.loc[~swap_mask, "max_seq"] = mokapot_csms["base_pep_b"] + mokapot_uniq_xls = mokapot_csms.drop_duplicates(subset=["min_seq", "max_seq"]) + xl_between_or_self(mokapot_uniq_xls, "mokapot score") + mokapot_uniq_xls_true = mokapot_uniq_xls[mokapot_uniq_xls["label"] == True] + mokapot_uniq_xls_decoy = mokapot_uniq_xls[mokapot_uniq_xls["label"] == False] if original_or_rescore == "original": - mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep='\t', index=False) - mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.peptides.txt", sep='\t', index=False) + if percolator_or_mokapot == "percolator": + mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.percolator.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.percolator.decoy.peptides.txt", sep="\t", index=False) + else: + mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.peptides.txt", sep="\t", index=False) else: - mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep='\t', index=False) - mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.peptides.txt", sep='\t', index=False) - - -def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str): - columns_to_keep_mokapot = ["SpecId_a", "label", "ScanNr_a", "filename_a", "Peptide_a", "mokapot score", "mokapot q-value", "mokapot q-value_a", "mokapot q-value_b", "Proteins_b"] + if percolator_or_mokapot == "percolator": + mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.percolator.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.peptides.txt", sep="\t", index=False) + else: + mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.peptides.txt", sep="\t", index=False) + + +def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str, percolator_or_mokapot: str ): + columns_to_keep_mokapot = [ + "PSMId_a", + "label", + "scan_number_a", + "filename_a", + "peptide_a", + "mokapot score", + "q-value", + "q-value_a", + "q-value_b", + "fdr_group_a", + "proteinIds_b", + ] + mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_2.csv") mokapot_csms = mokapot_csms[columns_to_keep_mokapot] - mokapot_csms = mokapot_csms.rename(columns={'SpecId_a': 'SpecId', 'ScanNr_a': 'ScanNr', 'filename_a': 'filename', 'Peptide_a': 'Peptide', 'Proteins_b': 'Proteins'}) - mokapot_csms['label'] = mokapot_csms['label'].replace({'TT': True, 'TD': False, 'DD': False}) - mokapot_csms_true = mokapot_csms[mokapot_csms['label'] == True] - mokapot_csms_decoy = mokapot_csms[mokapot_csms['label'] == False] + mokapot_csms = mokapot_csms.rename( + columns={ + "PSMId_a": "SpecId", + "scan_number_a": "ScanNr", + "filename_a": "filename", + "peptide_a": "Peptide", + "fdr_group_a": "fdr_group", + "proteinIds_b": "Proteins", + } + ) + mokapot_csms["is_target"] = mokapot_csms["label"] + mokapot_csms["label"] = mokapot_csms["label"].replace({"TT": True, "TD": False, "DD": False}) + mokapot_csms_true = mokapot_csms[mokapot_csms["label"] == True] + mokapot_csms_true.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_true.csv") + + mokapot_csms_decoy = mokapot_csms[mokapot_csms["label"] == False] if original_or_rescore == "original": - mokapot_csms_true.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep='\t', index=False) - mokapot_csms_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", sep='\t', index=False) + if percolator_or_mokapot == "percolator": + mokapot_csms_true.to_csv(featrures_dir + "/original.percolator.psms.txt", sep="\t", index=False) + mokapot_csms_decoy.to_csv(featrures_dir + "/original.percolator.decoy.psms.txt", sep="\t", index=False) + else: + mokapot_csms_true.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep="\t", index=False) + mokapot_csms_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", sep="\t", index=False) else: - mokapot_csms_true.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep='\t', index=False) - mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", sep='\t', index=False) - + if percolator_or_mokapot == "percolator": + mokapot_csms_true.to_csv(featrures_dir + "/rescore.percolator.psms.txt", sep="\t", index=False) + mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.psms.txt", sep="\t", index=False) + else: + mokapot_csms_true.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep="\t", index=False) + mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", sep="\t", index=False) + -def xl_psm_to_csm(featrures_dir: Path, original_or_rescore: str ): +def xl_psm_to_csm(featrures_dir: Path, original_or_rescore: str, percolator_or_mokapot: str): def get_label(row): - if row['is_decoy_p1_a'] == 'False' and row['is_decoy_p2_a'] == 'False': - return 'TT' - elif row['is_decoy_p1_a'] == 'True' and row['is_decoy_p2_a'] == 'True': - return 'DD' + if row["is_decoy_p1_a"] == "False" and row["is_decoy_p2_a"] == "False": + return "TT" + elif row["is_decoy_p1_a"] == "True" and row["is_decoy_p2_a"] == "True": + return "DD" else: - return 'TD' - + return "TD" + def min_score(row): - return min(row['mokapot score_a'], row['mokapot score_b']) - + return min(row["score_a"], row["score_b"]) + if original_or_rescore == "original": - decoy_psms = pd.read_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", delimiter='\t') - target_psms = pd.read_csv(featrures_dir + "/original.mokapot.psms.txt", delimiter='\t') - else: - decoy_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", delimiter='\t') - target_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.psms.txt", delimiter='\t') + if percolator_or_mokapot == "percolator": + decoy_psms = pd.read_csv(featrures_dir + "/original.percolator.decoy.psms.txt", delimiter="\t") + target_psms = pd.read_csv(featrures_dir + "/original.percolator.psms.txt", delimiter="\t") + psm_id = "PSMId" + else: + decoy_psms = pd.read_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", delimiter="\t") + target_psms = pd.read_csv(featrures_dir + "/original.mokapot.psms.txt", delimiter="\t") + psm_id = "SpecId" - split_data = target_psms['SpecId'].str.rsplit('-', n=10, expand=True) - new_columns = ["raw_file","scan_number", "mod_pep_a", "mod_pep_b", "charge", - "decoy_p1", "is_decoy_p1", "decoy_p2", "is_decoy_p2", "index"] + else: + if percolator_or_mokapot == "percolator": + decoy_psms = pd.read_csv(featrures_dir + "/rescore.percolator.decoy.psms.txt", delimiter="\t") + target_psms = pd.read_csv(featrures_dir + "/rescore.percolator.psms.txt", delimiter="\t") + psm_id = "PSMId" + else: + decoy_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", delimiter="\t") + target_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.psms.txt", delimiter="\t") + psm_id = "SpecId" + + split_data = target_psms[psm_id].str.rsplit("-", n=13, expand=True) + new_columns = [ + "raw_file", + "scan_number", + "mod_pep_a", + "mod_pep_b", + "charge", + "decoy_p1", + "is_decoy_p1", + "decoy_p2", + "is_decoy_p2", + "fdr_group", + "base_sequence_p1", + "base_sequence_p2", + "index", + ] split_data.columns = new_columns df_psm_target = pd.concat([target_psms, split_data], axis=1) - split_data = decoy_psms['SpecId'].str.rsplit('-', n=10, expand=True) - new_columns = ["raw_file","scan_number", "mod_pep_a", "mod_pep_b", "charge", - "decoy_p1", "is_decoy_p1", "decoy_p2", "is_decoy_p2", "index"] + split_data = decoy_psms[psm_id].str.rsplit("-", n=13, expand=True) split_data.columns = new_columns df_psm_decoy = pd.concat([decoy_psms, split_data], axis=1) - df_mokapot = pd.concat([df_psm_decoy , df_psm_target], axis = 0) - df_mokapot[['index_csm', '_', 'which_pep',]] = df_mokapot['index'].str.split('_', expand=True) - df_mokapot.drop(columns=['index', '_', 'decoy_p1', 'decoy_p2'], inplace=True) - df_pep_1 = df_mokapot[df_mokapot['which_pep'] == "1"].copy() - df_pep_2 = df_mokapot[df_mokapot['which_pep'] == "2"].copy() - df_pep_1.drop(columns=['which_pep'], inplace=True) - df_pep_2.drop(columns=['which_pep'], inplace=True) - df_pep_1.columns = [col + '_a' if col != 'index_csm' else col for col in df_pep_1.columns] - df_pep_2.columns = [col + '_b' if col != 'index_csm' else col for col in df_pep_2.columns] - mokapot_csms = pd.merge(df_pep_1, df_pep_2, on='index_csm') - mokapot_csms['mokapot score'] = mokapot_csms.apply(min_score, axis=1) - mokapot_csms['label'] = mokapot_csms.apply(get_label, axis=1) - mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms.csv") + df_mokapot = pd.concat([df_psm_decoy, df_psm_target], axis=0) + df_mokapot[ + [ + "index_csm", + "_", + "which_pep", + ] + ] = df_mokapot[ + "index" + ].str.split("_", expand=True) + df_mokapot.drop(columns=["index", "_", "decoy_p1", "decoy_p2"], inplace=True) + df_pep_1 = df_mokapot[df_mokapot["which_pep"] == "1"].copy() + df_pep_2 = df_mokapot[df_mokapot["which_pep"] == "2"].copy() + df_pep_1.drop(columns=["which_pep"], inplace=True) + df_pep_2.drop(columns=["which_pep"], inplace=True) + df_pep_1.columns = [col + "_a" if col != "index_csm" else col for col in df_pep_1.columns] + df_pep_2.columns = [col + "_b" if col != "index_csm" else col for col in df_pep_2.columns] + mokapot_csms = pd.merge(df_pep_1, df_pep_2, on="index_csm") + mokapot_csms["mokapot score"] = mokapot_csms.apply(min_score, axis=1) + mokapot_csms["label"] = mokapot_csms.apply(get_label, axis=1) + mokapot_csms.rename(columns={'fdr_group_b': 'fdr_group'}, inplace=True) + mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_5.csv") return mokapot_csms -def prepare_rescore_xl_psm_level(featrures_dir: Path, original_or_rescore: str ): +def prepare_rescore_xl_psm_level(featrures_dir: Path, original_or_rescore: str): def extract_label_pep_a(specid): if "-decoy_p1-True-" in specid: return -1 @@ -676,6 +793,7 @@ def extract_label_pep_a(specid): return 1 else: return None + def extract_label_pep_b(specid): if "-decoy_p2-True-" in specid: return -1 @@ -683,98 +801,121 @@ def extract_label_pep_b(specid): return 1 else: return None - - columns_to_remove_psm_rescore= ["run_name", - "scan_number", - "precursor_mass", - "precursor_charge", - "crosslinker_name", - "base_sequence_p1", - "aa_len_p1", - "link_pos_p1", - "linked_aa_p1", - "mods_p1", - "mod_pos_p1", - "base_sequence_p2", - "aa_len_p2", - "link_pos_p2", - "linked_aa_p2", - "mods_p2", - "mod_pos_p2", - "linear", - "match_score", - "decoy", - "RAW_FILE", - "MASS", - "PRECURSOR_CHARGE", - "CROSSLINKER_TYPE", - "REVERSE", - "SCAN_NUMBER", - "SEQUENCE_A", - "SEQUENCE_B", - "Modifications_A", - "Modifications_B", - "CROSSLINKER_POSITION_A", - "CROSSLINKER_POSITION_B", - "ModificationPositions1", - "MZ_RANGE", - "ModificationPositions2", - "MODIFIED_SEQUENCE_A", - "MODIFIED_SEQUENCE_B", - "FRAGMENTATION", - "INSTRUMENT_TYPES", - "MASS_ANALYZER", - "Proteins", - "spectral_angle", - "SCORE", - "Unnamed: 0" + + columns_to_remove_psm_rescore = [ + "run_name", + "scan_number", + "precursor_mass", + "precursor_charge", + "crosslinker_name", + "aa_len_p1", + "link_pos_p1", + "linked_aa_p1", + "mods_p1", + "mod_pos_p1", + "aa_len_p2", + "link_pos_p2", + "linked_aa_p2", + "mods_p2", + "mod_pos_p2", + "linear", + "match_score", + "decoy", + "RAW_FILE", + "MASS", + "PRECURSOR_CHARGE", + "CROSSLINKER_TYPE", + "REVERSE", + "SCAN_NUMBER", + "SEQUENCE_A", + "SEQUENCE_B", + "Modifications_A", + "Modifications_B", + "CROSSLINKER_POSITION_A", + "CROSSLINKER_POSITION_B", + "ModificationPositions1", + "MZ_RANGE", + "ModificationPositions2", + "MODIFIED_SEQUENCE_A", + "MODIFIED_SEQUENCE_B", + "FRAGMENTATION", + "INSTRUMENT_TYPES", + "MASS_ANALYZER", + "Proteins", + "spectral_angle", + "SCORE", + "Unnamed: 0", ] - columns_to_remove_psm_original = list(set(columns_to_remove_psm_rescore + ["match_score"]) - set(["spectral_angle"])) + columns_to_remove_psm_original = list(set(columns_to_remove_psm_rescore + ["match_score"]) - {"spectral_angle"}) if original_or_rescore == "original": columns_to_remove_psm = columns_to_remove_psm_original - rescore_tab_file = pd.read_csv(featrures_dir + "/original.tab", sep="\t") + rescore_tab_file = pd.read_csv(featrures_dir + "/original.tab", sep="\t") else: columns_to_remove_psm = columns_to_remove_psm_rescore - rescore_tab_file = pd.read_csv(featrures_dir + "/rescore.tab", sep="\t") - + rescore_tab_file = pd.read_csv(featrures_dir + "/rescore.tab", sep="\t") rescore_tab_file.drop(columns=columns_to_remove_psm, inplace=True) rescore_tab_file = rescore_tab_file.fillna(0) - string_columns = ['SpecId', 'Peptide', 'Proteins', 'protein_p1', 'protein_p2', 'filename'] - rescore_tab_file_numeric_columns = [col for col in rescore_tab_file.columns if col not in string_columns] - rescore_tab_file[rescore_tab_file_numeric_columns] = rescore_tab_file[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce') + string_columns = ["SpecId", "Peptide", "Proteins", "protein_p1", "protein_p2", "filename", "fdr_group", "base_sequence_p1", "base_sequence_p2"] + rescore_tab_file_numeric_columns = [col for col in rescore_tab_file.columns if col not in string_columns] + rescore_tab_file[rescore_tab_file_numeric_columns] = rescore_tab_file[rescore_tab_file_numeric_columns].apply( + pd.to_numeric, errors="coerce" + ) rescore_tab_file = rescore_tab_file.reset_index(drop=True) - rescore_tab_file['Proteins'] = 'p1_' + rescore_tab_file['protein_p1'].astype(str) + '_p2_' + rescore_tab_file['protein_p2'].astype(str) - rescore_tab_file['SpecId'] = ( - rescore_tab_file['SpecId'] + '-decoy_p1-' + - rescore_tab_file['decoy_p1'].astype(str) + '-decoy_p2-' + - rescore_tab_file['decoy_p2'].astype(str) + '-' + - rescore_tab_file.index.astype(str) # Adding index to SpecId -) - rescore_tab_file.drop(columns=[ 'protein_p1', 'protein_p2', 'decoy_p1', 'decoy_p2'], inplace=True) - columns_feature_pep_a = [col for col in rescore_tab_file.columns if col.endswith('_a') or col.endswith('_A') or not (col.endswith('_b') or col.endswith('_B'))] - columns_feature_pep_b = [col for col in rescore_tab_file.columns if col.endswith('_b') or col.endswith('_B') or not (col.endswith('_a') or col.endswith('_A'))] + rescore_tab_file["Proteins"] = ( + "p1_" + rescore_tab_file["protein_p1"].astype(str) + "_p2_" + rescore_tab_file["protein_p2"].astype(str) + ) + rescore_tab_file["SpecId"] = ( + rescore_tab_file["SpecId"] + + "-decoy_p1-" + + rescore_tab_file["decoy_p1"].astype(str) + + "-decoy_p2-" + + rescore_tab_file["decoy_p2"].astype(str) + + "-" + + rescore_tab_file["fdr_group"].astype(str) + + "-" + + rescore_tab_file["base_sequence_p1"].astype(str) + + "-" + + rescore_tab_file["base_sequence_p2"].astype(str) + + "-" + + rescore_tab_file.index.astype(str) # Adding index to SpecId + ) + rescore_tab_file.drop(columns=["protein_p1", "protein_p2", "decoy_p1", "decoy_p2", "fdr_group","base_sequence_p1", "base_sequence_p2"], inplace=True) + columns_feature_pep_a = [ + col + for col in rescore_tab_file.columns + if col.endswith("_a") or col.endswith("_A") or not (col.endswith("_b") or col.endswith("_B")) + ] + columns_feature_pep_b = [ + col + for col in rescore_tab_file.columns + if col.endswith("_b") or col.endswith("_B") or not (col.endswith("_a") or col.endswith("_A")) + ] rescore_tab_file_a = rescore_tab_file.loc[:, columns_feature_pep_a] rescore_tab_file_b = rescore_tab_file.loc[:, columns_feature_pep_b] - rescore_tab_file_a['SpecId'] = rescore_tab_file_a['SpecId'] + '_' + rescore_tab_file_a.index.astype(str) - rescore_tab_file_b['SpecId'] = rescore_tab_file_b['SpecId'] + '_' + rescore_tab_file_b.index.astype(str) - #1 means pep a and 2 means pep b - rescore_tab_file_a['SpecId'] = rescore_tab_file_a['SpecId'] + '_1' - rescore_tab_file_b['SpecId'] = rescore_tab_file_b['SpecId'] + '_2' - rescore_tab_file_a.columns = [col[:-2] if col.endswith(('_a', '_A')) else col for col in rescore_tab_file_a.columns] - rescore_tab_file_b.columns = [col[:-2] if col.endswith(('_b', '_B')) else col for col in rescore_tab_file_b.columns] - rescore_tab_file_a['Label'] = rescore_tab_file_a['SpecId'].apply(extract_label_pep_a) - rescore_tab_file_b['Label'] = rescore_tab_file_b['SpecId'].apply(extract_label_pep_b) - string_columns = ['SpecId', 'Peptide', 'Proteins', 'filename'] - rescore_tab_file_numeric_columns = [col for col in rescore_tab_file_a.columns if col not in string_columns] - rescore_tab_file_a[rescore_tab_file_numeric_columns] = rescore_tab_file_a[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce') - rescore_tab_file_b[rescore_tab_file_numeric_columns] = rescore_tab_file_b[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce') + rescore_tab_file_a["SpecId"] = rescore_tab_file_a["SpecId"] + "_" + rescore_tab_file_a.index.astype(str) + rescore_tab_file_b["SpecId"] = rescore_tab_file_b["SpecId"] + "_" + rescore_tab_file_b.index.astype(str) + # 1 means pep a and 2 means pep b + rescore_tab_file_a["SpecId"] = rescore_tab_file_a["SpecId"] + "_1" + rescore_tab_file_b["SpecId"] = rescore_tab_file_b["SpecId"] + "_2" + rescore_tab_file_a.columns = [col[:-2] if col.endswith(("_a", "_A")) else col for col in rescore_tab_file_a.columns] + rescore_tab_file_b.columns = [col[:-2] if col.endswith(("_b", "_B")) else col for col in rescore_tab_file_b.columns] + rescore_tab_file_a["Label"] = rescore_tab_file_a["SpecId"].apply(extract_label_pep_a) + rescore_tab_file_b["Label"] = rescore_tab_file_b["SpecId"].apply(extract_label_pep_b) + string_columns = ["SpecId", "Peptide", "Proteins", "filename"] + rescore_tab_file_numeric_columns = [col for col in rescore_tab_file_a.columns if col not in string_columns] + rescore_tab_file_a[rescore_tab_file_numeric_columns] = rescore_tab_file_a[rescore_tab_file_numeric_columns].apply( + pd.to_numeric, errors="coerce" + ) + rescore_tab_file_b[rescore_tab_file_numeric_columns] = rescore_tab_file_b[rescore_tab_file_numeric_columns].apply( + pd.to_numeric, errors="coerce" + ) # change ExpMass of rescore_tab_file_a - max_ExpMass = rescore_tab_file_a['ExpMass'].max() - rescore_tab_file_b['ExpMass'] += max_ExpMass + max_ExpMass = rescore_tab_file_a["ExpMass"].max() + rescore_tab_file_b["ExpMass"] += max_ExpMass input_mokapot_psm = pd.concat([rescore_tab_file_a, rescore_tab_file_b], axis=0, ignore_index=True) - input_mokapot_psm['Proteins'].fillna("unknown", inplace=True) + input_mokapot_psm["Proteins"].fillna("unknown", inplace=True) return input_mokapot_psm @@ -816,7 +957,6 @@ def run_rescoring(config_path: Union[str, Path]): fdr_dir = config.output / "results" / config.fdr_estimation_method original_tab_files = [fdr_dir / spectra_file.with_suffix(".original.tab").name for spectra_file in spectra_files] rescore_tab_files = [fdr_dir / spectra_file.with_suffix(".rescore.tab").name for spectra_file in spectra_files] - prepare_tab_original_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_original") prepare_tab_rescore_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_prosit") @@ -830,23 +970,32 @@ def run_rescoring(config_path: Union[str, Path]): logger.info("Merging input tab files for rescoring with peptide property prediction") re.merge_input(tab_files=rescore_tab_files, output_file=fdr_dir / "rescore.tab") prepare_tab_rescore_step.mark_done() - + # rescoring - if "xl" in config.models["intensity"].lower(): #xl-psm-level + if "xl" in config.models["intensity"].lower(): # xl-psm-level input_mokapot_psm_rescore = prepare_rescore_xl_psm_level(str(fdr_dir), "rescore") input_mokapot_psm_rescore.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=None) input_mokapot_psm_original = prepare_rescore_xl_psm_level(str(fdr_dir), "original") input_mokapot_psm_original.to_csv(str(fdr_dir) + "/original.tab", sep="\t", index=None) - _rescore(fdr_dir, config) - mokapot_csms_rescore = xl_psm_to_csm(str(fdr_dir), "rescore") - mokapot_csms_original = xl_psm_to_csm(str(fdr_dir), "original") - mokapot_csms_rescore = xl_fdr(mokapot_csms_rescore, score = "mokapot score") - mokapot_csms_original = xl_fdr(mokapot_csms_original, score = "mokapot score") - xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_rescore, "rescore") - xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_original, "original") - xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_rescore, "rescore") - xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_original, "original") - + _rescore(fdr_dir, config, xl=True) + if config.fdr_estimation_method == "percolator": + mokapot_csms_rescore = xl_psm_to_csm(str(fdr_dir), "rescore", "percolator") + mokapot_csms_original = xl_psm_to_csm(str(fdr_dir), "original", "percolator") + mokapot_csms_rescore = xl_between_or_self(mokapot_csms_rescore, score="mokapot score") + mokapot_csms_original = xl_between_or_self(mokapot_csms_original, score="mokapot score") + xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_rescore, "rescore", "percolator") + xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_original, "original", "percolator") + xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_rescore, "rescore", "percolator") + xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_original, "original", "percolator") + else: + mokapot_csms_rescore = xl_psm_to_csm(str(fdr_dir), "rescore", "mokapot") + mokapot_csms_original = xl_psm_to_csm(str(fdr_dir), "original", "mokapot") + mokapot_csms_rescore = xl_between_or_self(mokapot_csms_rescore, score="mokapot score") + mokapot_csms_original = xl_between_or_self(mokapot_csms_original, score="mokapot score") + xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_rescore, "rescore", "mokapot") + xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_original, "original", "mokapot") + xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_rescore, "rescore", "mokapot") + xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_original, "original", "mokapot") else: _rescore(fdr_dir, config)