diff --git a/oktoberfest/preprocessing/preprocessing.py b/oktoberfest/preprocessing/preprocessing.py old mode 100644 new mode 100755 index 13198c8..0aba595 --- a/oktoberfest/preprocessing/preprocessing.py +++ b/oktoberfest/preprocessing/preprocessing.py @@ -396,6 +396,8 @@ def convert_search( tmt_label: str = "", custom_mods: Optional[dict[str, int]] = None, output_file: Optional[Union[str, Path]] = None, + ptm_unimod_id: Optional[int] = 0, + ptm_sites: Optional[list] = None, ) -> pd.DataFrame: r""" Convert search results to Oktoberfest format. @@ -414,7 +416,8 @@ def convert_search( static and variable mods as keys. The values are the integer values of the respective unimod identifier :param output_file: Optional path to the location where the converted search results should be written to. If this is omitted, the results are not stored. - + :param ptm_unimod_id: unimod id used for site localization + :param ptm_sites: possible sites that the ptm can exist on :raises ValueError: if an unsupported search engine was given :return: A dataframe containing the converted results. @@ -503,7 +506,11 @@ def convert_search( raise ValueError(f"Unknown search engine provided: {search_engine}") return search_result(input_path).generate_internal( - tmt_label=tmt_label, out_path=output_file, custom_mods=custom_mods + tmt_label=tmt_label, + out_path=output_file, + custom_mods=custom_mods, + ptm_unimod_id=ptm_unimod_id, + ptm_sites=ptm_sites, ) @@ -850,6 +857,7 @@ def annotate_spectral_library( mass_tol: Optional[float] = None, unit_mass_tol: Optional[str] = None, custom_mods: Optional[dict[str, float]] = None, + annotate_neutral_loss: Optional[bool] = False, ) -> Spectra: """ Annotate all specified ion peaks of given PSMs (Default b and y ions). @@ -865,6 +873,7 @@ def annotate_spectral_library( :param unit_mass_tol: The unit in which the mass tolerance is given :param fragmentation_method: fragmentation method that was used :param custom_mods: mapping of custom UNIMOD string identifiers ('[UNIMOD:xyz]') to their mass + :param annotate_neutral_loss: flag to indicate whether to annotate neutral loss peaks or not :return: Spectra object containing the annotated b and y ion peaks including metadata @@ -892,6 +901,7 @@ def annotate_spectral_library( unit_mass_tolerance=unit_mass_tol, fragmentation_method=fragmentation_method, custom_mods=custom_mods, + annotate_neutral_loss=annotate_neutral_loss, ) ion_types = retrieve_ion_types(fragmentation_method) @@ -903,6 +913,8 @@ def annotate_spectral_library( ) aspec.add_mzs(np.stack(df_annotated_spectra["MZ"]), FragmentType.MZ) aspec.add_column(df_annotated_spectra["CALCULATED_MASS"].values, "CALCULATED_MASS") + aspec.add_column(df_annotated_spectra["EXPECTED_NL_COUNT"].values, "EXPECTED_NL_COUNT") + aspec.add_column(df_annotated_spectra["ANNOTATED_NL_COUNT"].values, "ANNOTATED_NL_COUNT") aspec.strings_to_categoricals() logger.info("Finished annotating.") diff --git a/oktoberfest/rescore/rescore.py b/oktoberfest/rescore/rescore.py index 390b433..eaf2f24 100644 --- a/oktoberfest/rescore/rescore.py +++ b/oktoberfest/rescore/rescore.py @@ -25,6 +25,8 @@ def generate_features( additional_columns: str | list | None = None, all_features: bool = False, regression_method: str = "spline", + add_neutral_loss_features: bool = False, + remove_miss_cleavage_features: bool = False, ): """ Generate features to be used for percolator or mokapot target decoy separation. @@ -38,6 +40,8 @@ def generate_features( :param additional_columns: additional columns supplied in the search results to be used as features (either a list or "all") :param all_features: whether to use all features or only the standard set TODO :param regression_method: The regression method to use for iRT alignment + :param add_neutral_loss_features: Flag to indicate whether to add neutral loss features to percolator or not + :param remove_miss_cleavage_features: Flag to indicate whether to remove miss cleavage features from percolator or not :Example: @@ -87,6 +91,8 @@ def generate_features( additional_columns=additional_columns, all_features_flag=all_features, regression_method=regression_method, + neutral_loss_flag=add_neutral_loss_features, + drop_miss_cleavage_flag=remove_miss_cleavage_features, ) perc_features.calc() perc_features.write_to_file(str(output_file)) diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py old mode 100644 new mode 100755 index a434d61..74ed752 --- a/oktoberfest/runner.py +++ b/oktoberfest/runner.py @@ -61,13 +61,16 @@ def _preprocess(spectra_files: list[Path], config: Config) -> list[Path]: msms_output.mkdir(exist_ok=True) internal_search_file = msms_output / "msms.prosit" tmt_label = config.tag - + ptm_unimods = config.ptm_unimod_id + ptm_sites = config.ptm_possible_sites search_results = pp.convert_search( input_path=config.search_results, search_engine=config.search_results_type, tmt_label=tmt_label, custom_mods=config.custom_to_unimod(), output_file=internal_search_file, + ptm_unimod_id=ptm_unimods, + ptm_sites=ptm_sites, ) if config.spectra_type.lower() in ["d", "hdf"]: timstof_metadata = pp.convert_timstof_metadata( @@ -149,12 +152,14 @@ def _annotate_and_get_library(spectra_file: Path, config: Config, tims_meta_file spectra["INSTRUMENT_TYPES"] = config_instrument_type search = pp.load_search(config.output / "msms" / spectra_file.with_suffix(".rescore").name) library = pp.merge_spectra_and_peptides(spectra, search) + annotate_neutral_loss = config.ptm_use_neutral_loss aspec = pp.annotate_spectral_library( psms=library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance, fragmentation_method=config.fragmentation_method, custom_mods=config.unimod_to_mass(), + annotate_neutral_loss=annotate_neutral_loss, ) aspec.write_as_hdf5(hdf5_path) # write_metadata_annotation @@ -562,7 +567,6 @@ def _refinement_learn(spectra_files: list[Path], config: Config): def _calculate_features(spectra_file: Path, config: Config): library = _ce_calib(spectra_file, config) - calc_feature_step = ProcessStep(config.output, "calculate_features." + spectra_file.stem) if calc_feature_step.is_done(): return @@ -597,7 +601,8 @@ def _calculate_features(spectra_file: Path, config: Config): # produce percolator tab files fdr_dir = config.output / "results" / config.fdr_estimation_method fdr_dir.mkdir(exist_ok=True) - + add_neutral_loss_features = config.ptm_use_neutral_loss + remove_miss_cleavage_features = ("R" in config.ptm_possible_sites) or ("K" in config.ptm_possible_sites) re.generate_features( library=library, search_type="original", @@ -613,6 +618,8 @@ def _calculate_features(spectra_file: Path, config: Config): additional_columns=config.use_feature_cols, all_features=config.all_features, regression_method=config.curve_fitting_method, + add_neutral_loss_features=add_neutral_loss_features, + remove_miss_cleavage_features=remove_miss_cleavage_features, ) calc_feature_step.mark_done() @@ -634,8 +641,13 @@ def _rescore(fdr_dir: Path, config: Config): re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir) rescore_original_step.mark_done() if not rescore_prosit_step.is_done(): - re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir) - rescore_prosit_step.mark_done() + logger.info("Start percolator rescoring") + logger.info(config.ptm_localization) + if config.ptm_localization: + _ptm_localization_rescore(fdr_dir, config) + else: + re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir) + rescore_prosit_step.mark_done() elif config.fdr_estimation_method == "mokapot": if not rescore_original_step.is_done(): re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir) @@ -649,6 +661,43 @@ def _rescore(fdr_dir: Path, config: Config): ) +def _ptm_localization_rescore(fdr_dir: Path, config: Config): + """ + Helper function for running percolator to do PTM localization. + + :param fdr_dir: the output directory + :param config: the configuration object + """ + df_rescore = pd.read_csv(fdr_dir / "rescore.tab", sep="\t") + unimod_id = config.ptm_unimod_id + df_rescore["id"] = df_rescore["filename"] + df_rescore["ScanNr"].astype(str) + df_rescore_fil = df_rescore[df_rescore["Peptide"].str.contains("UNIMOD:" + str(unimod_id))] + df_rescore = df_rescore[df_rescore["id"].isin(df_rescore_fil["id"].tolist())] + df_rescore.drop(columns=["id"], inplace=True) + df_rescore.to_csv(fdr_dir / "rescore.tab", sep="\t", index=False) + new_rescore_dir = fdr_dir / "localize_mod" + new_rescore_dir.mkdir(parents=True, exist_ok=True) + + if unimod_id == 7: + re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir) + df_rescore_psms_targets = pd.read_csv(fdr_dir / "rescore.percolator.psms.txt", sep="\t") + df_rescore_psms_decoys = pd.read_csv(fdr_dir / "rescore.percolator.decoy.psms.txt", sep="\t") + df_rescore_psms_targets = df_rescore_psms_targets[ + df_rescore_psms_targets["peptide"].apply(lambda x: "UNIMOD:" + str(unimod_id) in x) + ] + df_rescore_psms_decoys = df_rescore_psms_decoys[ + df_rescore_psms_decoys["peptide"].apply(lambda x: "UNIMOD:" + str(unimod_id) in x) + ] + df_rescore_psms = pd.concat([df_rescore_psms_targets, df_rescore_psms_decoys]) + df_rescore_psms = df_rescore_psms[["PSMId"]] + df_rescore_psms.rename(columns={"PSMId": "SpecId"}, inplace=True) + df_rescore = df_rescore.merge(df_rescore_psms, on="SpecId", how="inner") + df_rescore.to_csv(new_rescore_dir / "rescore.tab", sep="\t", index=False) + re.rescore_with_percolator(input_file=new_rescore_dir / "rescore.tab", output_folder=new_rescore_dir) + else: + re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=new_rescore_dir) + + def run_rescoring(config_path: Union[str, Path]): """ Create a ReScore object and run the rescoring. @@ -717,7 +766,8 @@ def run_rescoring(config_path: Union[str, Path]): # plotting logger.info("Generating summary plots...") - pl.plot_all(fdr_dir) + if not config.ptm_localization: + pl.plot_all(fdr_dir) logger.info("Finished rescoring.") if config.quantification: diff --git a/oktoberfest/utils/config.py b/oktoberfest/utils/config.py index b700d36..8165040 100644 --- a/oktoberfest/utils/config.py +++ b/oktoberfest/utils/config.py @@ -354,6 +354,36 @@ def nr_ox(self) -> int: """Get the maximum number of oxidations allowed on M residues in peptides during spectral library generation.""" return self.spec_lib_options.get("nrOx", 1) + ########################### + # these are PTM localization options # + ########################### + + @property + def ptm_localization(self) -> bool: + """Get ptm localization flag from the config file.""" + return self.data.get("ptm_localization", False) + + @property + def ptm_localization_options(self) -> dict: + """Get ptm localization dictionary from the config file.""" + return self.data.get("ptmLocalizationOptions", {}) + + @property + def ptm_unimod_id(self) -> int: + """Get the unimod id required for localization.""" + unimod_id = self.ptm_localization_options.get("unimod_id", 0) + return unimod_id + + @property + def ptm_possible_sites(self) -> list: + """Get which sites ptm can exist on.""" + return self.ptm_localization_options.get("possible_sites", []) + + @property + def ptm_use_neutral_loss(self) -> bool: + """Get neutral loss flag to indicate whether to add a score for this or not.""" + return self.ptm_localization_options.get("neutral_loss", False) + ##################################################################### # these are local prediction / transfer&refinement learning options # ##################################################################### diff --git a/pyproject.toml b/pyproject.toml index 3b56e8c..d6ced96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,8 +29,8 @@ matplotlib = "^3.6.3" mokapot = ">=0.9.1,<0.11.0" numpy = ">=1.20,<1.25" seaborn = ">=0.12.2,<0.14.0" -spectrum_fundamentals = ">=0.7.3,<0.8.0" -spectrum-io = ">=0.6.2,<0.7.0" +spectrum_fundamentals = ">=0.7.4,<0.8.0" +spectrum-io = ">=0.6.3,<0.7.0" picked_group_fdr = ">=0.7.1" dlomix = {extras = ["rltl-report", "wandb"], git = "git@github.com:wilhelm-lab/dlomix.git", branch = "feature/bmpc", optional = true}