diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py index ca4723a5..62095b0b 100644 --- a/oktoberfest/data/spectra.py +++ b/oktoberfest/data/spectra.py @@ -247,6 +247,7 @@ def from_hdf5(cls: Type[SpectraT], input_file: Union[str, Path]): :param input_file: path to input file :return: a spectra instance """ + return cls(anndata.read_h5ad(str(input_file))) def convert_to_df(self) -> pd.DataFrame: diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py index 5efa1648..17ac4447 100644 --- a/oktoberfest/predict/koina.py +++ b/oktoberfest/predict/koina.py @@ -375,9 +375,17 @@ def __merge_list_dict_array(dict_list: List[Dict[str, np.ndarray]]) -> Dict[str, merged_dict = model.__merge_list_dict_array(dict_list) print(merged_dict) """ + if not dict_list: + return {} # Return an empty dictionary if dict_list is empty + tmp = [x.keys() for x in dict_list] + + if not tmp or not tmp[0]: + return {} # Return an empty dictionary if tmp is empty or its first element is empty + if not np.all([tmp[0] == x for x in tmp]): raise NotImplementedError(f"Keys of all dictionaries in the list need to be equal {tmp}") + out = {} for k in tmp[0]: out[k] = np.concatenate([x[k] for x in dict_list]) @@ -609,13 +617,14 @@ def __handle_results( infer_results_to_return = [infer_results[i] for i in range(len(infer_results))] return self.__merge_list_dict_array(infer_results_to_return) except AttributeError: - for res in infer_results.values(): - if isinstance(res, InferenceServerException): + exceptions = [res for res in infer_results.values() if isinstance(res, InferenceServerException)] + if exceptions: + for res in exceptions: warnings.warn(res.message(), stacklevel=1) - else: raise InferenceServerException( """ At least one request failed. Check the error message above and try again. To get a list of responses run koina.predict(..., debug = True), then call koina.response_dict """ ) from None + diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py index 746f2b64..904460ab 100644 --- a/oktoberfest/runner.py +++ b/oktoberfest/runner.py @@ -109,6 +109,7 @@ def _annotate_and_get_library(spectra_file: Path, config: Config, tims_meta_file data_dir = config.output / "data" data_dir.mkdir(exist_ok=True) hdf5_path = data_dir / spectra_file.with_suffix(".mzml.hdf5").name + print(hdf5_path) if hdf5_path.is_file(): aspec = Spectra.from_hdf5(hdf5_path) else: @@ -143,6 +144,13 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config): results_dir = config.output / "results" results_dir.mkdir(exist_ok=True) if ((library.obs["FRAGMENTATION"] == "HCD") & (library.obs["REVERSE"] == False)).any(): + best_ce = 27 + library.obs["COLLISION_ENERGY"] = best_ce + + with open(results_dir / f"{spectra_file.stem}_ce.txt", "w") as f: + f.write(str(best_ce)) + f.close() + """ server_kwargs = { "server_url": config.prediction_server, "ssl": config.ssl, @@ -184,7 +192,7 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config): delta_ce = ransac.predict(library.obs[["MASS", "PRECURSOR_CHARGE"]]) library.obs["COLLISION_ENERGY"] = np.maximum(0, library.obs["COLLISION_ENERGY"] + delta_ce) - + else: ce_alignment = alignment_library.obs.groupby(by=["COLLISION_ENERGY"])["SPECTRAL_ANGLE"].mean() @@ -201,6 +209,7 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config): with open(results_dir / f"{spectra_file.stem}_ce.txt", "w") as f: f.write(str(best_ce)) f.close() + """ else: best_ce = 35 library.obs["COLLISION_ENERGY"] = best_ce @@ -641,21 +650,21 @@ def xl_preprocessing_plot_unique_xl(featrures_dir: Path, mokapot_csms: pd.DataFr mokapot_csms.loc[~swap_mask, "max_seq"] = mokapot_csms["base_pep_b"] mokapot_uniq_xls = mokapot_csms.drop_duplicates(subset=["min_seq", "max_seq"]) xl_between_or_self(mokapot_uniq_xls, "mokapot score") - mokapot_uniq_xls_true = mokapot_uniq_xls[mokapot_uniq_xls["label"] == True] + mokapot_uniq_xls_target = mokapot_uniq_xls[mokapot_uniq_xls["label"] == True] mokapot_uniq_xls_decoy = mokapot_uniq_xls[mokapot_uniq_xls["label"] == False] if original_or_rescore == "original": if percolator_or_mokapot == "percolator": - mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.percolator.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_target.to_csv(featrures_dir + "/original.percolator.peptides.txt", sep="\t", index=False) mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.percolator.decoy.peptides.txt", sep="\t", index=False) else: - mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_target.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep="\t", index=False) mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.peptides.txt", sep="\t", index=False) else: if percolator_or_mokapot == "percolator": - mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.percolator.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_target.to_csv(featrures_dir + "/rescore.percolator.peptides.txt", sep="\t", index=False) mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.peptides.txt", sep="\t", index=False) else: - mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep="\t", index=False) + mokapot_uniq_xls_target.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep="\t", index=False) mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.peptides.txt", sep="\t", index=False) @@ -670,10 +679,13 @@ def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, o "q-value", "q-value_a", "q-value_b", + "score_a", + "score_b", "fdr_group_a", "proteinIds_b", ] - mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_2.csv") + print("salaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaam") + print(mokapot_csms.columns.tolist()) mokapot_csms = mokapot_csms[columns_to_keep_mokapot] mokapot_csms = mokapot_csms.rename( columns={ @@ -687,23 +699,25 @@ def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, o ) mokapot_csms["is_target"] = mokapot_csms["label"] mokapot_csms["label"] = mokapot_csms["label"].replace({"TT": True, "TD": False, "DD": False}) - mokapot_csms_true = mokapot_csms[mokapot_csms["label"] == True] - mokapot_csms_true.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_true.csv") + mokapot_csms.to_csv("/cmnfs/proj/prosit/xl/mokapot_csms.csv", index=False) + print(mokapot_csms.columns.tolist()) + mokapot_csms_target = mokapot_csms[mokapot_csms["label"] == True] + mokapot_csms_decoy = mokapot_csms[mokapot_csms["label"] == False] if original_or_rescore == "original": if percolator_or_mokapot == "percolator": - mokapot_csms_true.to_csv(featrures_dir + "/original.percolator.psms.txt", sep="\t", index=False) + mokapot_csms_target.to_csv(featrures_dir + "/original.percolator.psms.txt", sep="\t", index=False) mokapot_csms_decoy.to_csv(featrures_dir + "/original.percolator.decoy.psms.txt", sep="\t", index=False) else: - mokapot_csms_true.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep="\t", index=False) + mokapot_csms_target.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep="\t", index=False) mokapot_csms_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", sep="\t", index=False) else: if percolator_or_mokapot == "percolator": - mokapot_csms_true.to_csv(featrures_dir + "/rescore.percolator.psms.txt", sep="\t", index=False) + mokapot_csms_target.to_csv(featrures_dir + "/rescore.percolator.psms.txt", sep="\t", index=False) mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.psms.txt", sep="\t", index=False) else: - mokapot_csms_true.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep="\t", index=False) + mokapot_csms_target.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep="\t", index=False) mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", sep="\t", index=False) @@ -740,6 +754,8 @@ def min_score(row): psm_id = "SpecId" split_data = target_psms[psm_id].str.rsplit("-", n=13, expand=True) + target_psms.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/target_psms.csv", sep="\t", index=False) + split_data.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/split_data.csv", sep="\t", index=False) new_columns = [ "raw_file", "scan_number", @@ -781,7 +797,6 @@ def min_score(row): mokapot_csms["mokapot score"] = mokapot_csms.apply(min_score, axis=1) mokapot_csms["label"] = mokapot_csms.apply(get_label, axis=1) mokapot_csms.rename(columns={'fdr_group_b': 'fdr_group'}, inplace=True) - mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_5.csv") return mokapot_csms @@ -973,6 +988,10 @@ def run_rescoring(config_path: Union[str, Path]): # rescoring if "xl" in config.models["intensity"].lower(): # xl-psm-level + rescore_tab_file = pd.read_csv(str(fdr_dir) + "/rescore.tab", sep="\t") + rescore_tab_file.to_csv(str(fdr_dir) + "/rescore_csm.tab", sep="\t") + original_tab_file = pd.read_csv(str(fdr_dir) + "/original.tab", sep="\t") + original_tab_file.to_csv(str(fdr_dir) + "/original_csm.tab", sep="\t") input_mokapot_psm_rescore = prepare_rescore_xl_psm_level(str(fdr_dir), "rescore") input_mokapot_psm_rescore.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=None) input_mokapot_psm_original = prepare_rescore_xl_psm_level(str(fdr_dir), "original") diff --git a/oktoberfest/utils/multiprocessing_pool.py b/oktoberfest/utils/multiprocessing_pool.py index c3f706d0..271a8571 100644 --- a/oktoberfest/utils/multiprocessing_pool.py +++ b/oktoberfest/utils/multiprocessing_pool.py @@ -40,7 +40,7 @@ def check_pool(self): res_len = len(self.results) with tqdm(total=res_len, desc="Waiting for tasks to complete", leave=True) as progress: for res in self.results: - outputs.append(res.get(timeout=10000)) # 10000 seconds = ~3 hours + outputs.append(res.get(timeout=10000000)) # 10000 seconds = ~3 hours progress.update(1) self.pool.close()