paper results is based on this version

wilhelm-lab · Oct 15, 2024 · 1c6ec29 · 1c6ec29
1 parent 5f64471
commit 1c6ec29
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 18 deletions.
diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py
@@ -247,6 +247,7 @@ def from_hdf5(cls: Type[SpectraT], input_file: Union[str, Path]):
         :param input_file: path to input file
         :return: a spectra instance
         """
+
         return cls(anndata.read_h5ad(str(input_file)))
 
     def convert_to_df(self) -> pd.DataFrame:

diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py
@@ -375,9 +375,17 @@ def __merge_list_dict_array(dict_list: List[Dict[str, np.ndarray]]) -> Dict[str,
             merged_dict = model.__merge_list_dict_array(dict_list)
             print(merged_dict)
         """
+        if not dict_list:
+            return {}  # Return an empty dictionary if dict_list is empty
+
         tmp = [x.keys() for x in dict_list]
+
+        if not tmp or not tmp[0]:
+            return {}  # Return an empty dictionary if tmp is empty or its first element is empty
+
         if not np.all([tmp[0] == x for x in tmp]):
             raise NotImplementedError(f"Keys of all dictionaries in the list need to be equal {tmp}")
+
         out = {}
         for k in tmp[0]:
             out[k] = np.concatenate([x[k] for x in dict_list])
@@ -609,13 +617,14 @@ def __handle_results(
             infer_results_to_return = [infer_results[i] for i in range(len(infer_results))]
             return self.__merge_list_dict_array(infer_results_to_return)
         except AttributeError:
-            for res in infer_results.values():
-                if isinstance(res, InferenceServerException):
+            exceptions = [res for res in infer_results.values() if isinstance(res, InferenceServerException)]
+            if exceptions:
+                for res in exceptions:
                     warnings.warn(res.message(), stacklevel=1)
-            else:
                 raise InferenceServerException(
                     """
                     At least one request failed. Check the error message above and try again.
                     To get a list of responses run koina.predict(..., debug = True), then call koina.response_dict
                     """
                 ) from None
+
diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py
@@ -109,6 +109,7 @@ def _annotate_and_get_library(spectra_file: Path, config: Config, tims_meta_file
     data_dir = config.output / "data"
     data_dir.mkdir(exist_ok=True)
     hdf5_path = data_dir / spectra_file.with_suffix(".mzml.hdf5").name
+    print(hdf5_path)
     if hdf5_path.is_file():
         aspec = Spectra.from_hdf5(hdf5_path)
     else:
@@ -143,6 +144,13 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config):
     results_dir = config.output / "results"
     results_dir.mkdir(exist_ok=True)
     if ((library.obs["FRAGMENTATION"] == "HCD") & (library.obs["REVERSE"] == False)).any():
+        best_ce = 27
+        library.obs["COLLISION_ENERGY"] = best_ce
+
+        with open(results_dir / f"{spectra_file.stem}_ce.txt", "w") as f:
+            f.write(str(best_ce))
+            f.close()
+        """
         server_kwargs = {
             "server_url": config.prediction_server,
             "ssl": config.ssl,
@@ -184,7 +192,7 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config):
 
             delta_ce = ransac.predict(library.obs[["MASS", "PRECURSOR_CHARGE"]])
             library.obs["COLLISION_ENERGY"] = np.maximum(0, library.obs["COLLISION_ENERGY"] + delta_ce)
-
+            
         else:
             ce_alignment = alignment_library.obs.groupby(by=["COLLISION_ENERGY"])["SPECTRAL_ANGLE"].mean()
 
@@ -201,6 +209,7 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config):
             with open(results_dir / f"{spectra_file.stem}_ce.txt", "w") as f:
                 f.write(str(best_ce))
                 f.close()
+        """
     else:
         best_ce = 35
         library.obs["COLLISION_ENERGY"] = best_ce
@@ -641,21 +650,21 @@ def xl_preprocessing_plot_unique_xl(featrures_dir: Path, mokapot_csms: pd.DataFr
     mokapot_csms.loc[~swap_mask, "max_seq"] = mokapot_csms["base_pep_b"]
     mokapot_uniq_xls = mokapot_csms.drop_duplicates(subset=["min_seq", "max_seq"])
     xl_between_or_self(mokapot_uniq_xls, "mokapot score")
-    mokapot_uniq_xls_true = mokapot_uniq_xls[mokapot_uniq_xls["label"] == True]
+    mokapot_uniq_xls_target = mokapot_uniq_xls[mokapot_uniq_xls["label"] == True]
     mokapot_uniq_xls_decoy = mokapot_uniq_xls[mokapot_uniq_xls["label"] == False]
     if original_or_rescore == "original":
         if percolator_or_mokapot == "percolator":
-            mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.percolator.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_target.to_csv(featrures_dir + "/original.percolator.peptides.txt", sep="\t", index=False)
             mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.percolator.decoy.peptides.txt", sep="\t", index=False)
         else:
-            mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_target.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep="\t", index=False)
             mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.peptides.txt", sep="\t", index=False)
     else:
         if percolator_or_mokapot == "percolator":
-            mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.percolator.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_target.to_csv(featrures_dir + "/rescore.percolator.peptides.txt", sep="\t", index=False)
             mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.peptides.txt", sep="\t", index=False)
         else:
-            mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_target.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep="\t", index=False)
             mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.peptides.txt", sep="\t", index=False)
 
 
@@ -670,10 +679,13 @@ def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, o
         "q-value",
         "q-value_a",
         "q-value_b",
+        "score_a",
+        "score_b",
         "fdr_group_a",
         "proteinIds_b",
     ]
-    mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_2.csv")
+    print("salaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaam")
+    print(mokapot_csms.columns.tolist())
     mokapot_csms = mokapot_csms[columns_to_keep_mokapot]
     mokapot_csms = mokapot_csms.rename(
         columns={
@@ -687,23 +699,25 @@ def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, o
     )
     mokapot_csms["is_target"] = mokapot_csms["label"]
     mokapot_csms["label"] = mokapot_csms["label"].replace({"TT": True, "TD": False, "DD": False})
-    mokapot_csms_true = mokapot_csms[mokapot_csms["label"] == True]
-    mokapot_csms_true.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_true.csv")
+    mokapot_csms.to_csv("/cmnfs/proj/prosit/xl/mokapot_csms.csv", index=False)
+    print(mokapot_csms.columns.tolist())
+    mokapot_csms_target = mokapot_csms[mokapot_csms["label"] == True]
+
 
     mokapot_csms_decoy = mokapot_csms[mokapot_csms["label"] == False]
     if original_or_rescore == "original":
         if percolator_or_mokapot == "percolator":
-            mokapot_csms_true.to_csv(featrures_dir + "/original.percolator.psms.txt", sep="\t", index=False)
+            mokapot_csms_target.to_csv(featrures_dir + "/original.percolator.psms.txt", sep="\t", index=False)
             mokapot_csms_decoy.to_csv(featrures_dir + "/original.percolator.decoy.psms.txt", sep="\t", index=False)
         else:
-            mokapot_csms_true.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep="\t", index=False)
+            mokapot_csms_target.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep="\t", index=False)
             mokapot_csms_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", sep="\t", index=False)
     else:
         if percolator_or_mokapot == "percolator":
-            mokapot_csms_true.to_csv(featrures_dir + "/rescore.percolator.psms.txt", sep="\t", index=False)
+            mokapot_csms_target.to_csv(featrures_dir + "/rescore.percolator.psms.txt", sep="\t", index=False)
             mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.psms.txt", sep="\t", index=False)
         else:
-            mokapot_csms_true.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep="\t", index=False)
+            mokapot_csms_target.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep="\t", index=False)
             mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", sep="\t", index=False)
 
 
@@ -740,6 +754,8 @@ def min_score(row):
             psm_id = "SpecId"
 
     split_data = target_psms[psm_id].str.rsplit("-", n=13, expand=True)
+    target_psms.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/target_psms.csv", sep="\t", index=False)
+    split_data.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/split_data.csv", sep="\t", index=False)
     new_columns = [
         "raw_file",
         "scan_number",
@@ -781,7 +797,6 @@ def min_score(row):
     mokapot_csms["mokapot score"] = mokapot_csms.apply(min_score, axis=1)
     mokapot_csms["label"] = mokapot_csms.apply(get_label, axis=1)
     mokapot_csms.rename(columns={'fdr_group_b': 'fdr_group'}, inplace=True)
-    mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_5.csv")
     return mokapot_csms
 
 
@@ -973,6 +988,10 @@ def run_rescoring(config_path: Union[str, Path]):
 
     # rescoring
     if "xl" in config.models["intensity"].lower():  # xl-psm-level
+        rescore_tab_file = pd.read_csv(str(fdr_dir) + "/rescore.tab", sep="\t")
+        rescore_tab_file.to_csv(str(fdr_dir) + "/rescore_csm.tab", sep="\t")
+        original_tab_file = pd.read_csv(str(fdr_dir) + "/original.tab", sep="\t")
+        original_tab_file.to_csv(str(fdr_dir) + "/original_csm.tab", sep="\t")
         input_mokapot_psm_rescore = prepare_rescore_xl_psm_level(str(fdr_dir), "rescore")
         input_mokapot_psm_rescore.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=None)
         input_mokapot_psm_original = prepare_rescore_xl_psm_level(str(fdr_dir), "original")

diff --git a/oktoberfest/utils/multiprocessing_pool.py b/oktoberfest/utils/multiprocessing_pool.py
@@ -40,7 +40,7 @@ def check_pool(self):
             res_len = len(self.results)
             with tqdm(total=res_len, desc="Waiting for tasks to complete", leave=True) as progress:
                 for res in self.results:
-                    outputs.append(res.get(timeout=10000))  # 10000 seconds = ~3 hours
+                    outputs.append(res.get(timeout=10000000))  # 10000 seconds = ~3 hours
                     progress.update(1)
 
             self.pool.close()