xl_fdr of uxls fixed

wilhelm-lab · Apr 26, 2024 · 4d8e00b · 4d8e00b
1 parent ae7f423
commit 4d8e00b
Show file tree

Hide file tree

Showing 6 changed files with 416 additions and 248 deletions.
diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py
@@ -19,15 +19,15 @@
 class FragmentType(Enum):
     """FragmentType class to enumerate pred, raw, and mz."""
 
-    PRED   = 1
+    PRED = 1
     PRED_A = 2
     PRED_B = 3
-    RAW    = 4
-    RAW_A  = 5
-    RAW_B  = 6
-    MZ     = 7
-    MZ_A   = 8
-    MZ_B   = 9
+    RAW = 4
+    RAW_A = 5
+    RAW_B = 6
+    MZ = 7
+    MZ_A = 8
+    MZ_B = 9
 
 
 class Spectra(anndata.AnnData):
@@ -65,7 +65,7 @@ def _gen_vars_df(xl: bool = False) -> pd.DataFrame:
         else:
             max_range = 30
         ion_nums = np.repeat(np.arange(1, max_range), 6)
-        ion_charge = np.tile([1, 2, 3], (max_range-1) * 2)
+        ion_charge = np.tile([1, 2, 3], (max_range - 1) * 2)
         temp_cols = []
         for size in range(1, max_range):
             for typ in ["Y", "B"]:
@@ -89,7 +89,7 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> List[str
         if xl:
             max_range = 59
         else:
-            max_range = 30 
+            max_range = 30
         for i in range(1, max_range):
             for column in Spectra.COLUMNS_FRAGMENT_ION:
                 columns.append(prefix + "_" + column.replace("1", str(i)))
@@ -107,19 +107,19 @@ def _resolve_prefix(fragment_type: FragmentType) -> str:
         if fragment_type.value == 1:
             prefix = Spectra.INTENSITY_PRED_PREFIX
         elif fragment_type.value == 2:
-            prefix = Spectra.INTENSITY_PRED_PREFIX_A    
+            prefix = Spectra.INTENSITY_PRED_PREFIX_A
         elif fragment_type.value == 3:
             prefix = Spectra.INTENSITY_PRED_PREFIX_B
         elif fragment_type.value == 4:
-            prefix = Spectra.INTENSITY_COLUMN_PREFIX   
+            prefix = Spectra.INTENSITY_COLUMN_PREFIX
         elif fragment_type.value == 5:
             prefix = Spectra.INTENSITY_COLUMN_PREFIX_A
         elif fragment_type.value == 6:
             prefix = Spectra.INTENSITY_COLUMN_PREFIX_B
         elif fragment_type.value == 7:
             prefix = Spectra.MZ_COLUMN_PREFIX
         elif fragment_type.value == 8:
-            prefix = Spectra.MZ_COLUMN_PREFIX_A       
+            prefix = Spectra.MZ_COLUMN_PREFIX_A
         else:
             prefix = Spectra.MZ_COLUMN_PREFIX_B
         return prefix

diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py
@@ -1,7 +1,7 @@
 import time
 import warnings
 from functools import partial
-from typing import Dict, Generator, KeysView, List, Optional, Union
+from typing import Dict, Generator, KeysView, List, Optional, Union, Tuple
 
 import numpy as np
 import pandas as pd
@@ -35,9 +35,14 @@
 alternative_column_map_xl_switched = {
     "peptide_sequences_1": "MODIFIED_SEQUENCE_B",
     "peptide_sequences_2": "MODIFIED_SEQUENCE_A",
-    **{key: value for key, value in alternative_column_map_xl.items() if key not in ["peptide_sequences_1", "peptide_sequences_2"]}
+    **{
+        key: value
+        for key, value in alternative_column_map_xl.items()
+        if key not in ["peptide_sequences_1", "peptide_sequences_2"]
+    },
 }
 
+
 class Koina:
     """A class for interacting with Koina models for inference."""
 
@@ -495,7 +500,7 @@ def predict_xl(
         data: Union[Dict[str, np.ndarray], pd.DataFrame],
         _async: bool = True,
         debug=False,
-    ) -> Dict[str, np.ndarray]:
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
         """
         Perform inference on the xl data using the Koina model.
 
@@ -539,7 +544,6 @@ def predict_xl(
         else:
             return self.__predict_sequential(data_1), self.__predict_sequential(data_2)
 
-
     def __predict_async(self, data: Dict[str, np.ndarray], debug=False) -> Dict[str, np.ndarray]:
         """
         Perform asynchronous inference on the given data using the Koina model.

diff --git a/oktoberfest/predict/predict.py b/oktoberfest/predict/predict.py
@@ -13,7 +13,7 @@
 logger = logging.getLogger(__name__)
 
 
-def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.ndarray]:
+def predict(data: pd.DataFrame, xl: bool = False, **kwargs) -> Dict[str, np.ndarray]:
     """
     Retrieve predictions from koina.
 
@@ -29,10 +29,10 @@ def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.nda
     predictor = Koina(**kwargs)
     if xl:
         results_a, results_b = predictor.predict_xl(data)
-        return results_a, results_b 
+        return results_a, results_b
     else:
         results = predictor.predict(data)
-        return results 
+        return results
 
 
 def parse_fragment_labels(
@@ -71,7 +71,9 @@ def parse_fragment_labels(
     return {"type": fragment_type_array, "number": fragment_number_array, "charge": fragment_charge_array}
 
 
-def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl : bool = False) -> Spectra:
+def _prepare_alignment_df(
+    library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl: bool = False
+) -> Spectra:
     """
     Prepare an alignment DataFrame from the given Spectra library.
 
@@ -102,7 +104,9 @@ def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_
     return alignment_library
 
 
-def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl : bool = False, **server_kwargs ) -> Spectra:
+def ce_calibration(
+    library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl: bool = False, **server_kwargs
+) -> Spectra:
     """
     Calculate best collision energy for peptide property predictions.
 
@@ -119,13 +123,13 @@ def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge:
     """
     alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge, xl=xl)
     if xl:
-        intensities_a, intensities_b = predict(alignment_library.obs, xl = True, **server_kwargs)
+        intensities_a, intensities_b = predict(alignment_library.obs, xl=True, **server_kwargs)
         alignment_library.add_matrix(intensities_a["intensities"], FragmentType.PRED_A)
         alignment_library.add_matrix(intensities_b["intensities"], FragmentType.PRED_B)
     else:
         intensities = predict(alignment_library.obs, **server_kwargs)
         alignment_library.add_matrix(intensities["intensities"], FragmentType.PRED)
-    
+
     _alignment(alignment_library, xl=xl)
     return alignment_library
 
@@ -148,13 +152,12 @@ def _alignment(alignment_library: Spectra, xl: bool = False):
         sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b)
         alignment_library.add_column(sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0), "SPECTRAL_ANGLE_A")
         alignment_library.add_column(sm_b.spectral_angle(raw_intensity_b, pred_intensity_b, 0), "SPECTRAL_ANGLE_B")
-        alignment_library.add_column((alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2, "SPECTRAL_ANGLE")
+        alignment_library.add_column(
+            (alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2,
+            "SPECTRAL_ANGLE",
+        )
     else:
         pred_intensity = alignment_library.get_matrix(FragmentType.PRED)[0]
         raw_intensity = alignment_library.get_matrix(FragmentType.RAW)[0]
         sm = SimilarityMetrics(pred_intensity, raw_intensity)
         alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
-
-
-
-
diff --git a/oktoberfest/preprocessing/preprocessing.py b/oktoberfest/preprocessing/preprocessing.py
@@ -14,7 +14,7 @@
 from spectrum_io.d import convert_d_hdf, read_and_aggregate_timstof
 from spectrum_io.file import csv
 from spectrum_io.raw import ThermoRaw
-from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Xisearch
+from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Scout, Xisearch
 from spectrum_io.spectral_library.digest import get_peptide_to_protein_map
 
 from ..data.spectra import FragmentType, Spectra
@@ -198,6 +198,7 @@ def filter_peptides(
 
     return peptides[peptide_filter]
 
+
 def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int, max_charge: int) -> pd.DataFrame:
     """
     Filter xl search results using given constraints.
@@ -216,19 +217,20 @@ def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int,
         df = peptides.obs
     else:
         df = peptides
-        
+
     peptide_filter = (
-    (df["PEPTIDE_LENGTH_A"] <= max_length)
-    & (df["PEPTIDE_LENGTH_B"] <= max_length)
-    & (df["PEPTIDE_LENGTH_A"] >= min_length)
-    & (df["PEPTIDE_LENGTH_B"] >= min_length)
-    & (df["PRECURSOR_CHARGE"] <= max_charge)
-    & (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
-    & (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
+        (df["PEPTIDE_LENGTH_A"] <= max_length)
+        & (df["PEPTIDE_LENGTH_B"] <= max_length)
+        & (df["PEPTIDE_LENGTH_A"] >= min_length)
+        & (df["PEPTIDE_LENGTH_B"] >= min_length)
+        & (df["PRECURSOR_CHARGE"] <= max_charge)
+        & (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
+        & (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
     )
 
     return peptides[peptide_filter]
 
+
 def process_and_filter_spectra_data(library: Spectra, model: str, tmt_label: Optional[str] = None) -> Spectra:
     """
     Process and filter the spectra data in the given SpectralLibrary object.
@@ -318,6 +320,8 @@ def convert_search(
         search_result = Sage
     elif search_engine == "xisearch":
         search_result = Xisearch
+    elif search_engine == "scout":
+        search_result = Scout
     else:
         raise ValueError(f"Unknown search engine provided: {search_engine}")
 
@@ -581,6 +585,7 @@ def annotate_spectral_library(
 
     return aspec
 
+
 def annotate_spectral_library_xl(psms: Spectra, mass_tol: Optional[float] = None, unit_mass_tol: Optional[str] = None):
     """
     Annotate spectral library with peaks and mass for cross-linked peptides.

diff --git a/oktoberfest/rescore/rescore.py b/oktoberfest/rescore/rescore.py
@@ -2,10 +2,11 @@
 import subprocess
 from pathlib import Path
 from typing import List, Optional, Union
-import scipy.sparse as sp
+
 import mokapot
 import numpy as np
 import pandas as pd
+import scipy.sparse as sp
 from spectrum_fundamentals.metrics.percolator import Percolator
 
 from ..data import FragmentType, Spectra
@@ -34,21 +35,21 @@ def generate_features(
     :param regression_method: The regression method to use for iRT alignment
     """
     if xl:
-        pred_a= library.get_matrix(FragmentType.PRED_A)[0]
-        pred_b= library.get_matrix(FragmentType.PRED_B)[0]
-        raw_a= library.get_matrix(FragmentType.RAW_A)[0]
-        raw_b= library.get_matrix(FragmentType.RAW_B)[0]
-        mz_a= library.get_matrix(FragmentType.MZ_A)[0]
-        mz_b= library.get_matrix(FragmentType.MZ_B)[0]
+        pred_a = library.get_matrix(FragmentType.PRED_A)[0]
+        pred_b = library.get_matrix(FragmentType.PRED_B)[0]
+        raw_a = library.get_matrix(FragmentType.RAW_A)[0]
+        raw_b = library.get_matrix(FragmentType.RAW_B)[0]
+        mz_a = library.get_matrix(FragmentType.MZ_A)[0]
+        mz_b = library.get_matrix(FragmentType.MZ_B)[0]
         perc_features = Percolator(
-        metadata=library.get_meta_data().reset_index(drop=True),
-        pred_intensities=sp.hstack([pred_a, pred_b]),
-        true_intensities=sp.hstack([raw_a, raw_b]),
-        mz=sp.hstack([mz_a, mz_b]),
-        input_type=search_type,
-        all_features_flag=all_features,
-        regression_method=regression_method,
-    )
+            metadata=library.get_meta_data().reset_index(drop=True),
+            pred_intensities=sp.hstack([pred_a, pred_b]),
+            true_intensities=sp.hstack([raw_a, raw_b]),
+            mz=sp.hstack([mz_a, mz_b]),
+            input_type=search_type,
+            all_features_flag=all_features,
+            regression_method=regression_method,
+        )
     else:
         perc_features = Percolator(
             metadata=library.get_meta_data().reset_index(drop=True),
@@ -108,6 +109,7 @@ def rescore_with_percolator(
     num_threads: int = 3,
     test_fdr: float = 0.01,
     train_fdr: float = 0.01,
+    xl: bool = False,
 ):
     """
     Rescore using percolator.
@@ -140,18 +142,23 @@ def rescore_with_percolator(
     target_peptides = output_folder / f"{file_prefix}.percolator.peptides.txt"
     decoy_peptides = output_folder / f"{file_prefix}.percolator.decoy.peptides.txt"
     log_file = output_folder / f"{file_prefix}.log"
-
-    cmd = f"percolator --weights {weights_file} \
-                    --num-threads {num_threads} \
-                    --subset-max-train 500000 \
-                    --post-processing-tdc \
-                    --testFDR {test_fdr} \
-                    --trainFDR {train_fdr} \
+    if xl:
+        cmd = f"percolator --weights {weights_file} \
                     --results-psms {target_psms} \
                     --decoy-results-psms {decoy_psms} \
-                    --results-peptides {target_peptides} \
-                    --decoy-results-peptides {decoy_peptides} \
                     {input_file} 2> {log_file}"
+    else:
+        cmd = f"percolator --weights {weights_file} \
+                        --num-threads {num_threads} \
+                        --subset-max-train 500000 \
+                        --post-processing-tdc \
+                        --testFDR {test_fdr} \
+                        --trainFDR {train_fdr} \
+                        --results-psms {target_psms} \
+                        --decoy-results-psms {decoy_psms} \
+                        --results-peptides {target_peptides} \
+                        --decoy-results-peptides {decoy_peptides} \
+                        {input_file} 2> {log_file}"
 
     logger.info(f"Starting percolator with command {cmd}")
     subprocess.run(cmd, shell=True, check=True)