Skip to content

Commit

Permalink
xl_fdr of uxls fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
mostafakalhor committed Apr 26, 2024
1 parent ae7f423 commit 4d8e00b
Show file tree
Hide file tree
Showing 6 changed files with 416 additions and 248 deletions.
24 changes: 12 additions & 12 deletions oktoberfest/data/spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
class FragmentType(Enum):
"""FragmentType class to enumerate pred, raw, and mz."""

PRED = 1
PRED = 1
PRED_A = 2
PRED_B = 3
RAW = 4
RAW_A = 5
RAW_B = 6
MZ = 7
MZ_A = 8
MZ_B = 9
RAW = 4
RAW_A = 5
RAW_B = 6
MZ = 7
MZ_A = 8
MZ_B = 9


class Spectra(anndata.AnnData):
Expand Down Expand Up @@ -65,7 +65,7 @@ def _gen_vars_df(xl: bool = False) -> pd.DataFrame:
else:
max_range = 30
ion_nums = np.repeat(np.arange(1, max_range), 6)
ion_charge = np.tile([1, 2, 3], (max_range-1) * 2)
ion_charge = np.tile([1, 2, 3], (max_range - 1) * 2)
temp_cols = []
for size in range(1, max_range):
for typ in ["Y", "B"]:
Expand All @@ -89,7 +89,7 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> List[str
if xl:
max_range = 59
else:
max_range = 30
max_range = 30
for i in range(1, max_range):
for column in Spectra.COLUMNS_FRAGMENT_ION:
columns.append(prefix + "_" + column.replace("1", str(i)))
Expand All @@ -107,19 +107,19 @@ def _resolve_prefix(fragment_type: FragmentType) -> str:
if fragment_type.value == 1:
prefix = Spectra.INTENSITY_PRED_PREFIX
elif fragment_type.value == 2:
prefix = Spectra.INTENSITY_PRED_PREFIX_A
prefix = Spectra.INTENSITY_PRED_PREFIX_A
elif fragment_type.value == 3:
prefix = Spectra.INTENSITY_PRED_PREFIX_B
elif fragment_type.value == 4:
prefix = Spectra.INTENSITY_COLUMN_PREFIX
prefix = Spectra.INTENSITY_COLUMN_PREFIX
elif fragment_type.value == 5:
prefix = Spectra.INTENSITY_COLUMN_PREFIX_A
elif fragment_type.value == 6:
prefix = Spectra.INTENSITY_COLUMN_PREFIX_B
elif fragment_type.value == 7:
prefix = Spectra.MZ_COLUMN_PREFIX
elif fragment_type.value == 8:
prefix = Spectra.MZ_COLUMN_PREFIX_A
prefix = Spectra.MZ_COLUMN_PREFIX_A
else:
prefix = Spectra.MZ_COLUMN_PREFIX_B
return prefix
Expand Down
12 changes: 8 additions & 4 deletions oktoberfest/predict/koina.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import time
import warnings
from functools import partial
from typing import Dict, Generator, KeysView, List, Optional, Union
from typing import Dict, Generator, KeysView, List, Optional, Union, Tuple

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -35,9 +35,14 @@
alternative_column_map_xl_switched = {
"peptide_sequences_1": "MODIFIED_SEQUENCE_B",
"peptide_sequences_2": "MODIFIED_SEQUENCE_A",
**{key: value for key, value in alternative_column_map_xl.items() if key not in ["peptide_sequences_1", "peptide_sequences_2"]}
**{
key: value
for key, value in alternative_column_map_xl.items()
if key not in ["peptide_sequences_1", "peptide_sequences_2"]
},
}


class Koina:
"""A class for interacting with Koina models for inference."""

Expand Down Expand Up @@ -495,7 +500,7 @@ def predict_xl(
data: Union[Dict[str, np.ndarray], pd.DataFrame],
_async: bool = True,
debug=False,
) -> Dict[str, np.ndarray]:
) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
"""
Perform inference on the xl data using the Koina model.
Expand Down Expand Up @@ -539,7 +544,6 @@ def predict_xl(
else:
return self.__predict_sequential(data_1), self.__predict_sequential(data_2)


def __predict_async(self, data: Dict[str, np.ndarray], debug=False) -> Dict[str, np.ndarray]:
"""
Perform asynchronous inference on the given data using the Koina model.
Expand Down
27 changes: 15 additions & 12 deletions oktoberfest/predict/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
logger = logging.getLogger(__name__)


def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.ndarray]:
def predict(data: pd.DataFrame, xl: bool = False, **kwargs) -> Dict[str, np.ndarray]:
"""
Retrieve predictions from koina.
Expand All @@ -29,10 +29,10 @@ def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.nda
predictor = Koina(**kwargs)
if xl:
results_a, results_b = predictor.predict_xl(data)
return results_a, results_b
return results_a, results_b
else:
results = predictor.predict(data)
return results
return results


def parse_fragment_labels(
Expand Down Expand Up @@ -71,7 +71,9 @@ def parse_fragment_labels(
return {"type": fragment_type_array, "number": fragment_number_array, "charge": fragment_charge_array}


def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl : bool = False) -> Spectra:
def _prepare_alignment_df(
library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl: bool = False
) -> Spectra:
"""
Prepare an alignment DataFrame from the given Spectra library.
Expand Down Expand Up @@ -102,7 +104,9 @@ def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_
return alignment_library


def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl : bool = False, **server_kwargs ) -> Spectra:
def ce_calibration(
library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl: bool = False, **server_kwargs
) -> Spectra:
"""
Calculate best collision energy for peptide property predictions.
Expand All @@ -119,13 +123,13 @@ def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge:
"""
alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge, xl=xl)
if xl:
intensities_a, intensities_b = predict(alignment_library.obs, xl = True, **server_kwargs)
intensities_a, intensities_b = predict(alignment_library.obs, xl=True, **server_kwargs)
alignment_library.add_matrix(intensities_a["intensities"], FragmentType.PRED_A)
alignment_library.add_matrix(intensities_b["intensities"], FragmentType.PRED_B)
else:
intensities = predict(alignment_library.obs, **server_kwargs)
alignment_library.add_matrix(intensities["intensities"], FragmentType.PRED)

_alignment(alignment_library, xl=xl)
return alignment_library

Expand All @@ -148,13 +152,12 @@ def _alignment(alignment_library: Spectra, xl: bool = False):
sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b)
alignment_library.add_column(sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0), "SPECTRAL_ANGLE_A")
alignment_library.add_column(sm_b.spectral_angle(raw_intensity_b, pred_intensity_b, 0), "SPECTRAL_ANGLE_B")
alignment_library.add_column((alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2, "SPECTRAL_ANGLE")
alignment_library.add_column(
(alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2,
"SPECTRAL_ANGLE",
)
else:
pred_intensity = alignment_library.get_matrix(FragmentType.PRED)[0]
raw_intensity = alignment_library.get_matrix(FragmentType.RAW)[0]
sm = SimilarityMetrics(pred_intensity, raw_intensity)
alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")




23 changes: 14 additions & 9 deletions oktoberfest/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from spectrum_io.d import convert_d_hdf, read_and_aggregate_timstof
from spectrum_io.file import csv
from spectrum_io.raw import ThermoRaw
from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Xisearch
from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Scout, Xisearch
from spectrum_io.spectral_library.digest import get_peptide_to_protein_map

from ..data.spectra import FragmentType, Spectra
Expand Down Expand Up @@ -198,6 +198,7 @@ def filter_peptides(

return peptides[peptide_filter]


def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int, max_charge: int) -> pd.DataFrame:
"""
Filter xl search results using given constraints.
Expand All @@ -216,19 +217,20 @@ def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int,
df = peptides.obs
else:
df = peptides

peptide_filter = (
(df["PEPTIDE_LENGTH_A"] <= max_length)
& (df["PEPTIDE_LENGTH_B"] <= max_length)
& (df["PEPTIDE_LENGTH_A"] >= min_length)
& (df["PEPTIDE_LENGTH_B"] >= min_length)
& (df["PRECURSOR_CHARGE"] <= max_charge)
& (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
& (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
(df["PEPTIDE_LENGTH_A"] <= max_length)
& (df["PEPTIDE_LENGTH_B"] <= max_length)
& (df["PEPTIDE_LENGTH_A"] >= min_length)
& (df["PEPTIDE_LENGTH_B"] >= min_length)
& (df["PRECURSOR_CHARGE"] <= max_charge)
& (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
& (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
)

return peptides[peptide_filter]


def process_and_filter_spectra_data(library: Spectra, model: str, tmt_label: Optional[str] = None) -> Spectra:
"""
Process and filter the spectra data in the given SpectralLibrary object.
Expand Down Expand Up @@ -318,6 +320,8 @@ def convert_search(
search_result = Sage
elif search_engine == "xisearch":
search_result = Xisearch
elif search_engine == "scout":
search_result = Scout
else:
raise ValueError(f"Unknown search engine provided: {search_engine}")

Expand Down Expand Up @@ -581,6 +585,7 @@ def annotate_spectral_library(

return aspec


def annotate_spectral_library_xl(psms: Spectra, mass_tol: Optional[float] = None, unit_mass_tol: Optional[str] = None):
"""
Annotate spectral library with peaks and mass for cross-linked peptides.
Expand Down
55 changes: 31 additions & 24 deletions oktoberfest/rescore/rescore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
import subprocess
from pathlib import Path
from typing import List, Optional, Union
import scipy.sparse as sp

import mokapot
import numpy as np
import pandas as pd
import scipy.sparse as sp
from spectrum_fundamentals.metrics.percolator import Percolator

from ..data import FragmentType, Spectra
Expand Down Expand Up @@ -34,21 +35,21 @@ def generate_features(
:param regression_method: The regression method to use for iRT alignment
"""
if xl:
pred_a= library.get_matrix(FragmentType.PRED_A)[0]
pred_b= library.get_matrix(FragmentType.PRED_B)[0]
raw_a= library.get_matrix(FragmentType.RAW_A)[0]
raw_b= library.get_matrix(FragmentType.RAW_B)[0]
mz_a= library.get_matrix(FragmentType.MZ_A)[0]
mz_b= library.get_matrix(FragmentType.MZ_B)[0]
pred_a = library.get_matrix(FragmentType.PRED_A)[0]
pred_b = library.get_matrix(FragmentType.PRED_B)[0]
raw_a = library.get_matrix(FragmentType.RAW_A)[0]
raw_b = library.get_matrix(FragmentType.RAW_B)[0]
mz_a = library.get_matrix(FragmentType.MZ_A)[0]
mz_b = library.get_matrix(FragmentType.MZ_B)[0]
perc_features = Percolator(
metadata=library.get_meta_data().reset_index(drop=True),
pred_intensities=sp.hstack([pred_a, pred_b]),
true_intensities=sp.hstack([raw_a, raw_b]),
mz=sp.hstack([mz_a, mz_b]),
input_type=search_type,
all_features_flag=all_features,
regression_method=regression_method,
)
metadata=library.get_meta_data().reset_index(drop=True),
pred_intensities=sp.hstack([pred_a, pred_b]),
true_intensities=sp.hstack([raw_a, raw_b]),
mz=sp.hstack([mz_a, mz_b]),
input_type=search_type,
all_features_flag=all_features,
regression_method=regression_method,
)
else:
perc_features = Percolator(
metadata=library.get_meta_data().reset_index(drop=True),
Expand Down Expand Up @@ -108,6 +109,7 @@ def rescore_with_percolator(
num_threads: int = 3,
test_fdr: float = 0.01,
train_fdr: float = 0.01,
xl: bool = False,
):
"""
Rescore using percolator.
Expand Down Expand Up @@ -140,18 +142,23 @@ def rescore_with_percolator(
target_peptides = output_folder / f"{file_prefix}.percolator.peptides.txt"
decoy_peptides = output_folder / f"{file_prefix}.percolator.decoy.peptides.txt"
log_file = output_folder / f"{file_prefix}.log"

cmd = f"percolator --weights {weights_file} \
--num-threads {num_threads} \
--subset-max-train 500000 \
--post-processing-tdc \
--testFDR {test_fdr} \
--trainFDR {train_fdr} \
if xl:
cmd = f"percolator --weights {weights_file} \
--results-psms {target_psms} \
--decoy-results-psms {decoy_psms} \
--results-peptides {target_peptides} \
--decoy-results-peptides {decoy_peptides} \
{input_file} 2> {log_file}"
else:
cmd = f"percolator --weights {weights_file} \
--num-threads {num_threads} \
--subset-max-train 500000 \
--post-processing-tdc \
--testFDR {test_fdr} \
--trainFDR {train_fdr} \
--results-psms {target_psms} \
--decoy-results-psms {decoy_psms} \
--results-peptides {target_peptides} \
--decoy-results-peptides {decoy_peptides} \
{input_file} 2> {log_file}"

logger.info(f"Starting percolator with command {cmd}")
subprocess.run(cmd, shell=True, check=True)
Expand Down
Loading

0 comments on commit 4d8e00b

Please sign in to comment.