From 4d8e00ba5e22944947369c156e1146ce4736da21 Mon Sep 17 00:00:00 2001
From: Mostafa Kalhor <mostafa.kalhor@tum.de>
Date: Fri, 26 Apr 2024 15:10:44 +0000
Subject: [PATCH] xl_fdr of uxls fixed

---
 oktoberfest/data/spectra.py                |  24 +-
 oktoberfest/predict/koina.py               |  12 +-
 oktoberfest/predict/predict.py             |  27 +-
 oktoberfest/preprocessing/preprocessing.py |  23 +-
 oktoberfest/rescore/rescore.py             |  55 ++-
 oktoberfest/runner.py                      | 523 +++++++++++++--------
 6 files changed, 416 insertions(+), 248 deletions(-)

diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py
index 43dd002c..ca4723a5 100644
--- a/oktoberfest/data/spectra.py
+++ b/oktoberfest/data/spectra.py
@@ -19,15 +19,15 @@
 class FragmentType(Enum):
     """FragmentType class to enumerate pred, raw, and mz."""
 
-    PRED   = 1
+    PRED = 1
     PRED_A = 2
     PRED_B = 3
-    RAW    = 4
-    RAW_A  = 5
-    RAW_B  = 6
-    MZ     = 7
-    MZ_A   = 8
-    MZ_B   = 9
+    RAW = 4
+    RAW_A = 5
+    RAW_B = 6
+    MZ = 7
+    MZ_A = 8
+    MZ_B = 9
 
 
 class Spectra(anndata.AnnData):
@@ -65,7 +65,7 @@ def _gen_vars_df(xl: bool = False) -> pd.DataFrame:
         else:
             max_range = 30
         ion_nums = np.repeat(np.arange(1, max_range), 6)
-        ion_charge = np.tile([1, 2, 3], (max_range-1) * 2)
+        ion_charge = np.tile([1, 2, 3], (max_range - 1) * 2)
         temp_cols = []
         for size in range(1, max_range):
             for typ in ["Y", "B"]:
@@ -89,7 +89,7 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> List[str
         if xl:
             max_range = 59
         else:
-            max_range = 30 
+            max_range = 30
         for i in range(1, max_range):
             for column in Spectra.COLUMNS_FRAGMENT_ION:
                 columns.append(prefix + "_" + column.replace("1", str(i)))
@@ -107,11 +107,11 @@ def _resolve_prefix(fragment_type: FragmentType) -> str:
         if fragment_type.value == 1:
             prefix = Spectra.INTENSITY_PRED_PREFIX
         elif fragment_type.value == 2:
-            prefix = Spectra.INTENSITY_PRED_PREFIX_A    
+            prefix = Spectra.INTENSITY_PRED_PREFIX_A
         elif fragment_type.value == 3:
             prefix = Spectra.INTENSITY_PRED_PREFIX_B
         elif fragment_type.value == 4:
-            prefix = Spectra.INTENSITY_COLUMN_PREFIX   
+            prefix = Spectra.INTENSITY_COLUMN_PREFIX
         elif fragment_type.value == 5:
             prefix = Spectra.INTENSITY_COLUMN_PREFIX_A
         elif fragment_type.value == 6:
@@ -119,7 +119,7 @@ def _resolve_prefix(fragment_type: FragmentType) -> str:
         elif fragment_type.value == 7:
             prefix = Spectra.MZ_COLUMN_PREFIX
         elif fragment_type.value == 8:
-            prefix = Spectra.MZ_COLUMN_PREFIX_A       
+            prefix = Spectra.MZ_COLUMN_PREFIX_A
         else:
             prefix = Spectra.MZ_COLUMN_PREFIX_B
         return prefix
diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py
index 3cb4dcac..01e99309 100644
--- a/oktoberfest/predict/koina.py
+++ b/oktoberfest/predict/koina.py
@@ -1,7 +1,7 @@
 import time
 import warnings
 from functools import partial
-from typing import Dict, Generator, KeysView, List, Optional, Union
+from typing import Dict, Generator, KeysView, List, Optional, Union, Tuple
 
 import numpy as np
 import pandas as pd
@@ -35,9 +35,14 @@
 alternative_column_map_xl_switched = {
     "peptide_sequences_1": "MODIFIED_SEQUENCE_B",
     "peptide_sequences_2": "MODIFIED_SEQUENCE_A",
-    **{key: value for key, value in alternative_column_map_xl.items() if key not in ["peptide_sequences_1", "peptide_sequences_2"]}
+    **{
+        key: value
+        for key, value in alternative_column_map_xl.items()
+        if key not in ["peptide_sequences_1", "peptide_sequences_2"]
+    },
 }
 
+
 class Koina:
     """A class for interacting with Koina models for inference."""
 
@@ -495,7 +500,7 @@ def predict_xl(
         data: Union[Dict[str, np.ndarray], pd.DataFrame],
         _async: bool = True,
         debug=False,
-    ) -> Dict[str, np.ndarray]:
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
         """
         Perform inference on the xl data using the Koina model.
 
@@ -539,7 +544,6 @@ def predict_xl(
         else:
             return self.__predict_sequential(data_1), self.__predict_sequential(data_2)
 
-
     def __predict_async(self, data: Dict[str, np.ndarray], debug=False) -> Dict[str, np.ndarray]:
         """
         Perform asynchronous inference on the given data using the Koina model.
diff --git a/oktoberfest/predict/predict.py b/oktoberfest/predict/predict.py
index 1688e64e..f83d7fa6 100644
--- a/oktoberfest/predict/predict.py
+++ b/oktoberfest/predict/predict.py
@@ -13,7 +13,7 @@
 logger = logging.getLogger(__name__)
 
 
-def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.ndarray]:
+def predict(data: pd.DataFrame, xl: bool = False, **kwargs) -> Dict[str, np.ndarray]:
     """
     Retrieve predictions from koina.
 
@@ -29,10 +29,10 @@ def predict(data: pd.DataFrame, xl : bool = False, **kwargs) -> Dict[str, np.nda
     predictor = Koina(**kwargs)
     if xl:
         results_a, results_b = predictor.predict_xl(data)
-        return results_a, results_b 
+        return results_a, results_b
     else:
         results = predictor.predict(data)
-        return results 
+        return results
 
 
 def parse_fragment_labels(
@@ -71,7 +71,9 @@ def parse_fragment_labels(
     return {"type": fragment_type_array, "number": fragment_number_array, "charge": fragment_charge_array}
 
 
-def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl : bool = False) -> Spectra:
+def _prepare_alignment_df(
+    library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool = False, xl: bool = False
+) -> Spectra:
     """
     Prepare an alignment DataFrame from the given Spectra library.
 
@@ -102,7 +104,9 @@ def _prepare_alignment_df(library: Spectra, ce_range: Tuple[int, int], group_by_
     return alignment_library
 
 
-def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl : bool = False, **server_kwargs ) -> Spectra:
+def ce_calibration(
+    library: Spectra, ce_range: Tuple[int, int], group_by_charge: bool, xl: bool = False, **server_kwargs
+) -> Spectra:
     """
     Calculate best collision energy for peptide property predictions.
 
@@ -119,13 +123,13 @@ def ce_calibration(library: Spectra, ce_range: Tuple[int, int], group_by_charge:
     """
     alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge, xl=xl)
     if xl:
-        intensities_a, intensities_b = predict(alignment_library.obs, xl = True, **server_kwargs)
+        intensities_a, intensities_b = predict(alignment_library.obs, xl=True, **server_kwargs)
         alignment_library.add_matrix(intensities_a["intensities"], FragmentType.PRED_A)
         alignment_library.add_matrix(intensities_b["intensities"], FragmentType.PRED_B)
     else:
         intensities = predict(alignment_library.obs, **server_kwargs)
         alignment_library.add_matrix(intensities["intensities"], FragmentType.PRED)
-    
+
     _alignment(alignment_library, xl=xl)
     return alignment_library
 
@@ -148,13 +152,12 @@ def _alignment(alignment_library: Spectra, xl: bool = False):
         sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b)
         alignment_library.add_column(sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0), "SPECTRAL_ANGLE_A")
         alignment_library.add_column(sm_b.spectral_angle(raw_intensity_b, pred_intensity_b, 0), "SPECTRAL_ANGLE_B")
-        alignment_library.add_column((alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2, "SPECTRAL_ANGLE")
+        alignment_library.add_column(
+            (alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2,
+            "SPECTRAL_ANGLE",
+        )
     else:
         pred_intensity = alignment_library.get_matrix(FragmentType.PRED)[0]
         raw_intensity = alignment_library.get_matrix(FragmentType.RAW)[0]
         sm = SimilarityMetrics(pred_intensity, raw_intensity)
         alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
-
-
-    
-    
diff --git a/oktoberfest/preprocessing/preprocessing.py b/oktoberfest/preprocessing/preprocessing.py
index 510801ce..1798f43a 100644
--- a/oktoberfest/preprocessing/preprocessing.py
+++ b/oktoberfest/preprocessing/preprocessing.py
@@ -14,7 +14,7 @@
 from spectrum_io.d import convert_d_hdf, read_and_aggregate_timstof
 from spectrum_io.file import csv
 from spectrum_io.raw import ThermoRaw
-from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Xisearch
+from spectrum_io.search_result import Mascot, MaxQuant, MSFragger, Sage, Scout, Xisearch
 from spectrum_io.spectral_library.digest import get_peptide_to_protein_map
 
 from ..data.spectra import FragmentType, Spectra
@@ -198,6 +198,7 @@ def filter_peptides(
 
     return peptides[peptide_filter]
 
+
 def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int, max_charge: int) -> pd.DataFrame:
     """
     Filter xl search results using given constraints.
@@ -216,19 +217,20 @@ def filter_xl_peptides(peptides: pd.DataFrame, min_length: int, max_length: int,
         df = peptides.obs
     else:
         df = peptides
-        
+
     peptide_filter = (
-    (df["PEPTIDE_LENGTH_A"] <= max_length)
-    & (df["PEPTIDE_LENGTH_B"] <= max_length)
-    & (df["PEPTIDE_LENGTH_A"] >= min_length)
-    & (df["PEPTIDE_LENGTH_B"] >= min_length)
-    & (df["PRECURSOR_CHARGE"] <= max_charge)
-    & (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
-    & (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
+        (df["PEPTIDE_LENGTH_A"] <= max_length)
+        & (df["PEPTIDE_LENGTH_B"] <= max_length)
+        & (df["PEPTIDE_LENGTH_A"] >= min_length)
+        & (df["PEPTIDE_LENGTH_B"] >= min_length)
+        & (df["PRECURSOR_CHARGE"] <= max_charge)
+        & (df["MODIFIED_SEQUENCE_A"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
+        & (df["MODIFIED_SEQUENCE_B"].str.contains(r"\[UNIMOD\:1896\]|\[UNIMOD\:1884\]"))
     )
 
     return peptides[peptide_filter]
 
+
 def process_and_filter_spectra_data(library: Spectra, model: str, tmt_label: Optional[str] = None) -> Spectra:
     """
     Process and filter the spectra data in the given SpectralLibrary object.
@@ -318,6 +320,8 @@ def convert_search(
         search_result = Sage
     elif search_engine == "xisearch":
         search_result = Xisearch
+    elif search_engine == "scout":
+        search_result = Scout
     else:
         raise ValueError(f"Unknown search engine provided: {search_engine}")
 
@@ -581,6 +585,7 @@ def annotate_spectral_library(
 
     return aspec
 
+
 def annotate_spectral_library_xl(psms: Spectra, mass_tol: Optional[float] = None, unit_mass_tol: Optional[str] = None):
     """
     Annotate spectral library with peaks and mass for cross-linked peptides.
diff --git a/oktoberfest/rescore/rescore.py b/oktoberfest/rescore/rescore.py
index aae1c237..a82a08df 100644
--- a/oktoberfest/rescore/rescore.py
+++ b/oktoberfest/rescore/rescore.py
@@ -2,10 +2,11 @@
 import subprocess
 from pathlib import Path
 from typing import List, Optional, Union
-import scipy.sparse as sp
+
 import mokapot
 import numpy as np
 import pandas as pd
+import scipy.sparse as sp
 from spectrum_fundamentals.metrics.percolator import Percolator
 
 from ..data import FragmentType, Spectra
@@ -34,21 +35,21 @@ def generate_features(
     :param regression_method: The regression method to use for iRT alignment
     """
     if xl:
-        pred_a= library.get_matrix(FragmentType.PRED_A)[0]
-        pred_b= library.get_matrix(FragmentType.PRED_B)[0]
-        raw_a= library.get_matrix(FragmentType.RAW_A)[0]
-        raw_b= library.get_matrix(FragmentType.RAW_B)[0]
-        mz_a= library.get_matrix(FragmentType.MZ_A)[0]
-        mz_b= library.get_matrix(FragmentType.MZ_B)[0]
+        pred_a = library.get_matrix(FragmentType.PRED_A)[0]
+        pred_b = library.get_matrix(FragmentType.PRED_B)[0]
+        raw_a = library.get_matrix(FragmentType.RAW_A)[0]
+        raw_b = library.get_matrix(FragmentType.RAW_B)[0]
+        mz_a = library.get_matrix(FragmentType.MZ_A)[0]
+        mz_b = library.get_matrix(FragmentType.MZ_B)[0]
         perc_features = Percolator(
-        metadata=library.get_meta_data().reset_index(drop=True),
-        pred_intensities=sp.hstack([pred_a, pred_b]),
-        true_intensities=sp.hstack([raw_a, raw_b]),
-        mz=sp.hstack([mz_a, mz_b]),
-        input_type=search_type,
-        all_features_flag=all_features,
-        regression_method=regression_method,
-    )
+            metadata=library.get_meta_data().reset_index(drop=True),
+            pred_intensities=sp.hstack([pred_a, pred_b]),
+            true_intensities=sp.hstack([raw_a, raw_b]),
+            mz=sp.hstack([mz_a, mz_b]),
+            input_type=search_type,
+            all_features_flag=all_features,
+            regression_method=regression_method,
+        )
     else:
         perc_features = Percolator(
             metadata=library.get_meta_data().reset_index(drop=True),
@@ -108,6 +109,7 @@ def rescore_with_percolator(
     num_threads: int = 3,
     test_fdr: float = 0.01,
     train_fdr: float = 0.01,
+    xl: bool = False,
 ):
     """
     Rescore using percolator.
@@ -140,18 +142,23 @@ def rescore_with_percolator(
     target_peptides = output_folder / f"{file_prefix}.percolator.peptides.txt"
     decoy_peptides = output_folder / f"{file_prefix}.percolator.decoy.peptides.txt"
     log_file = output_folder / f"{file_prefix}.log"
-
-    cmd = f"percolator --weights {weights_file} \
-                    --num-threads {num_threads} \
-                    --subset-max-train 500000 \
-                    --post-processing-tdc \
-                    --testFDR {test_fdr} \
-                    --trainFDR {train_fdr} \
+    if xl:
+        cmd = f"percolator --weights {weights_file} \
                     --results-psms {target_psms} \
                     --decoy-results-psms {decoy_psms} \
-                    --results-peptides {target_peptides} \
-                    --decoy-results-peptides {decoy_peptides} \
                     {input_file} 2> {log_file}"
+    else:
+        cmd = f"percolator --weights {weights_file} \
+                        --num-threads {num_threads} \
+                        --subset-max-train 500000 \
+                        --post-processing-tdc \
+                        --testFDR {test_fdr} \
+                        --trainFDR {train_fdr} \
+                        --results-psms {target_psms} \
+                        --decoy-results-psms {decoy_psms} \
+                        --results-peptides {target_peptides} \
+                        --decoy-results-peptides {decoy_peptides} \
+                        {input_file} 2> {log_file}"
 
     logger.info(f"Starting percolator with command {cmd}")
     subprocess.run(cmd, shell=True, check=True)
diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py
index 8f5bd880..f72b463d 100644
--- a/oktoberfest/runner.py
+++ b/oktoberfest/runner.py
@@ -9,8 +9,9 @@
 from multiprocessing import Manager, Process, pool
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
-import pandas as pd
+
 import numpy as np
+import pandas as pd
 from sklearn.linear_model import LinearRegression, RANSACRegressor
 from spectrum_io.spectral_library import MSP, DLib, SpectralLibrary, Spectronaut
 from tqdm.auto import tqdm
@@ -126,9 +127,13 @@ def _annotate_and_get_library(spectra_file: Path, config: Config, tims_meta_file
         search = pp.load_search(config.output / "msms" / spectra_file.with_suffix(".rescore").name)
         library = pp.merge_spectra_and_peptides(spectra, search)
         if "xl" in config.models["intensity"].lower():
-            aspec = pp.annotate_spectral_library_xl(library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance)
+            aspec = pp.annotate_spectral_library_xl(
+                library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance
+            )
         else:
-            paspecp.annotate_spectral_library(library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance)
+            paspecp.annotate_spectral_library(
+                library, mass_tol=config.mass_tolerance, unit_mass_tol=config.unit_mass_tolerance
+            )
         aspec.write_as_hdf5(hdf5_path)  # write_metadata_annotation
 
     return aspec
@@ -145,7 +150,7 @@ def _get_best_ce(library: Spectra, spectra_file: Path, config: Config):
         }
         use_ransac_model = config.use_ransac_model
         if "xl" in config.models["intensity"].lower():
-            alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, xl =True, **server_kwargs )
+            alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, xl=True, **server_kwargs)
         else:
             alignment_library = pr.ce_calibration(library, config.ce_range, use_ransac_model, **server_kwargs)
 
@@ -488,22 +493,22 @@ def _calculate_features(spectra_file: Path, config: Config, xl: bool = False):
             "ssl": config.ssl,
         }
         if xl:
-            pred_intensities_a, pred_intensities_b  = pr.predict(
-            data=library.obs,
-            model_name=config.models["intensity"],
-            xl=True,
-            **predict_kwargs,
-        )
+            pred_intensities_a, pred_intensities_b = pr.predict(
+                data=library.obs,
+                model_name=config.models["intensity"],
+                xl=True,
+                **predict_kwargs,
+            )
             library.add_matrix(pred_intensities_a["intensities"], FragmentType.PRED_A)
             library.add_matrix(pred_intensities_b["intensities"], FragmentType.PRED_B)
             library.write_as_hdf5(config.output / "data" / spectra_file.with_suffix(".mzml.pred.hdf5").name)
             predict_step.mark_done()
         else:
             pred_intensities = pr.predict(
-            data=library.obs,
-            model_name=config.models["intensity"],
-            **predict_kwargs,
-        )
+                data=library.obs,
+                model_name=config.models["intensity"],
+                **predict_kwargs,
+            )
             pred_irts = pr.predict(data=library.obs, model_name=config.models["irt"], **predict_kwargs)
             library.add_matrix(pred_intensities["intensities"], FragmentType.PRED)
             library.add_column(pred_irts["irt"].squeeze(), name="PREDICTED_IRT")
@@ -534,7 +539,7 @@ def _calculate_features(spectra_file: Path, config: Config, xl: bool = False):
     calc_feature_step.mark_done()
 
 
-def _rescore(fdr_dir: Path, config: Config):
+def _rescore(fdr_dir: Path, config: Config, xl : bool = False):
     """
     High level rescore function for original and rescore.
 
@@ -547,128 +552,240 @@ def _rescore(fdr_dir: Path, config: Config):
 
     if config.fdr_estimation_method == "percolator":
         if not rescore_original_step.is_done():
-            re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir)
+            re.rescore_with_percolator(input_file=fdr_dir / "original.tab", output_folder=fdr_dir, xl = xl)
             rescore_original_step.mark_done()
         if not rescore_prosit_step.is_done():
-            re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
+            re.rescore_with_percolator(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir, xl = xl)
             rescore_prosit_step.mark_done()
     elif config.fdr_estimation_method == "mokapot":
         if not rescore_original_step.is_done():
-            re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir)
+            re.rescore_with_mokapot(input_file=fdr_dir / "original.tab", output_folder=fdr_dir, xl = xl)
             rescore_original_step.mark_done()
         if not rescore_prosit_step.is_done():
-            re.rescore_with_mokapot(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir)
+            re.rescore_with_mokapot(input_file=fdr_dir / "rescore.tab", output_folder=fdr_dir, xl = xl)
             rescore_prosit_step.mark_done()
     else:
         raise ValueError(
             'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"'
         )
 
-def xl_fdr(df: pd.DataFrame, score: str) -> pd.DataFrame: 
+
+def xl_fdr(df: pd.DataFrame, score: str) -> pd.DataFrame:
     """
-    "calculate and add fdr_xl to the DataFrame : (TD-DD)/(TT)".  
+    "calculate and add fdr_xl to the DataFrame : (TD-DD)/(TT)".
 
     :param df: DataFrame containing the data.
-    :param score: Column name containing the scores used for calculating FDR 
+    :param score: Column name containing the scores used for calculating FDR
     :return: DataFrame with the new column 'fdr'
     """
     df = df.sort_values(by=score, ascending=False)
-    df['TD_sum'] = (df['label'] == 'TD').cumsum()
-    df['DD_sum'] = (df['label'] == 'DD').cumsum()
-    df['TT_sum'] = (df['label'] == 'TT').cumsum()
-    df['mokapot q-value'] = (df['TD_sum'] - df['DD_sum']) / df['TT_sum']
-    df.loc[df['TT_sum'] == 0, 'mokapot q-value'] = 0  # Handling division by zero
-    df = df.drop(['TD_sum', 'DD_sum', 'TT_sum'], axis=1)
+    df["TD_sum"] = (df["label"] == "TD").cumsum()
+    df["DD_sum"] = (df["label"] == "DD").cumsum()
+    df["TT_sum"] = (df["label"] == "TT").cumsum()
+    df["q-value"] = (df["TD_sum"] - df["DD_sum"]) / df["TT_sum"]
+    df.loc[df["TT_sum"] == 0, "q-value"] = 0  # Handling division by zero
+    df = df.drop(["TD_sum", "DD_sum", "TT_sum"], axis=1)
     return df
 
 
-def xl_preprocessing_plot_unique_xl(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str):
-    columns_to_keep_mokapot = ["SpecId_a", "label", "ScanNr_a", "filename_a", "Peptide_a", "mokapot score", "mokapot q-value",  "mokapot q-value_a", "mokapot q-value_b", "Proteins_b", "mod_pep_a_b", "mod_pep_b_b"]
+def xl_between_or_self(df: pd.DataFrame, score: str) -> pd.DataFrame:
+    mokapot_csms_rescore_between =df[df["fdr_group"]=="between"] 
+    mokapot_csms_rescore_self =df[df["fdr_group"]=="self"] 
+    mokapot_csms_rescore_between= xl_fdr(mokapot_csms_rescore_between, score=score)
+    mokapot_csms_rescore_self= xl_fdr(mokapot_csms_rescore_self, score=score)
+    mokapot_csms_rescore = pd.concat([mokapot_csms_rescore_between, mokapot_csms_rescore_self], axis=0)
+    return mokapot_csms_rescore
+
+
+def xl_preprocessing_plot_unique_xl(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str, percolator_or_mokapot: str):
+    mokapot_csms["is_target"] = mokapot_csms["label"]
+    mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms.csv")
+    columns_to_keep_mokapot = [
+        "PSMId_a",
+        "label",
+        "is_target",
+        "scan_number_a",
+        "filename_a",
+        "peptide_a",
+        "mokapot score",
+        "q-value",
+        "q-value_a",
+        "q-value_b",
+        "proteinIds_b",
+        "mod_pep_a_b",
+        "fdr_group_a",
+        "mod_pep_b_b",
+    ]
     mokapot_csms = mokapot_csms[columns_to_keep_mokapot]
-    mokapot_csms = mokapot_csms.rename(columns={'SpecId_a': 'SpecId', 'ScanNr_a': 'ScanNr', 'filename_a': 'filename', 'Peptide_a': 'Peptide', 'Proteins_b': 'Proteins', 'mod_pep_a_b': 'mod_pep_a', 'mod_pep_b_b': 'mod_pep_b'})
-    mokapot_csms['label'] = mokapot_csms['label'].replace({'TT': True, 'TD': False, 'DD': False})
-    mokapot_csms['base_pep_a'] = mokapot_csms['mod_pep_a'].str.replace(r'\[.*?\]', '', regex=True)
-    mokapot_csms['base_pep_b'] = mokapot_csms['mod_pep_b'].str.replace(r'\[.*?\]', '', regex=True)
-    mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_10.csv")
-    mokapot_csms = mokapot_csms.sort_values(by='mokapot score', ascending = False)
-    swap_mask = mokapot_csms['base_pep_a'] > mokapot_csms['base_pep_b']
-    mokapot_csms.loc[swap_mask, 'min_seq'] = mokapot_csms['base_pep_b']
-    mokapot_csms.loc[swap_mask, 'max_seq'] = mokapot_csms['base_pep_a']
-    mokapot_csms.loc[~swap_mask, 'min_seq'] = mokapot_csms['base_pep_a']
-    mokapot_csms.loc[~swap_mask, 'max_seq'] = mokapot_csms['base_pep_b']
-    mokapot_uniq_xls = mokapot_csms.drop_duplicates(subset=['min_seq', 'max_seq'])
-    xl_fdr(mokapot_uniq_xls, "mokapot score")
-    mokapot_uniq_xls_true = mokapot_uniq_xls[mokapot_uniq_xls['label'] == True]
-    mokapot_uniq_xls_decoy = mokapot_uniq_xls[mokapot_uniq_xls['label'] == False]
+
+    mokapot_csms = mokapot_csms.rename(
+        columns={
+            "PSMId_a": "SpecId",
+            "scan_number_a": "ScanNr",
+            "filename_a": "filename",
+            "peptide_a": "Peptide",
+            "proteinIds_b": "Proteins",
+            "mod_pep_a_b": "mod_pep_a",
+            "fdr_group_a": "fdr_group",
+            "mod_pep_b_b": "mod_pep_b",
+        }
+    )
+    mokapot_csms["label"] = mokapot_csms["label"].replace({"TT": True, "TD": False, "DD": False})
+    mokapot_csms["base_pep_a"] = mokapot_csms["mod_pep_a"].str.replace(r"\[.*?\]", "", regex=True)
+    mokapot_csms["base_pep_b"] = mokapot_csms["mod_pep_b"].str.replace(r"\[.*?\]", "", regex=True)
+    mokapot_csms = mokapot_csms.sort_values(by="mokapot score", ascending=False)
+    swap_mask = mokapot_csms["base_pep_a"] > mokapot_csms["base_pep_b"]
+    mokapot_csms.loc[swap_mask, "min_seq"] = mokapot_csms["base_pep_b"]
+    mokapot_csms.loc[swap_mask, "max_seq"] = mokapot_csms["base_pep_a"]
+    mokapot_csms.loc[~swap_mask, "min_seq"] = mokapot_csms["base_pep_a"]
+    mokapot_csms.loc[~swap_mask, "max_seq"] = mokapot_csms["base_pep_b"]
+    mokapot_uniq_xls = mokapot_csms.drop_duplicates(subset=["min_seq", "max_seq"])
+    xl_between_or_self(mokapot_uniq_xls, "mokapot score")
+    mokapot_uniq_xls_true = mokapot_uniq_xls[mokapot_uniq_xls["label"] == True]
+    mokapot_uniq_xls_decoy = mokapot_uniq_xls[mokapot_uniq_xls["label"] == False]
     if original_or_rescore == "original":
-        mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep='\t', index=False)
-        mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.peptides.txt", sep='\t', index=False)
+        if percolator_or_mokapot == "percolator":
+            mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.percolator.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.percolator.decoy.peptides.txt", sep="\t", index=False)
+        else:
+            mokapot_uniq_xls_true.to_csv(featrures_dir + "/original.mokapot.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.peptides.txt", sep="\t", index=False)
     else:
-        mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep='\t', index=False)
-        mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.peptides.txt", sep='\t', index=False)
-
-    
-def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str):
-    columns_to_keep_mokapot = ["SpecId_a", "label", "ScanNr_a", "filename_a", "Peptide_a", "mokapot score", "mokapot q-value",  "mokapot q-value_a", "mokapot q-value_b", "Proteins_b"]
+        if percolator_or_mokapot == "percolator":
+            mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.percolator.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.peptides.txt", sep="\t", index=False)
+        else:
+            mokapot_uniq_xls_true.to_csv(featrures_dir + "/rescore.mokapot.peptides.txt", sep="\t", index=False)
+            mokapot_uniq_xls_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.peptides.txt", sep="\t", index=False)
+
+
+def xl_preprocessing_plot_csm(featrures_dir: Path, mokapot_csms: pd.DataFrame, original_or_rescore: str, percolator_or_mokapot: str ):
+    columns_to_keep_mokapot = [
+        "PSMId_a",
+        "label",
+        "scan_number_a",
+        "filename_a",
+        "peptide_a",
+        "mokapot score",
+        "q-value",
+        "q-value_a",
+        "q-value_b",
+        "fdr_group_a",
+        "proteinIds_b",
+    ]
+    mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_2.csv")
     mokapot_csms = mokapot_csms[columns_to_keep_mokapot]
-    mokapot_csms = mokapot_csms.rename(columns={'SpecId_a': 'SpecId', 'ScanNr_a': 'ScanNr', 'filename_a': 'filename', 'Peptide_a': 'Peptide', 'Proteins_b': 'Proteins'})
-    mokapot_csms['label'] = mokapot_csms['label'].replace({'TT': True, 'TD': False, 'DD': False})
-    mokapot_csms_true = mokapot_csms[mokapot_csms['label'] == True]
-    mokapot_csms_decoy = mokapot_csms[mokapot_csms['label'] == False]
+    mokapot_csms = mokapot_csms.rename(
+        columns={
+            "PSMId_a": "SpecId",
+            "scan_number_a": "ScanNr",
+            "filename_a": "filename",
+            "peptide_a": "Peptide",
+            "fdr_group_a": "fdr_group",
+            "proteinIds_b": "Proteins",
+        }
+    )
+    mokapot_csms["is_target"] = mokapot_csms["label"]
+    mokapot_csms["label"] = mokapot_csms["label"].replace({"TT": True, "TD": False, "DD": False})
+    mokapot_csms_true = mokapot_csms[mokapot_csms["label"] == True]
+    mokapot_csms_true.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_true.csv")
+
+    mokapot_csms_decoy = mokapot_csms[mokapot_csms["label"] == False]
     if original_or_rescore == "original":
-        mokapot_csms_true.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep='\t', index=False)
-        mokapot_csms_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", sep='\t', index=False)
+        if percolator_or_mokapot == "percolator":
+            mokapot_csms_true.to_csv(featrures_dir + "/original.percolator.psms.txt", sep="\t", index=False)
+            mokapot_csms_decoy.to_csv(featrures_dir + "/original.percolator.decoy.psms.txt", sep="\t", index=False)
+        else:
+            mokapot_csms_true.to_csv(featrures_dir + "/original.mokapot.psms.txt", sep="\t", index=False)
+            mokapot_csms_decoy.to_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", sep="\t", index=False)
     else:
-        mokapot_csms_true.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep='\t', index=False)
-        mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", sep='\t', index=False)
-    
+        if percolator_or_mokapot == "percolator":
+            mokapot_csms_true.to_csv(featrures_dir + "/rescore.percolator.psms.txt", sep="\t", index=False)
+            mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.percolator.decoy.psms.txt", sep="\t", index=False)
+        else:
+            mokapot_csms_true.to_csv(featrures_dir + "/rescore.mokapot.psms.txt", sep="\t", index=False)
+            mokapot_csms_decoy.to_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", sep="\t", index=False)
+
 
-def xl_psm_to_csm(featrures_dir: Path, original_or_rescore: str ):  
+def xl_psm_to_csm(featrures_dir: Path, original_or_rescore: str, percolator_or_mokapot: str):
     def get_label(row):
-        if row['is_decoy_p1_a'] == 'False' and row['is_decoy_p2_a'] == 'False':
-            return 'TT'
-        elif row['is_decoy_p1_a'] == 'True' and row['is_decoy_p2_a'] == 'True':
-            return 'DD'
+        if row["is_decoy_p1_a"] == "False" and row["is_decoy_p2_a"] == "False":
+            return "TT"
+        elif row["is_decoy_p1_a"] == "True" and row["is_decoy_p2_a"] == "True":
+            return "DD"
         else:
-            return 'TD'
-    
+            return "TD"
+
     def min_score(row):
-        return min(row['mokapot score_a'], row['mokapot score_b'])
-    
+        return min(row["score_a"], row["score_b"])
+
     if original_or_rescore == "original":
-        decoy_psms = pd.read_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", delimiter='\t') 
-        target_psms = pd.read_csv(featrures_dir + "/original.mokapot.psms.txt", delimiter='\t') 
-    else:
-        decoy_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", delimiter='\t') 
-        target_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.psms.txt", delimiter='\t') 
+        if percolator_or_mokapot == "percolator":
+            decoy_psms = pd.read_csv(featrures_dir + "/original.percolator.decoy.psms.txt", delimiter="\t")
+            target_psms = pd.read_csv(featrures_dir + "/original.percolator.psms.txt", delimiter="\t")
+            psm_id = "PSMId"
+        else:
+            decoy_psms = pd.read_csv(featrures_dir + "/original.mokapot.decoy.psms.txt", delimiter="\t")
+            target_psms = pd.read_csv(featrures_dir + "/original.mokapot.psms.txt", delimiter="\t")
+            psm_id = "SpecId"
 
-    split_data = target_psms['SpecId'].str.rsplit('-', n=10, expand=True)
-    new_columns = ["raw_file","scan_number", "mod_pep_a", "mod_pep_b", "charge",  
-               "decoy_p1", "is_decoy_p1", "decoy_p2", "is_decoy_p2", "index"]
+    else:
+        if percolator_or_mokapot == "percolator":
+            decoy_psms = pd.read_csv(featrures_dir + "/rescore.percolator.decoy.psms.txt", delimiter="\t")
+            target_psms = pd.read_csv(featrures_dir + "/rescore.percolator.psms.txt", delimiter="\t")
+            psm_id = "PSMId"
+        else:
+            decoy_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.decoy.psms.txt", delimiter="\t")
+            target_psms = pd.read_csv(featrures_dir + "/rescore.mokapot.psms.txt", delimiter="\t")
+            psm_id = "SpecId"
+            
+    split_data = target_psms[psm_id].str.rsplit("-", n=13, expand=True)
+    new_columns = [
+        "raw_file",
+        "scan_number",
+        "mod_pep_a",
+        "mod_pep_b",
+        "charge",
+        "decoy_p1",
+        "is_decoy_p1",
+        "decoy_p2",
+        "is_decoy_p2",
+        "fdr_group",
+        "base_sequence_p1",
+        "base_sequence_p2",
+        "index",
+    ]
     split_data.columns = new_columns
     df_psm_target = pd.concat([target_psms, split_data], axis=1)
-    split_data = decoy_psms['SpecId'].str.rsplit('-', n=10, expand=True)
-    new_columns = ["raw_file","scan_number", "mod_pep_a", "mod_pep_b", "charge", 
-                "decoy_p1", "is_decoy_p1", "decoy_p2", "is_decoy_p2", "index"]
+    split_data = decoy_psms[psm_id].str.rsplit("-", n=13, expand=True)
     split_data.columns = new_columns
     df_psm_decoy = pd.concat([decoy_psms, split_data], axis=1)
-    df_mokapot = pd.concat([df_psm_decoy , df_psm_target], axis = 0)
-    df_mokapot[['index_csm', '_', 'which_pep',]] = df_mokapot['index'].str.split('_', expand=True)
-    df_mokapot.drop(columns=['index', '_', 'decoy_p1', 'decoy_p2'], inplace=True)
-    df_pep_1 = df_mokapot[df_mokapot['which_pep'] == "1"].copy()
-    df_pep_2 = df_mokapot[df_mokapot['which_pep'] == "2"].copy()
-    df_pep_1.drop(columns=['which_pep'], inplace=True)
-    df_pep_2.drop(columns=['which_pep'], inplace=True)
-    df_pep_1.columns = [col + '_a' if col != 'index_csm' else col for col in df_pep_1.columns]
-    df_pep_2.columns = [col + '_b' if col != 'index_csm' else col for col in df_pep_2.columns]
-    mokapot_csms = pd.merge(df_pep_1, df_pep_2, on='index_csm')
-    mokapot_csms['mokapot score'] = mokapot_csms.apply(min_score, axis=1)
-    mokapot_csms['label'] = mokapot_csms.apply(get_label, axis=1)
-    mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms.csv")
+    df_mokapot = pd.concat([df_psm_decoy, df_psm_target], axis=0)
+    df_mokapot[
+        [
+            "index_csm",
+            "_",
+            "which_pep",
+        ]
+    ] = df_mokapot[
+        "index"
+    ].str.split("_", expand=True)
+    df_mokapot.drop(columns=["index", "_", "decoy_p1", "decoy_p2"], inplace=True)
+    df_pep_1 = df_mokapot[df_mokapot["which_pep"] == "1"].copy()
+    df_pep_2 = df_mokapot[df_mokapot["which_pep"] == "2"].copy()
+    df_pep_1.drop(columns=["which_pep"], inplace=True)
+    df_pep_2.drop(columns=["which_pep"], inplace=True)
+    df_pep_1.columns = [col + "_a" if col != "index_csm" else col for col in df_pep_1.columns]
+    df_pep_2.columns = [col + "_b" if col != "index_csm" else col for col in df_pep_2.columns]
+    mokapot_csms = pd.merge(df_pep_1, df_pep_2, on="index_csm")
+    mokapot_csms["mokapot score"] = mokapot_csms.apply(min_score, axis=1)
+    mokapot_csms["label"] = mokapot_csms.apply(get_label, axis=1)
+    mokapot_csms.rename(columns={'fdr_group_b': 'fdr_group'}, inplace=True)
+    mokapot_csms.to_csv("/cmnfs/data/proteomics/XL/Ribosome/mokapot_csms_5.csv")
     return mokapot_csms
 
 
-def prepare_rescore_xl_psm_level(featrures_dir: Path, original_or_rescore: str ):
+def prepare_rescore_xl_psm_level(featrures_dir: Path, original_or_rescore: str):
     def extract_label_pep_a(specid):
         if "-decoy_p1-True-" in specid:
             return -1
@@ -676,6 +793,7 @@ def extract_label_pep_a(specid):
             return 1
         else:
             return None
+
     def extract_label_pep_b(specid):
         if "-decoy_p2-True-" in specid:
             return -1
@@ -683,98 +801,121 @@ def extract_label_pep_b(specid):
             return 1
         else:
             return None
-   
-    columns_to_remove_psm_rescore= ["run_name",
-            "scan_number",
-            "precursor_mass",
-            "precursor_charge",
-            "crosslinker_name",
-            "base_sequence_p1",
-            "aa_len_p1",
-            "link_pos_p1",
-            "linked_aa_p1",
-            "mods_p1",
-            "mod_pos_p1",
-            "base_sequence_p2",
-            "aa_len_p2",
-            "link_pos_p2",
-            "linked_aa_p2",
-            "mods_p2",
-            "mod_pos_p2",
-            "linear",
-            "match_score",
-            "decoy",
-            "RAW_FILE",
-            "MASS",
-            "PRECURSOR_CHARGE",
-            "CROSSLINKER_TYPE",
-            "REVERSE",
-            "SCAN_NUMBER",
-            "SEQUENCE_A",
-            "SEQUENCE_B",
-            "Modifications_A",
-            "Modifications_B",
-            "CROSSLINKER_POSITION_A",
-            "CROSSLINKER_POSITION_B",
-            "ModificationPositions1",
-            "MZ_RANGE",
-            "ModificationPositions2",
-            "MODIFIED_SEQUENCE_A",
-            "MODIFIED_SEQUENCE_B",
-            "FRAGMENTATION",
-            "INSTRUMENT_TYPES",
-            "MASS_ANALYZER",
-            "Proteins",
-            "spectral_angle",
-            "SCORE",
-            "Unnamed: 0"
+
+    columns_to_remove_psm_rescore = [
+        "run_name",
+        "scan_number",
+        "precursor_mass",
+        "precursor_charge",
+        "crosslinker_name",
+        "aa_len_p1",
+        "link_pos_p1",
+        "linked_aa_p1",
+        "mods_p1",
+        "mod_pos_p1",
+        "aa_len_p2",
+        "link_pos_p2",
+        "linked_aa_p2",
+        "mods_p2",
+        "mod_pos_p2",
+        "linear",
+        "match_score",
+        "decoy",
+        "RAW_FILE",
+        "MASS",
+        "PRECURSOR_CHARGE",
+        "CROSSLINKER_TYPE",
+        "REVERSE",
+        "SCAN_NUMBER",
+        "SEQUENCE_A",
+        "SEQUENCE_B",
+        "Modifications_A",
+        "Modifications_B",
+        "CROSSLINKER_POSITION_A",
+        "CROSSLINKER_POSITION_B",
+        "ModificationPositions1",
+        "MZ_RANGE",
+        "ModificationPositions2",
+        "MODIFIED_SEQUENCE_A",
+        "MODIFIED_SEQUENCE_B",
+        "FRAGMENTATION",
+        "INSTRUMENT_TYPES",
+        "MASS_ANALYZER",
+        "Proteins",
+        "spectral_angle",
+        "SCORE",
+        "Unnamed: 0",
     ]
-    columns_to_remove_psm_original = list(set(columns_to_remove_psm_rescore + ["match_score"]) - set(["spectral_angle"]))
+    columns_to_remove_psm_original = list(set(columns_to_remove_psm_rescore + ["match_score"]) - {"spectral_angle"})
 
     if original_or_rescore == "original":
         columns_to_remove_psm = columns_to_remove_psm_original
-        rescore_tab_file =  pd.read_csv(featrures_dir + "/original.tab", sep="\t")
+        rescore_tab_file = pd.read_csv(featrures_dir + "/original.tab", sep="\t")
     else:
         columns_to_remove_psm = columns_to_remove_psm_rescore
-        rescore_tab_file =  pd.read_csv(featrures_dir + "/rescore.tab", sep="\t")
-
+        rescore_tab_file = pd.read_csv(featrures_dir + "/rescore.tab", sep="\t")
 
     rescore_tab_file.drop(columns=columns_to_remove_psm, inplace=True)
     rescore_tab_file = rescore_tab_file.fillna(0)
-    string_columns = ['SpecId', 'Peptide', 'Proteins', 'protein_p1', 'protein_p2', 'filename']  
-    rescore_tab_file_numeric_columns = [col for col in  rescore_tab_file.columns if col not in string_columns]
-    rescore_tab_file[rescore_tab_file_numeric_columns] = rescore_tab_file[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce')
+    string_columns = ["SpecId", "Peptide", "Proteins", "protein_p1", "protein_p2", "filename", "fdr_group", "base_sequence_p1", "base_sequence_p2"]
+    rescore_tab_file_numeric_columns = [col for col in rescore_tab_file.columns if col not in string_columns]
+    rescore_tab_file[rescore_tab_file_numeric_columns] = rescore_tab_file[rescore_tab_file_numeric_columns].apply(
+        pd.to_numeric, errors="coerce"
+    )
     rescore_tab_file = rescore_tab_file.reset_index(drop=True)
-    rescore_tab_file['Proteins'] = 'p1_' + rescore_tab_file['protein_p1'].astype(str) + '_p2_' + rescore_tab_file['protein_p2'].astype(str)
-    rescore_tab_file['SpecId'] = (
-    rescore_tab_file['SpecId'] + '-decoy_p1-' +
-    rescore_tab_file['decoy_p1'].astype(str) + '-decoy_p2-' +
-    rescore_tab_file['decoy_p2'].astype(str) + '-' +
-    rescore_tab_file.index.astype(str)         # Adding index to SpecId
-)
-    rescore_tab_file.drop(columns=[ 'protein_p1', 'protein_p2', 'decoy_p1', 'decoy_p2'], inplace=True)
-    columns_feature_pep_a = [col for col in rescore_tab_file.columns if col.endswith('_a') or col.endswith('_A') or not (col.endswith('_b') or col.endswith('_B'))]
-    columns_feature_pep_b = [col for col in rescore_tab_file.columns if col.endswith('_b') or col.endswith('_B') or not (col.endswith('_a') or col.endswith('_A'))]
+    rescore_tab_file["Proteins"] = (
+        "p1_" + rescore_tab_file["protein_p1"].astype(str) + "_p2_" + rescore_tab_file["protein_p2"].astype(str)
+    )
+    rescore_tab_file["SpecId"] = (
+        rescore_tab_file["SpecId"]
+        + "-decoy_p1-"
+        + rescore_tab_file["decoy_p1"].astype(str)
+        + "-decoy_p2-"
+        + rescore_tab_file["decoy_p2"].astype(str)
+        + "-" 
+        + rescore_tab_file["fdr_group"].astype(str)
+        + "-"
+        + rescore_tab_file["base_sequence_p1"].astype(str)
+        + "-"
+        + rescore_tab_file["base_sequence_p2"].astype(str)
+        + "-"
+        + rescore_tab_file.index.astype(str)  # Adding index to SpecId
+    )
+    rescore_tab_file.drop(columns=["protein_p1", "protein_p2", "decoy_p1", "decoy_p2", "fdr_group","base_sequence_p1", "base_sequence_p2"], inplace=True)
+    columns_feature_pep_a = [
+        col
+        for col in rescore_tab_file.columns
+        if col.endswith("_a") or col.endswith("_A") or not (col.endswith("_b") or col.endswith("_B"))
+    ]
+    columns_feature_pep_b = [
+        col
+        for col in rescore_tab_file.columns
+        if col.endswith("_b") or col.endswith("_B") or not (col.endswith("_a") or col.endswith("_A"))
+    ]
     rescore_tab_file_a = rescore_tab_file.loc[:, columns_feature_pep_a]
     rescore_tab_file_b = rescore_tab_file.loc[:, columns_feature_pep_b]
-    rescore_tab_file_a['SpecId'] = rescore_tab_file_a['SpecId'] + '_' + rescore_tab_file_a.index.astype(str)
-    rescore_tab_file_b['SpecId'] = rescore_tab_file_b['SpecId'] + '_' + rescore_tab_file_b.index.astype(str)
-    #1 means pep a and 2 means pep b
-    rescore_tab_file_a['SpecId'] = rescore_tab_file_a['SpecId'] + '_1' 
-    rescore_tab_file_b['SpecId'] = rescore_tab_file_b['SpecId'] + '_2' 
-    rescore_tab_file_a.columns = [col[:-2] if col.endswith(('_a', '_A')) else col for col in rescore_tab_file_a.columns]
-    rescore_tab_file_b.columns = [col[:-2] if col.endswith(('_b', '_B')) else col for col in rescore_tab_file_b.columns]
-    rescore_tab_file_a['Label'] = rescore_tab_file_a['SpecId'].apply(extract_label_pep_a)
-    rescore_tab_file_b['Label'] = rescore_tab_file_b['SpecId'].apply(extract_label_pep_b)
-    string_columns = ['SpecId', 'Peptide', 'Proteins', 'filename']  
-    rescore_tab_file_numeric_columns = [col for col in  rescore_tab_file_a.columns if col not in string_columns]
-    rescore_tab_file_a[rescore_tab_file_numeric_columns] = rescore_tab_file_a[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce')
-    rescore_tab_file_b[rescore_tab_file_numeric_columns] = rescore_tab_file_b[rescore_tab_file_numeric_columns].apply(pd.to_numeric, errors='coerce')
+    rescore_tab_file_a["SpecId"] = rescore_tab_file_a["SpecId"] + "_" + rescore_tab_file_a.index.astype(str)
+    rescore_tab_file_b["SpecId"] = rescore_tab_file_b["SpecId"] + "_" + rescore_tab_file_b.index.astype(str)
+    # 1 means pep a and 2 means pep b
+    rescore_tab_file_a["SpecId"] = rescore_tab_file_a["SpecId"] + "_1"
+    rescore_tab_file_b["SpecId"] = rescore_tab_file_b["SpecId"] + "_2"
+    rescore_tab_file_a.columns = [col[:-2] if col.endswith(("_a", "_A")) else col for col in rescore_tab_file_a.columns]
+    rescore_tab_file_b.columns = [col[:-2] if col.endswith(("_b", "_B")) else col for col in rescore_tab_file_b.columns]
+    rescore_tab_file_a["Label"] = rescore_tab_file_a["SpecId"].apply(extract_label_pep_a)
+    rescore_tab_file_b["Label"] = rescore_tab_file_b["SpecId"].apply(extract_label_pep_b)
+    string_columns = ["SpecId", "Peptide", "Proteins", "filename"]
+    rescore_tab_file_numeric_columns = [col for col in rescore_tab_file_a.columns if col not in string_columns]
+    rescore_tab_file_a[rescore_tab_file_numeric_columns] = rescore_tab_file_a[rescore_tab_file_numeric_columns].apply(
+        pd.to_numeric, errors="coerce"
+    )
+    rescore_tab_file_b[rescore_tab_file_numeric_columns] = rescore_tab_file_b[rescore_tab_file_numeric_columns].apply(
+        pd.to_numeric, errors="coerce"
+    )
     # change ExpMass of rescore_tab_file_a
-    max_ExpMass = rescore_tab_file_a['ExpMass'].max()
-    rescore_tab_file_b['ExpMass'] += max_ExpMass
+    max_ExpMass = rescore_tab_file_a["ExpMass"].max()
+    rescore_tab_file_b["ExpMass"] += max_ExpMass
     input_mokapot_psm = pd.concat([rescore_tab_file_a, rescore_tab_file_b], axis=0, ignore_index=True)
-    input_mokapot_psm['Proteins'].fillna("unknown", inplace=True)
+    input_mokapot_psm["Proteins"].fillna("unknown", inplace=True)
     return input_mokapot_psm
 
 
@@ -816,7 +957,6 @@ def run_rescoring(config_path: Union[str, Path]):
     fdr_dir = config.output / "results" / config.fdr_estimation_method
     original_tab_files = [fdr_dir / spectra_file.with_suffix(".original.tab").name for spectra_file in spectra_files]
     rescore_tab_files = [fdr_dir / spectra_file.with_suffix(".rescore.tab").name for spectra_file in spectra_files]
-    
 
     prepare_tab_original_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_original")
     prepare_tab_rescore_step = ProcessStep(config.output, f"{config.fdr_estimation_method}_prepare_tab_prosit")
@@ -830,23 +970,32 @@ def run_rescoring(config_path: Union[str, Path]):
         logger.info("Merging input tab files for rescoring with peptide property prediction")
         re.merge_input(tab_files=rescore_tab_files, output_file=fdr_dir / "rescore.tab")
         prepare_tab_rescore_step.mark_done()
-   
+
     # rescoring
-    if "xl" in config.models["intensity"].lower(): #xl-psm-level
+    if "xl" in config.models["intensity"].lower():  # xl-psm-level
         input_mokapot_psm_rescore = prepare_rescore_xl_psm_level(str(fdr_dir), "rescore")
         input_mokapot_psm_rescore.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=None)
         input_mokapot_psm_original = prepare_rescore_xl_psm_level(str(fdr_dir), "original")
         input_mokapot_psm_original.to_csv(str(fdr_dir) + "/original.tab", sep="\t", index=None)
-        _rescore(fdr_dir, config)
-        mokapot_csms_rescore = xl_psm_to_csm(str(fdr_dir), "rescore")
-        mokapot_csms_original = xl_psm_to_csm(str(fdr_dir), "original")
-        mokapot_csms_rescore = xl_fdr(mokapot_csms_rescore, score = "mokapot score")
-        mokapot_csms_original = xl_fdr(mokapot_csms_original, score = "mokapot score")
-        xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_rescore, "rescore")
-        xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_original, "original")
-        xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_rescore, "rescore")
-        xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_original, "original")
-
+        _rescore(fdr_dir, config, xl=True)
+        if config.fdr_estimation_method == "percolator":
+            mokapot_csms_rescore = xl_psm_to_csm(str(fdr_dir), "rescore", "percolator")
+            mokapot_csms_original = xl_psm_to_csm(str(fdr_dir), "original", "percolator")
+            mokapot_csms_rescore = xl_between_or_self(mokapot_csms_rescore, score="mokapot score")
+            mokapot_csms_original = xl_between_or_self(mokapot_csms_original, score="mokapot score")
+            xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_rescore, "rescore", "percolator")
+            xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_original, "original", "percolator")
+            xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_rescore, "rescore", "percolator")
+            xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_original, "original", "percolator")
+        else:
+            mokapot_csms_rescore = xl_psm_to_csm(str(fdr_dir), "rescore", "mokapot")
+            mokapot_csms_original = xl_psm_to_csm(str(fdr_dir), "original", "mokapot")
+            mokapot_csms_rescore = xl_between_or_self(mokapot_csms_rescore, score="mokapot score")
+            mokapot_csms_original = xl_between_or_self(mokapot_csms_original, score="mokapot score")
+            xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_rescore, "rescore", "mokapot")
+            xl_preprocessing_plot_csm(str(fdr_dir), mokapot_csms_original, "original", "mokapot")
+            xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_rescore, "rescore", "mokapot")
+            xl_preprocessing_plot_unique_xl(str(fdr_dir), mokapot_csms_original, "original", "mokapot")
     else:
         _rescore(fdr_dir, config)