Skip to content

Commit

Permalink
Merge pull request #148 from wilhelm-lab/release/0.6.2
Browse files Browse the repository at this point in the history
Release/0.6.2
  • Loading branch information
picciama authored Sep 12, 2024
2 parents cb45899 + a6f2325 commit 236425a
Show file tree
Hide file tree
Showing 13 changed files with 109 additions and 131 deletions.
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Mario Picciani
email: [email protected]
project_name: spectrum_io
project_short_description: IO related functionalities for oktoberfest.
version: 0.6.1
version: 0.6.2
license: MIT
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.6.1 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.6.1 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.6.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.6.2 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
poetry build --ansi
- name: Publish package on PyPI
uses: pypa/gh-action-pypi-publish@v1.9.0
uses: pypa/gh-action-pypi-publish@v1.10.1
with:
# TODO COOKIETEMPLE: Configure your PyPI Token to enable automatic deployment to PyPi on releases
# https://help.github.com/en/actions/configuring-and-managing-workflows/creating-and-storing-encrypted-secrets
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ jobs:
with:
name: coverage-data
path: ".coverage.*"
include-hidden-files: "true"

- name: Upload documentation
if: matrix.session == 'docs-build'
Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.1
current_version = 0.6.2

[bumpversion_files_whitelisted]
init_file = spectrum_io/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.6.1"
version = "0.6.2"
# The full version, including alpha/beta/rc tags.
release = "0.6.1"
release = "0.6.2"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "spectrum_io"
version = "0.6.1" # <<COOKIETEMPLE_FORCE_BUMP>>
version = "0.6.2" # <<COOKIETEMPLE_FORCE_BUMP>>
description = "IO related functionalities for oktoberfest."
authors = ["Wilhelmlab at Technical University of Munich"]
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
__copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
__license__ = "MIT"
__version__ = "0.6.1"
__version__ = "0.6.2"

import logging
import logging.handlers
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@click.command()
@click.version_option(version="0.6.1", message=click.style("spectrum_io Version: 0.6.1"))
@click.version_option(version="0.6.2", message=click.style("spectrum_io Version: 0.6.2"))
def main() -> None:
"""spectrum_io."""

Expand Down
1 change: 1 addition & 0 deletions spectrum_io/search_result/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from .mascot import Mascot
from .maxquant import MaxQuant
from .msamanda import MSAmanda
from .msfragger import MSFragger
from .sage import Sage
from .xisearch import Xisearch
207 changes: 92 additions & 115 deletions spectrum_io/search_result/msamanda.py
Original file line number Diff line number Diff line change
@@ -1,125 +1,102 @@
import logging
from pathlib import Path
from typing import Union
from typing import Dict, Optional

import pandas as pd
from spectrum_fundamentals.constants import PARTICLE_MASSES

from .search_results import filter_valid_prosit_sequences
from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods

logger = logging.getLogger(__name__)


def _update_columns_for_prosit(df: pd.DataFrame):
"""
Update columns of df to work with Prosit.
:param df: df to modify
:return: modified df as pd.DataFrame
"""

def _check_unsupported_mods(modstring: str):
if modstring:
for mod in modstring.split(";"):
if not mod.split("(", 1)[1].startswith(("Oxidation", "Carbamidomethyl")):
raise AssertionError(f"Unsupported modification: {mod}")

def _make_mod_seq(seq: str):
seq = seq.replace("m", "M[UNIMOD:35]")
seq = seq.replace("c", "C[UNIMOD:4]")
return seq

df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")
df["MASS"] = df["m/z"] * df["PRECURSOR_CHARGE"] - (df["PRECURSOR_CHARGE"] * PARTICLE_MASSES["PROTON"])
df["Modifications"].fillna("").apply(_check_unsupported_mods)
df["MODIFIED_SEQUENCE"] = df["SEQUENCE"].apply(_make_mod_seq)
df["SEQUENCE"] = df["SEQUENCE"].str.upper()
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()
df["RAW_FILE"] = df["RAW_FILE"].str.replace(r"\.\w+$", "", regex=True)

return df[
[
"SCORE",
"MASS",
"PRECURSOR_CHARGE",
"RAW_FILE",
"SCAN_NUMBER",
"REVERSE",
"MODIFIED_SEQUENCE",
"SEQUENCE",
"PEPTIDE_LENGTH",
]
]


def _remove_decoys_in_targets(full_df):
duplicated_psms = full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)
logger.info(f"Removing {sum(duplicated_psms)} duplicated PSMs...")
full_df = full_df[~(full_df["REVERSE"] & duplicated_psms)]
if any(full_df[["SCAN_NUMBER", "RAW_FILE", "MODIFIED_SEQUENCE"]].duplicated(keep=False)):
raise AssertionError("There are duplicate target PSMs. This is a bug of msamanda!")
logger.info(f"{len(full_df)} remaining PSMs")

return full_df


def read_result(path: Union[str, Path], suffix: str = "output.csv") -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
:param path: path to msms.txt to read
:param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path
:raises FileNotFoundError: If the supplied path is not found
:raises AssertionError: If the supplied path does not contain any files matching the provided suffix.
:return: pd.DataFrame with the formatted data
"""
if isinstance(path, str):
path = Path(path)
if path.is_file():
pathlist = [path]
elif path.is_dir():
pathlist = list(path.glob(f"*{suffix}"))
if not pathlist:
raise AssertionError(f"The directory does not contain any files that match the pattern *{suffix}")
else:
raise FileNotFoundError(f"{path} does not exist.")

df_list = []
for output_file in pathlist:
logger.info(f"Reading {output_file}...")
df = pd.read_csv(
output_file,
sep="\t",
skiprows=1,
usecols=[
"Scan Number",
"Sequence",
"Modifications",
"Filename",
"Amanda Score",
"m/z",
"Charge",
"Protein Accessions",
],
class MSAmanda(SearchResults):
"""Handle search results from MSAmanda."""

@property
def standard_mods(self):
"""Standard modifications that are always applied if not otherwise specified."""
return {"m": 35, "c": 4}

def read_result(
self, tmt_label: str = "", custom_mods: Optional[Dict[str, int]] = None, suffix: str = "output.csv"
) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
:param tmt_label: optional tmt label as str
:param custom_mods: optional dictionary mapping Sage-specific mod pattern to UNIMOD IDs.
If None, static carbamidomethylation of cytein and variable oxidation of methionine
are mapped automatically. To avoid this, explicitely provide an empty dictionary.
:param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path
:raises FileNotFoundError: If the supplied path is not found
:raises AssertionError: If the supplied path does not contain any files matching the provided suffix.
:raises NotImplementedError: If tmt label was supplied.
:return: pd.DataFrame with the formatted data
"""
parsed_mods = parse_mods(self.standard_mods | (custom_mods or {}))
print(parsed_mods)
if tmt_label:
raise NotImplementedError("TMT is not supported for MSAmanda. Please open an issue on github.")

if self.path.is_file():
pathlist = [self.path]
print("hooray")
elif self.path.is_dir():
pathlist = list(self.path.glob(f"*{suffix}"))
if not pathlist:
raise AssertionError(f"The directory does not contain any files that match the pattern *{suffix}")
else:
raise FileNotFoundError(f"{self.path} does not exist.")

df_list = []
for output_file in pathlist:
logger.info(f"Reading {output_file}...")
df_list.append(
pd.read_csv(
output_file,
sep="\t",
skiprows=1,
usecols=[
"Scan Number",
"Sequence",
# "Modifications",
"Protein Accessions",
"Amanda Score",
"m/z",
"Charge",
"Filename",
],
)
)
logger.info(f"Finished reading {output_file}")

self.results = pd.concat(df_list)

self.convert_to_internal(mods=parsed_mods)
return filter_valid_prosit_sequences(self.results)

def convert_to_internal(self, mods: Dict[str, str]):
"""
Convert all columns in the Sage output to the internal format used by Oktoberfest.
:param mods: dictionary mapping Sage-specific mod patterns (keys) to ProForma standard (values)
"""
df = self.results
df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")
df["m/z"] = df["Charge"] * (df["m/z"] - PARTICLE_MASSES["PROTON"])
df["SEQUENCE"] = df["Sequence"].str.upper()
df.replace({"RAW_FILE": {r"\.\w+$": ""}, "Sequence": mods}, regex=True, inplace=True)
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].str.len()

df.rename(
columns={
"Filename": "RAW_FILE",
"Scan Number": "SCAN_NUMBER",
"Sequence": "MODIFIED_SEQUENCE",
"Charge": "PRECURSOR_CHARGE",
"m/z": "MASS",
"Amanda Score": "SCORE",
"Protein Accessions": "PROTEINS",
},
inplace=True,
)
df.columns = [
"SCAN_NUMBER",
"SEQUENCE",
"Modifications",
"Protein Accessions",
"SCORE",
"m/z",
"PRECURSOR_CHARGE",
"RAW_FILE",
]
logger.info(f"Finished reading {output_file}")

df = _update_columns_for_prosit(df)
df = filter_valid_prosit_sequences(df)
df_list.append(df)

if len(df_list) == 1:
full_df = df_list[0]
full_df = pd.concat(df_list, axis=0)
full_df = _remove_decoys_in_targets(full_df)
return full_df
5 changes: 3 additions & 2 deletions spectrum_io/search_result/search_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def parse_mods(mods: Dict[str, int]) -> Dict[str, str]:
f"Replacement {k} not understood. Replacements must be strings and follow " f"the pattern {key_pattern}"
)
if k[0].isalpha():
unimod_regex_map[re.escape(k)] = f"{k[0]}[UNIMOD:{v}]"
unimod_regex_map[re.escape(k)] = f"{k[0].upper()}[UNIMOD:{v}]"
continue
if k[0] == "^":
unimod_regex_map[f"^{re.escape(k[1:])}"] = f"[UNIMOD:{v}]-"
Expand All @@ -87,7 +87,8 @@ def parse_mods(mods: Dict[str, int]) -> Dict[str, str]:
f"Replacement {k} not understood. {k[0]} is not a valid aminoacid. "
f"Replacements most follow the pattern {key_pattern}"
)
return unimod_regex_map
# sort descending by length to avoid replacement of single-aa keys within larger keys
return dict(sorted(unimod_regex_map.items(), key=lambda x: -len(x[0])))


class SearchResults:
Expand Down
6 changes: 2 additions & 4 deletions tests/unit_tests/test_sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,9 @@ class TestSage(unittest.TestCase):
def test_read_sage(self):
"""Test function for reading sage results and transforming to Prosit format."""
expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv"
internal_search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result(
tmt_label="tmt"
)
search_results_df = Sage(Path(__file__).parent / "data" / "sage_output.tsv").read_result(tmt_label="tmt")
expected_df = pd.read_csv(expected_sage_internal_path)
pd.testing.assert_frame_equal(internal_search_results_df[COLUMNS], expected_df[COLUMNS])
pd.testing.assert_frame_equal(search_results_df[COLUMNS], expected_df[COLUMNS])

def test_read_sage_with_custom_mods(self):
"""Test function for reading sage results with custom mods and transforming to Prosit format ."""
Expand Down

0 comments on commit 236425a

Please sign in to comment.