From e840749e36a4d493314c6793d4615ed8463cca35 Mon Sep 17 00:00:00 2001 From: Joshua Klein Date: Fri, 19 Jul 2024 12:56:33 -0400 Subject: [PATCH] feature: add support for parsing ProForma style formulas --- Makefile | 6 ++ implementations/python/mzpaf/annotation.py | 10 +- .../mzpaf/data/reference_molecules.json | 46 +++++++- .../reference_data/reference_molecules.md | 102 +++++++++--------- 4 files changed, 110 insertions(+), 54 deletions(-) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..31dc404 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ + +all: publish-references + +publish-references: + cp ./specification/reference_data/reference_molecules.json ./implementations/python/mzpaf/data/reference_molecules.json + cd ./specification/reference_data/ && python reference_mol_to_md.py \ No newline at end of file diff --git a/implementations/python/mzpaf/annotation.py b/implementations/python/mzpaf/annotation.py index 4641207..4ac92a1 100644 --- a/implementations/python/mzpaf/annotation.py +++ b/implementations/python/mzpaf/annotation.py @@ -14,9 +14,10 @@ except ImportError: Composition = None try: - from pyteomics.proforma import ProForma + from pyteomics.proforma import (ProForma, FormulaModification) except ImportError: ProForma = None + FormulaModification = None from .reference import ReferenceMolecule @@ -34,7 +35,7 @@ (?P[^\]]+) \]) ))| - (?:f\{(?P[A-Za-z0-9]+)\})| + (?:f\{(?P[A-Za-z0-9\[\]]+)\})| (?:_\{ (?P[^\{\}\s,/]+) \})| @@ -723,6 +724,11 @@ def _populate_from_dict(self, data): self.formula = descr['formula'] return self + def to_composition(self) -> "Composition": + if Composition is None: + raise ImportError("Cannot use `to_composition` without `pyteomics`") + return FormulaModification(self.formula).resolve()['composition'] + class SMILESAnnotation(IonAnnotationBase): __slots__ = ("smiles", ) diff --git a/implementations/python/mzpaf/data/reference_molecules.json b/implementations/python/mzpaf/data/reference_molecules.json index e162be2..1985f86 100644 --- a/implementations/python/mzpaf/data/reference_molecules.json +++ b/implementations/python/mzpaf/data/reference_molecules.json @@ -27,228 +27,272 @@ "TMT126": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C8N1H15", "ion_mz": 126.127726 }, "TMT127N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C8[15N1]H15", "ion_mz": 127.124761 }, "TMT127C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C7[13C1]N1H15", "ion_mz": 127.131081 }, "TMT128N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C7[13C1][15N1]H15", "ion_mz": 128.128116 }, "TMT128C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C6[13C2]N1H15", "ion_mz": 128.134436 }, "TMT129N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C6[13C2][15N1]H15", "ion_mz": 129.131471 }, "TMT129C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C5[13C3]N1H15", "ion_mz": 129.13779 }, "TMT130N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C5[13C3][15N1]H15", "ion_mz": 130.134825 }, "TMT130C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C4[13C4]N1H15", "ion_mz": 130.141145 }, "TMT131N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C4[13C4][15N1]H15", "ion_mz": 131.13818 }, "TMT131C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C3[13C5]N1H15", "ion_mz": 131.1445 }, "TMT132N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C3[13C5][15N1]H15", "ion_mz": 132.141535 }, "TMT132C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C2[13C6]N1H15", "ion_mz": 122.147855 }, "TMT133N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C2[13C6][15N1]H15", "ion_mz": 133.14489 }, "TMT133C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C1[13C7]N1H15", "ion_mz": 133.15121 }, "TMT134N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C1[13C7][15N1]H15", "ion_mz": 134.148245 }, "TMT134C": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "[13C8]N1H15", "ion_mz": 134.154565 }, "TMT135N": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "[13C8][15N1]H15", "ion_mz": 135.1516 }, "TMTzero": { "label_type": "TMTzero", "molecule_type": "reporter+balance", + "chemical_formula": "C12H20N2O2", "neutral_mass": 224.152478, "ion_mz": 225.15975447 }, "TMTpro_zero": { "label_type": "TMTpro_zero", "molecule_type": "reporter+balance", + "chemical_formula": "C15H25N3O3", "neutral_mass": 295.189592, "ion_mz": 296.1968685 }, "TMT2plex": { "label_type": "TMT2plex", "molecule_type": "reporter+balance", + "chemical_formula": "C11[13C1]H20N2O2", "neutral_mass": 225.155833, "ion_mz": 226.16310947 }, "TMT6plex": { "label_type": "TMT6plex", "molecule_type": "reporter+balance", + "chemical_formula": "C8[13C5]H20N1[15N1]O2", "neutral_mass": 229.162932, "ion_mz": 230.17020847 }, "TMTpro": { "label_type": "TMTpro", "molecule_type": "reporter+balance", + "chemical_formula": "C8[13C7]H25[15N2]N1O3", "neutral_mass": 304.207146, "ion_mz": 305.21442247 }, "iTRAQ113": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "C6N2H12", "ion_mz": 113.1078 }, "iTRAQ114": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "C5[13C1]N2H12", "ion_mz": 114.1112 }, "iTRAQ115": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "C5[13C1]N1[15N1]H12", "ion_mz": 115.1082 }, "iTRAQ116": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "C4[13C2]N1[15N1]H12", "ion_mz": 116.1116 }, "iTRAQ117": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "C3[13C3]N1[15N1]H12", "ion_mz": 117.1149 }, "iTRAQ118": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "C3[13C3][15N2]H12", "ion_mz": 118.112 }, "iTRAQ119": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "C2[13C4][15N2]H12", "ion_mz": 119.1153 }, "iTRAQ121": { "label_type": "iTRAQ", "molecule_type": "reporter", + "chemical_formula": "[13C6][15N2]H12", "ion_mz": 121.122 }, "iTRAQ4plex": { "label_type": "iTRAQ4plex", "molecule_type": "reporter+balance", + "chemical_formula": "C4[13C3]N1[15N1]O1H12", "neutral_mass": 144.102063, "ion_mz": 145.10933947 }, "iTRAQ8plex": { "label_type": "iTRAQ8plex", "molecule_type": "reporter+balance", + "chemical_formula": "C7[13C7]N3[15N1]O3H24", "neutral_mass": 304.205360, "ion_mz": 305.21263647 }, "TMT126-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C7N1H15", "ion_mz": 114.127725 }, "TMT127N-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C7[15N1]H15", "ion_mz": 115.12476 }, "TMT127C-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C6[13C1]N1H15", "ion_mz": 114.127725 }, "TMT128N-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C6[13C1][15N1]H15", "ion_mz": 115.12476 }, "TMT128C-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C5[13C2]N1H15", "ion_mz": 116.134433 }, "TMT129N-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C5[13C2][15N1]H15", "ion_mz": 117.131468 }, "TMT129C-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C4[13C3]N1H15", "ion_mz": 116.134433 }, "TMT130N-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C4[13C3][15N1]H15", "ion_mz": 117.131468 }, "TMT130C-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C3[13C4]N1H15", "ion_mz": 118.141141 }, "TMT131N-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C3[13C4][15N1]H15", "ion_mz": 119.138176 }, "TMT131C-ETD": { "label_type": "TMT", "molecule_type": "reporter", + "chemical_formula": "C2[13C5]N1H15", "ion_mz": 118.141141 } -} +} \ No newline at end of file diff --git a/specification/reference_data/reference_molecules.md b/specification/reference_data/reference_molecules.md index 3ab7d70..e5e780e 100644 --- a/specification/reference_data/reference_molecules.md +++ b/specification/reference_data/reference_molecules.md @@ -1,51 +1,51 @@ -| | molecule_type | neutral_mass | chemical_formula | label_type | ion_mz | -|:------------|:-----------------|---------------:|:-------------------|:-------------|---------:| -| Hex | monosaccharide | 162.053 | C6H10O5 | | | -| HexNAc | monosaccharide | 203.079 | C8H13N1O5 | | | -| dHex | monosaccharide | 146.058 | C6H10O4 | | | -| NeuAc | monosaccharide | 291.095 | C11H17N1O8 | | | -| NeuGc | monosaccharide | 307.09 | C11H17N1O9 | | | -| TMT126 | reporter | | | TMT | 126.128 | -| TMT127N | reporter | | | TMT | 127.125 | -| TMT127C | reporter | | | TMT | 127.131 | -| TMT128N | reporter | | | TMT | 128.128 | -| TMT128C | reporter | | | TMT | 128.134 | -| TMT129N | reporter | | | TMT | 129.131 | -| TMT129C | reporter | | | TMT | 129.138 | -| TMT130N | reporter | | | TMT | 130.135 | -| TMT130C | reporter | | | TMT | 130.141 | -| TMT131N | reporter | | | TMT | 131.138 | -| TMT131C | reporter | | | TMT | 131.144 | -| TMT132N | reporter | | | TMT | 132.142 | -| TMT132C | reporter | | | TMT | 122.148 | -| TMT133N | reporter | | | TMT | 133.145 | -| TMT133C | reporter | | | TMT | 133.151 | -| TMT134N | reporter | | | TMT | 134.148 | -| TMT134C | reporter | | | TMT | 134.155 | -| TMT135N | reporter | | | TMT | 135.152 | -| TMTzero | reporter+balance | 224.152 | | TMTzero | 225.16 | -| TMTpro_zero | reporter+balance | 295.19 | | TMTpro_zero | 296.197 | -| TMT2plex | reporter+balance | 225.156 | | TMT2plex | 226.163 | -| TMT6plex | reporter+balance | 229.163 | | TMT6plex | 230.17 | -| TMTpro | reporter+balance | 304.207 | | TMTpro | 305.214 | -| iTRAQ113 | reporter | | | iTRAQ | 113.108 | -| iTRAQ114 | reporter | | | iTRAQ | 114.111 | -| iTRAQ115 | reporter | | | iTRAQ | 115.108 | -| iTRAQ116 | reporter | | | iTRAQ | 116.112 | -| iTRAQ117 | reporter | | | iTRAQ | 117.115 | -| iTRAQ118 | reporter | | | iTRAQ | 118.112 | -| iTRAQ119 | reporter | | | iTRAQ | 119.115 | -| iTRAQ121 | reporter | | | iTRAQ | 121.122 | -| iTRAQ4plex | reporter+balance | 144.102 | | iTRAQ4plex | 145.109 | -| iTRAQ8plex | reporter+balance | 304.205 | | iTRAQ8plex | 305.213 | -| TMT126-ETD | reporter | | | TMT | 114.128 | -| TMT127N-ETD | reporter | | | TMT | 115.125 | -| TMT127C-ETD | reporter | | | TMT | 114.128 | -| TMT128N-ETD | reporter | | | TMT | 115.125 | -| TMT128C-ETD | reporter | | | TMT | 116.134 | -| TMT129N-ETD | reporter | | | TMT | 117.131 | -| TMT129C-ETD | reporter | | | TMT | 116.134 | -| TMT130N-ETD | reporter | | | TMT | 117.131 | -| TMT130C-ETD | reporter | | | TMT | 118.141 | -| TMT131N-ETD | reporter | | | TMT | 119.138 | -| TMT131C-ETD | reporter | | | TMT | 118.141 | \ No newline at end of file +| | molecule_type | neutral_mass | chemical_formula | label_type | ion_mz | +|:------------|:-----------------|---------------:|:----------------------|:-------------|---------:| +| Hex | monosaccharide | 162.053 | C6H10O5 | | | +| HexNAc | monosaccharide | 203.079 | C8H13N1O5 | | | +| dHex | monosaccharide | 146.058 | C6H10O4 | | | +| NeuAc | monosaccharide | 291.095 | C11H17N1O8 | | | +| NeuGc | monosaccharide | 307.09 | C11H17N1O9 | | | +| TMT126 | reporter | | C8N1H15 | TMT | 126.128 | +| TMT127N | reporter | | C8[15N1]H15 | TMT | 127.125 | +| TMT127C | reporter | | C7[13C1]N1H15 | TMT | 127.131 | +| TMT128N | reporter | | C7[13C1][15N1]H15 | TMT | 128.128 | +| TMT128C | reporter | | C6[13C2]N1H15 | TMT | 128.134 | +| TMT129N | reporter | | C6[13C2][15N1]H15 | TMT | 129.131 | +| TMT129C | reporter | | C5[13C3]N1H15 | TMT | 129.138 | +| TMT130N | reporter | | C5[13C3][15N1]H15 | TMT | 130.135 | +| TMT130C | reporter | | C4[13C4]N1H15 | TMT | 130.141 | +| TMT131N | reporter | | C4[13C4][15N1]H15 | TMT | 131.138 | +| TMT131C | reporter | | C3[13C5]N1H15 | TMT | 131.144 | +| TMT132N | reporter | | C3[13C5][15N1]H15 | TMT | 132.142 | +| TMT132C | reporter | | C2[13C6]N1H15 | TMT | 122.148 | +| TMT133N | reporter | | C2[13C6][15N1]H15 | TMT | 133.145 | +| TMT133C | reporter | | C1[13C7]N1H15 | TMT | 133.151 | +| TMT134N | reporter | | C1[13C7][15N1]H15 | TMT | 134.148 | +| TMT134C | reporter | | [13C8]N1H15 | TMT | 134.155 | +| TMT135N | reporter | | [13C8][15N1]H15 | TMT | 135.152 | +| TMTzero | reporter+balance | 224.152 | C12H20N2O2 | TMTzero | 225.16 | +| TMTpro_zero | reporter+balance | 295.19 | C15H25N3O3 | TMTpro_zero | 296.197 | +| TMT2plex | reporter+balance | 225.156 | C11[13C1]H20N2O2 | TMT2plex | 226.163 | +| TMT6plex | reporter+balance | 229.163 | C8[13C5]H20N1[15N1]O2 | TMT6plex | 230.17 | +| TMTpro | reporter+balance | 304.207 | C8[13C7]H25[15N2]N1O3 | TMTpro | 305.214 | +| iTRAQ113 | reporter | | C6N2H12 | iTRAQ | 113.108 | +| iTRAQ114 | reporter | | C5[13C1]N2H12 | iTRAQ | 114.111 | +| iTRAQ115 | reporter | | C5[13C1]N1[15N1]H12 | iTRAQ | 115.108 | +| iTRAQ116 | reporter | | C4[13C2]N1[15N1]H12 | iTRAQ | 116.112 | +| iTRAQ117 | reporter | | C3[13C3]N1[15N1]H12 | iTRAQ | 117.115 | +| iTRAQ118 | reporter | | C3[13C3][15N2]H12 | iTRAQ | 118.112 | +| iTRAQ119 | reporter | | C2[13C4][15N2]H12 | iTRAQ | 119.115 | +| iTRAQ121 | reporter | | [13C6][15N2]H12 | iTRAQ | 121.122 | +| iTRAQ4plex | reporter+balance | 144.102 | C4[13C3]N1[15N1]O1H12 | iTRAQ4plex | 145.109 | +| iTRAQ8plex | reporter+balance | 304.205 | C7[13C7]N3[15N1]O3H24 | iTRAQ8plex | 305.213 | +| TMT126-ETD | reporter | | C7N1H15 | TMT | 114.128 | +| TMT127N-ETD | reporter | | C7[15N1]H15 | TMT | 115.125 | +| TMT127C-ETD | reporter | | C6[13C1]N1H15 | TMT | 114.128 | +| TMT128N-ETD | reporter | | C6[13C1][15N1]H15 | TMT | 115.125 | +| TMT128C-ETD | reporter | | C5[13C2]N1H15 | TMT | 116.134 | +| TMT129N-ETD | reporter | | C5[13C2][15N1]H15 | TMT | 117.131 | +| TMT129C-ETD | reporter | | C4[13C3]N1H15 | TMT | 116.134 | +| TMT130N-ETD | reporter | | C4[13C3][15N1]H15 | TMT | 117.131 | +| TMT130C-ETD | reporter | | C3[13C4]N1H15 | TMT | 118.141 | +| TMT131N-ETD | reporter | | C3[13C4][15N1]H15 | TMT | 119.138 | +| TMT131C-ETD | reporter | | C2[13C5]N1H15 | TMT | 118.141 | \ No newline at end of file