Removed proteins without any PSMs below the FDR threshold from protei…

…n groups output. Setting the new `--keep_all_proteins` flag reverts to the old behavior.
kusterlab · May 30, 2023 · 012ed0d · 012ed0d
1 parent ec1e431
commit 012ed0d
Show file tree

Hide file tree

Showing 6 changed files with 170 additions and 13 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,10 @@
+v0.4.0
+- Removed proteins without any PSMs below the FDR threshold. Setting the new `--keep_all_proteins` flag reverts to the old behavior.
+- Added support for supplying multiple evidence files
+- Added support for mokapot format and allow users to specify methods with custom toml files
+- Add `--fasta_contains_decoys` flag and functionality
+- Update llvmlite (#4)
+
 v0.3.5
 - Added rescoring input (e.g. Prosit results) to GUI
 

diff --git a/picked_group_fdr/picked_group_fdr.py b/picked_group_fdr/picked_group_fdr.py
@@ -76,6 +76,10 @@ def parseArgs(argv):
                          help='''File with mapping from peptides to proteotypicity.
                                 ''')
 
+    apars.add_argument('--keep_all_proteins', default=None,
+                         help='''Keep proteins that do not have peptides below the PSM FDR filter.''',
+                         action='store_true')
+
     apars.add_argument('--gene_level',
                          help='Report gene-level statistics instead of protein group-level. This requires the GN= field to be present in the fasta file.',
                          action='store_true')
@@ -170,7 +174,7 @@ def main(argv):
                 peptideInfoList, args.mq_protein_groups, 
                 proteinAnnotations, peptideToProteotypicityMap,
                 config['pickedStrategy'], config['scoreType'], config['grouping'], 
-                plotter)
+                plotter, args.keep_all_proteins)
 
         if args.do_quant:
             doQuantification(config, args, proteinGroupResults, parseId, peptideToProteinMap)
@@ -194,7 +198,8 @@ def getProteinGroupResults(
         pickedStrategy: ProteinCompetitionStrategy,
         scoreType: ProteinScoringStrategy,
         groupingStrategy: ProteinGroupingStrategy, 
-        plotter: Union[Plotter, NoPlotter]):
+        plotter: Union[Plotter, NoPlotter],
+        keep_all_proteins: bool):
     proteinGroups = groupingStrategy.group_proteins(peptideInfoList, mqProteinGroupsFile)
 
     # for razor peptide strategy
@@ -224,7 +229,8 @@ def getProteinGroupResults(
         proteinGroupResults = ProteinGroupResults.from_protein_groups(
                 pickedProteinGroups, pickedProteinGroupPeptideInfos, 
                 proteinScores, reportedQvals, 
-                scoreCutoff, proteinAnnotations)
+                scoreCutoff, proteinAnnotations,
+                keep_all_proteins)
 
     if scoreType.use_proteotypicity:
         proteotypicity.calculateProteotypicityScores(pickedProteinGroups, pickedProteinGroupPeptideInfos, peptideToProteotypicityMap, scoreType, scoreCutoff)

diff --git a/picked_group_fdr/quantification.py b/picked_group_fdr/quantification.py
@@ -33,7 +33,7 @@ def parseArgs(argv):
     apars = argparse.ArgumentParser(
             formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    apars.add_argument('--mq_evidence', default=None, metavar = "EV", required = True,
+    apars.add_argument('--mq_evidence', default=None, metavar = "EV", required = True, nargs="+",
                          help='''MaxQuant evidence file.''')
 
     apars.add_argument('--mq_protein_groups', default=None, metavar = "PG", required = True,

diff --git a/picked_group_fdr/results.py b/picked_group_fdr/results.py
@@ -63,14 +63,21 @@ def from_mq_protein_groups(cls, row, cols):
                              numberOfProteins, qValue, score, reverse, potentialContaminant, [], [])
 
     @classmethod
-    def from_protein_group(cls, proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations):
-        proteinIds = ";".join(proteinGroup)
-        bestPeptide = sorted([(p[0], p[1]) for p in peptideScores])[0][1]
+    def from_protein_group(cls, proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations, keep_all_proteins):
         numUniquePeptidesPerProtein = cls._get_peptide_counts(peptideScores, scoreCutoff)
-        peptideCountsUnique = ";".join([str(numUniquePeptidesPerProtein[p]) for p in proteinGroup])
-        majorityProteinIds = ";".join([p for p in proteinGroup if numUniquePeptidesPerProtein[p] >= max(numUniquePeptidesPerProtein.values()) / 2])
-        numberOfProteins = len(proteinGroup)
+        peptideCountsUnique = [numUniquePeptidesPerProtein[p] for p in proteinGroup]
+        if sum(peptideCountsUnique) == 0 and not keep_all_proteins:
+            return None
+
+        proteinGroup, peptideCountsUnique = zip(*[(p, num_peptides) for p, num_peptides in zip(proteinGroup, peptideCountsUnique) if num_peptides > 0 or keep_all_proteins])
 
+        bestPeptide = sorted([(p[0], p[1]) for p in peptideScores])[0][1]
+        majorityProteinIds = ";".join([p for p, num_peptides in zip(proteinGroup, peptideCountsUnique) if num_peptides >= max(peptideCountsUnique) / 2])
+        numberOfProteins = len(proteinGroup)
+
+        peptideCountsUnique = ";".join(map(str, peptideCountsUnique))
+        proteinIds = ";".join(proteinGroup)
+
         proteinNames, geneNames, fastaHeaders = list(), list(), list()
         for p in proteinGroup:
             if p in proteinAnnotations:
@@ -193,13 +200,15 @@ def from_protein_groups(cls,
             proteinScores: List[float], 
             reportedQvals: List[float], 
             scoreCutoff: float, 
-            proteinAnnotations) -> ProteinGroupResults:
+            proteinAnnotations,
+            keep_all_proteins: bool) -> ProteinGroupResults:
         protein_group_results = list()
         for proteinGroup, peptideScores, proteinScore, reportedFdr in zip(proteinGroups, proteinGroupPeptideInfos, proteinScores, reportedQvals):
             if helpers.isObsolete(proteinGroup):
                 continue
-            pgr = ProteinGroupResult.from_protein_group(proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations)
-            protein_group_results.append(pgr)
+            pgr = ProteinGroupResult.from_protein_group(proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations, keep_all_proteins)
+            if pgr is not None: # protein groups can get filtered out when they do not have any PSMs below the PSM FDR cutoff
+                protein_group_results.append(pgr)
 
         return cls(protein_group_results)
 
diff --git a/tests/unit_tests/test_results.py b/tests/unit_tests/test_results.py
@@ -0,0 +1,129 @@
+from picked_group_fdr.results import ProteinGroupResult
+
+
+def test_extend():
+    p = ProteinGroupResult()
+    p.extend([1, 2, 3])
+    assert p.extraColumns == [1, 2, 3]
+
+
+def test_append():
+    p = ProteinGroupResult()
+    p.append(1)
+    assert p.extraColumns == [1]
+
+
+def test_from_mq_protein_groups():
+    row = {
+        "Protein IDs": "123",
+        "Majority protein IDs": "456",
+        "Peptide counts (unique)": "3",
+        "Protein names": "protein1",
+        "Gene names": "gene1",
+        "Fasta headers": "header1",
+        "Number of proteins": "1",
+        "Q-value": "0.05",
+        "Score": "10",
+        "Reverse": "+",
+        "Potential contaminant": "",
+    }
+    cols = {
+        "Protein IDs": 0,
+        "Majority protein IDs": 1,
+        "Peptide counts (unique)": 2,
+        "Protein names": 3,
+        "Gene names": 4,
+        "Fasta headers": 5,
+        "Number of proteins": 6,
+        "Q-value": 7,
+        "Score": 8,
+        "Reverse": 9,
+        "Potential contaminant": 10,
+    }
+    p = ProteinGroupResult.from_mq_protein_groups(list(row.values()), cols)
+    assert p.proteinIds == "123"
+    assert p.majorityProteinIds == "456"
+    assert p.peptideCountsUnique == "3"
+    assert p.proteinNames == "protein1"
+    assert p.geneNames == "gene1"
+    assert p.fastaHeaders == "header1"
+    assert p.numberOfProteins == 1
+    assert p.qValue == 0.05
+    assert p.score == 10
+    assert p.reverse == "+"
+    assert p.potentialContaminant == ""
+
+
+def test_from_protein_group():
+    proteinGroup = ["protein1", "protein2"]
+    peptideScores = [
+        (0.05, "peptide1", ["protein1"]),
+        (0.03, "peptide2", ["protein1", "protein2"]),
+    ]
+    reportedFdr = 0.05
+    proteinScore = 10
+    scoreCutoff = 0.01
+    proteinAnnotations = {
+        "protein1": ("protein1", "gene1", "header1"),
+        "protein2": ("protein2", "gene2", "header2"),
+    }
+    keep_all_proteins = False
+    p = ProteinGroupResult.from_protein_group(
+        proteinGroup,
+        peptideScores,
+        reportedFdr,
+        proteinScore,
+        scoreCutoff,
+        proteinAnnotations,
+        keep_all_proteins,
+    )
+    assert p is None
+
+
+def test_from_protein_group_keep_all_proteins():
+    proteinGroup = ["protein1", "protein2"]
+    peptideScores = [
+        (0.05, "peptide1", ["protein1"]),
+        (0.03, "peptide2", ["protein1", "protein2"]),
+    ]
+    reportedFdr = 0.05
+    proteinScore = 10
+    scoreCutoff = 0.01
+    proteinAnnotations = {
+        "protein1": ("protein1", "gene1", "header1"),
+        "protein2": ("protein2", "gene2", "header2"),
+    }
+    keep_all_proteins = True
+    p = ProteinGroupResult.from_protein_group(
+        proteinGroup,
+        peptideScores,
+        reportedFdr,
+        proteinScore,
+        scoreCutoff,
+        proteinAnnotations,
+        keep_all_proteins,
+    )
+    assert p.proteinIds == "protein1;protein2"
+    assert p.majorityProteinIds == "protein1;protein2"
+    assert p.peptideCountsUnique == "0;0"
+    assert p.proteinNames == "protein1;protein2"
+    assert p.geneNames == "gene1;gene2"
+    assert p.fastaHeaders == "header1;header2"
+    assert p.numberOfProteins == 2
+    assert p.qValue == 0.05
+    assert p.score == 10
+    assert p.reverse == ""
+    assert p.potentialContaminant == ""
+
+
+def test_get_peptide_counts():
+    scorePeptidePairs = [
+        (0.05, "peptide1", ["protein1"]),
+        (0.03, "peptide2", ["protein1", "protein2"]),
+        (0.02, "peptide3", ["protein2"]),
+        (0.01, "peptide4", ["protein3"]),
+    ]
+    scoreCutoff = 0.02
+    p = ProteinGroupResult()
+    counts = p._get_peptide_counts(scorePeptidePairs, scoreCutoff)
+    assert counts == {"protein2": 1, "protein3": 1}