Skip to content

Commit

Permalink
Removed proteins without any PSMs below the FDR threshold from protei…
Browse files Browse the repository at this point in the history
…n groups output. Setting the new `--keep_all_proteins` flag reverts to the old behavior.
  • Loading branch information
MatthewThe committed May 30, 2023
1 parent ec1e431 commit 012ed0d
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 13 deletions.
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none"
}
7 changes: 7 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
v0.4.0
- Removed proteins without any PSMs below the FDR threshold. Setting the new `--keep_all_proteins` flag reverts to the old behavior.
- Added support for supplying multiple evidence files
- Added support for mokapot format and allow users to specify methods with custom toml files
- Add `--fasta_contains_decoys` flag and functionality
- Update llvmlite (#4)

v0.3.5
- Added rescoring input (e.g. Prosit results) to GUI

Expand Down
12 changes: 9 additions & 3 deletions picked_group_fdr/picked_group_fdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ def parseArgs(argv):
help='''File with mapping from peptides to proteotypicity.
''')

apars.add_argument('--keep_all_proteins', default=None,
help='''Keep proteins that do not have peptides below the PSM FDR filter.''',
action='store_true')

apars.add_argument('--gene_level',
help='Report gene-level statistics instead of protein group-level. This requires the GN= field to be present in the fasta file.',
action='store_true')
Expand Down Expand Up @@ -170,7 +174,7 @@ def main(argv):
peptideInfoList, args.mq_protein_groups,
proteinAnnotations, peptideToProteotypicityMap,
config['pickedStrategy'], config['scoreType'], config['grouping'],
plotter)
plotter, args.keep_all_proteins)

if args.do_quant:
doQuantification(config, args, proteinGroupResults, parseId, peptideToProteinMap)
Expand All @@ -194,7 +198,8 @@ def getProteinGroupResults(
pickedStrategy: ProteinCompetitionStrategy,
scoreType: ProteinScoringStrategy,
groupingStrategy: ProteinGroupingStrategy,
plotter: Union[Plotter, NoPlotter]):
plotter: Union[Plotter, NoPlotter],
keep_all_proteins: bool):
proteinGroups = groupingStrategy.group_proteins(peptideInfoList, mqProteinGroupsFile)

# for razor peptide strategy
Expand Down Expand Up @@ -224,7 +229,8 @@ def getProteinGroupResults(
proteinGroupResults = ProteinGroupResults.from_protein_groups(
pickedProteinGroups, pickedProteinGroupPeptideInfos,
proteinScores, reportedQvals,
scoreCutoff, proteinAnnotations)
scoreCutoff, proteinAnnotations,
keep_all_proteins)

if scoreType.use_proteotypicity:
proteotypicity.calculateProteotypicityScores(pickedProteinGroups, pickedProteinGroupPeptideInfos, peptideToProteotypicityMap, scoreType, scoreCutoff)
Expand Down
2 changes: 1 addition & 1 deletion picked_group_fdr/quantification.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def parseArgs(argv):
apars = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

apars.add_argument('--mq_evidence', default=None, metavar = "EV", required = True,
apars.add_argument('--mq_evidence', default=None, metavar = "EV", required = True, nargs="+",
help='''MaxQuant evidence file.''')

apars.add_argument('--mq_protein_groups', default=None, metavar = "PG", required = True,
Expand Down
27 changes: 18 additions & 9 deletions picked_group_fdr/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,21 @@ def from_mq_protein_groups(cls, row, cols):
numberOfProteins, qValue, score, reverse, potentialContaminant, [], [])

@classmethod
def from_protein_group(cls, proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations):
proteinIds = ";".join(proteinGroup)
bestPeptide = sorted([(p[0], p[1]) for p in peptideScores])[0][1]
def from_protein_group(cls, proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations, keep_all_proteins):
numUniquePeptidesPerProtein = cls._get_peptide_counts(peptideScores, scoreCutoff)
peptideCountsUnique = ";".join([str(numUniquePeptidesPerProtein[p]) for p in proteinGroup])
majorityProteinIds = ";".join([p for p in proteinGroup if numUniquePeptidesPerProtein[p] >= max(numUniquePeptidesPerProtein.values()) / 2])
numberOfProteins = len(proteinGroup)
peptideCountsUnique = [numUniquePeptidesPerProtein[p] for p in proteinGroup]
if sum(peptideCountsUnique) == 0 and not keep_all_proteins:
return None

proteinGroup, peptideCountsUnique = zip(*[(p, num_peptides) for p, num_peptides in zip(proteinGroup, peptideCountsUnique) if num_peptides > 0 or keep_all_proteins])

bestPeptide = sorted([(p[0], p[1]) for p in peptideScores])[0][1]
majorityProteinIds = ";".join([p for p, num_peptides in zip(proteinGroup, peptideCountsUnique) if num_peptides >= max(peptideCountsUnique) / 2])
numberOfProteins = len(proteinGroup)

peptideCountsUnique = ";".join(map(str, peptideCountsUnique))
proteinIds = ";".join(proteinGroup)

proteinNames, geneNames, fastaHeaders = list(), list(), list()
for p in proteinGroup:
if p in proteinAnnotations:
Expand Down Expand Up @@ -193,13 +200,15 @@ def from_protein_groups(cls,
proteinScores: List[float],
reportedQvals: List[float],
scoreCutoff: float,
proteinAnnotations) -> ProteinGroupResults:
proteinAnnotations,
keep_all_proteins: bool) -> ProteinGroupResults:
protein_group_results = list()
for proteinGroup, peptideScores, proteinScore, reportedFdr in zip(proteinGroups, proteinGroupPeptideInfos, proteinScores, reportedQvals):
if helpers.isObsolete(proteinGroup):
continue
pgr = ProteinGroupResult.from_protein_group(proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations)
protein_group_results.append(pgr)
pgr = ProteinGroupResult.from_protein_group(proteinGroup, peptideScores, reportedFdr, proteinScore, scoreCutoff, proteinAnnotations, keep_all_proteins)
if pgr is not None: # protein groups can get filtered out when they do not have any PSMs below the PSM FDR cutoff
protein_group_results.append(pgr)

return cls(protein_group_results)

129 changes: 129 additions & 0 deletions tests/unit_tests/test_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from picked_group_fdr.results import ProteinGroupResult


def test_extend():
p = ProteinGroupResult()
p.extend([1, 2, 3])
assert p.extraColumns == [1, 2, 3]


def test_append():
p = ProteinGroupResult()
p.append(1)
assert p.extraColumns == [1]


def test_from_mq_protein_groups():
row = {
"Protein IDs": "123",
"Majority protein IDs": "456",
"Peptide counts (unique)": "3",
"Protein names": "protein1",
"Gene names": "gene1",
"Fasta headers": "header1",
"Number of proteins": "1",
"Q-value": "0.05",
"Score": "10",
"Reverse": "+",
"Potential contaminant": "",
}
cols = {
"Protein IDs": 0,
"Majority protein IDs": 1,
"Peptide counts (unique)": 2,
"Protein names": 3,
"Gene names": 4,
"Fasta headers": 5,
"Number of proteins": 6,
"Q-value": 7,
"Score": 8,
"Reverse": 9,
"Potential contaminant": 10,
}
p = ProteinGroupResult.from_mq_protein_groups(list(row.values()), cols)
assert p.proteinIds == "123"
assert p.majorityProteinIds == "456"
assert p.peptideCountsUnique == "3"
assert p.proteinNames == "protein1"
assert p.geneNames == "gene1"
assert p.fastaHeaders == "header1"
assert p.numberOfProteins == 1
assert p.qValue == 0.05
assert p.score == 10
assert p.reverse == "+"
assert p.potentialContaminant == ""


def test_from_protein_group():
proteinGroup = ["protein1", "protein2"]
peptideScores = [
(0.05, "peptide1", ["protein1"]),
(0.03, "peptide2", ["protein1", "protein2"]),
]
reportedFdr = 0.05
proteinScore = 10
scoreCutoff = 0.01
proteinAnnotations = {
"protein1": ("protein1", "gene1", "header1"),
"protein2": ("protein2", "gene2", "header2"),
}
keep_all_proteins = False
p = ProteinGroupResult.from_protein_group(
proteinGroup,
peptideScores,
reportedFdr,
proteinScore,
scoreCutoff,
proteinAnnotations,
keep_all_proteins,
)
assert p is None


def test_from_protein_group_keep_all_proteins():
proteinGroup = ["protein1", "protein2"]
peptideScores = [
(0.05, "peptide1", ["protein1"]),
(0.03, "peptide2", ["protein1", "protein2"]),
]
reportedFdr = 0.05
proteinScore = 10
scoreCutoff = 0.01
proteinAnnotations = {
"protein1": ("protein1", "gene1", "header1"),
"protein2": ("protein2", "gene2", "header2"),
}
keep_all_proteins = True
p = ProteinGroupResult.from_protein_group(
proteinGroup,
peptideScores,
reportedFdr,
proteinScore,
scoreCutoff,
proteinAnnotations,
keep_all_proteins,
)
assert p.proteinIds == "protein1;protein2"
assert p.majorityProteinIds == "protein1;protein2"
assert p.peptideCountsUnique == "0;0"
assert p.proteinNames == "protein1;protein2"
assert p.geneNames == "gene1;gene2"
assert p.fastaHeaders == "header1;header2"
assert p.numberOfProteins == 2
assert p.qValue == 0.05
assert p.score == 10
assert p.reverse == ""
assert p.potentialContaminant == ""


def test_get_peptide_counts():
scorePeptidePairs = [
(0.05, "peptide1", ["protein1"]),
(0.03, "peptide2", ["protein1", "protein2"]),
(0.02, "peptide3", ["protein2"]),
(0.01, "peptide4", ["protein3"]),
]
scoreCutoff = 0.02
p = ProteinGroupResult()
counts = p._get_peptide_counts(scorePeptidePairs, scoreCutoff)
assert counts == {"protein2": 1, "protein3": 1}

0 comments on commit 012ed0d

Please sign in to comment.