diff --git a/CHANGELOG.md b/CHANGELOG.md index e6424d4..9694375 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,9 @@ # Changelog +## [0.2.4] - 2024-12-08 +### Added +- Add `-g` to control the minimum number of unique marker genes (default: 1) required for a species to report its genome copies. Increase `-g` (1 -> 2) lowers recall (detection limit: 0.125 -> 0.25) but improves precision. + + ## [0.2.3] - 2024-12-01 ### Fixed - Fix a bug introduced in v0.2.2 causing ties not resolved properly. Results should be identical to v0.2.1. diff --git a/src/melon/__init__.py b/src/melon/__init__.py index 5dbf1e6..9356a22 100644 --- a/src/melon/__init__.py +++ b/src/melon/__init__.py @@ -1,3 +1,3 @@ -__version__ = '0.2.3' +__version__ = '0.2.4' from .melon import GenomeProfiler diff --git a/src/melon/cli.py b/src/melon/cli.py index 1a926fd..11e4fc3 100755 --- a/src/melon/cli.py +++ b/src/melon/cli.py @@ -104,6 +104,14 @@ def cli(argv=sys.argv): default=0.9, help='Min. secondary-to-primary score ratio to report secondary alignments (-p in minimap2). [0.9]') + additional.add_argument( + '-g', + metavar='INT', + type=int, + choices=range(1, 9), + default=1, + help='Min. number of unique marker genes required for a species to report its genome copies. [1]') + additional_em.add_argument( '-a', metavar='INT', @@ -186,7 +194,7 @@ def run(opt): GenomeProfiler(file, opt.db, opt.output, opt.threads).run( db_kraken=opt.db_kraken, skip_profile=opt.skip_profile, skip_clean=opt.skip_clean, max_target_seqs=opt.m, evalue=opt.e, identity=opt.i, subject_cover=opt.s, - secondary_num=opt.n, secondary_ratio=opt.p, + secondary_num=opt.n, secondary_ratio=opt.p, min_markers=opt.g, max_iterations=opt.a, epsilon=opt.c) if index == len(opt.FILE) - 1: diff --git a/src/melon/melon.py b/src/melon/melon.py index 0744201..c6e7323 100644 --- a/src/melon/melon.py +++ b/src/melon/melon.py @@ -289,7 +289,7 @@ def run_em(self, max_iterations=1000, epsilon=1e-10): def run(self, db_kraken=None, skip_profile=False, skip_clean=False, max_target_seqs=25, evalue=1e-15, identity=0, subject_cover=75, - secondary_num=2147483647, secondary_ratio=0.9, + secondary_num=2147483647, secondary_ratio=0.9, min_markers=1, max_iterations=1000, epsilon=1e-10): ''' Run the pipeline. @@ -326,10 +326,15 @@ def run(self, db_kraken=None, skip_profile=False, skip_clean=False, ) for kingdom, replacement in replacements.items()} ## count assigned taxonomic labels - self.hits = [[*hit, self.assignments.get(hit[0], replacements.get(hit[1]))] for hit in self.hits] - counts, total_counts, lineage2identity = defaultdict(lambda: 0), defaultdict(lambda: 0), defaultdict(list) + lineage2rpg = defaultdict(set) + for hit in self.hits: + hit.append(self.assignments.get(hit[0], replacements.get(hit[1]))) + lineage2rpg[hit[-1]].add(hit[2]) + counts, total_counts, lineage2identity = defaultdict(lambda: 0), defaultdict(lambda: 0), defaultdict(list) for hit in self.hits: + if len(lineage2rpg.get(hit[-1])) < min_markers: + hit[-1] = replacements.get(hit[1]) total_counts[hit[1]] += 1 counts[(hit[-1], hit[1])] += 1 lineage2identity[hit[-1]].append(self.identities.get((hit[0], hit[-1]), (0, 0)))