diff --git a/CHANGELOG.md b/CHANGELOG.md index adbc3a5..104770e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +### Added + +- option to compress JSON output (`-z`/`--compress`) +- specific support for targeted sequencing (`-T`/`--targeted`). Takes a BED file of the regions targeted to aid in the read depth estimation [[#28][28]] +- The script `scripts/combine.py` to combine mykrobe reports into a single CSV file + ## 0.12.1 ### Fixed @@ -89,6 +95,8 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm eg `katG_S315X`. If gene is on reverse strand, then the wrong new amino acid change was being reported [[#127][127]]. +[28]: https://github.com/Mykrobe-tools/mykrobe/issues/28 + [113]: https://github.com/Mykrobe-tools/mykrobe/issues/113 [114]: https://github.com/Mykrobe-tools/mykrobe/issues/114 diff --git a/Dockerfile b/Dockerfile index bf69d36..89aa6ae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,8 +41,13 @@ RUN git clone --recursive -b geno_kmer_count https://github.com/phelimb/mccortex # install mykrobe WORKDIR "/${PROJECT}" -RUN python -m pip install requests && python -m pip install . -vv +RUN python -m pip install requests \ + && python -m pip install . -vv \ + && chmod +x /${PROJECT}/scripts/combine.py # download panels RUN mykrobe panels update_metadata \ - && mykrobe panels update_species all \ No newline at end of file + && mykrobe panels update_species all + +# add combine script to path \ +ENV PATH="/${PROJECT}/scripts:$PATH" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index effb63e..b620ba2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ requests>=2.27.1 mongoengine>=0.24.1 cython>=0.29.28 numpy>=1.22.0 +intervaltree~=3.1 ## Testing tox pytest diff --git a/scripts/combine.py b/scripts/combine.py new file mode 100755 index 0000000..a03f9be --- /dev/null +++ b/scripts/combine.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""This script can be used to combine multiple mykrobe CSV/JSON output files into a +a sheet for easy comparison. The columns are drugs, the rows are samples, and the cells +are the susceptibility prediction. + +USAGE: + +# combine two CSV reports into a file called samples.csv +combine.py -o samples.csv sample1.csv sample2.csv + +# combine a JSON report with a CSV report +combine.py -o samples.csv sample1.json sample2.csv + +# combine all JSON and CSV files in a directory +combine.py -o samples.csv reports/ +""" +import json +import sys +import argparse +from pathlib import Path +from typing import Dict + + +def extract_csv_data(path: Path) -> Dict[str, str]: + report = dict() + with open(path) as fp: + header_fields = [s.strip('"') for s in next(fp).split(",")] + sample_idx = header_fields.index("sample") + prediction_idx = header_fields.index("susceptibility") + drug_idx = header_fields.index("drug") + species_idx = header_fields.index("species") + lineage_idx = header_fields.index("lineage") + + samples = set() + for row in fp: + fields = [s.strip('"') for s in row.split(",")] + sample = fields[sample_idx] + samples.add(sample) + drug = fields[drug_idx] + pred = fields[prediction_idx] + report["species"] = fields[species_idx] + report["lineage"] = fields[lineage_idx] + report[drug] = pred + + if len(samples) > 1: + eprint(f"ERROR: Got more than one sample in {path}") + sys.exit(1) + + report["sample"] = sample + print(report) + return report + + +def extract_json_data(path: Path) -> Dict[str, str]: + report = dict() + with open(path) as fp: + data = json.load(fp) + + samples = list(data.keys()) + if len(samples) > 1: + eprint(f"ERROR: Got moe than one sample in {path}") + sys.exit(1) + + report["sample"] = samples[0] + + for drug, info in data[samples[0]]["susceptibility"].items(): + pred = info["predict"] + report[drug] = pred + + phylo = data[samples[0]].get("phylogenetics", {}) + report["species"] = ";".join(list(phylo.get("species", {}).keys())) + lineages = [ + s.replace("lineage", "") for s in phylo.get("lineage", {}).get("lineage", {}) + ] + report["lineage"] = ";".join(lineages) + + return report + + +def extract_data(path: Path) -> Dict[str, str]: + if path.suffix == ".csv": + return extract_csv_data(path) + elif path.suffix == ".json": + return extract_json_data(path) + else: + eprint(f"ERROR: {path} is an unknown file type") + sys.exit(1) + + +def eprint(msg: str): + print(msg, file=sys.stderr) + + +def main(): + if sys.version_info < (3, 6, 0): + eprint("You need python 3.6 or later to run this script") + sys.exit(1) + + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + "src", + type=Path, + metavar="DIR/FILES", + nargs="+", + help="A directory containing mykrobe reports, or a list of mykrobe report files", + ) + parser.add_argument( + "-d", + "--delim", + help="Delimiting character to use in the output file [default: %(default)s]", + default=",", + ) + parser.add_argument( + "-o", + "--output", + help="File path to save output to [default: stdout]", + type=Path, + ) + + args = parser.parse_args() + + delim = args.delim + input_files = [] + for p in args.src: + if p.is_dir(): + input_files.append(list(p.glob("*.csv"))) + input_files.append(list(p.glob("*.json"))) + else: + input_files.append(p) + + eprint(f"Found {len(input_files)} mykrobe reports") + + drugs = set() + reports = dict() + for p in input_files: + data = extract_data(p) + reports[p] = data + for k in data: + if k in ("sample", "species", "lineage"): + continue + drugs.add(k) + + fp_out = args.output.open("w") if args.output is not None else sys.stdout + print(delim.join(["sample", "species", "lineage", *sorted(drugs)]), file=fp_out) + + for p, data in reports.items(): + row = [] + for k in ["sample", "species", "lineage"]: + row.append(data[k]) + + for drug in sorted(drugs): + row.append(data.get(drug, "")) + + print(delim.join(row), file=fp_out) + + fp_out.close() + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 52f0aee..395ded9 100644 --- a/setup.py +++ b/setup.py @@ -132,6 +132,7 @@ def read(*names, **kwargs): "mongoengine", "requests", "numpy", + "intervaltree~=3.1" ], entry_points={ "console_scripts": [ diff --git a/src/mykrobe/cmds/amr.py b/src/mykrobe/cmds/amr.py index b7351df..081265e 100755 --- a/src/mykrobe/cmds/amr.py +++ b/src/mykrobe/cmds/amr.py @@ -1,6 +1,7 @@ from __future__ import print_function import copy +import gzip import json import logging import random @@ -156,6 +157,7 @@ def ref_data_from_args(args): "kmer": args.kmer, "version": "custom", "species_phylo_group": None, + "panel_name": "custom", } else: data_dir = DataDir(args.panels_dir) @@ -170,6 +172,7 @@ def ref_data_from_args(args): "kmer": species_dir.kmer(), "version": species_dir.version(), "species_phylo_group": species_dir.species_phylo_group(), + "panel_name": species_dir.panel_name, } if ref_data["lineage_json"] is None: @@ -218,14 +221,6 @@ def write_outputs(args, base_json): del base_json[args.sample]["lineage_calls"] outputs["json"] = json.dumps(base_json, indent=4) - if len(outputs) == 0: - raise ValueError( - ( - f"Output format must be one of: csv,json,json_and_csv. Got " - f"'{args.output_format}'" - ) - ) - for output_type, output in outputs.items(): # write to file is specified by user, otherwise send to stdout if args.output: @@ -233,9 +228,17 @@ def write_outputs(args, base_json): outfile = args.output + "." + output_type else: outfile = args.output - with open(outfile, "w") as f: - f.write(output) + + if args.compress and output_type == "json": + p = outfile if outfile.split(".")[-1] == "gz" else f"{outfile}.gz" + with gzip.open(p, "w") as fout: + fout.write(output.encode("utf-8")) + else: + with open(outfile, "w") as f: + f.write(output) else: + if args.compress and output_type == "json": + output = gzip.compress(output) print(output) @@ -259,11 +262,13 @@ def run(parser, args): and ref_data["lineage_json"] is None ): logger.info( - "Forcing --report_all_calls because species is 'custom' and options --custom_variant_to_resistance_json,--custom_lineage_json were not used" + "Forcing --report_all_calls because species is 'custom' and options " + "--custom_variant_to_resistance_json,--custom_lineage_json were not used" ) args.report_all_calls = True logger.info( - f"Running mykrobe predict using species {args.species}, and panel version {ref_data['version']}" + f"Running mykrobe predict using species {args.species}, panel version " + f"{ref_data['version']}, and panel {ref_data['panel_name']}" ) if args.tmp is None: @@ -272,10 +277,11 @@ def run(parser, args): if args.ont: args.expected_error_rate = ONT_E_RATE logger.info( - f"Set expected error rate to {args.expected_error_rate} because --ont flag was used" + f"Set expected error rate to {args.expected_error_rate} because --ont flag " + f"was used" ) args.ploidy = ONT_PLOIDY - logger.info(f"Set ploidy to {args.ploidy} because --ont flag used") + logger.info(f"Set ploidy to {args.ploidy} because --ont flag was used") # Run Cortex cp = CoverageParser( @@ -289,11 +295,12 @@ def run(parser, args): tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, memory=args.memory, + targeted=args.targeted, ) cp.run() logger.debug("CoverageParser complete") - if ref_data["species_phylo_group"] is None: + if ref_data["species_phylo_group"] is None or args.targeted: phylogenetics = {} depths = [cp.estimate_depth()] else: diff --git a/src/mykrobe/parser.py b/src/mykrobe/parser.py index 1fde64b..cec30bb 100644 --- a/src/mykrobe/parser.py +++ b/src/mykrobe/parser.py @@ -196,6 +196,18 @@ def __init__(self, *args, **kwargs): choices=["json", "csv", "json_and_csv"], default="csv", ) +parser_amr.add_argument( + "-T", + "--targeted", + help=f"BED file of regions that sequencing targeted", + type=str, +) +parser_amr.add_argument( + "-z", + "--compress", + action="store_true", + help="Compress JSON output with gzip", +) parser_amr.set_defaults(func=run_subtool) diff --git a/src/mykrobe/species_data/species_dir.py b/src/mykrobe/species_data/species_dir.py index caf0ea4..edc2c46 100644 --- a/src/mykrobe/species_data/species_dir.py +++ b/src/mykrobe/species_data/species_dir.py @@ -14,6 +14,7 @@ def __init__(self, root_dir): if not os.path.exists(self.manifest_json): raise FileNotFoundError(f"Manifest file not found in species directory {self.root_dir}. Expected to find {self.manifest_json}.") self.panel = None + self.panel_name = None self.manifest = load_json(self.manifest_json) self.set_panel(self.default_panel()) diff --git a/src/mykrobe/typing/typer/genotyper.py b/src/mykrobe/typing/typer/genotyper.py index 4ae1ac7..bab171f 100755 --- a/src/mykrobe/typing/typer/genotyper.py +++ b/src/mykrobe/typing/typer/genotyper.py @@ -1,37 +1,34 @@ from __future__ import print_function -import os -import json + import csv -import glob import logging -import subprocess -from copy import copy +import re -from mykrobe import K +from mykrobe.cortex import McCortexGenoRunner from mykrobe.metagenomics import LineagePredictor - -from mykrobe.typing import SequenceProbeCoverage -from mykrobe.typing import VariantProbeCoverage -from mykrobe.typing import ProbeCoverage -from mykrobe.typing import Panel - +from mykrobe.typing import ( + Panel, + ProbeCoverage, + SequenceProbeCoverage, + VariantProbeCoverage, +) +from mykrobe.typing.typer.base import DEFAULT_ERROR_RATE, DEFAULT_MINOR_FREQ from mykrobe.typing.typer.presence import GeneCollectionTyper from mykrobe.typing.typer.variant import VariantTyper -from mykrobe.typing.typer.base import DEFAULT_MINOR_FREQ -from mykrobe.typing.typer.base import DEFAULT_ERROR_RATE - -from mykrobe.variants.schema.models import VariantCallSet +from mykrobe.utils import get_params, load_bed, median, split_var_name from mykrobe.variants.schema.models import Variant -from mykrobe.cortex import McCortexGenoRunner +from mykrobe import K -from mykrobe.utils import get_params -from mykrobe.utils import split_var_name -from mykrobe.utils import median +POS_REGEX = re.compile(r"[ACGT]+(?P\d+)[ACGT]+") logger = logging.getLogger(__name__) +class EmptyBedFile(Exception): + pass + + class CoverageParser(object): def __init__( self, @@ -47,6 +44,7 @@ def __init__( verbose=True, tmp_dir="tmp/", skeleton_dir="atlas/data/skeletons/", + targeted=None, ): self.sample = sample self.seq = seq @@ -64,6 +62,13 @@ def __init__( self.panels = [] self.threads = threads self.memory = memory + if targeted: + self.targeted = load_bed(targeted) + if self.targeted.is_empty(): + raise EmptyBedFile("Targeted BED file has no regions") + logging.info(f"Loaded {len(self.targeted)} regions from targeted BED file") + else: + self.targeted = None for panel_file_path in self.panel_file_paths: panel = Panel(panel_file_path) self.panels.append(panel) @@ -92,18 +97,20 @@ def _run_cortex(self): def estimate_depth(self): depth = [] - for variant_covg in self.variant_covgs.values(): + + for variant_name, variant_covg in self.variant_covgs.items(): + var = variant_name.split("-")[-1] + if not self._variant_is_in_target_region(var): + continue if variant_covg.reference_coverage.median_depth > 0: depth.append(variant_covg.reference_coverage.median_depth) - for spcs in self.gene_presence_covgs.values(): - __median_depth = median([spc.median_depth for spc in spcs.values()]) - if __median_depth > 0: - depth.append(__median_depth) + if not self.targeted: + for spcs in self.gene_presence_covgs.values(): + __median_depth = median([spc.median_depth for spc in spcs.values()]) + if __median_depth > 0: + depth.append(__median_depth) _median = median(depth) - if _median < 1: - return 1 - else: - return _median + return max(1, _median) def remove_temporary_files(self): self.mc_cortex_runner.remove_temporary_files() @@ -139,6 +146,7 @@ def _parse_covgs(self): klen, ) = self._parse_summary_covgs_row(row) allele_name = allele.split("?")[0] + if self._is_variant_panel(allele_name): self._parse_variant_panel(row) else: @@ -262,6 +270,17 @@ def _parse_variant_panel(self, row): else: raise ValueError("probe_type must be ref or alt") + def _variant_is_in_target_region(self, variant: str) -> bool: + """Checks whether an allele falls within the targeted BED regions""" + if not self.targeted: + return True + + match = POS_REGEX.search(variant) + if not match: # no genome position, so it isn't a variant probe + return False + pos = int(match.group("pos")) - 1 # mykrobe panel is 1-based + return self.targeted.overlaps(pos) + class Genotyper(object): diff --git a/src/mykrobe/utils.py b/src/mykrobe/utils.py index cc89dbb..095f75a 100644 --- a/src/mykrobe/utils.py +++ b/src/mykrobe/utils.py @@ -1,12 +1,14 @@ import gzip import hashlib import json +import logging import os import re from pathlib import Path from typing import TextIO, Union from Bio.Seq import Seq +from intervaltree import Interval, IntervalTree def check_args(args): @@ -190,3 +192,35 @@ def get_first_chrom_name(fp: TextIO) -> str: if not header.startswith(">"): raise ValueError(f"Expected fasta file, but it did not start with '>'") return header[1:].split()[0] + + +def load_bed(fpath: str) -> IntervalTree: + tree = IntervalTree() + with open(fpath) as fp: + for line in map(str.rstrip, fp): + # skip headers https://en.wikipedia.org/wiki/BED_(file_format)#Header + if line.startswith(("#", "browser", "track")): + continue + try: + start, end = line.split("\t")[1:3] + except IndexError as err: + logging.error( + "BED file must contain at least three tab-separated columns: " + "chromosome, start (zero-based inclusive), end (zero-based " + "exclusive). There are less than three columns in this BED file" + ) + raise err + try: + start = int(start) + end = int(end) + except ValueError as err: + logging.error( + "BED file must contain at least three tab-separated columns: " + "chromosome, start (zero-based inclusive), end (zero-based " + "exclusive). We could not parse the start/end column into integers" + ) + raise err + iv = Interval(start, end) + tree.add(iv) + + return tree