Mykrobe-tools · mbhall88 · Sep 29, 2022 · Sep 29, 2022 · Sep 29, 2022 · Sep 29, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+### Added
+
+- option to compress JSON output (`-z`/`--compress`)
+- specific support for targeted sequencing (`-T`/`--targeted`). Takes a BED file of the regions targeted to aid in the read depth estimation [[#28][28]]
+- The script `scripts/combine.py` to combine mykrobe reports into a single CSV file
+
 ## 0.12.1
 
 ### Fixed
@@ -89,6 +95,8 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
   eg `katG_S315X`. If gene is on reverse strand, then the wrong new amino
   acid change was being reported [[#127][127]].
 
+[28]: https://github.com/Mykrobe-tools/mykrobe/issues/28
+
 [113]: https://github.com/Mykrobe-tools/mykrobe/issues/113
 
 [114]: https://github.com/Mykrobe-tools/mykrobe/issues/114

diff --git a/Dockerfile b/Dockerfile
@@ -41,8 +41,13 @@ RUN git clone --recursive -b geno_kmer_count https://github.com/phelimb/mccortex
 
 # install mykrobe
 WORKDIR "/${PROJECT}"
-RUN python -m pip install requests && python -m pip install . -vv
+RUN python -m pip install requests \
+    && python -m pip install . -vv \
+    && chmod +x /${PROJECT}/scripts/combine.py
 
 # download panels
 RUN mykrobe panels update_metadata \
-    && mykrobe panels update_species all
+    && mykrobe panels update_species all
+
+# add combine script to path \
+ENV PATH="/${PROJECT}/scripts:$PATH"
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@ requests>=2.27.1
 mongoengine>=0.24.1
 cython>=0.29.28
 numpy>=1.22.0
+intervaltree~=3.1
 ## Testing
 tox
 pytest

diff --git a/scripts/combine.py b/scripts/combine.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""This script can be used to combine multiple mykrobe CSV/JSON output files into a
+a sheet for easy comparison. The columns are drugs, the rows are samples, and the cells
+are the susceptibility prediction.
+
+USAGE:
+
+# combine two CSV reports into a file called samples.csv
+combine.py -o samples.csv sample1.csv sample2.csv
+
+# combine a JSON report with a CSV report
+combine.py -o samples.csv sample1.json sample2.csv
+
+# combine all JSON and CSV files in a directory
+combine.py -o samples.csv reports/
+"""
+import json
+import sys
+import argparse
+from pathlib import Path
+from typing import Dict
+
+
+def extract_csv_data(path: Path) -> Dict[str, str]:
+    report = dict()
+    with open(path) as fp:
+        header_fields = [s.strip('"') for s in next(fp).split(",")]
+        sample_idx = header_fields.index("sample")
+        prediction_idx = header_fields.index("susceptibility")
+        drug_idx = header_fields.index("drug")
+        species_idx = header_fields.index("species")
+        lineage_idx = header_fields.index("lineage")
+
+        samples = set()
+        for row in fp:
+            fields = [s.strip('"') for s in row.split(",")]
+            sample = fields[sample_idx]
+            samples.add(sample)
+            drug = fields[drug_idx]
+            pred = fields[prediction_idx]
+            report["species"] = fields[species_idx]
+            report["lineage"] = fields[lineage_idx]
+            report[drug] = pred
+
+        if len(samples) > 1:
+            eprint(f"ERROR: Got more than one sample in {path}")
+            sys.exit(1)
+
+        report["sample"] = sample
+        print(report)
+        return report
+
+
+def extract_json_data(path: Path) -> Dict[str, str]:
+    report = dict()
+    with open(path) as fp:
+        data = json.load(fp)
+
+    samples = list(data.keys())
+    if len(samples) > 1:
+        eprint(f"ERROR: Got moe than one sample in {path}")
+        sys.exit(1)
+
+    report["sample"] = samples[0]
+
+    for drug, info in data[samples[0]]["susceptibility"].items():
+        pred = info["predict"]
+        report[drug] = pred
+
+    phylo = data[samples[0]].get("phylogenetics", {})
+    report["species"] = ";".join(list(phylo.get("species", {}).keys()))
+    lineages = [
+        s.replace("lineage", "") for s in phylo.get("lineage", {}).get("lineage", {})
+    ]
+    report["lineage"] = ";".join(lineages)
+
+    return report
+
+
+def extract_data(path: Path) -> Dict[str, str]:
+    if path.suffix == ".csv":
+        return extract_csv_data(path)
+    elif path.suffix == ".json":
+        return extract_json_data(path)
+    else:
+        eprint(f"ERROR: {path} is an unknown file type")
+        sys.exit(1)
+
+
+def eprint(msg: str):
+    print(msg, file=sys.stderr)
+
+
+def main():
+    if sys.version_info < (3, 6, 0):
+        eprint("You need python 3.6 or later to run this script")
+        sys.exit(1)
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    parser.add_argument(
+        "src",
+        type=Path,
+        metavar="DIR/FILES",
+        nargs="+",
+        help="A directory containing mykrobe reports, or a list of mykrobe report files",
+    )
+    parser.add_argument(
+        "-d",
+        "--delim",
+        help="Delimiting character to use in the output file [default: %(default)s]",
+        default=",",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="File path to save output to [default: stdout]",
+        type=Path,
+    )
+
+    args = parser.parse_args()
+
+    delim = args.delim
+    input_files = []
+    for p in args.src:
+        if p.is_dir():
+            input_files.append(list(p.glob("*.csv")))
+            input_files.append(list(p.glob("*.json")))
+        else:
+            input_files.append(p)
+
+    eprint(f"Found {len(input_files)} mykrobe reports")
+
+    drugs = set()
+    reports = dict()
+    for p in input_files:
+        data = extract_data(p)
+        reports[p] = data
+        for k in data:
+            if k in ("sample", "species", "lineage"):
+                continue
+            drugs.add(k)
+
+    fp_out = args.output.open("w") if args.output is not None else sys.stdout
+    print(delim.join(["sample", "species", "lineage", *sorted(drugs)]), file=fp_out)
+
+    for p, data in reports.items():
+        row = []
+        for k in ["sample", "species", "lineage"]:
+            row.append(data[k])
+
+        for drug in sorted(drugs):
+            row.append(data.get(drug, ""))
+
+        print(delim.join(row), file=fp_out)
+
+    fp_out.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
@@ -132,6 +132,7 @@ def read(*names, **kwargs):
         "mongoengine",
         "requests",
         "numpy",
+        "intervaltree~=3.1"
     ],
     entry_points={
         "console_scripts": [

diff --git a/src/mykrobe/cmds/amr.py b/src/mykrobe/cmds/amr.py
@@ -1,6 +1,7 @@
 from __future__ import print_function
 
 import copy
+import gzip
 import json
 import logging
 import random
@@ -156,6 +157,7 @@ def ref_data_from_args(args):
             "kmer": args.kmer,
             "version": "custom",
             "species_phylo_group": None,
+            "panel_name": "custom",
         }
     else:
         data_dir = DataDir(args.panels_dir)
@@ -170,6 +172,7 @@ def ref_data_from_args(args):
             "kmer": species_dir.kmer(),
             "version": species_dir.version(),
             "species_phylo_group": species_dir.species_phylo_group(),
+            "panel_name": species_dir.panel_name,
         }
 
     if ref_data["lineage_json"] is None:
@@ -218,24 +221,24 @@ def write_outputs(args, base_json):
             del base_json[args.sample]["lineage_calls"]
         outputs["json"] = json.dumps(base_json, indent=4)
 
-    if len(outputs) == 0:
-        raise ValueError(
-            (
-                f"Output format must be one of: csv,json,json_and_csv. Got "
-                f"'{args.output_format}'"
-            )
-        )
-
     for output_type, output in outputs.items():
         # write to file is specified by user, otherwise send to stdout
         if args.output:
             if args.output_format == "json_and_csv":
                 outfile = args.output + "." + output_type
             else:
                 outfile = args.output
-            with open(outfile, "w") as f:
-                f.write(output)
+
+            if args.compress and output_type == "json":
+                p = outfile if outfile.split(".")[-1] == "gz" else f"{outfile}.gz"
+                with gzip.open(p, "w") as fout:
+                    fout.write(output.encode("utf-8"))
+            else:
+                with open(outfile, "w") as f:
+                    f.write(output)
         else:
+            if args.compress and output_type == "json":
+                output = gzip.compress(output)
             print(output)
 
 
@@ -259,11 +262,13 @@ def run(parser, args):
         and ref_data["lineage_json"] is None
     ):
         logger.info(
-            "Forcing --report_all_calls because species is 'custom' and options --custom_variant_to_resistance_json,--custom_lineage_json were not used"
+            "Forcing --report_all_calls because species is 'custom' and options "
+            "--custom_variant_to_resistance_json,--custom_lineage_json were not used"
         )
         args.report_all_calls = True
     logger.info(
-        f"Running mykrobe predict using species {args.species}, and panel version {ref_data['version']}"
+        f"Running mykrobe predict using species {args.species}, panel version "
+        f"{ref_data['version']}, and panel {ref_data['panel_name']}"
     )
 
     if args.tmp is None:
@@ -272,10 +277,11 @@ def run(parser, args):
     if args.ont:
         args.expected_error_rate = ONT_E_RATE
         logger.info(
-            f"Set expected error rate to {args.expected_error_rate} because --ont flag was used"
+            f"Set expected error rate to {args.expected_error_rate} because --ont flag "
+            f"was used"
         )
         args.ploidy = ONT_PLOIDY
-        logger.info(f"Set ploidy to {args.ploidy} because --ont flag used")
+        logger.info(f"Set ploidy to {args.ploidy} because --ont flag was used")
 
     # Run Cortex
     cp = CoverageParser(
@@ -289,11 +295,12 @@ def run(parser, args):
         tmp_dir=args.tmp,
         skeleton_dir=args.skeleton_dir,
         memory=args.memory,
+        targeted=args.targeted,
     )
     cp.run()
     logger.debug("CoverageParser complete")
 
-    if ref_data["species_phylo_group"] is None:
+    if ref_data["species_phylo_group"] is None or args.targeted:
         phylogenetics = {}
         depths = [cp.estimate_depth()]
     else:

diff --git a/src/mykrobe/parser.py b/src/mykrobe/parser.py
@@ -196,6 +196,18 @@ def __init__(self, *args, **kwargs):
     choices=["json", "csv", "json_and_csv"],
     default="csv",
 )
+parser_amr.add_argument(
+    "-T",
+    "--targeted",
+    help=f"BED file of regions that sequencing targeted",
+    type=str,
+)
+parser_amr.add_argument(
+    "-z",
+    "--compress",
+    action="store_true",
+    help="Compress JSON output with gzip",
+)
 parser_amr.set_defaults(func=run_subtool)
 
 

diff --git a/src/mykrobe/species_data/species_dir.py b/src/mykrobe/species_data/species_dir.py
@@ -14,6 +14,7 @@ def __init__(self, root_dir):
         if not os.path.exists(self.manifest_json):
             raise FileNotFoundError(f"Manifest file not found in species directory {self.root_dir}. Expected to find {self.manifest_json}.")
         self.panel = None
+        self.panel_name = None
         self.manifest = load_json(self.manifest_json)
         self.set_panel(self.default_panel())