From 2f601cd88cdc15f2116159983fe97071992f304f Mon Sep 17 00:00:00 2001 From: Bede Constantinides Date: Thu, 19 Dec 2024 15:29:18 +0000 Subject: [PATCH] New index-related subcommands --- README.md | 56 +++++++++++++------------- src/hostile/aligner.py | 16 ++++++-- src/hostile/cli.py | 57 +++++++++++++++++---------- src/hostile/lib.py | 89 +++++++++++++++++++++++++++++++++++++++--- src/hostile/util.py | 8 ++-- tests/test_all.py | 8 +++- 6 files changed, 169 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 7602eac..cad36e4 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ # Hostile -Hostile accurately removes host sequences from short and long read (meta)genomes, consuming single-read or paired `fastq[.gz]` input. Batteries are included – a human reference genome is downloaded when run for the first time. Hostile is precise by default, removing an [order of magnitude fewer microbial reads](https://log.bede.im/2023/08/29/precise-host-read-removal.html#evaluating-accuracy) than existing approaches while removing >99.5% of real human reads from 1000 Genomes Project samples. For the best possible retention of microbial reads, use an existing index masked against bacterial and/or viral genomes, or make your own using the built-in masking utility. Read headers can be replaced with integers (using `--rename`) for privacy and smaller FASTQs. Heavy lifting is done with fast existing tools (Minimap2/Bowtie2 and Samtools). In benchmarks, bacterial Illumina reads were decontaminated at 32Mbp/s (210k reads/sec) and bacterial ONT reads at 22Mbp/s, using 8 alignment threads. In typical use, Hostile requires 4GB of RAM for decontaminating short reads (Bowtie2) and 13GB for long reads (Minimap2). Further information and benchmarks can be found in the [paper](https://doi.org/10.1093/bioinformatics/btad728) and [blog post](https://log.bede.im/2023/08/29/precise-host-read-removal.html). Please open an issue to report problems or otherwise [reach](https://bsky.app/profile/bedec.bsky.social) [out](mailto:b@bede.im) for help and advice. +Hostile accurately removes host sequences from short and long read (meta)genomes, consuming single-read or paired `fastq[.gz]` input. Batteries are included – a human reference genome is downloaded when run for the first time. Hostile is precise by default, removing an [order of magnitude fewer microbial reads](https://log.bede.im/2023/08/29/precise-host-read-removal.html#evaluating-accuracy) than existing approaches while removing >99.5% of real human reads from 1000 Genomes Project samples. For the best possible retention of microbial reads, use an existing index masked against bacterial and/or viral genomes, or make your own using the built-in masking utility. Read headers can be replaced with integers (using `--rename`) for privacy and smaller FASTQs. Heavy lifting is done with fast existing tools (Minimap2/Bowtie2 and Samtools). In benchmarks, bacterial Illumina reads were decontaminated at 32Mbp/s (210k reads/sec) and bacterial ONT reads at 22Mbp/s, using 8 alignment threads. In typical use, Hostile requires 4GB of RAM for decontaminating short reads (Bowtie2) and 13GB for long reads (Minimap2). Further information and benchmarks can be found in the [paper](https://doi.org/10.1093/bioinformatics/btad728) and [blog post](https://log.bede.im/2023/08/29/precise-host-read-removal.html). Please open an issue to report problems or otherwise [reach](https://bsky.app/profile/bedec.bsky.social) [out](mailto:b@bede.im) for help and advice, and please cite the paper if you use Hostile in your work. @@ -56,29 +56,27 @@ A [Biocontainer image](https://biocontainers.pro/tools/hostile) is also availabl # Long reads hostile clean --fastq1 long.fastq.gz # Creates long.clean.fastq.gz hostile clean --fastq1 --index mouse-mm39 # Use mouse index -hostile clean --fastq1 long.fastq.gz --stdout > long.clean.fastq # Send to stdout -hostile clean --invert --fastq1 long.fastq.gz # Keep only host reads +cat reads.fastq | hostile clean --fastq1 - # Read from stdin +hostile clean --fastq1 long.fastq.gz -o - > long.clean.fastq # Write to stdout +hostile clean --fastq1 long.fastq.gz --invert # Keep only host reads # Short reads hostile clean --fastq1 short.r1.fq.gz --aligner bowtie2 # Single/unpaired hostile clean --fastq1 short.r1.fq.gz --fastq2 short.r2.fq.gz # Paired -hostile clean --fastq1 short.r1.fq.gz --fastq2 short.r2.fq.gz --stdout > clean.fq # Send interleaved read pairs to stdout +cat interleaved.fastq | hostile clean --fastq1 - --fastq2 - # Read interleaved reads from stdin +hostile clean --fastq1 short.r1.fq.gz --fastq2 short.r2.fq.gz -o - > clean.fq # Write interleaved reads to stdout ``` ## Custom indexes -- To download ahead of time and cache the default index (`human-t2t-hla`), run `hostile fetch` -- To list available standard indexes, run `hostile fetch --list` -- To download and cache another standard index, run e.g. `hostile fetch --name human-t2t-hla-argos985`. This will download and cache both short read (Bowtie2) and long read (Minimap2) indexes, unless restricted to one or the other using e.g. `--aligner minimap2`. -- To use a custom genome/index (made with `hostile mask` or otherwise), run `hostile clean` with `--index path/to/genome.fa` (for minimap2) or `--index path/to/bowtie2-index-name` (for Bowtie2). Note that Minimap2 mode accepts a path to a genome in fasta format, whereas Bowtie2 mode accepts a path to a precomputed index, minus the `.x.bt2` suffix. A Bowtie2 index can be built for use with Hostile using e.g. `bowtie2-build genome.fa index-name`. - -- To change where indexes are stored, set the environment variable `HOSTILE_CACHE_DIR` to a directory of your choice. Run `hostile fetch --list` to verify. - -- If (like [EIT Pathogena](https://www.eit-pathogena.com)) you wish to use your own remote repository of indexes, set the environment variable `HOSTILE_REPOSITORY_URL`. Hostile will then look for indexes inside `{HOSTILE_REPOSITORY_URL}/manifest.json`. - -- From version 1.2.0 onwards, Hostile automatically builds and reuses Minimap2 .mmi files to reduce warmup time for long read decontamination. The first time you use a new index you may notice an indexing delay, but thereafter it will be much faster. +- To list available standard indexes, run `hostile index list`. +- To optionally download and cache the default index (`human-t2t-hla`) ahead of time, run `hostile index fetch`. Include `--minimap2` or `--bowtie2` to download only the respective long or short read index rather than both. To download and cache another standard index, provide its name with e.g. `hostile index fetch --name human-t2t-hla-argos985 --minimap2`. +- To use a custom genome/index (made with `hostile mask` or otherwise), run `hostile clean` with `--index path/to/genome.fa` (for minimap2) or `--index path/to/bowtie2-index-name` (for Bowtie2). Note that Minimap2 mode accepts a path to a genome in fasta format or .mmi, whereas Bowtie2 mode accepts a path to a precomputed index, minus the `.x.bt2` suffix. A Bowtie2 index can be built for use with Hostile using e.g. `bowtie2-build genome.fa index-name`. +- To change where indexes are stored, set the environment variable `HOSTILE_CACHE_DIR` to a directory of your choice. Run `hostile index list` to verify. +- If you wish to use your own remote repository of indexes, set the environment variable `HOSTILE_REPOSITORY_URL`. Hostile will then look for indexes inside `{HOSTILE_REPOSITORY_URL}/manifest.json`. +- From version 2.0.0 onwards, Hostile automatically builds and reuses .mmi files to speed up long read decontamination with Minimap2. If building an MMI is interrupted, you may receive an error about index corruption. If this happens, run `hostile index delete --mmi`, or if using a custom index, delete the .mmi created in the same directory. @@ -86,41 +84,41 @@ hostile clean --fastq1 short.r1.fq.gz --fastq2 short.r2.fq.gz --stdout > clean.f ```bash $ hostile clean -h -usage: hostile clean [-h] --fastq1 FASTQ1 [--fastq2 FASTQ2] [--aligner {bowtie2,minimap2,auto}] [--index INDEX] [--invert] [--rename] [--reorder] [--out-dir OUT_DIR] [-s] [-t THREADS] [--force] - [--aligner-args ALIGNER_ARGS] [--airplane] [-d] +usage: hostile clean [-h] --fastq1 FASTQ1 [--fastq2 FASTQ2] [--aligner {bowtie2,minimap2,auto}] [--index INDEX] [--invert] [--rename] [--reorder] [-c] [-o OUTPUT] + [--aligner-args ALIGNER_ARGS] [-t THREADS] [--force] [--airplane] [-d] -Remove reads aligning to an index from fastq[.gz] input files. +Remove reads aligning to an index from fastq[.gz] input files or stdin. options: -h, --help show this help message and exit --fastq1 FASTQ1 path to forward fastq[.gz] file --fastq2 FASTQ2 optional path to reverse fastq[.gz] file (short reads only) - (default: None) + (default: ) --aligner {bowtie2,minimap2,auto} alignment algorithm. Defaults to minimap2 (long read) given fastq1 only or bowtie2 (short read) given fastq1 and fastq2. Override with bowtie2 for single/unpaired short reads (default: auto) --index INDEX name of standard index or path to custom genome (Minimap2) or Bowtie2 index (default: human-t2t-hla) - --invert keep only reads aligning to the target genome (and their mates if applicable) + --invert keep only reads aligning to the index (and their mates if applicable) (default: False) --rename replace read names with incrementing integers (default: False) --reorder ensure deterministic output order (default: False) - --out-dir OUT_DIR path to output directory - (default: /Users/bede/Research/git/hostile) - -s, --stdout send FASTQ to stdout instead of writing fastq.gz file(s). Sends log to stderr instead. Paired output is interleaved + -c, --casava use Casava 1.8+ read header format (default: False) + -o, --output OUTPUT path to output directory or - for stdout + (default: /Users/bede/Research/git/hostile) + --aligner-args ALIGNER_ARGS + additional arguments for alignment + (default: ) -t, --threads THREADS number of alignment threads. A sensible default is chosen automatically - (default: 5) + (default: 10) --force overwrite existing output files (default: False) - --aligner-args ALIGNER_ARGS - additional arguments for alignment - (default: ) - --airplane disable automatic index download + --airplane disable automatic index download (offline mode) (default: False) -d, --debug show debug messages (default: False) @@ -170,7 +168,7 @@ INFO: Cleaning complete Reads sent to stdout, log sent to stderr ```bash -$ hostile clean --fastq1 tests/data/tuberculosis_1_1.fastq.gz --stdout > out.fastq +$ hostile clean --fastq1 tests/data/tuberculosis_1_1.fastq.gz -o - > out.fastq INFO: Hostile v2.0.0. Mode: long read (Minimap2) INFO: Found cached standard index human-t2t-hla (MMI available) INFO: Cleaning… @@ -242,7 +240,7 @@ INFO: Cleaning complete When using stdout mode with paired input, Hostile sends interleaved paired reads to stdout. ```bash -$ hostile clean --fastq1 human_1_1.fastq.gz --fastq2 human_1_2.fastq.gz --stdout > interleaved.fastq +$ hostile clean --fastq1 human_1_1.fastq.gz --fastq2 human_1_2.fastq.gz -o - > interleaved.fastq INFO: Hostile v2.0.0. Mode: paired short read (Bowtie2) INFO: Found cached standard index human-t2t-hla INFO: Cleaning… diff --git a/src/hostile/aligner.py b/src/hostile/aligner.py index dbd7060..e6f256f 100644 --- a/src/hostile/aligner.py +++ b/src/hostile/aligner.py @@ -37,7 +37,9 @@ class Aligner: def __post_init__(self): Path(self.data_dir).mkdir(exist_ok=True, parents=True) - def check_index(self, index: str, airplane: bool = False) -> Path: + def check_index( + self, index: str, airplane: bool = False, build_mmi: bool = False + ) -> Path: """Test aligner and check/download a ref/index if necessary, returning genome or index path""" try: util.run(f"{self.bin_path} --version", cwd=self.data_dir) @@ -79,14 +81,19 @@ def check_index(self, index: str, airplane: bool = False) -> Path: elif self.name == "Minimap2": if Path(f"{index}").is_file(): index_path = Path(index) - logging.info(f"Found custom index {index}") + if get_mmi_path(index_path).is_file(): + logging.info(f"Found cached standard index {index} (MMI available)") + else: + logging.info( + f"Found cached standard index {index} (building MMI cache; do not interrupt)" + ) elif (self.data_dir / f"{index}.fa.gz").is_file(): index_path = self.data_dir / f"{index}.fa.gz" if get_mmi_path(index_path).is_file(): logging.info(f"Found cached standard index {index} (MMI available)") else: logging.info( - f"Found cached standard index {index} (MMI file will be generated)" + f"Found cached standard index {index} (building MMI cache; do not interrupt)" ) elif not airplane and util.fetch_manifest(util.INDEX_REPOSITORY_URL).get( index @@ -106,6 +113,9 @@ def check_index(self, index: str, airplane: bool = False) -> Path: shutil.copy(tmp_path, self.data_dir / f"{index}.fa.gz") index_path = self.data_dir / f"{index}.fa.gz" logging.info(f"Downloaded standard index {index_path}") + mmi_path = get_mmi_path(index_path) + logging.info(f"Building MMI cache; do not interrupt {mmi_path}…") + util.run(f'minimap2 -x map-ont -d "{mmi_path}" "{index_path}"') else: message = f"{index} is neither a valid custom FASTA path, nor a valid standard index name. Mode: long read (Minimap2)" if airplane: diff --git a/src/hostile/cli.py b/src/hostile/cli.py index 822b5cd..044753b 100644 --- a/src/hostile/cli.py +++ b/src/hostile/cli.py @@ -4,7 +4,6 @@ from enum import Enum from pathlib import Path -from typing import Literal import defopt @@ -52,7 +51,7 @@ def clean( :arg aligner_args: additional arguments for alignment :arg threads: number of alignment threads. A sensible default is chosen automatically :arg force: overwrite existing output files - :arg airplane: disable automatic index download + :arg airplane: disable automatic index download (offline mode) :arg debug: show debug messages """ @@ -132,36 +131,52 @@ def mask( ) -def fetch( +def fetch_index( name: str = util.DEFAULT_INDEX_NAME, - aligner: Literal["minimap2", "bowtie2", "both"] = "both", - list: bool = False, + minimap2: bool = False, + bowtie2: bool = False, ) -> None: """ Download and cache indexes from object storage for use with hostile clean :arg name: name of index to download - :arg aligner: aligner(s) for which to download an index - :arg list: list available indexes + :arg minimap2: fetch Minimap2 index + :arg bowtie2: fetch Bowtie2 index """ - logging.info(f"Cache directory: {util.CACHE_DIR}") - logging.info(f"Manifest URL: {util.INDEX_REPOSITORY_URL}/manifest.json") - if list: - manifest = util.fetch_manifest() - for name in manifest.keys(): - print(name) - else: - if aligner == "minimap2" or aligner == "both": - logging.info(f"Looking for Minimap2 index {name}") - lib.ALIGNER.minimap2.value.check_index(name) - if aligner == "bowtie2" or aligner == "both": - logging.info(f"Looking for Bowtie2 index {name}") - lib.ALIGNER.bowtie2.value.check_index(name) + lib.fetch_index(name=name, minimap2=minimap2, bowtie2=bowtie2) + + +def list_indexes(airplane: bool = False): + """ + List available remote and local cached indexes + + :arg airplane: list only local cached indexes (offline mode) + """ + lib.list_indexes(airplane=airplane) + + +def delete_index(name: str = "", all: bool = False, mmi: bool = False) -> None: + """ + Delete cached indexes + + :arg name: name of cached index to delete + :arg all: delete all cached indexes + :arg mmi: delete all cached Minimap2 indexes + """ + lib.delete_index(name=name, all=all, mmi=mmi) def main(): defopt.run( - {"clean": clean, "mask": mask, "fetch": fetch}, + { + "clean": clean, + "mask": mask, + "index": { + "delete": delete_index, + "list": list_indexes, + "fetch": fetch_index, + }, + }, no_negated_flags=True, strict_kwonly=False, ) diff --git a/src/hostile/lib.py b/src/hostile/lib.py index 4865c3e..59f5c1f 100644 --- a/src/hostile/lib.py +++ b/src/hostile/lib.py @@ -6,7 +6,7 @@ from pathlib import Path from hostile import util, __version__ -from hostile.aligner import ALIGNER +from hostile.aligner import ALIGNER, get_mmi_path logging.basicConfig( @@ -381,14 +381,91 @@ def mask( if apply_cmd_run.stderr: logging.info(apply_cmd_run.stderr) - build_masked_index_cmd = f"bowtie2-build --threads '{threads}' '{masked_ref_path}' '{masked_ref_index_path}'" - logging.info(f"Indexing masked reference ({build_masked_index_cmd})") - build_masked_index_cmd_run = util.run(build_masked_index_cmd) + masked_ref_mmi_path = get_mmi_path(masked_ref_path) + build_mm2_masked_index_cmd = ( + f"minimap2 -d '{masked_ref_mmi_path}' '{masked_ref_path}'" + ) + logging.info(f"Building Minimap2 index ({build_mm2_masked_index_cmd})") + build_masked_index_cmd_run = util.run(build_mm2_masked_index_cmd) + + build_bt2_masked_index_cmd = f"bowtie2-build --threads '{threads}' '{masked_ref_path}' '{masked_ref_index_path}'" + logging.info(f"Building Bowtie2 index ({build_bt2_masked_index_cmd})") + build_masked_index_cmd_run = util.run(build_bt2_masked_index_cmd) if build_masked_index_cmd_run.stderr: logging.info(build_masked_index_cmd_run.stderr.strip()) logging.info(f"Masked {n_masked_positions} positions") - logging.info(f"Masked reference genome: {masked_ref_path}") - logging.info(f"Masked reference Bowtie2 index: {masked_ref_index_path}") + logging.info(f"Masked reference: {masked_ref_path}") + logging.info(f"Masked Minimap2 index: {masked_ref_mmi_path}") + logging.info(f"Masked Bowtie2 index: {masked_ref_index_path}") return masked_ref_path, masked_ref_index_path, n_masked_positions + + +def fetch_index( + name: str = util.DEFAULT_INDEX_NAME, + minimap2: bool = False, + bowtie2: bool = False, +) -> None: + if minimap2 or (not minimap2 and not bowtie2): + logging.info(f"Looking for Minimap2 index {name}") + ALIGNER.minimap2.value.check_index(name) + if bowtie2 or (not minimap2 and not bowtie2): + logging.info(f"Looking for Bowtie2 index {name}") + ALIGNER.bowtie2.value.check_index(name) + + +def list_indexes(airplane: bool = False): + total_size_gb = sum( + f.stat().st_size / (1024**3) for f in util.CACHE_DIR.glob("**/*") if f.is_file() + ) + logging.info(f"Remote index: {util.INDEX_REPOSITORY_URL}/manifest.json") + logging.info(f"Local cache: '{util.CACHE_DIR}' ({total_size_gb:.1f}GB total)") + if not airplane: + manifest = util.fetch_manifest() + for name in manifest.keys(): + print(f"Remote\t{name}") + unique_prefixes = set() + for f in util.CACHE_DIR.iterdir(): + if f.is_file(): + name = f.name + if name.endswith(".fa.gz"): + prefix = name[:-6] + elif name.endswith(".bt2"): + prefix = name[:-6] + if prefix.endswith(".rev"): + prefix = prefix[:-4] + else: + continue + unique_prefixes.add(prefix) + for prefix in unique_prefixes: + suffixes = [] + fa_exists = (util.CACHE_DIR / f"{prefix}.fa.gz").exists() + mmi_exists = (util.CACHE_DIR / f"{prefix}.mmi").exists() + bt2_exists = (util.CACHE_DIR / f"{prefix}.1.bt2").exists() + if fa_exists: + suffixes.append(f"Minimap2{' with MMI' if mmi_exists else ''}") + if bt2_exists: + suffixes.append("Bowtie2") + suffix_str = ", ".join(suffixes) + print(f"Local\t{prefix} ({suffix_str})") + + +def delete_index(name: str = "", all: bool = False, mmi: bool = False) -> None: + total_size_gb = sum( + f.stat().st_size / (1024**3) for f in util.CACHE_DIR.glob("**/*") if f.is_file() + ) + logging.info( + f"Local cache: '{util.CACHE_DIR}' ({total_size_gb:.1f}GB total before deletion)" + ) + for f in util.CACHE_DIR.iterdir(): + if f.is_file() and mmi and f.name.endswith(".mmi"): + f.unlink() + logging.info(f"Deleted {f}") + for f in util.CACHE_DIR.iterdir(): + if f.is_file(): + if all or (name and f.name.startswith(name)): + f.unlink() + logging.info(f"Deleted {f}") + if not name and not all and not mmi: + logging.error("Provide an index name (--name NAME) or --all") diff --git a/src/hostile/util.py b/src/hostile/util.py index 4ceca32..e7d8eff 100644 --- a/src/hostile/util.py +++ b/src/hostile/util.py @@ -85,12 +85,12 @@ def handle_alignment_exceptions(exception: subprocess.CalledProcessError) -> Non logging.debug(f"stderr: {exception.stderr}") alignment_successful = False stream_empty = False - if "function mm_idx_load": # Minimap2 index corruption - raise RuntimeError( - "Minimap2 index may be corrupted, run hostile index delete --mmi" - ) if "Error, fewer reads in file specified" in exception.stderr: # Bowtie2 raise RuntimeError("fastq1 and fastq2 contain different numbers of reads") + if "function mm_idx_load" in exception.stderr: # Minimap2 index corruption + raise RuntimeError( + "Minimap2 index appears corrupted, run hostile index delete --mmi. If using a custom index, delete the .mmi file" + ) if 'Failed to read header for "-"' in exception.stderr: stream_empty = True if "overall alignment rate" in exception.stderr: # Bowtie2 diff --git a/tests/test_all.py b/tests/test_all.py index 4e96db5..cc3ee77 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -414,9 +414,10 @@ def test_mask(tmp_path): masked_ref_path, _, _ = lib.mask( reference=data_dir / "sars-cov-2/sars-cov-2.fasta.gz", target=data_dir / "sars-cov-2/partial-for-mask-testing.fa.gz", + output=tmp_path, ) - assert Path("masked/mask.bed").exists() and masked_ref_path.is_file() - shutil.rmtree("masked") + assert (Path(tmp_path) / "mask.bed").exists() + assert (Path(tmp_path) / "mask.bed").exists() and masked_ref_path.exists() def test_mask_performance(tmp_path): @@ -424,6 +425,7 @@ def test_mask_performance(tmp_path): reference=data_dir / "mask/t2t-chm13v2.0-chr21-subset.fa", target=data_dir / "mask/gallid-herpesvirus-2.fa", ) + assert Path("masked/masked.fa").exists() assert str(masked_ref_path) == "masked/masked.fa" assert str(masked_ref_index_path) == "masked/masked" assert n_masked_positions == 2255 @@ -741,6 +743,7 @@ def test_override_cache_dir(tmp_path): env=env, ) assert "custom_directory" in result.stderr + shutil.rmtree("custom_directory") def test_override_cache_dir_paired(tmp_path): @@ -751,6 +754,7 @@ def test_override_cache_dir_paired(tmp_path): env=env, ) assert "custom_directory" in result.stderr + shutil.rmtree("custom_directory") def test_override_repository_url(tmp_path):