From ada627dd9089abcb11bc16b6269bd7a161540bfd Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Tue, 21 Jan 2025 22:49:31 +0000
Subject: [PATCH 01/34] On the off chance that something happens to this
 instance, I should have this online.

---
 docs/batch.md           |  89 +++++++++++++
 docs/developer.md       |  67 ++++++++++
 docs/faqs.md            |  15 +++
 docs/index.md           |  21 +++
 docs/installation.md    |  82 ++++++++++++
 docs/output.md          | 107 +++++++++++++++
 docs/overview.md        |  49 +++++++
 docs/run.md             | 286 ++++++++++++++++++++++++++++++++++++++++
 docs/run_validation.md  |  29 ++++
 docs/troubleshooting.md |  43 ++++++
 docs/usage.md           | 122 +++++++++++++++++
 11 files changed, 910 insertions(+)
 create mode 100644 docs/batch.md
 create mode 100644 docs/developer.md
 create mode 100644 docs/faqs.md
 create mode 100644 docs/index.md
 create mode 100644 docs/installation.md
 create mode 100644 docs/output.md
 create mode 100644 docs/overview.md
 create mode 100644 docs/run.md
 create mode 100644 docs/run_validation.md
 create mode 100644 docs/troubleshooting.md
 create mode 100644 docs/usage.md

diff --git a/docs/batch.md b/docs/batch.md
new file mode 100644
index 00000000..f29dc88f
--- /dev/null
+++ b/docs/batch.md
@@ -0,0 +1,89 @@
+# AWS BATCH
+
+This page is a guide for running the pipeline on [AWS Batch](https://aws.amazon.com/batch/), a tool which allows you to run Nextflow workflows in a highly parallelized and automated manner. The original source of this guide is [this notebook](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html), however this verison will be updated to reflect the current state of the pipeline. 
+
+## 0. Check your permissions
+
+The most common ways you can run into trouble following these instructions all arise from insufficient AWS permissions. For a smooth experience, make sure you have the following:
+
+1. **AWS Batch Permissions:** You need to have appropriate permissions to view, modify and run Batch compute environments and job queues. The simplest way to do this is to have your administrator add you to the `AWSBatchFullAccess` IAM policy (at SecureBio, this is enabled for all members of the `TeamMembers` group).
+2. **S3 Permissions:** You need to have appropriate permissions to read, write and view the S3 bucket where your workflow data is stored. A useful start here is to have your administrator add you to the `AmazonS3FullAccess` IAM policy. After that, you should make sure you have the appropriate permissions to access the specific bucket you're using for your workflow, including the ability to list, read, write, and delete objects[^s3].
+3. **EC2 Permissions:** You need to have appropriate permissions to create and modify EC2 launch templates and compute environments. The simplest way to do this is to have your administrator add you to the `AmazonEC2FullAccess` IAM policy.
+4. **Instance Role Permissions:** You will need to assign an Instance Role when setting up your Batch compute environment. This role should have at least the following permissions: `AmazonEC2ContainerServiceRole`, `AmazonEC2ContainerServiceforEC2Role`, and `AmazonS3FullAccess`. Make sure you can either set up such a role yourself, or have your administrator do so and point you to the role name. (On SecureBio, this role is called `ecsInstanceRole`.)
+
+[^s3]: In more depth, you need the following actions to be enabled for the bucket in question for your IAM user or role: `s3:ListBucket`, `s3:GetBucketLocation`, `s3:GetObject`, `s3:GetObjectAcl`, `s3:PutObject`, `s3:PutObjectAcl`, `s3:PutObjectTagging`, and `s3:DeleteObject`. If you're using a bucket specific to your user, all this is easier if you first have your administrator enable `s3:GetBucketPolicy` and `s3:PutBucketPolicy` for your user.
+
+## 1. Create an EC2 launch template
+
+First, you need to create an EC2 launch template that specifies the configuration for EC2 instances to be set up through Batch. To do this on the AWS console, Navigate to EC2 on the Services menu, then:
+
+1.  Select "Launch templates" in the left-hand menu.
+2.  Click the orange "Create launch template" button.
+3.  Enter a launch template name (e.g. "will-batch-template") and optionally a description.
+4.  Check the box under "Auto scaling guidance"
+5.  Under "Application and OS Images (Amazon Machine Image)", click "Browse more AMIs", then search for "Amazon ECS-Optimized Amazon Linux 2023 (AL2023) x86_64 AMI".
+    1.  Under "AWS Marketplace AMIs", select "Amazon ECS-Optimized Amazon Linux 2023 (AL2023) x86_64 AMI" by Amazon Web Services.
+    2.  In the popup, select "Subscribe now"
+6.  Select an instance type (this isn't hugely important as Batch will modify the instance types provisioned based on the needs of the workflow; I generally select "m5.8xlarge").
+7.  Under "Key pair (login)", select "Don't include in launch template"
+8.  Under "Network settings", select "Create security group" and follow the default settings.
+9.  Now we come to storage. Configuring this correctly is important to avoid IOPS errors!
+    1.  The key thing to realize is that, since Batch is spinning up and terminating instances as needed, the usual costs of creating large EBS volumes don't really apply. As such, you can be relatively greedy in provisioning storage for these instances, to minimize the risk of IOPS-related problems with your workflow.
+    2.  My recommendation is as follows: under "Storage (volumes)", expand the default "Volume 1 (AMI Root)" volume, then enter the following configuration values[^1]:
+        1.  Size: 1000 GiB
+        2.  Volume type: gp3
+        3.  IOPS: 16000 (maximum for gp3)
+        4.  Throughput: 1000
+10. Finally, add tags so the cost of your provisioned resources can be tracked more effectively. Add one "Instances" tag (e.g. "will-batch-template-instance") and one "Volumes" tag (e.g. "will-batch-template-volumes").
+11. Leave the other settings as default (mostly "Don't include in launch template") and select "Create launch template".
+
+[^1]: If you want even more IOPS, you can provision an io2 volume instead of gp3. However, that's beyond the scope of this guide.
+
+If you want to modify your template after creating it, you can do so by navigating to it in the panel and selecting "Actions" \> "Modify template (create new version)". Be careful to pay attention to which version of the template any dependent resources (e.g. compute environments, see below) are using.
+
+## 2. Set up a Batch compute environment
+
+Next, you need to create a compute environment through which jobs can be allocated to instances. To do this on the AWS console, navigate to Batch on the Services menu, then:
+
+1.  Select "Compute environments" in the left-hand menu
+2.  Click the orange "Create" button
+3.  Under "Compute environment configuration", select "Amazon Elastic Compute Cloud (Amazon EC2)"[^2]. Then:
+    1.  Under "Orchestration Type", select "Managed".
+    2.  Enter an environment name (I generally go with something unimaginative like "will-batch-3".
+    3.  Set up roles:
+        1.  Under "Service role" select "AWSServiceRoleForBatch".
+        2.  Under "Instance role" select "ecsInstanceRole", or another role with appropriate permissions (see step 0 above).
+        3.  If these roles don't exist or aren't available in the drop-down menu, contact your administrator about setting them up for you.
+4.  Under "Tags", navigate to "EC2 tags" and click the "Add tag" button. Then select a tag name that can be used to uniquely identify your use of the workflow (e.g. "mgs-workflow-will"). This is important to let your administrator keep track of how much money you and the wider team are spending on Batch (whose resource consumption is otherwise somewhat inscrutable).
+5.  Click the orange "Next" button. Then, under "Instance configuration":
+    1.  Under "Use EC2 Spot instances", make sure the "Enable using Spot instances" selector is enabled.
+    2.  Under "Maximum % on-demand price", enter a number between 0 and 100. 100 is a good default. Lower numbers will lower costs but increase the chance of unscheduled instance termination, which will require your workflow to re-run jobs.
+    3.  Enter integer values under "Minimum vCPUs", "Desired vCPUs" and "Maximum vCPUs". I typically use 0, 0, and 1024.
+    4.  Under "Allowed instance types", select "optimal" plus whatever other instance families you want to provision. I typically use optimal, m5, and c6a.
+    5.  Under "Additional configuration":
+        1.  Specify your EC2 key pair to allow direct ssh'ing into Batch instances (you should very rarely need to do this so you can skip this if you like)
+        2.  Under "Launch template" select the launch template you configured previously.
+        3.  Under "Launch template version", enter "\$Default" or "\$Latest" (your preference).
+6.  Click the orange "Next button", then do so again (i.e. accept defaults for "Network configuration").
+    1.  You can configure your own network setup if you like, but that's beyond the scope of this guide.
+7.  Review your selections, then click the orange "Create compute environment" button.
+
+[^2]: In the future, I'll investigate running Batch with Fargate for Nextflow workflows. For now, using EC2 gives us greater control over configuration than Fargate, at the cost of additional setup complexity and occasional startup delays.
+
+## 3. Set up a Batch job queue
+
+The last step you need to complete on AWS itself is to set up a job queue that Nextflow can use to submit jobs to Batch. To do this on the AWS console, navigate to Batch on the Services menu, then:
+
+1.  Select "job queues" in the left-hand menu.
+2.  Click the orange "Create" button.
+3.  Under "Orchestration Type", again select "Amazon Elastic Compute Cloud (Amazon EC2)".
+4.  Under "Job queue configuration", enter:
+    1.  A queue name (I generally go with something uncreative like "will-batch-queue-nf")
+    2.  A job priority (unimportant if you're only using one queue and you're the only one using that queue)
+    3.  A connected compute environment (select the environment you set up previously from the dropdown menu)
+5.  Click the orange "Create job queue" button.
+6.  Success!
+
+## 4. Run Nextflow with Batch
+
+Finally, you need to use all the infrastructure you've just set up to actually run a Nextflow workflow! We recommend using our test dataset to get started. [Click here to see how to run the pipeline on the test dataset](./testing.md#user-guide).
\ No newline at end of file
diff --git a/docs/developer.md b/docs/developer.md
new file mode 100644
index 00000000..4a7ad5aa
--- /dev/null
+++ b/docs/developer.md
@@ -0,0 +1,67 @@
+# Developer guide
+This section is soley for developers of the pipeline. We thank you greatly for your work!
+
+All of our testing is done using [nf-test](https://www.nf-test.com/), which is a testing framework that allows us to run tests locally and on Github Actions. All tests dataset are stored in the `s3://nao-testing` s3 bucket, and must be made public for nf-test (and anyone) to access. 
+
+As of right now we have the following test datasets ([more details here](#available-test-datasets)):
+
+- `gold-standard-test`: A small test dataset to test the run workflow.
+
+### Contributing to the pipeline
+
+When developing new features, always make sure to make a new branch that starts with your name, then is followed by a one or two word description of the feature you're working on by underscores. For example, `will_streaming` or `harmon_remove_trimmomatic`. This branch should be made from the `dev` branch.
+
+>[!CAUTION]
+> Do not make pull requests from the `dev` or `main` branches.
+
+By default, all changes are made to local brnaches, pulled into `dev` then after enough `dev` changes, merged into `main`.
+
+#### Pull request requirements
+
+To have a pull request accepted, you must do the following:
+
+1. **Write new tests** for the changes that you make using `nf-test` if those tests don't already exist. At the very least, these tests should check that the new implementation runs to completion, however tests that verify the output of the test dataset are strongly encouraged.
+2. **Pass all existing tests locally** by running `nf-test test tests`. *If you make any changes that affect the output of the pipeline, you must list the changes that have occured in the output of the test dataset during your pull request.*
+3. **Update the `CHANGELOG.md` file** with the changes that you are making. More information on how to update the `CHANGELOG.md` file can be found [here](./version_schema.md).
+4. **Pass automated tests on Github Actions**. These run automatically when you open a pull request.
+5. **Reviewed by a maintainer**.
+
+More information on how to run nf-test locally can be found [here](#running-tests-locally-with-nf-test).
+
+### Everything testing
+
+#### Making a test dataset
+
+To make a new test dataset, copy the test dataset to `s3://nao-testing/[name-of-test-dataset]` where `[name-of-test-dataset]` is the name of the test dataset you want to create (request Harmon, or Will for permission to add to the bucket):
+
+```
+aws s3 cp /path/to/my_dataset s3://nao-testing/my_dataset/ --acl public-read
+```
+
+> [!NOTE]
+> Any time you update a test dataset, you must make it public again.
+
+#### Running tests locally with `nf-test`
+
+By default, we use the [gold standard test](#gold-standard-test-s3nao-testinggold-standard-test) dataset to test the run workflow. To run the tests locally, you need to make sure that you have a big enough ec2-instance. We recommend the `m5.xlarge` with at least `32GB` of EBS storage, as this machine closely reflects the VMs on Github Actions. Once you have an instance, run `nf-test test tests`, which will run all tests in the `tests` directory. If you want to run a specific test, you can either reference the name of the test file, or the tag of the test you want to run:
+
+```
+nf-test test tests/main.test.nf # Runs all tests in the main.test.nf file
+nf-test test --tag run     # Runs the run workflow
+```
+
+The intended results for the run workflow can be found in following directory `test-data/gold-standard-results`. Should the `run_output` test fail, you can diff the resulting files of that test, with the files in this folder to find the differences.
+
+> [!NOTE]
+> Periodically delete docker images to free up space on your instance. Running the following command will delete all docker images on your system:
+> ```
+> docker kill $(docker ps -q) 2>/dev/null || true
+> docker rm $(docker ps -a -q) 2>/dev/null || true
+> docker rmi $(docker images -q) -f 2>/dev/null || true
+> docker system prune -af --volumes
+> ```
+
+### Available test datasets
+
+#### Gold standard test (`s3://nao-testing/gold-standard-test/`)
+This is the default test dataset that we use for testing the run workflow. It is a small dataset that contains XXX reads from the [Yang 2020](https://www.sciencedirect.com/science/article/abs/pii/S0048969720358514?via%3Dihub) study. The code to generate this data can be found [here](https://github.com/naobservatory/generate-test-dataset/tree/harmon-dev-edits).
\ No newline at end of file
diff --git a/docs/faqs.md b/docs/faqs.md
new file mode 100644
index 00000000..ad64c606
--- /dev/null
+++ b/docs/faqs.md
@@ -0,0 +1,15 @@
+# FAQs
+
+This page is a collection of frequently asked questions about the pipeline.
+
+## How do I run the pipeline on a new dataset?
+See [Usage](usage.md) for more information.
+
+## How do I run the pipeline on AWS Batch?
+See [Batch](batch.md) for more information.
+
+## Does the pipeline work on all operating systems?
+By default, we've only tested the pipeline on Linux-based EC2 instances. For other operating systems, such as MACOS or Windows, we cannot guarantee that the pipeline will work.
+
+## How can I contribute to the pipeline?
+See [Developer](developer.md) for more information.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..7560399f
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,21 @@
+# Index workflow
+
+The index workflow generates static index files from reference data. These reference data and indices don't depend on the reads processed by the run workflow, so a set of indices built by the index workflow can be used by multiple instantiations of the run workflow — no need to rebuild indices for every set of reads. The index workflow should be run once per reasonable length of time, balancing two considerations: Many of these index/reference files are derived from publicly available reference genomes or other resources, and should thus be updated and re-run periodically as new versions of these become available; however, to keep results comparable across datasets analyzed with the run workflow, this should be done relatively rarely.
+
+Key inputs to the index workflow include:
+- A TSV specifying names and taxonomic IDs (taxids) for host taxa for which to search for host-infecting viruses.
+- A URL linking to a suitable Kraken2 database for taxonomic profiling (typically the [latest release](https://benlangmead.github.io/aws-indexes/k2) of the `k2_standard` database).
+- URLS for up-to-date releases of reference genomes for various common contaminant species that can confound the identification of HV reads (currently [human](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9606), [cow](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9913), [pig](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9823), [carp](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=7962)[^carp], [mouse](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=10090), [*E. coli*](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=562)).
+- URLs to sequence databases for small and large ribosomal subunits from [SILVA](https://www.arb-silva.de/download/arb-files/).
+- Up-to-date links to [VirusHostDB](https://www.genome.jp/virushostdb) and [NCBI Taxonomy](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/).
+
+[^carp]: The carp genome is included as a last-ditch attempt to [capture any remaining Illumina adapter sequences](https://dgg32.medium.com/carp-in-the-soil-1168818d2191) before moving on to HV identification. I'm not especially confident this is helpful.
+
+Given these inputs, the index workflow:
+- Generates a TSV of viral taxa, incorporating taxonomic information from NCBI, and annotates each taxon with infection status for each host taxon of interest (using Virus-Host-DB).
+- Makes Bowtie2 indices from (1) all host-infecting viral genomes in Genbank[^genbank], (2) the human genome, (3) common non-human contaminants, plus BBMap indices for (2) and (3).
+- Downloads and extracts local copies of (1) the BLAST nt database, (2) the specified Kraken2 DB, (3) the SILVA rRNA reference files.
+
+[^genbank]: Excluding transgenic, contaminated, or erroneous sequences, which are excluded according to a list of sequence ID patterns specified in the config file.
+
+Run the index workflow by setting `mode = "index"` in the relevant config file. For more information, see `workflows/index.nf` and the associated subworkflows and modules.
diff --git a/docs/installation.md b/docs/installation.md
new file mode 100644
index 00000000..d8e27163
--- /dev/null
+++ b/docs/installation.md
@@ -0,0 +1,82 @@
+# Installation & setup
+
+This section describes how to install the pipeline and set up your environment to run it.
+
+This pipeline is generally optimized and tested to run on linux machines, we specifically use EC2 instances for running this pipeline, but this code could be adopted for MACOS or Windows machines.
+
+
+> [!TIP]
+> If someone else in your organization already uses this pipeline, it's likely they've already run the index workflow and generated an output directory. If this is the case, you can reduce costs and increase reproducibility by using theirs instead of generating your own. If you want to do this, skip this step, and edit `configs/run.config` such that `params.ref_dir` points to `INDEX_DIR/output`.
+
+
+<!-- TOC start (generated with https://github.com/derlin/bitdowntoc) -->
+- [Installation \& setup](#installation--setup)
+      - [1. Install dependencies](#1-install-dependencies)
+      - [2. Configure AWS \& Docker](#2-configure-aws--docker)
+      - [3. Clone this repository](#3-clone-this-repository)
+      - [4. Run index/reference workflow](#4-run-indexreference-workflow)
+<!-- TOC end -->
+
+## 1. Install dependencies 
+
+To run this workflow with full functionality, you need access to the following dependencies:
+
+1. **SDKMAN!:** To install the SDKMAN! Java SDK manager, follow the installation instructions available [here](https://sdkman.io/install).
+2. **Nextflow:** To install the workflow management framework, follow the installation instructions available [here](https://www.nextflow.io/docs/latest/getstarted.html), beginning by installing a recommended Java distribution through SDKMAN!. Pipeline version 2.5.0+ requires Nextflow version 24.10.0+.
+2. **Docker:** To install Docker Engine for command-line use, follow the installation instructions available [here](https://docs.docker.com/engine/install/) (or [here](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-docker.html) for installation on an AWS EC2 instance).
+3. **AWS CLI:** If not already installed, install the AWS CLI by following the instructions available [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
+4. **Git:** To install the Git version control tool, follow the installation instructions available [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
+5. **nf-test**: To install nf-test, follow the install instructions available [here](https://www.nf-test.com/installation/).
+
+## 2. Configure AWS & Docker
+
+To run the workflow using AWS S3 for the working and output directories, you first need to [configure AWS access](https://www.nextflow.io/docs/latest/aws.html). To do this, you need to create a file at `~/.aws/config` **or** `~/.aws/credentials` specifying your access key ID and secret access key, e.g.
+
+`~/.aws/config`:
+```
+[default]
+region = us-east-1
+output = table
+tcp_keepalive = true
+aws_access_key_id = <ACCESS_KEY_ID>
+aws_secret_access_key = <SECRET_ACCESS_KEY>
+```
+
+`~/.aws/credentials`:
+```
+[default]
+aws_access_key_id = <ACCESS_KEY_ID>
+aws_secret_access_key = <SECRET_ACCESS_KEY>
+```
+
+Next, you need to make sure your user is configured to use Docker. To do this, create the `docker` user group and add your current user to it:
+
+```
+sudo groupadd docker
+sudo usermod -aG docker $USER
+newgrp docker
+docker run hello-world
+```
+
+#### 3. Clone this repository
+
+Clone this repo into a new directory as normal.
+
+#### 4. Run index/reference workflow
+Create a new directory outside the repo directory and copy over the index workflow config file as `nextflow.config` in that directory:
+
+```
+mkdir index
+cd index
+cp REPO_DIR/configs/index.config nextflow.config
+```
+
+Next, edit `nextflow.config` such that `params.base_dir` points to the directory (likely on S3) where you want to store your index files. (You shouldn't need to modify anything else about the config file at this stage.)
+
+Next, call `nextflow run` pointing at the repo directory (NB: *not* `main.nf` or any other workflow file; pointing to the directory will cause Nextflow to automatically run `main.nf` from that directory.):
+
+```
+nextflow run PATH_TO_REPO_DIR -resume
+```
+
+Wait for the workflow to run to completion; this is likely to take several hours at least.
\ No newline at end of file
diff --git a/docs/output.md b/docs/output.md
new file mode 100644
index 00000000..637ddb80
--- /dev/null
+++ b/docs/output.md
@@ -0,0 +1,107 @@
+# Outputs
+
+If the pipeline runs to completion, the following output files are expected. In the future, we will add more specific information about the outputs, including in-depth descriptions of the columns in the output files.
+
+All pipeline output can be found in the `output` directory, which is broken into four subdirectories:
+
+- `input`: Directory containing saved input information (useful for trying to reproduce someone else's results)
+- `logging`: Log files containing meta-level information about the pipeline run itself.
+- `intermediates`: Intermediate files produced by key stages in the run workflow, saved for nonstandard downstream analysis.
+- `results`: Directory containing processed results files for standard downstream analysis.
+
+## Run workflow
+
+Main heading represents the folder name, and subheadings represent a description of the file's usage. If the file is not in the heading folder name, the relative path is given.
+
+### `input`
+
+- `adapters.fasta`: FASTA file of adapter sequences used for adapter screening.
+- `run-params.json`: JSON file giving all the parameters passed to the pipeline.
+- `index-params.json`: JSON file giving parameters used to generate index directory (`params.ref_dir`).
+- `samplesheet.csv`: Copy of the samplesheet file used to configure the pipeline (specified by `params.sample_sheet`).
+
+### `logging`
+
+- `pipeline-version.txt`: Version of the pipeline used for the run.
+- `time.txt`: Start time of the run.
+- `trace.txt`: Tab delimited log of all the information for each task run in the pipeline including runtime, memory usage, exit status, etc. Can be used to create an execution timeline using the the script `bin/plot-timeline-script.R` after the pipeline has finished running. More information regarding the trace file format can be found [here](https://www.nextflow.io/docs/latest/reports.html#trace-file).
+- `pipeline-version-index.txt`: Version of pipeline used to generate index directory (`params.ref_dir`).
+
+### `intermediates`
+
+- `reads/raw_viral`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
+
+### `results`
+
+#### `qc`
+- `total_reads_qc.tsv.gz`: Total number of reads processed at each stage of the preprocessing phase (`stage`)
+- `qc_basic_stats.tsv.gz`: Summary statistics for each sample at each stage of the preprocessing phase (`stage`), including:
+    - GC content (`percent GC`);
+    - Average read length (`mean_seq_len`);
+    - Number of read pairs (`n_read pairs`);
+    - Approximate number of base pairs in reads (`n_bases_approx`);
+    - Percent duplicates as measured by FASTQC (`percent_duplicates`);
+    - Pass/fail scores for each test conducted by FASTQC.
+- `qc_adapter_stats.tsv.gz`: Adapter statistics calculated by FASTQC for each sample and preprocessing stage, given as a percentage of reads containing adapter content (`pc_adapters`) at each position along the read (`position`) for each adapter detected (`adapter`) for each read in the read pair (`read_pair`).
+- `qc_quality_base_stats.tsv.gz`: Per-base read-quality statistics calculated by FASTQC for each sample and preprocessing stage, given as the mean Phred score (`mean_phred_score`) at each position along the read (`position`) for each read in the read pair (`read_pair`).
+- `qc_quality_sequence_stats.tsv.gz`: Per-sequence read-quality statistics calculated by FASTQC for each sample and preprocessing stage, given as the number of reads (`n_sequences`) with a given mean Phred score (`mean_phred_score`) for each read in the read pair (`read_pair`).
+
+#### `viral identification`
+- `virus_hits_db.tsv.gz`: TSV output by the viral identification phase, giving information about each read pair assigned to a host-infecting virus.
+- `virus_clade_counts.tsv.gz`: Summary of the previous file giving the number of HV read pairs mapped to each viral taxon. Includes both read pairs mapped directly to that taxon (`n_reads_direct`) and to that taxon plus all descendent taxa (`n_reads_clade`).
+
+#### `taxonomic identification`
+- `kraken_reports_merged.tsv.gz`: Kraken output reports in TSV format, labeled by sample and ribosomal status.
+- `bracken_reports_merged.tsv.gz`: Bracken output reports in TSV format, labeled by sample and ribosomal status.
+
+#### `blast`
+- `blast_hits_paired.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
+    - The number of reads in the read pair with high-scoring matches to that taxid (`n_reads`).
+    - The best bitscores of alignments to that taxid for each matching read (`bitscore_max` and `bitscore_min`)[^bitscore].
+- `blast_forward_only.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
+- `blast_reverse_only.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
+
+[^bitscore]: If only one read aligns to the target, these two fields will be identical. If not, they will give the higher and lower of the best bitscores for the two reads in the pair..
+
+
+## Index workflow
+
+Main heading represents the folder name, and subheadings describes the tool that consumes the file. Files that are consumed by multiple tools or are not consumed by any tools are put in the `General` subheading. If the file is not in the heading folder name, the relative path is given.
+
+### `input`
+
+- `index-params.json`: JSON file giving all the parameters passed to the pipeline (useful for trying to reproduce someone else's results).
+
+### `logging`
+
+- `pipeline-version.txt`: Version of the pipeline with which index directory was created.
+- `time.txt`: Start time of index workflow run.
+- `trace.txt`: Nextflow trace file containing logging information for each process performed during the workflow run.
+
+### `results`
+
+#### `General`
+
+- `total-virus-db-annotated.tsv.gz`: Database generated from NCBI taxonomy and Virus-Host-DB giving taxonomy and host-infection information for each viral taxon.
+- `taxonomy-nodes.dmp`: Taxonomy dump file from NCBI mapping between taxids and their parents in the NCBI taxonomy tree structure.
+- `taxonomy-names.dmp`: Taxonomy dump file from NCBI mapping between taxids and taxon names.
+
+#### `BLAST`
+
+- `core_nt`: Directory containing extracted BLAST database files for BLASTing against core_nt.
+
+#### `Bowtie2`
+
+- `bt2-virus-index`: Directory containing Bowtie2 index for host-infecting viral genomes.
+- `bt2-human-index`: Directory containing Bowtie2 index for the human genome.
+- `bt2-other-index`: Directory containing Bowtie2 index for other contaminant sequences.
+- `virus-genome-metadata-gid.tsv.gz`: Genome metadata file generated during download of HV genomes from viral Genbank, annotated additionally with Genome IDs used by Bowtie2 (allowing mapping between genome ID and taxid).
+
+#### `Kraken2`
+
+- `kraken_db`: Directory containing Kraken2 reference database (default: Most recent version of Standard).
+
+#### `BBduk`
+
+- `virus-genomes-filtered.fasta.gz`: FASTA file containing host-infecting viral genomes downloaded from viral Genbank (filtered to remove transgenic, contaminated, or erroneous sequences).
+- `ribo-ref-concat.fasta.gz`: Reference database of ribosomal LSU and SSU sequences from SILVA.
diff --git a/docs/overview.md b/docs/overview.md
new file mode 100644
index 00000000..4734e185
--- /dev/null
+++ b/docs/overview.md
@@ -0,0 +1,49 @@
+# Overview
+
+The pipeline currently consists of three workflows:
+
+- `run`: Performs the main analysis, including viral identification, taxonomic profiling, and optional BLAST validation.
+- `index`: Creates indices and reference files used by the `run` workflow.[^1]
+- `run_validation`: Performs BLAST validation on any set of reads to verify their taxonomic classification.[^2]
+
+[^1]: The `index` workflow is intended to be run first, after which many instantiations of the `run` workflow can use the same index output files. 
+[^2]: The `run_validation` workflow is intended to be run after the `run` workflow if the optional BLAST validation was not selected during the `run` workflow. Typically, this workflow is run on a subset of the host viral reads identified in the `run` workflow, to evaluate the sensitivity and specificity of the viral identification process.
+
+The `run` workflow consists of four parts:
+
+- **Viral identification**: Reads from viruses infecting specific host taxa of interest (default: vertebrate viruses) are sensititvely and specifically identified using a multi-step pipeline based around k-mer filtering, adapter trimming, read alignment, and taxonomic classification.
+- **Taxonomic profile**: A random subset of reads (default: 1M/sample) undergo adapter trimming, ribosomal classification, and taxonomic classification.
+- **QC**: The total number of reads are recorded, then a subset of reads (default 1M/sample) undergo quality assessment, both before and after adapter trimming.
+- **(Optional) BLAST validation**: Putative host viral reads from part 1 (either all reads or a subset) are checked aginst `core_nt` to evaluate the sensitivity and specificity of the viral identification process.
+
+The following diagram provides a high-level overview of the `run` workflow (each blue box is a subworkflow):
+
+```mermaid
+---
+title: RUN WORKFLOW
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: defaultr
+---
+flowchart LR
+A[Input Files] --> |Raw reads|B[LOAD_SAMPLESHEET]
+B --> C[COUNT_TOTAL_READS] & D[EXTRACT_VIRAL_READS] & E[SUBSET_TRIM]
+D -.-> |*Optional*| F[BLAST_VIRAL]
+E --> |Trim and subset reads| G[RUN_QC] & H[PROFILE]
+E --> |Subset reads|G[RUN_QC]
+%% Adjust layout by placing subgraphs in specific order
+subgraph "Viral identification"
+D
+end
+subgraph "Taxonomic Profile"
+H
+end
+subgraph "QC"
+C
+G
+end
+subgraph "BLAST Validation"
+F
+end
+```
\ No newline at end of file
diff --git a/docs/run.md b/docs/run.md
new file mode 100644
index 00000000..5b5b2468
--- /dev/null
+++ b/docs/run.md
@@ -0,0 +1,286 @@
+# RUN WORKFLOW
+
+The `run` workflow is responsible for the primary analysis of the pipeline. 
+
+```mermaid
+---
+title: RUN WORKFLOW
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Input Files] --> |Raw reads|B[LOAD_SAMPLESHEET]
+B --> C[COUNT_TOTAL_READS] & D[EXTRACT_VIRAL_READS] & E[SUBSET_TRIM]
+D -.-> |*Optional*| F[BLAST_VIRAL]
+E --> |Trim and subset reads| G[RUN_QC] & H[PROFILE]
+E --> |Subset reads|G[RUN_QC]
+%% Adjust layout by placing subgraphs in specific order
+subgraph "Viral identification"
+D
+end
+subgraph "Taxonomic Profile"
+H
+end
+subgraph "QC"
+C
+G
+end
+subgraph "BLAST Validation"
+F
+end
+```
+
+By default, we define the host-infecting viruses to be screened against all vertebrate-infecting viruses and we don't perform BLAST validation. Corresponding config file: `configs/run.config`.
+
+I've broken down the `run` workflow into a series of subworkflows, each of which performs a specific task:
+1. [Setup workflows](#setup-workflows): Prepare the input data for analysis
+    - [Load data into channels (LOAD_SAMPLESHEET)](#load-data-into-channels-load_samplesheet)
+    - [Subset and trim reads (SUBSET_TRIM)](#subset-and-trim-reads-subset_trim)
+2. [Analysis workflows](#analysis-workflows): Perform the primary analysis
+    - [Viral identification phase (EXTRACT_VIRAL_READS)](#viral-identification-phase-extract_viral_reads)
+    - [Taxonomic profiling phase (PROFILE)](#taxonomic-profiling-phase-profile)
+    - [BLAST validation phase (BLAST_VIRAL)](#blast-validation-phase-blast_viral)
+3. [QC workflows](#qc-workflows): Conduct quality control on the analysis results
+    - [Count total reads (COUNT_TOTAL_READS)](#count-total-reads-count_total_reads)
+    - [QC and output phase (RUN_QC)](#qc-and-output-phase-run_qc)
+4. [Helper workflows](#helper-workflows): Helper workflows that are used by the workflows above
+    - [Taxonomic assignment (TAXONOMY)](#taxonomic-assignment-taxonomy)
+    - [QC (QC)](#qc-qc)
+
+
+
+## Setup workflows
+
+### Load data into channels (LOAD_SAMPLESHEET)
+This workflow loads the samplesheet and creates a channel containing the samplesheet data. It also creates a channel containing the group data, which is used to group reads into samples. No diagram is provided for this workflow, as it's a simple channel creation workflow.
+
+Output:
+- `samplesheet`: A channel containing the samplesheet data.
+- `group`: A channel containing the group data.
+- `start_time_str`: A channel containing the start time of the workflow.
+
+### Subset and trim reads (SUBSET_TRIM)
+This workflow subsets the reads to a target number (default 1M/sample), and trims adapters with [FASTP](https://github.com/OpenGene/fastp), which both screens for adapters and trims low-quality and low-complexity sequences.
+
+```mermaid
+---
+title: TRIM_AND_SUBSET
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Raw reads] --> B[SUBSET]
+B --> C[FASTP]
+```
+
+1. Subset reads using [Seqtk](https://github.com/lh3/seqtk)
+2. Trim reads using [FASTP](https://github.com/OpenGene/fastp)
+
+Output:
+- `subset_reads`: A channel containing the subsetted reads.
+- `trimmed_subset_reads`: A channel containing the subsetted and trimmed reads.
+
+## Analysis workflows
+
+### Viral identification phase (EXTRACT_VIRAL_READS)
+
+The goal of this phase is to sensitively, specifically, and efficiently identify reads arising from host-infecting viruses. It uses a multi-step process designed to keep computational costs low while minimizing false-positive and false-negative errors.
+
+```mermaid
+---
+title: EXTRACT_VIRAL_READS
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Raw reads] --> B["BBDuk <br> (viral index)"]
+B --> C[FASTP]
+C --> D[Cutadapt]
+D --> E["Bowtie2 <br> (viral index)"]
+E --> F["Bowtie2 <br> (human index)"]
+F --> G["Bowtie2 <br> (other contaminants index)"]
+G --> H[TAXONOMY]
+H --> I[Process & merge output]
+
+subgraph "Filter for viral reads"
+B
+E
+end
+subgraph "Trim and clean reads"
+C
+D
+end
+subgraph "Remove contaminants"
+F
+G
+end
+```
+
+1. To begin with, the raw reads are screened against a database of vertebrate-infecting viral genomes generated from Genbank by the index workflow. This initial screen is performed using [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), which flags any read that contains at least three 21-mers matching any host-infecting viral genome. The purpose of this initial screen is to rapidly and sensitively identify putative vertebrate-infecting viral reads while discarding the vast majority of non-HV reads, reducing the cost associated with the rest of this phase.
+2. Surviving reads undergo additional adapter trimming with [FASTP](https://github.com/OpenGene/fastp) and [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) to remove any residual adapter contamination that might lead to false positive results.
+3. Next, reads are aligned to the previously-mentioned database of vertebrate-infecting viral genomes with [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) using quite permissive parameters designed to capture as many putative HV reads as possible. The SAM and FASTQ files are processed to generate new read files containing any read pair for which at least one read matches the HV database.
+4. The output of the previous step is passed to a further filtering step, in which reads matching a series of common contaminant sequences are removed. This is done by aligning surviving reads to these contaminants using Bowtie2 in series[^filter]. Contaminants to be screened against include reference genomes from human, cow, pig, carp, mouse and *E. coli*, as well as various genetic engineering vectors.
+5. Surviving read pairs are then taxonomically profiled using the [TAXONOMY workflow](#taxonomic-assignment-taxonomy), which deduplicates, merges and profiles the reads.
+6.  Finally, reads are assigned a final vertebrate-infecting virus status if they:
+    - Are classified as vertebrate-infecting virus by both Bowtie2 and Kraken2; or
+    - Are unassigned by Kraken and align to an vertebrate-infecting virus taxon with Bowtie2 with an alignment score above a user-specifed threshold[^threshold].
+
+
+Output:
+- `virus_hits_db.tsv.gz`: A TSV file containing information for each read pair, including whether it was classified as vertebrate-infecting virus, and the taxonomic assignment of the read pair.
+- `virus_clade_counts.tsv.gz`: A TSV file containing the number of read pairs mapping to each detected vertebrate-infecting virus taxon.
+
+[^filter]: We've found in past investigations that the two aligners detect different contaminant sequences, and aligning against both is more effective at avoiding false positives than either in isolation.
+ 
+[^threshold]: Specifically, Kraken-unassigned read pairs are classed as HV if, for either read in the pair, S/ln(L) >= T, where S is the best-match Bowtie2 alignment score for that read, L is the length of the read, and T is the value of `params.bt2_score_threshold` specified in the config file.
+
+### Taxonomic profiling phase (PROFILE)
+
+The goal of this phase is to give an overview of the taxonomic composition of the cleaned reads from the preprocessing phase. In particular, it gives an estimate of (i) the fraction of ribosomal reads in the dataset, (ii) the taxonomic breakdown of the dataset at the domain level[^eukarya], and (iii) more detailed abundance estimates for lower-level taxa.
+
+```mermaid
+---
+title: PROFILE
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Subset & trimmed reads] --> B["BBDuk <br> (SILVA index)"]
+B --> |Ribosomal reads| C[TAXONOMY]
+B --> |Non-ribosomal reads| D[TAXONOMY]
+C --> E[Merge output]
+D --> E
+subgraph "Ribosomal classification"
+B
+end
+```
+1. These subset reads are then separated into ribosomal and non-ribosomal read groups using BBDuk, by searching for ribosomal k-mers from the SILVA database obtained during the index workflow. 
+2. The output of the previous step is passed to the [TAXONOMY workflow](#taxonomic-assignment-taxonomy), which returns back the Kraken2 and Bracken outputs.
+
+[^eukarya]: As human is the only eukaryotic genome included in the Standard reference database for Kraken2, all sequences assigned to that domain can be assigned to *Homo sapiens*.
+
+### *Optional: BLAST validation phase (BLAST_VIRAL)*
+
+To evaluate the performance of the process described in the viral identification phase, it's useful to get some ground-truth information about whether the host viral assignments made in that subworkflow are correct. To do this, we use [BLASTN](https://blast.ncbi.nlm.nih.gov/Blast.cgi) to align the putative host viral reads output by the previous phase against the `core_nt` database, then process the output to check whether each sequence had a high-scoring alignment to at least one HV sequence. For computational tractability, this can be performed on only a subset of surviving host viral reads (specified by `params.blast_hv_fraction`)[^blast].
+
+```mermaid
+---
+title: BLAST_VIRAL
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Reads] --> B["Subset reads <br> (*Optional*)"]
+B --> |Forward read|C[BLASTN]
+B --> |Reverse read|D[BLASTN]
+C --> E[Filter BLASTN output]
+D --> F[Filter BLASTN output]
+E & F --> G[Process & merge output]
+```
+
+1. Reads are subset if `params.blast_hv_fraction` is less than 1.
+2. Forward and reverse reads are aligned separately with BLASTN.
+3. BLASTN outputs are filtered to keep only the best-scoring alignment for each read.
+4. Output from both reads are combined into a single file, with columns for the read ID, the subject taxid, and the alignment score.
+
+[^blast]: Setting `params.blast_hv_fraction` to 0 skips this step altogether.
+
+## QC workflows
+
+### Count total reads (COUNT_TOTAL_READS)
+This phase counts the total number of reads in the input files, then merges them all into one file. No diagram is provided for this workflow, as it's a simple channel creation workflow.
+
+### QC and output phase (RUN_QC)
+This phase conducts quality control on the subsetted reads, both the raw and cleaned reads.
+
+```mermaid
+---
+title: RUN_QC
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Subset reads] --> B[QC]
+C[Subset and trimmed reads] --> D[QC]
+B & D --> E[Process & merge output]
+```
+
+In this phase, the raw and cleaned reads from the preprocessing phase are analyzed with [FASTQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [MultiQC](https://multiqc.info/). This allows us to assess important metrics (e.g. read quality, read numbers, adapter content) in the raw input, and how these metrics are affected by cleaning. Relevant metrics are then extracted from MultiQC outputs into easy-to-parse TSV files[^tsvs] for downstream processing.
+
+
+[^tsvs]: Specifically, `qc_basic_stats.tsv.gz`, `qc_adapter_stats.tsv.gz`, `qc_quality_base_stats.tsv.gz` and `qc_quality_sequence_stats.tsv.gz`.
+
+## Helper workflows
+
+### Taxonomic assignment (TAXONOMY)
+
+This workflow performs taxonomic assignment on a set of reads.
+
+```mermaid
+---
+title: TAXONOMY
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Reads] --> B[CLUMPIFY_PAIRED]
+B --> C[BBMERGE]
+C --> D[CLUMPIFY_SINGLE]
+D --> E[KRAKEN]
+E --> F[BRACKEN]
+subgraph "Read deduplication"
+B
+D
+end
+subgraph "Paired reads to single reads"
+C
+end
+subgraph "Taxonomic profiling"
+E
+F
+end
+```
+
+1. Reads are deduplicated with [Clumpify](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/clumpify-guide/)[^dedup] 
+2. Merged into single sequences through a combination of [BBMerge](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmerge-guide/)[^merge]. Read pairs that fail to merge with BBMerge are concatenated end-to-end with an intervening "N" base.
+3. Reads are deduplicated once again with Clumpify, but this time they're single end reads.
+4. Reads are passed to [Kraken2](https://ccb.jhu.edu/software/kraken2/) for taxonomic assignment, using the reference database obtained in the index workflow. We record whether each read was (1) assigned to a host-infecting virus taxon with Kraken, (2) assigned to a non-HV taxon with Kraken, or (3) not assigned to any taxon. All reads in category (2) are filtered out.
+
+[^dedup]: Which identifies and collapses groups of reads that are identical modulo some specified error rate.
+[^merge]: Which aligns and merges read pairs with significant overlap.
+
+### QC (QC)
+This phase conducts quality control on any set of reads.
+```mermaid
+---
+title: QC
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Reads] --> B[FASTQ]
+B --> C[MULTIQC]
+C --> D[SUMMARIZE_MULTIQC_PAIR]
+D --> E[Process & merge output]
+```
+
+1. Run FASTQC on each pair of read files
+2. Extract data with MultiQC for each pair of read files
+3. Summarize MultiQC information for each pair of read files
+4. Process and merge outputs
\ No newline at end of file
diff --git a/docs/run_validation.md b/docs/run_validation.md
new file mode 100644
index 00000000..e7704f7e
--- /dev/null
+++ b/docs/run_validation.md
@@ -0,0 +1,29 @@
+# Run validation
+
+## BLAST validation phase (BLAST_VIRAL)
+
+To evaluate the performance of the process described in the viral identification phase, it's useful to get some ground-truth information about whether the host viral assignments made in that subworkflow are correct. To do this, we use [BLASTN](https://blast.ncbi.nlm.nih.gov/Blast.cgi) to align the putative host viral reads output by the previous phase against the `core_nt` database, then process the output to check whether each sequence had a high-scoring alignment to at least one HV sequence. For computational tractability, this can be performed on only a subset of surviving host viral reads (specified by `params.blast_hv_fraction`)[^blast].
+
+```mermaid
+---
+title: BLAST_VIRAL
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Reads] --> B["Subset reads <br> (*Optional*)"]
+B --> |Forward read|C[BLASTN]
+B --> |Reverse read|D[BLASTN]
+C --> E[Filter BLASTN output]
+D --> F[Filter BLASTN output]
+E & F --> G[Process & merge output]
+```
+
+1. Reads are subset if `params.blast_hv_fraction` is less than 1.
+2. Forward and reverse reads are aligned separately with BLASTN.
+3. BLASTN outputs are filtered to keep only the best-scoring alignment for each read.
+4. Output from both reads are combined into a single file, with columns for the read ID, the subject taxid, and the alignment score.
+
+[^blast]: Setting `params.blast_hv_fraction` to 0 skips this step altogether.
\ No newline at end of file
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
new file mode 100644
index 00000000..ed1ce2ab
--- /dev/null
+++ b/docs/troubleshooting.md
@@ -0,0 +1,43 @@
+
+## Troubleshooting
+
+We break down the issues that can arise into two categories:
+
+- Nextflow isn't able to run
+- Nextflow runs then fails
+
+We discuss each of these issues below.
+
+### Nextflow isn't able to run
+
+**The most common sources of errors are AWS permission issues.** Whether you're using the `batch` profile or not, you should make sure you have all AWS permissions, as specified in [our Batch tutorial](./batch.md#step-0-set-up-your-aws-credentials).
+
+Next, make sure Nextflow has access to your AWS credentials (if not you may get `AccessDenied` errors):
+
+```
+eval "$(aws configure export-credentials --format env)"
+```
+
+### Nextflow runs then fails
+
+
+#### None of my jobs are running
+
+The first time a pipeline is run on any dataset, it can take a while for the jobs to start (10-15 minutes at most). If after this period of time, none of your jobs have started, then this may be another AWS permission issue.
+
+#### Docker image failure
+
+Another common issue is for processes to fail with some variation of the following Docker-related error:
+
+```
+docker: failed to register layer: write /usr/lib/jvm/java-11-openjdk-amd64/lib/modules: **no space left on device**.
+```
+
+This is a fairly well-known problem, which can arise even when there is substantial free storage space accessible to your EC2 instance. Following the steps recommended [here](https://www.baeldung.com/linux/docker-fix-no-space-error) or [here](https://forums.docker.com/t/docker-no-space-left-on-device/69205) typically resolves the issue, either by deleting Docker assets to free up space (e.g. via `docker system prune --all --force`) or by giving Docker more space.
+
+#### Insufficient resources
+
+This an issue we're actively working on. Sometimes your jobs may fail due to insufficient memory or CPU resources. To avoid this, you can either change your overall resources or change the resources for a specific process:
+
+- **Change your overall resources**: You can change the resources for your entire pipeline by editing the `configs/resources.config` file. However, note that this will probably also require you to change the reference files and indices generated by the `index` workflow. [See here for more information on compute resource requirements](./usage#compute-resource-requirements).
+- **Change the resources for a specific process**: You can change the resources for a specific process by changing the label of that process to another label that is in the `configs/resources.config` file or you can define a new label in the `configs/resources.config` file.
diff --git a/docs/usage.md b/docs/usage.md
new file mode 100644
index 00000000..eb3e41c6
--- /dev/null
+++ b/docs/usage.md
@@ -0,0 +1,122 @@
+# Usage
+
+To run any of the workflows you must choose a profile, and then have data to run the pipeline on. To help with this process, we provide a small test dataset to run through the `run` workflow.
+
+<!-- TOC start (generated with https://github.com/derlin/bitdowntoc) -->
+- [Usage](#usage)
+  - [Profiles and modes](#profiles-and-modes)
+    - [Compute resource requirements](#compute-resource-requirements)
+  - [Running on new data](#running-on-new-data)
+  - [Tutorial: Running the `run` workflow on a test dataset](#tutorial-running-the-run-workflow-on-a-test-dataset)
+    - [Setup](#setup)
+    - [Profile specific instructions](#profile-specific-instructions)
+      - [`ec2_local`](#ec2_local)
+      - [`ec2_s3`](#ec2_s3)
+      - [`batch`](#batch)
+<!-- TOC end -->
+
+## Profiles and modes
+
+The pipeline has three main workflows: `INDEX`, `RUN`, and `RUN_VALIDATION`. Each of these calls their corresponding subworkflows, which are located in the `subworkflows` directory.
+
+The pipeline can be run in multiple ways by modifying various configuration variables specified in `configs/profiles.config`. Currently, three profiles are implemented, all of which assume the workflow is being launched from an AWS EC2 instance:
+
+- `batch (default)`:  Most efficient way to run the pipeline
+  - This profile is the default and attempts to run the pipeline with AWS Batch. This is the quickest and most efficient way to run the pipeline, but requires significant additional setup not described in this repo. To set up AWS Batch for this pipeline, follow the instructions [here](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html) (steps 1-3), then modify your config file to point `process.queue` to the name of your Batch job queue.
+- `ec2_local`: Simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations.
+  - This profile attempts to run the whole workflow locally on your EC2 instance, storing intermediate and outflow files on instance-linked block storage. This is simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations; in particular, if you don't use an instance with very high memory, the pipeline is likely to fail when loading the Kraken2 reference DB.
+- `ec2_s3`: Avoids storage issues on your EC2 instance, but is still constrained by your instance's memory allocation.
+  - This profile runs the pipeline on your EC2 instance, but attempts to read and write files to a specified S3 directory. This avoids problems caused by insufficient storage on your EC2 instance, but (1) is significantly slower and (2) is still constrained by your instance's memory allocation.
+
+To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
+
+As of the current pipeline, you must have at least 128GB of memory and 32 cores to run the pipeline if you're running ec2_local or ec2_s3. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
+
+> [!TIP]
+> It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
+
+### Compute resource requirements
+
+To run the pipeline as is you need at least 128GB of memory and 64 cores. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
+
+To change the compute resources for a process, you can modify the `resources.config` file. This file specifies the compute resources for each process based on the label of the process. For example, to change the compute resources for the `kraken` process, you can add the following to the `resources.config` file:
+
+In the case that you change the resources, you'll need to also change the index.
+
+
+## Running on new data
+
+To run the workflow on new data, you need:
+
+1. Accessible raw data files in Gzipped FASTQ format, named appropriately.
+2. A sample sheet file specifying the samples, along with paths to the forward and reverse read files for each sample. `generate_samplesheet.sh` (see below) can make this for you.
+3. A config file in a clean launch directory, pointing to:
+    - The base directory in which to put the working and output directories (`params.base_dir`).
+    - The directory containing the outputs of the reference workflow (`params.ref_dir`).
+    - The sample sheet (`params.sample_sheet`).
+    - Various other parameter values.
+
+> [!NOTE]
+> The samplesheet must have the following format for each row:
+> - First column: Sample ID
+> - Second column: Path to FASTQ file 1 which should be the forward read for this sample
+> - Third column: Path to FASTQ file 2 which should be the reverse read for this sample
+> 
+> The easiest way to get this file is by using the `generate_samplesheet.sh` script. As input, this script takes a path to raw FASTQ files (`dir_path`), and forward (`forward_suffix`) and reverse (`reverse_suffix`) read suffixes, both of which support regex, and an optional output path (`output_path`). Those using data from s3 should make sure to pass the `s3` parameter. Those who would like to group samples by some metadata can pass a path to a CSV file containing a header column named `sample,group`, where each row gives the sample name and the group to group by (`group_file`), edit the samplesheet manually after generation (since manually editing the samplesheet will be easier when the groups CSV isn't readily available), or provide the --group_across_illumina_lanes option if a single library was run across a single Illumina flowcell. As output, the script generates a CSV file named (`samplesheet.csv` by default), which can be used as input for the pipeline.
+>
+> For example:
+> ```
+> ../bin/generate_samplesheet.sh \
+>    --s3
+>    --dir_path s3://nao-restricted/MJ-2024-10-21/raw/ \
+>    --forward_suffix _1 \
+>    --reverse_suffix _2
+> ```
+
+If running on Batch, a good process for starting the pipeline on a new dataset is as follows:
+
+1. Process the raw data to have appropriate filenames (see above) and deposit it in an accessible S3 directory.
+2. Create a clean launch directory and copy `configs/run.config` to a file named `nextflow.config` in that directory.
+3. Create a sample sheet in that launch directory (see above)
+4. Edit `nextflow.config` to specify each item in `params` as appropriate, as well as setting `process.queue` to the appropriate Batch queue.
+5. Run `nextflow run PATH_TO_REPO_DIR -resume`.
+6. Navigate to `{params.base_dir}/output` to view and download output files.
+
+
+## Tutorial: Running the `run` workflow on a test dataset
+
+To confirm that the pipeline works in your hands, we provide a small test dataset (`s3://nao-testing/gold-standard-test/raw/`) to run through the `run` workflow. Feel free to use any profile, but we recommend using the `ec2_local` profile as long as [you have the resources](usage.md#compute-resource-requirements) to handle it. 
+
+When running with any profile, there is a shared setup that you need to do, before running the workflow. The shared setup is the same for all profiles, and is described below, followed by profile specific instructions.
+
+### Setup
+
+1. Create a new directory outside the repo directory and copy over the run workflow config file as `nextflow.config` in that directory:
+
+```
+mkdir launch
+cd launch
+cp REPO_DIR/configs/run.config nextflow.config
+```
+
+2. Edit `nextflow.config` to set `params.ref_dir` to the index directory you chose or created above (specifically `PATH_TO_REF_DIR/output`)
+3. Set the samplesheet path to the test dataset samplesheet `${projectDir}/test-data/samplesheet.csv`
+
+### Profile specific instructions
+
+#### `ec2_local`
+
+4. Within this directory, run `nextflow run -profile ec2_local .. -resume`. Wait for the workflow to finish. 
+5. Inspect the `output` directory to view the processed output files.
+
+#### `ec2_s3`
+
+4. Edit `nextflow.config` to set `params.base_dir` to the S3 directory of your choice. 
+5. Still within that directory, run `nextflow run -profile ec2_s3 .. -resume`. 
+6. Wait for the workflow to finish, and inspect the output on S3.
+
+#### `batch`
+
+4. Edit `nextflow.config` to set `params.base_dir` to the S3 directory of your choice and `process.queue` to the name of your Batch job queue. 
+5. Still within that directory, run `nextflow run -profile batch .. -resume` (or simply `nextflow run .. -resume`). 
+6. Wait for the workflow to finish, and inspect the output on S3.
\ No newline at end of file

From 6cc739e4a5cd645ef0b93423653e9e19265fcbf1 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Wed, 22 Jan 2025 15:30:48 +0000
Subject: [PATCH 02/34] Updated these slightly.

---
 docs/installation.md   |  4 +---
 docs/run.md            | 24 ++++++------------------
 docs/run_validation.md | 34 +++++++++++++++++++++++++++++-----
 docs/usage.md          | 10 ++++------
 4 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index d8e27163..297ea8a1 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,8 +1,6 @@
 # Installation & setup
 
-This section describes how to install the pipeline and set up your environment to run it.
-
-This pipeline is generally optimized and tested to run on linux machines, we specifically use EC2 instances for running this pipeline, but this code could be adopted for MACOS or Windows machines.
+This section describes how to install the pipeline and set up your environment to run it. This pipeline is generally optimized and tested to run on linux machines, we specifically use EC2 instances for running this pipeline, but this code could be adopted for MACOS or Windows machines.
 
 
 > [!TIP]
diff --git a/docs/run.md b/docs/run.md
index 5b5b2468..e87a56c7 100644
--- a/docs/run.md
+++ b/docs/run.md
@@ -56,11 +56,6 @@ I've broken down the `run` workflow into a series of subworkflows, each of which
 ### Load data into channels (LOAD_SAMPLESHEET)
 This workflow loads the samplesheet and creates a channel containing the samplesheet data. It also creates a channel containing the group data, which is used to group reads into samples. No diagram is provided for this workflow, as it's a simple channel creation workflow.
 
-Output:
-- `samplesheet`: A channel containing the samplesheet data.
-- `group`: A channel containing the group data.
-- `start_time_str`: A channel containing the start time of the workflow.
-
 ### Subset and trim reads (SUBSET_TRIM)
 This workflow subsets the reads to a target number (default 1M/sample), and trims adapters with [FASTP](https://github.com/OpenGene/fastp), which both screens for adapters and trims low-quality and low-complexity sequences.
 
@@ -80,10 +75,6 @@ B --> C[FASTP]
 1. Subset reads using [Seqtk](https://github.com/lh3/seqtk)
 2. Trim reads using [FASTP](https://github.com/OpenGene/fastp)
 
-Output:
-- `subset_reads`: A channel containing the subsetted reads.
-- `trimmed_subset_reads`: A channel containing the subsetted and trimmed reads.
-
 ## Analysis workflows
 
 ### Viral identification phase (EXTRACT_VIRAL_READS)
@@ -131,11 +122,6 @@ end
     - Are classified as vertebrate-infecting virus by both Bowtie2 and Kraken2; or
     - Are unassigned by Kraken and align to an vertebrate-infecting virus taxon with Bowtie2 with an alignment score above a user-specifed threshold[^threshold].
 
-
-Output:
-- `virus_hits_db.tsv.gz`: A TSV file containing information for each read pair, including whether it was classified as vertebrate-infecting virus, and the taxonomic assignment of the read pair.
-- `virus_clade_counts.tsv.gz`: A TSV file containing the number of read pairs mapping to each detected vertebrate-infecting virus taxon.
-
 [^filter]: We've found in past investigations that the two aligners detect different contaminant sequences, and aligning against both is more effective at avoiding false positives than either in isolation.
  
 [^threshold]: Specifically, Kraken-unassigned read pairs are classed as HV if, for either read in the pair, S/ln(L) >= T, where S is the best-match Bowtie2 alignment score for that read, L is the length of the read, and T is the value of `params.bt2_score_threshold` specified in the config file.
@@ -180,15 +166,17 @@ config:
   theme: default
 ---
 flowchart LR
-A[Reads] --> B["Subset reads <br> (*Optional*)"]
-B --> |Forward read|C[BLASTN]
-B --> |Reverse read|D[BLASTN]
+A[Reads] -.-> |Optional|B[Subset reads]
+A --> |Forward read|C[BLASTN]
+A --> |Reverse read|D[BLASTN]
+B -.-> |Forward read|C[BLASTN]
+B -.-> |Reverse read|D[BLASTN]
 C --> E[Filter BLASTN output]
 D --> F[Filter BLASTN output]
 E & F --> G[Process & merge output]
 ```
 
-1. Reads are subset if `params.blast_hv_fraction` is less than 1.
+1. Reads are subset if `params.blast_hv_fraction` is less than 1, else if `params.blast_hv_fraction` is 1, then BLAST is run on all host viral reads.
 2. Forward and reverse reads are aligned separately with BLASTN.
 3. BLASTN outputs are filtered to keep only the best-scoring alignment for each read.
 4. Output from both reads are combined into a single file, with columns for the read ID, the subject taxid, and the alignment score.
diff --git a/docs/run_validation.md b/docs/run_validation.md
index e7704f7e..2d2207f4 100644
--- a/docs/run_validation.md
+++ b/docs/run_validation.md
@@ -1,8 +1,30 @@
 # Run validation
 
+This pipeline allows you to run BLAST validation on the output of the RUN workflow or on any reads. We use [BLASTN](https://blast.ncbi.nlm.nih.gov/Blast.cgi) to align the putative host viral reads output by the previous phase against the `core_nt` database, then process the output to check whether each sequence had a high-scoring alignment to at least one host viral sequence.
+
+```mermaid
+---
+title: RUN_VALIDATION
+config:
+  layout: horizontal
+  look: handDrawn
+  theme: default
+---
+flowchart LR
+A[Reads] --> C["BLAST_VIRAL"]
+B["Run workflow viral reads"] --> C["BLAST_VIRAL"]
+subgraph "Input (choose one)"
+A 
+B
+end
+```
+
+1. Reads can be provided using the output of the host viral identification phase of the RUN workflow, or directly from the user.
+2. Run the [BLAST_VIRAL](#blast-validation-phase-blast_viral) subworkflow, which BLASTS the forward and reverse reads separately and then merges the results.
+
 ## BLAST validation phase (BLAST_VIRAL)
 
-To evaluate the performance of the process described in the viral identification phase, it's useful to get some ground-truth information about whether the host viral assignments made in that subworkflow are correct. To do this, we use [BLASTN](https://blast.ncbi.nlm.nih.gov/Blast.cgi) to align the putative host viral reads output by the previous phase against the `core_nt` database, then process the output to check whether each sequence had a high-scoring alignment to at least one HV sequence. For computational tractability, this can be performed on only a subset of surviving host viral reads (specified by `params.blast_hv_fraction`)[^blast].
+To evaluate the performance of the process described in the viral identification phase, it's useful to get some ground-truth information about whether the host viral assignments made in that subworkflow are correct. For computational tractability, this can be performed on only a subset of surviving host viral reads (specified by `params.blast_hv_fraction`)[^blast].
 
 ```mermaid
 ---
@@ -13,15 +35,17 @@ config:
   theme: default
 ---
 flowchart LR
-A[Reads] --> B["Subset reads <br> (*Optional*)"]
-B --> |Forward read|C[BLASTN]
-B --> |Reverse read|D[BLASTN]
+A[Reads] -.-> |Optional|B[Subset reads]
+A --> |Forward read|C[BLASTN]
+A --> |Reverse read|D[BLASTN]
+B -.-> |Forward read|C[BLASTN]
+B -.-> |Reverse read|D[BLASTN]
 C --> E[Filter BLASTN output]
 D --> F[Filter BLASTN output]
 E & F --> G[Process & merge output]
 ```
 
-1. Reads are subset if `params.blast_hv_fraction` is less than 1.
+1. Reads are subset if `params.blast_hv_fraction` is less than 1, else if `params.blast_hv_fraction` is 1, then BLAST is run on all host viral reads.
 2. Forward and reverse reads are aligned separately with BLASTN.
 3. BLASTN outputs are filtered to keep only the best-scoring alignment for each read.
 4. Output from both reads are combined into a single file, with columns for the read ID, the subject taxid, and the alignment score.
diff --git a/docs/usage.md b/docs/usage.md
index eb3e41c6..6cdf7ab6 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -21,17 +21,15 @@ The pipeline has three main workflows: `INDEX`, `RUN`, and `RUN_VALIDATION`. Eac
 
 The pipeline can be run in multiple ways by modifying various configuration variables specified in `configs/profiles.config`. Currently, three profiles are implemented, all of which assume the workflow is being launched from an AWS EC2 instance:
 
-- `batch (default)`:  Most efficient way to run the pipeline
-  - This profile is the default and attempts to run the pipeline with AWS Batch. This is the quickest and most efficient way to run the pipeline, but requires significant additional setup not described in this repo. To set up AWS Batch for this pipeline, follow the instructions [here](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html) (steps 1-3), then modify your config file to point `process.queue` to the name of your Batch job queue.
-- `ec2_local`: Simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations.
+- `batch (default)`:  **Most efficient way to run the pipeline**
+  - This profile is the default and attempts to run the pipeline with AWS Batch. This is the quickest and most efficient way to run the pipeline, but requires significant additional setup not described in this repo. To set up AWS Batch for this pipeline, follow the instructions [here](./batch.md) (steps 1-3), then modify your config file to point `process.queue` to the name of your Batch job queue.
+- `ec2_local`: **Simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations.**
   - This profile attempts to run the whole workflow locally on your EC2 instance, storing intermediate and outflow files on instance-linked block storage. This is simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations; in particular, if you don't use an instance with very high memory, the pipeline is likely to fail when loading the Kraken2 reference DB.
-- `ec2_s3`: Avoids storage issues on your EC2 instance, but is still constrained by your instance's memory allocation.
+- `ec2_s3`: **Avoids storage issues on your EC2 instance, but is still constrained by your instance's memory allocation.**
   - This profile runs the pipeline on your EC2 instance, but attempts to read and write files to a specified S3 directory. This avoids problems caused by insufficient storage on your EC2 instance, but (1) is significantly slower and (2) is still constrained by your instance's memory allocation.
 
 To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
 
-As of the current pipeline, you must have at least 128GB of memory and 32 cores to run the pipeline if you're running ec2_local or ec2_s3. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
-
 > [!TIP]
 > It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
 

From 08bc40dea9640aeba0c52eba04983ebed1dc7cdf Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Wed, 22 Jan 2025 14:49:47 -0500
Subject: [PATCH 03/34] Updated batch docs

---
 docs/batch.md | 53 +++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/docs/batch.md b/docs/batch.md
index f29dc88f..b7f7ba3f 100644
--- a/docs/batch.md
+++ b/docs/batch.md
@@ -1,65 +1,68 @@
-# AWS BATCH
+# Running the pipeline on AWS Batch
 
-This page is a guide for running the pipeline on [AWS Batch](https://aws.amazon.com/batch/), a tool which allows you to run Nextflow workflows in a highly parallelized and automated manner. The original source of this guide is [this notebook](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html), however this verison will be updated to reflect the current state of the pipeline. 
+This page contains recommendations for running the pipeline on [AWS Batch](https://aws.amazon.com/batch/), a tool which allows you to run Nextflow workflows in a highly parallelized and automated manner[^notebook].
 
-## 0. Check your permissions
+[^notebook]: The original source of this guide is [this notebook](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html). This version will be updated to reflect changes to the pipeline and online resources.
 
-The most common ways you can run into trouble following these instructions all arise from insufficient AWS permissions. For a smooth experience, make sure you have the following:
+## Step 1: Check your AWS permissions
 
-1. **AWS Batch Permissions:** You need to have appropriate permissions to view, modify and run Batch compute environments and job queues. The simplest way to do this is to have your administrator add you to the `AWSBatchFullAccess` IAM policy (at SecureBio, this is enabled for all members of the `TeamMembers` group).
-2. **S3 Permissions:** You need to have appropriate permissions to read, write and view the S3 bucket where your workflow data is stored. A useful start here is to have your administrator add you to the `AmazonS3FullAccess` IAM policy. After that, you should make sure you have the appropriate permissions to access the specific bucket you're using for your workflow, including the ability to list, read, write, and delete objects[^s3].
+The most common failure modes in setting up Batch to work with Nextflow arise from insufficient AWS permissions. If you run into trouble at any point, make sure you have the following:
+
+
+1. **AWS Batch Permissions:** You need to have appropriate permissions to view, modify and run Batch compute environments and job queues. The simplest way to do this is to have your administrator add you to the `AWSBatchFullAccess` IAM policy.
+2. **S3 Permissions:** You need to have appropriate permissions to read, write and view the S3 bucket where your workflow data is stored, including the ability to list, read, write, and delete objects[^s3].
 3. **EC2 Permissions:** You need to have appropriate permissions to create and modify EC2 launch templates and compute environments. The simplest way to do this is to have your administrator add you to the `AmazonEC2FullAccess` IAM policy.
-4. **Instance Role Permissions:** You will need to assign an Instance Role when setting up your Batch compute environment. This role should have at least the following permissions: `AmazonEC2ContainerServiceRole`, `AmazonEC2ContainerServiceforEC2Role`, and `AmazonS3FullAccess`. Make sure you can either set up such a role yourself, or have your administrator do so and point you to the role name. (On SecureBio, this role is called `ecsInstanceRole`.)
+4. **Instance Role Permissions:** You will need to assign an Instance Role when setting up your Batch compute environment. This role should have at least the following permissions: `AmazonEC2ContainerServiceRole`, `AmazonEC2ContainerServiceforEC2Role`, and `AmazonS3FullAccess`. Make sure you can either set up such a role yourself, or have your administrator do so and point you to the role name.
 
 [^s3]: In more depth, you need the following actions to be enabled for the bucket in question for your IAM user or role: `s3:ListBucket`, `s3:GetBucketLocation`, `s3:GetObject`, `s3:GetObjectAcl`, `s3:PutObject`, `s3:PutObjectAcl`, `s3:PutObjectTagging`, and `s3:DeleteObject`. If you're using a bucket specific to your user, all this is easier if you first have your administrator enable `s3:GetBucketPolicy` and `s3:PutBucketPolicy` for your user.
 
-## 1. Create an EC2 launch template
+## Step 2: Create an EC2 launch template
 
 First, you need to create an EC2 launch template that specifies the configuration for EC2 instances to be set up through Batch. To do this on the AWS console, Navigate to EC2 on the Services menu, then:
 
 1.  Select "Launch templates" in the left-hand menu.
 2.  Click the orange "Create launch template" button.
-3.  Enter a launch template name (e.g. "will-batch-template") and optionally a description.
+3.  Enter a launch template name (e.g. "YOURNAME-batch-template") and optionally a description.
 4.  Check the box under "Auto scaling guidance"
 5.  Under "Application and OS Images (Amazon Machine Image)", click "Browse more AMIs", then search for "Amazon ECS-Optimized Amazon Linux 2023 (AL2023) x86_64 AMI".
-    1.  Under "AWS Marketplace AMIs", select "Amazon ECS-Optimized Amazon Linux 2023 (AL2023) x86_64 AMI" by Amazon Web Services.
-    2.  In the popup, select "Subscribe now"
-6.  Select an instance type (this isn't hugely important as Batch will modify the instance types provisioned based on the needs of the workflow; I generally select "m5.8xlarge").
+    1.  Under "AWS Marketplace AMIs", select "Amazon ECS-Optimized Amazon Linux 2023 (AL2023) x86_64 AMI" by Amazon Web Services. (Make sure you select the container by AWS itself and not some third-party source!)
+    2.  In the popup, select "Subscribe now". As this is a free container, you shouldn't need any special permissions or payment to do this.
+6.  Select an instance type (this isn't hugely important as Batch will modify the instance types provisioned based on the needs of the workflow; we recommend "m5.8xlarge").
 7.  Under "Key pair (login)", select "Don't include in launch template"
 8.  Under "Network settings", select "Create security group" and follow the default settings.
 9.  Now we come to storage. Configuring this correctly is important to avoid IOPS errors!
     1.  The key thing to realize is that, since Batch is spinning up and terminating instances as needed, the usual costs of creating large EBS volumes don't really apply. As such, you can be relatively greedy in provisioning storage for these instances, to minimize the risk of IOPS-related problems with your workflow.
-    2.  My recommendation is as follows: under "Storage (volumes)", expand the default "Volume 1 (AMI Root)" volume, then enter the following configuration values[^1]:
+    2.  Our recommendation is as follows: under "Storage (volumes)", expand the default "Volume 1 (AMI Root)" volume, then enter the following configuration values[^iops]:
         1.  Size: 1000 GiB
         2.  Volume type: gp3
         3.  IOPS: 16000 (maximum for gp3)
         4.  Throughput: 1000
-10. Finally, add tags so the cost of your provisioned resources can be tracked more effectively. Add one "Instances" tag (e.g. "will-batch-template-instance") and one "Volumes" tag (e.g. "will-batch-template-volumes").
+10. Finally, add tags so the cost of your provisioned resources can be tracked more effectively. Add one "Instances" tag (e.g. "YOURNAME-batch-template-instance") and one "Volumes" tag (e.g. "YOURNAME-batch-template-volumes").
 11. Leave the other settings as default (mostly "Don't include in launch template") and select "Create launch template".
 
-[^1]: If you want even more IOPS, you can provision an io2 volume instead of gp3. However, that's beyond the scope of this guide.
+[^iops]: If you want even more IOPS, you can provision an io2 volume instead of gp3. However, that's beyond the scope of this guide.
 
 If you want to modify your template after creating it, you can do so by navigating to it in the panel and selecting "Actions" \> "Modify template (create new version)". Be careful to pay attention to which version of the template any dependent resources (e.g. compute environments, see below) are using.
 
-## 2. Set up a Batch compute environment
+## Step 3: Set up a Batch compute environment
 
 Next, you need to create a compute environment through which jobs can be allocated to instances. To do this on the AWS console, navigate to Batch on the Services menu, then:
 
 1.  Select "Compute environments" in the left-hand menu
 2.  Click the orange "Create" button
-3.  Under "Compute environment configuration", select "Amazon Elastic Compute Cloud (Amazon EC2)"[^2]. Then:
+3.  Under "Compute environment configuration", select "Amazon Elastic Compute Cloud (Amazon EC2)"[^fargate]. Then:
     1.  Under "Orchestration Type", select "Managed".
-    2.  Enter an environment name (I generally go with something unimaginative like "will-batch-3".
+    2.  Enter an environment name (e.g. "YOURNAME-batch-1").
     3.  Set up roles:
         1.  Under "Service role" select "AWSServiceRoleForBatch".
-        2.  Under "Instance role" select "ecsInstanceRole", or another role with appropriate permissions (see step 0 above).
+        2.  Under "Instance role" select the instance role set up in Step 1, or another role with appropriate permissions.
         3.  If these roles don't exist or aren't available in the drop-down menu, contact your administrator about setting them up for you.
-4.  Under "Tags", navigate to "EC2 tags" and click the "Add tag" button. Then select a tag name that can be used to uniquely identify your use of the workflow (e.g. "mgs-workflow-will"). This is important to let your administrator keep track of how much money you and the wider team are spending on Batch (whose resource consumption is otherwise somewhat inscrutable).
+4.  Under "Tags", navigate to "EC2 tags" and click the "Add tag" button. Then select a tag name that can be used to uniquely identify your use of the workflow (e.g. "mgs-workflow-YOURNAME"). This is important to let your administrator keep track of how much money you and the wider team are spending on Batch (whose resource consumption is otherwise somewhat inscrutable).
 5.  Click the orange "Next" button. Then, under "Instance configuration":
     1.  Under "Use EC2 Spot instances", make sure the "Enable using Spot instances" selector is enabled.
     2.  Under "Maximum % on-demand price", enter a number between 0 and 100. 100 is a good default. Lower numbers will lower costs but increase the chance of unscheduled instance termination, which will require your workflow to re-run jobs.
-    3.  Enter integer values under "Minimum vCPUs", "Desired vCPUs" and "Maximum vCPUs". I typically use 0, 0, and 1024.
-    4.  Under "Allowed instance types", select "optimal" plus whatever other instance families you want to provision. I typically use optimal, m5, and c6a.
+    3.  Enter integer values under "Minimum vCPUs", "Desired vCPUs" and "Maximum vCPUs". We recommend 0, 0, and 1024 as good defaults.
+    4.  Under "Allowed instance types", select "optimal" plus whatever other instance families you want to provision. We recommend optimal, m5, and c6a.
     5.  Under "Additional configuration":
         1.  Specify your EC2 key pair to allow direct ssh'ing into Batch instances (you should very rarely need to do this so you can skip this if you like)
         2.  Under "Launch template" select the launch template you configured previously.
@@ -68,7 +71,7 @@ Next, you need to create a compute environment through which jobs can be allocat
     1.  You can configure your own network setup if you like, but that's beyond the scope of this guide.
 7.  Review your selections, then click the orange "Create compute environment" button.
 
-[^2]: In the future, I'll investigate running Batch with Fargate for Nextflow workflows. For now, using EC2 gives us greater control over configuration than Fargate, at the cost of additional setup complexity and occasional startup delays.
+[^fargate]: In the future, we'll investigate running Batch with Fargate for Nextflow workflows. For now, using EC2 gives us greater control over configuration than Fargate, at the cost of additional setup complexity and occasional startup delays.
 
 ## 3. Set up a Batch job queue
 
@@ -78,7 +81,7 @@ The last step you need to complete on AWS itself is to set up a job queue that N
 2.  Click the orange "Create" button.
 3.  Under "Orchestration Type", again select "Amazon Elastic Compute Cloud (Amazon EC2)".
 4.  Under "Job queue configuration", enter:
-    1.  A queue name (I generally go with something uncreative like "will-batch-queue-nf")
+    1.  A queue name (e.g. "YOURNAME-batch-queue-nf")
     2.  A job priority (unimportant if you're only using one queue and you're the only one using that queue)
     3.  A connected compute environment (select the environment you set up previously from the dropdown menu)
 5.  Click the orange "Create job queue" button.
@@ -86,4 +89,4 @@ The last step you need to complete on AWS itself is to set up a job queue that N
 
 ## 4. Run Nextflow with Batch
 
-Finally, you need to use all the infrastructure you've just set up to actually run a Nextflow workflow! We recommend using our test dataset to get started. [Click here to see how to run the pipeline on the test dataset](./testing.md#user-guide).
\ No newline at end of file
+Finally, you need to use all the infrastructure you've just set up to actually run a Nextflow workflow! We recommend using our test dataset to get started. [Click here to see how to run the pipeline on the test dataset](./testing.md#user-guide).

From 717ca3c0a7a635a72337d0ecce8f35f9c5b3dcc4 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Wed, 22 Jan 2025 15:10:26 -0500
Subject: [PATCH 04/34] Edited installation docs

---
 docs/installation.md | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 297ea8a1..a1832b69 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,11 +1,9 @@
 # Installation & setup
 
-This section describes how to install the pipeline and set up your environment to run it. This pipeline is generally optimized and tested to run on linux machines, we specifically use EC2 instances for running this pipeline, but this code could be adopted for MACOS or Windows machines.
-
-
-> [!TIP]
-> If someone else in your organization already uses this pipeline, it's likely they've already run the index workflow and generated an output directory. If this is the case, you can reduce costs and increase reproducibility by using theirs instead of generating your own. If you want to do this, skip this step, and edit `configs/run.config` such that `params.ref_dir` points to `INDEX_DIR/output`.
+This page describes how to install the pipeline and configure your computing environment to run it.
 
+> [!IMPORTANT]
+> This pipeline has been developed to run on Linux environments, primarily on AWS EC2 instances. It has not been tested or debugged on MacOS or Windows machines, and is not guaranteed to work on those environments.
 
 <!-- TOC start (generated with https://github.com/derlin/bitdowntoc) -->
 - [Installation \& setup](#installation--setup)
@@ -47,6 +45,14 @@ aws_access_key_id = <ACCESS_KEY_ID>
 aws_secret_access_key = <SECRET_ACCESS_KEY>
 ```
 
+> [!TIP]
+> If you encounter `AccessDenied` errors after doing this, you may also need to export these keys as environment variables before running Nextflow:
+>
+> ```
+> eval "$(aws configure export-credentials --format env)"
+> ```
+
+
 Next, you need to make sure your user is configured to use Docker. To do this, create the `docker` user group and add your current user to it:
 
 ```
@@ -61,6 +67,10 @@ docker run hello-world
 Clone this repo into a new directory as normal.
 
 #### 4. Run index/reference workflow
+
+> [!TIP]
+> If someone else in your organization already uses this pipeline, it's likely they've already run the index workflow and generated an output directory. If this is the case, you can reduce costs and increase reproducibility by using theirs instead of generating your own. If you want to do this, skip this step, and edit `configs/run.config` such that `params.ref_dir` points to `INDEX_DIR/output`.
+
 Create a new directory outside the repo directory and copy over the index workflow config file as `nextflow.config` in that directory:
 
 ```
@@ -71,10 +81,13 @@ cp REPO_DIR/configs/index.config nextflow.config
 
 Next, edit `nextflow.config` such that `params.base_dir` points to the directory (likely on S3) where you want to store your index files. (You shouldn't need to modify anything else about the config file at this stage.)
 
-Next, call `nextflow run` pointing at the repo directory (NB: *not* `main.nf` or any other workflow file; pointing to the directory will cause Nextflow to automatically run `main.nf` from that directory.):
+Next, call `nextflow run` pointing at the repo directory:
 
 ```
 nextflow run PATH_TO_REPO_DIR -resume
 ```
 
-Wait for the workflow to run to completion; this is likely to take several hours at least.
\ No newline at end of file
+> [!TIP]
+> You don't need to point `nextflow run` at `main.nf` any other workflow file; pointing to the directory will cause Nextflow to automatically run `main.nf` from that directory.
+
+Wait for the workflow to run to completion; this is likely to take several hours at least.

From 1e83724ec9e40e2e0807879edae461ecd425aa90 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Wed, 22 Jan 2025 15:12:20 -0500
Subject: [PATCH 05/34] Removed TOC from installation doc (not rendering
 properly, and not really needed)

---
 docs/installation.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index a1832b69..bf97f0ff 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -5,14 +5,6 @@ This page describes how to install the pipeline and configure your computing env
 > [!IMPORTANT]
 > This pipeline has been developed to run on Linux environments, primarily on AWS EC2 instances. It has not been tested or debugged on MacOS or Windows machines, and is not guaranteed to work on those environments.
 
-<!-- TOC start (generated with https://github.com/derlin/bitdowntoc) -->
-- [Installation \& setup](#installation--setup)
-      - [1. Install dependencies](#1-install-dependencies)
-      - [2. Configure AWS \& Docker](#2-configure-aws--docker)
-      - [3. Clone this repository](#3-clone-this-repository)
-      - [4. Run index/reference workflow](#4-run-indexreference-workflow)
-<!-- TOC end -->
-
 ## 1. Install dependencies 
 
 To run this workflow with full functionality, you need access to the following dependencies:

From 4b8d6869bea50c27c3848877b14c19b2f77eed80 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Wed, 22 Jan 2025 16:18:15 -0500
Subject: [PATCH 06/34] Updated usage and installation docs

---
 docs/installation.md |  32 ++++++++-
 docs/usage.md        | 158 ++++++++++++++++++++-----------------------
 2 files changed, 102 insertions(+), 88 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index bf97f0ff..7814620c 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -54,11 +54,11 @@ newgrp docker
 docker run hello-world
 ```
 
-#### 3. Clone this repository
+## 3. Clone this repository
 
 Clone this repo into a new directory as normal.
 
-#### 4. Run index/reference workflow
+## 4. Run index/reference workflow
 
 > [!TIP]
 > If someone else in your organization already uses this pipeline, it's likely they've already run the index workflow and generated an output directory. If this is the case, you can reduce costs and increase reproducibility by using theirs instead of generating your own. If you want to do this, skip this step, and edit `configs/run.config` such that `params.ref_dir` points to `INDEX_DIR/output`.
@@ -83,3 +83,31 @@ nextflow run PATH_TO_REPO_DIR -resume
 > You don't need to point `nextflow run` at `main.nf` any other workflow file; pointing to the directory will cause Nextflow to automatically run `main.nf` from that directory.
 
 Wait for the workflow to run to completion; this is likely to take several hours at least.
+
+## 5. Run the pipeline on test data
+
+To confirm that the pipeline works in your hands, we recommend running it on a small test dataset, such as the one provided at `s3://nao-testing/gold-standard-test/raw/`, before running it on larger input data. To do this with our test dataset, follow the instructions below, or do it yourself according to the directions given [here](./docs/usage.md).
+
+1. Prepare the launch directory:
+    - Create a clean launch directory outside the repository directory.
+    - Copy over the run workflow config file to a new file in the launch directory labeled `nextflow.config`.
+    - Copy the test-data sample sheet from the repository directory to the launch directory.
+
+```
+mkdir launch
+cd launch
+cp REPO_DIR/configs/run.config nextflow.config
+cp REPO_DIR/test-data/samplesheet.csv samplesheet.csv
+```
+
+2. Edit the config file (`nextflow.config`):
+    - Edit `params.ref_dir` to point to the index directory you chose or created above (specifically `PATH_TO_REF_DIR/output`)
+    - Edit `params.base_dir` to point to where you would like the pipeline to save intermediate and final pipeline outputs.
+3. Choose a profile as described [here](./docs/usage.md).
+4. Run the pipeline from the launch directory:
+
+```
+nextflow run -resume -profile <PROFILE> REPO_DIR
+```
+
+Once the pipeline is complete, output and logging files will be available in the `output` subdirectory of the base directory specified in the config file.
diff --git a/docs/usage.md b/docs/usage.md
index 6cdf7ab6..ff348858 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -1,120 +1,106 @@
-# Usage
+# Pipeline Usage
 
-To run any of the workflows you must choose a profile, and then have data to run the pipeline on. To help with this process, we provide a small test dataset to run through the `run` workflow.
+This page describes the process of running the pipeline's [core workflow](./docs/run.md) on available data.
 
-<!-- TOC start (generated with https://github.com/derlin/bitdowntoc) -->
-- [Usage](#usage)
-  - [Profiles and modes](#profiles-and-modes)
-    - [Compute resource requirements](#compute-resource-requirements)
-  - [Running on new data](#running-on-new-data)
-  - [Tutorial: Running the `run` workflow on a test dataset](#tutorial-running-the-run-workflow-on-a-test-dataset)
-    - [Setup](#setup)
-    - [Profile specific instructions](#profile-specific-instructions)
-      - [`ec2_local`](#ec2_local)
-      - [`ec2_s3`](#ec2_s3)
-      - [`batch`](#batch)
-<!-- TOC end -->
+> [!IMPORTANT]
+> Before following the instructions on this page, make sure you have followed the [installation and setup instructions](./docs/installation.md), including running the [index workflow](./docs/index.md) or otherwise having a complete and up-to-date index directory in an accessible location.
 
-## Profiles and modes
+> [!IMPORTANT]
+> Currently, the pipeline only accepts paired short-read data; single-end and Oxford Nanopore versions are under development but are not ready for general use.
 
-The pipeline has three main workflows: `INDEX`, `RUN`, and `RUN_VALIDATION`. Each of these calls their corresponding subworkflows, which are located in the `subworkflows` directory.
+## 1. Preparing input files
 
-The pipeline can be run in multiple ways by modifying various configuration variables specified in `configs/profiles.config`. Currently, three profiles are implemented, all of which assume the workflow is being launched from an AWS EC2 instance:
-
-- `batch (default)`:  **Most efficient way to run the pipeline**
-  - This profile is the default and attempts to run the pipeline with AWS Batch. This is the quickest and most efficient way to run the pipeline, but requires significant additional setup not described in this repo. To set up AWS Batch for this pipeline, follow the instructions [here](./batch.md) (steps 1-3), then modify your config file to point `process.queue` to the name of your Batch job queue.
-- `ec2_local`: **Simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations.**
-  - This profile attempts to run the whole workflow locally on your EC2 instance, storing intermediate and outflow files on instance-linked block storage. This is simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations; in particular, if you don't use an instance with very high memory, the pipeline is likely to fail when loading the Kraken2 reference DB.
-- `ec2_s3`: **Avoids storage issues on your EC2 instance, but is still constrained by your instance's memory allocation.**
-  - This profile runs the pipeline on your EC2 instance, but attempts to read and write files to a specified S3 directory. This avoids problems caused by insufficient storage on your EC2 instance, but (1) is significantly slower and (2) is still constrained by your instance's memory allocation.
+To run the workflow on new data, you need:
 
-To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
+1. Accessible **raw data** files in Gzipped FASTQ format, named appropriately.
+2. A **sample sheet** file specifying the samples to be analyzed, along with paths to the forward and reverse read files for each sample. `bin/generate_samplesheet.sh` (see below) can make this for you.
+3. A **config file** in a clean launch directory, pointing to:
+    - The base directory in which to put the working and output directories (`params.base_dir`).
+    - The directory containing the outputs of the reference workflow (`params.ref_dir`).
+    - The sample sheet (`params.sample_sheet`).
+    - Various other parameter values.
 
 > [!TIP]
-> It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
+> We recommend starting each Nextflow pipeline run in a clean launch directory, containing only your sample sheet and config file.
 
-### Compute resource requirements
+### 1.1. The sample sheet
 
-To run the pipeline as is you need at least 128GB of memory and 64 cores. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
+The sample sheet must be an uncompressed CSV file with the following fields:
 
-To change the compute resources for a process, you can modify the `resources.config` file. This file specifies the compute resources for each process based on the label of the process. For example, to change the compute resources for the `kraken` process, you can add the following to the `resources.config` file:
+- First column: Sample ID
+- Second column: Path to FASTQ file 1 which should be the forward read for this sample
+- Third column: Path to FASTQ file 2 which should be the reverse read for this sample
 
-In the case that you change the resources, you'll need to also change the index.
+The easiest way to generate this file is typically using `dev/generate_samplesheet.sh`. This script takes in a path to a directory containing raw FASTQ files (`dir_path`) along with forward (`forward_suffix`) and reverse (`reverse_suffix`) read suffixes (both of which support regex) and an optional output path (`output_path`). The `--s3` flag indicates that the target directory is specified as an S3 path. As output, the script generates a CSV file (names `samplesheet.csv` by default) which can be used as input for the pipeline.
 
+For example:
+```
+../bin/generate_samplesheet.sh \
+   --s3
+   --dir_path s3://nao-restricted/MJ-2024-10-21/raw/ \
+   --forward_suffix _1 \
+   --reverse_suffix _2
+```
 
-## Running on new data
+In addition, the script can also add an additional `group` column that can be used to combine reads from different files that should be processed together (e.g. for deduplication). There are two options for doing this:
 
-To run the workflow on new data, you need:
+- Provide a path to a CSV file containing `sample` and `group` headers, specifying the mapping between samples and groups.
+- Specify the `--group_across_illumina_lanes` option if the target directory contains data from one or more libraries split across lanes on an Illumina flowcell.
 
-1. Accessible raw data files in Gzipped FASTQ format, named appropriately.
-2. A sample sheet file specifying the samples, along with paths to the forward and reverse read files for each sample. `generate_samplesheet.sh` (see below) can make this for you.
-3. A config file in a clean launch directory, pointing to:
-    - The base directory in which to put the working and output directories (`params.base_dir`).
-    - The directory containing the outputs of the reference workflow (`params.ref_dir`).
-    - The sample sheet (`params.sample_sheet`).
-    - Various other parameter values.
+Alternatively, the sample sheet can be manually edited to provide the `groups` column prior to running the pipeline.
 
-> [!NOTE]
-> The samplesheet must have the following format for each row:
-> - First column: Sample ID
-> - Second column: Path to FASTQ file 1 which should be the forward read for this sample
-> - Third column: Path to FASTQ file 2 which should be the reverse read for this sample
-> 
-> The easiest way to get this file is by using the `generate_samplesheet.sh` script. As input, this script takes a path to raw FASTQ files (`dir_path`), and forward (`forward_suffix`) and reverse (`reverse_suffix`) read suffixes, both of which support regex, and an optional output path (`output_path`). Those using data from s3 should make sure to pass the `s3` parameter. Those who would like to group samples by some metadata can pass a path to a CSV file containing a header column named `sample,group`, where each row gives the sample name and the group to group by (`group_file`), edit the samplesheet manually after generation (since manually editing the samplesheet will be easier when the groups CSV isn't readily available), or provide the --group_across_illumina_lanes option if a single library was run across a single Illumina flowcell. As output, the script generates a CSV file named (`samplesheet.csv` by default), which can be used as input for the pipeline.
->
-> For example:
-> ```
-> ../bin/generate_samplesheet.sh \
->    --s3
->    --dir_path s3://nao-restricted/MJ-2024-10-21/raw/ \
->    --forward_suffix _1 \
->    --reverse_suffix _2
-> ```
+### 1.2. The config file
 
-If running on Batch, a good process for starting the pipeline on a new dataset is as follows:
+The config file specifies parameters and other configuration options used by Nextflow in executing the pipeline. To create a config file for your pipeline run, copy `configs/run.config` into your launch directory as a file named `nextflow.config`, then modify the file as follows:
+    - Make sure `params.mode = "run"`; this instructs the pipeline to execute the [core run workflow](./docs/run.md).
+    - Edit `params.ref_dir` to point to the directory containing the outputs of the reference workflow.
+    - Edit `params.sample_sheet` to point to your sample sheet.
+    - Edit `params.base_dir` to point to the directory in which Nextflow should put the pipeline working and output directories.
+    - Edit `params.grouping` to specify whether to group samples together for common processing, based on the `group` column in the sample sheet.
+    - If running on AWS Batch (see below), edit `process.queue` to the name of your Batch job queue.
 
-1. Process the raw data to have appropriate filenames (see above) and deposit it in an accessible S3 directory.
-2. Create a clean launch directory and copy `configs/run.config` to a file named `nextflow.config` in that directory.
-3. Create a sample sheet in that launch directory (see above)
-4. Edit `nextflow.config` to specify each item in `params` as appropriate, as well as setting `process.queue` to the appropriate Batch queue.
-5. Run `nextflow run PATH_TO_REPO_DIR -resume`.
-6. Navigate to `{params.base_dir}/output` to view and download output files.
+Most other entries in the config file can be left at their default values for most runs. See [here](./docs/config.md) for a full description of config file parameters and their meanings.
 
+## 2. Choosing a profile
 
-## Tutorial: Running the `run` workflow on a test dataset
+The pipeline can be run in multiple ways by modifying various configuration variables specified in `configs/profiles.config`. Currently, three profiles are implemented, all of which assume the workflow is being launched from an AWS EC2 instance:
 
-To confirm that the pipeline works in your hands, we provide a small test dataset (`s3://nao-testing/gold-standard-test/raw/`) to run through the `run` workflow. Feel free to use any profile, but we recommend using the `ec2_local` profile as long as [you have the resources](usage.md#compute-resource-requirements) to handle it. 
+- `batch (default)`:  **Most reliable way to run the pipeline**
+  - This profile is the default and attempts to run the pipeline with AWS Batch. This is the most reliable and convenient way to run the pipeline, but requires significant additional setup (described [here](./batch.md)). Before running the pipeline using this profile, make sure `process.queue` in your config file is pointing to the correct Batch job queue.
+- `ec2_local`: **Requires the least setup, but is bottlenecked by your instance's compute, memory and storage.**
+  - This profile attempts to run the whole pipeline locally on your EC2 instance, storing all files on instance-linked block storage.
+  - This is simple and can be relatively fast, but requires large CPU, memory and storage allocations. In particular, if you don't use an instance with very high memory, the pipeline is likely to fail when loading the Kraken2 reference DB.
+- `ec2_s3`: **Avoids storage issues on your EC2 instance, but is still constrained by local compute and memory.**
+  - This profile runs the pipeline on your EC2 instance, but attempts to read and write files to a specified S3 directory. This avoids problems arising from insufficient local storage, but (a) is significantly slower and (b) is still constrained by local compute and memory allocations.
 
-When running with any profile, there is a shared setup that you need to do, before running the workflow. The shared setup is the same for all profiles, and is described below, followed by profile specific instructions.
+To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
 
-### Setup
+> [!TIP]
+> It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
 
-1. Create a new directory outside the repo directory and copy over the run workflow config file as `nextflow.config` in that directory:
+### Compute resource requirements
 
-```
-mkdir launch
-cd launch
-cp REPO_DIR/configs/run.config nextflow.config
-```
+To run the pipeline as is you need at least 128GB of memory and 64 cores. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
 
-2. Edit `nextflow.config` to set `params.ref_dir` to the index directory you chose or created above (specifically `PATH_TO_REF_DIR/output`)
-3. Set the samplesheet path to the test dataset samplesheet `${projectDir}/test-data/samplesheet.csv`
+To change the compute resources for a process, you can modify the `resources.config` file. This file specifies the compute resources for each process based on the label of the process. For example, to change the compute resources for the `kraken` process, you can add the following to the `resources.config` file:
 
-### Profile specific instructions
+In the case that you change the resources, you'll need to also change the index.
 
-#### `ec2_local`
+## 3. Running the pipeline
 
-4. Within this directory, run `nextflow run -profile ec2_local .. -resume`. Wait for the workflow to finish. 
-5. Inspect the `output` directory to view the processed output files.
+After creating your sample sheet and config files and choosing a profile, navigate to the launch directory containing your config file. You can then run the pipeline as follows:
 
-#### `ec2_s3`
+```
+nextflow run -resume -profile <PROFILE> PATH/TO/PIPELINE/DIR
+```
 
-4. Edit `nextflow.config` to set `params.base_dir` to the S3 directory of your choice. 
-5. Still within that directory, run `nextflow run -profile ec2_s3 .. -resume`. 
-6. Wait for the workflow to finish, and inspect the output on S3.
+where PATH/TO/PIPELINE/DIR specifies the path to the directory containing the pipeline files from this repository (in particular, `main.nf`) from the launch directory.
 
-#### `batch`
+> [!TIP]
+> If you are running the pipeline with its default profile (`batch`) you can omit the `-profile` declaration and simply write:
+>
+> ```
+> nextflow run -resume PATH/TO/PIPELINE/DIR
+> ```
 
-4. Edit `nextflow.config` to set `params.base_dir` to the S3 directory of your choice and `process.queue` to the name of your Batch job queue. 
-5. Still within that directory, run `nextflow run -profile batch .. -resume` (or simply `nextflow run .. -resume`). 
-6. Wait for the workflow to finish, and inspect the output on S3.
\ No newline at end of file
+Once the pipeline has finished, output and logging files will be available in the `output` subdirectory of the base directory specified in the config file.

From 32e3ca993035b0262c98a2c6855bbbc576657e77 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Wed, 22 Jan 2025 16:21:16 -0500
Subject: [PATCH 07/34] Minor edit to usage.md

---
 docs/usage.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index ff348858..e319e57c 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -75,9 +75,6 @@ The pipeline can be run in multiple ways by modifying various configuration vari
 
 To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
 
-> [!TIP]
-> It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
-
 ### Compute resource requirements
 
 To run the pipeline as is you need at least 128GB of memory and 64 cores. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
@@ -103,4 +100,7 @@ where PATH/TO/PIPELINE/DIR specifies the path to the directory containing the pi
 > nextflow run -resume PATH/TO/PIPELINE/DIR
 > ```
 
+> [!TIP]
+> It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
+
 Once the pipeline has finished, output and logging files will be available in the `output` subdirectory of the base directory specified in the config file.

From 1cd0986e5629a3665ba20b69aeb9fd79e34c7cfe Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Wed, 22 Jan 2025 18:25:59 -0500
Subject: [PATCH 08/34] Updated docs/run.md

---
 docs/run.md | 85 +++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 45 deletions(-)

diff --git a/docs/run.md b/docs/run.md
index e87a56c7..0331d613 100644
--- a/docs/run.md
+++ b/docs/run.md
@@ -1,6 +1,8 @@
 # RUN WORKFLOW
 
-The `run` workflow is responsible for the primary analysis of the pipeline. 
+This page describes the structure and function of the `run` workflow, which is responsible for the primary analysis of the pipeline. For usage instructions, see [here](./docs/usage.md).
+
+## Workflow structure
 
 ```mermaid
 ---
@@ -11,11 +13,11 @@ config:
   theme: default
 ---
 flowchart LR
-A[Input Files] --> |Raw reads|B[LOAD_SAMPLESHEET]
+A[Raw input reads] --> B[LOAD_SAMPLESHEET]
 B --> C[COUNT_TOTAL_READS] & D[EXTRACT_VIRAL_READS] & E[SUBSET_TRIM]
-D -.-> |*Optional*| F[BLAST_VIRAL]
-E --> |Trim and subset reads| G[RUN_QC] & H[PROFILE]
-E --> |Subset reads|G[RUN_QC]
+D -.-> F[BLAST_VIRAL]
+E --> |Subsetted reads|G[RUN_QC]
+E --> |Trimmed subsetted reads| G[RUN_QC] & H[PROFILE]
 %% Adjust layout by placing subgraphs in specific order
 subgraph "Viral identification"
 D
@@ -27,37 +29,38 @@ subgraph "QC"
 C
 G
 end
-subgraph "BLAST Validation"
+subgraph "BLAST Validation (optional)"
 F
 end
 ```
 
-By default, we define the host-infecting viruses to be screened against all vertebrate-infecting viruses and we don't perform BLAST validation. Corresponding config file: `configs/run.config`.
+At a high level, the `run` workflow carries out two main analyses on raw input reads: sensitive and specific identification of vertebrate-infecting viral reads, and high-level taxonomic profiling. Optionally, it can also perform BLAST validation on putative vertebrate-infecting viral reads; however, this is slow and expensive and not performed by default.
+
+To perform these functions, the workflow runs a series of subworkflows responsible for different tasks:
 
-I've broken down the `run` workflow into a series of subworkflows, each of which performs a specific task:
-1. [Setup workflows](#setup-workflows): Prepare the input data for analysis
+1. [Setup subworkflows](#setup-subworkflows): Prepare the input data for analysis
     - [Load data into channels (LOAD_SAMPLESHEET)](#load-data-into-channels-load_samplesheet)
     - [Subset and trim reads (SUBSET_TRIM)](#subset-and-trim-reads-subset_trim)
-2. [Analysis workflows](#analysis-workflows): Perform the primary analysis
-    - [Viral identification phase (EXTRACT_VIRAL_READS)](#viral-identification-phase-extract_viral_reads)
-    - [Taxonomic profiling phase (PROFILE)](#taxonomic-profiling-phase-profile)
-    - [BLAST validation phase (BLAST_VIRAL)](#blast-validation-phase-blast_viral)
-3. [QC workflows](#qc-workflows): Conduct quality control on the analysis results
+2. [Analysis subworkflows](#analysis-subworkflows): Perform the primary analysis
+    - [Viral identification (EXTRACT_VIRAL_READS)](#viral-identification-extract_viral_reads)
+    - [Taxonomic profiling (PROFILE)](#taxonomic-profiling-profile)
+    - [BLAST validation (BLAST_VIRAL)](#blast-validation-blast_viral)
+3. [QC subworkflows](#qc-subworkflows): Conduct quality control on the analysis results
     - [Count total reads (COUNT_TOTAL_READS)](#count-total-reads-count_total_reads)
-    - [QC and output phase (RUN_QC)](#qc-and-output-phase-run_qc)
-4. [Helper workflows](#helper-workflows): Helper workflows that are used by the workflows above
+    - [QC and output (RUN_QC)](#qc-and-output-run_qc)
+4. [Helper subworkflows](#helper-subworkflows): Helper workflows that are used by other subworkflows in this list
     - [Taxonomic assignment (TAXONOMY)](#taxonomic-assignment-taxonomy)
     - [QC (QC)](#qc-qc)
 
-
-
-## Setup workflows
+## Setup subworkflows
 
 ### Load data into channels (LOAD_SAMPLESHEET)
-This workflow loads the samplesheet and creates a channel containing the samplesheet data. It also creates a channel containing the group data, which is used to group reads into samples. No diagram is provided for this workflow, as it's a simple channel creation workflow.
+This subworkflow loads the samplesheet and creates a channel containing the samplesheet data, in the structure expected by the pipeline. If provided, it also creates a channel containing the grouping information used to combine samples for downstream analysis. (No diagram is provided for this subworkflow.)
 
 ### Subset and trim reads (SUBSET_TRIM)
-This workflow subsets the reads to a target number (default 1M/sample), and trims adapters with [FASTP](https://github.com/OpenGene/fastp), which both screens for adapters and trims low-quality and low-complexity sequences.
+This subworkflow uses [Seqtk](https://github.com/lh3/seqtk) to randomly subsample the input reads to a target number[^target] (default 1 million read pairs per sample) to save time and and compute on downstream steps while still providing a reliable statistical picture of the overall sample. Following downsampling, reads undergo adapter trimming and quality screening with [FASTP](https://github.com/OpenGene/fastp), which both screens for adapters and trims low-quality and low-complexity sequences.
+
+[^target]: More precisely, the subworkflow uses the total read count and target read number to calculate a fraction $p$ of the input reads that should be retained, then keeps each read from the input data with probability $p$. Since each read is kept or discarded independently of the others, the final read count will not exactly match the target number; however, it will be very close for sufficiently large input files.
 
 ```mermaid
 ---
@@ -68,18 +71,15 @@ config:
   theme: default
 ---
 flowchart LR
-A[Raw reads] --> B[SUBSET]
-B --> C[FASTP]
+A[Raw reads] --> B[Subset with Seqtk]
+B --> C[Trim with FASTP]
 ```
 
-1. Subset reads using [Seqtk](https://github.com/lh3/seqtk)
-2. Trim reads using [FASTP](https://github.com/OpenGene/fastp)
-
-## Analysis workflows
+## Analysis subworkflows
 
-### Viral identification phase (EXTRACT_VIRAL_READS)
+### Viral identification (EXTRACT_VIRAL_READS)
 
-The goal of this phase is to sensitively, specifically, and efficiently identify reads arising from host-infecting viruses. It uses a multi-step process designed to keep computational costs low while minimizing false-positive and false-negative errors.
+The goal of this subworkflow is to sensitively, specifically, and efficiently identify reads arising from host-infecting viruses. It uses a multi-step process designed to keep computational costs low while minimizing false-positive and false-negative errors.
 
 ```mermaid
 ---
@@ -113,22 +113,20 @@ G
 end
 ```
 
-1. To begin with, the raw reads are screened against a database of vertebrate-infecting viral genomes generated from Genbank by the index workflow. This initial screen is performed using [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), which flags any read that contains at least three 21-mers matching any host-infecting viral genome. The purpose of this initial screen is to rapidly and sensitively identify putative vertebrate-infecting viral reads while discarding the vast majority of non-HV reads, reducing the cost associated with the rest of this phase.
+1. To begin with, the raw reads are screened against a database of vertebrate-infecting viral genomes generated from Genbank by the index workflow. This initial screen is performed using [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), which flags any read that contains at least one 24-mer matching any vertebrate-infecting viral genome. The purpose of this initial screen is to rapidly and sensitively identify putative vertebrate-infecting viral reads while discarding the vast majority of non-HV reads, reducing the cost associated with the rest of this phase.
 2. Surviving reads undergo additional adapter trimming with [FASTP](https://github.com/OpenGene/fastp) and [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) to remove any residual adapter contamination that might lead to false positive results.
 3. Next, reads are aligned to the previously-mentioned database of vertebrate-infecting viral genomes with [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) using quite permissive parameters designed to capture as many putative HV reads as possible. The SAM and FASTQ files are processed to generate new read files containing any read pair for which at least one read matches the HV database.
-4. The output of the previous step is passed to a further filtering step, in which reads matching a series of common contaminant sequences are removed. This is done by aligning surviving reads to these contaminants using Bowtie2 in series[^filter]. Contaminants to be screened against include reference genomes from human, cow, pig, carp, mouse and *E. coli*, as well as various genetic engineering vectors.
-5. Surviving read pairs are then taxonomically profiled using the [TAXONOMY workflow](#taxonomic-assignment-taxonomy), which deduplicates, merges and profiles the reads.
+4. The output of the previous step is passed to a further filtering step, in which reads matching a series of common contaminant sequences are removed. This is done by aligning surviving reads to these contaminants using Bowtie2 in series. Contaminants to be screened against include reference genomes from human, cow, pig, carp, mouse and *E. coli*, as well as various genetic engineering vectors.
+5. Surviving read pairs are then taxonomically profiled using the [TAXONOMY subworkflow](#taxonomic-assignment-taxonomy), which deduplicates, merges and profiles the reads.
 6.  Finally, reads are assigned a final vertebrate-infecting virus status if they:
     - Are classified as vertebrate-infecting virus by both Bowtie2 and Kraken2; or
     - Are unassigned by Kraken and align to an vertebrate-infecting virus taxon with Bowtie2 with an alignment score above a user-specifed threshold[^threshold].
 
-[^filter]: We've found in past investigations that the two aligners detect different contaminant sequences, and aligning against both is more effective at avoiding false positives than either in isolation.
- 
 [^threshold]: Specifically, Kraken-unassigned read pairs are classed as HV if, for either read in the pair, S/ln(L) >= T, where S is the best-match Bowtie2 alignment score for that read, L is the length of the read, and T is the value of `params.bt2_score_threshold` specified in the config file.
 
-### Taxonomic profiling phase (PROFILE)
+### Taxonomic profiling (PROFILE)
 
-The goal of this phase is to give an overview of the taxonomic composition of the cleaned reads from the preprocessing phase. In particular, it gives an estimate of (i) the fraction of ribosomal reads in the dataset, (ii) the taxonomic breakdown of the dataset at the domain level[^eukarya], and (iii) more detailed abundance estimates for lower-level taxa.
+The goal of this subworkflow is to give an overview of the taxonomic composition of the cleaned and subset reads from the preprocessing phase. In particular, it gives an estimate of (i) the fraction of ribosomal reads in the dataset, (ii) the taxonomic breakdown of the dataset at the domain level[^eukarya], and (iii) more detailed abundance estimates for lower-level taxa.
 
 ```mermaid
 ---
@@ -148,8 +146,8 @@ subgraph "Ribosomal classification"
 B
 end
 ```
-1. These subset reads are then separated into ribosomal and non-ribosomal read groups using BBDuk, by searching for ribosomal k-mers from the SILVA database obtained during the index workflow. 
-2. The output of the previous step is passed to the [TAXONOMY workflow](#taxonomic-assignment-taxonomy), which returns back the Kraken2 and Bracken outputs.
+
+To do this, reads from SUBSET_CLEAN are separated into ribosomal and non-ribosomal read groups using BBDuk, by searching for ribosomal k-mers from the SILVA database generated by the index workflow. The ribosomal and non-ribosomal reads are then passed separately to [TAXONOMY workflow](#taxonomic-assignment-taxonomy), which returns back the Kraken2 and Bracken outputs. These are then annotated and merged across samples to produce single output files.
 
 [^eukarya]: As human is the only eukaryotic genome included in the Standard reference database for Kraken2, all sequences assigned to that domain can be assigned to *Homo sapiens*.
 
@@ -186,10 +184,10 @@ E & F --> G[Process & merge output]
 ## QC workflows
 
 ### Count total reads (COUNT_TOTAL_READS)
-This phase counts the total number of reads in the input files, then merges them all into one file. No diagram is provided for this workflow, as it's a simple channel creation workflow.
+This subworkflow counts the total number of reads in the input files, then merges read counts from all samples into one output TSV. (No diagram is provided for this subworkflow.)
 
 ### QC and output phase (RUN_QC)
-This phase conducts quality control on the subsetted reads, both the raw and cleaned reads.
+This subworkflow generates quality metrics on the raw and cleaned read subsets output by SUBSET_TRIM. Reads are analyzed with [FASTQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [MultiQC](https://multiqc.info/), then extracted into easy-to-parse TSV files[^tsvs] for downstream processing.
 
 ```mermaid
 ---
@@ -205,16 +203,13 @@ C[Subset and trimmed reads] --> D[QC]
 B & D --> E[Process & merge output]
 ```
 
-In this phase, the raw and cleaned reads from the preprocessing phase are analyzed with [FASTQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [MultiQC](https://multiqc.info/). This allows us to assess important metrics (e.g. read quality, read numbers, adapter content) in the raw input, and how these metrics are affected by cleaning. Relevant metrics are then extracted from MultiQC outputs into easy-to-parse TSV files[^tsvs] for downstream processing.
-
-
 [^tsvs]: Specifically, `qc_basic_stats.tsv.gz`, `qc_adapter_stats.tsv.gz`, `qc_quality_base_stats.tsv.gz` and `qc_quality_sequence_stats.tsv.gz`.
 
 ## Helper workflows
 
 ### Taxonomic assignment (TAXONOMY)
 
-This workflow performs taxonomic assignment on a set of reads.
+This subworkflow performs taxonomic assignment on a set of reads using [Kraken2](https://ccb.jhu.edu/software/kraken2/). It is called by both the PROFILE and EXTRACT_VIRAL_READS subworkflows.
 
 ```mermaid
 ---
@@ -271,4 +266,4 @@ D --> E[Process & merge output]
 1. Run FASTQC on each pair of read files
 2. Extract data with MultiQC for each pair of read files
 3. Summarize MultiQC information for each pair of read files
-4. Process and merge outputs
\ No newline at end of file
+4. Process and merge outputs

From b3054bf1852af2e6be5d96be43310ed64958d714 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Thu, 23 Jan 2025 19:50:51 +0000
Subject: [PATCH 09/34] Removed files, will bring t hem back later

---
 docs/developer.md      |  67 --------------------------
 docs/faqs.md           |  15 ------
 docs/index.md          |  21 --------
 docs/output.md         | 107 -----------------------------------------
 docs/overview.md       |  49 -------------------
 docs/run_validation.md |  53 --------------------
 6 files changed, 312 deletions(-)
 delete mode 100644 docs/developer.md
 delete mode 100644 docs/faqs.md
 delete mode 100644 docs/index.md
 delete mode 100644 docs/output.md
 delete mode 100644 docs/overview.md
 delete mode 100644 docs/run_validation.md

diff --git a/docs/developer.md b/docs/developer.md
deleted file mode 100644
index 4a7ad5aa..00000000
--- a/docs/developer.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Developer guide
-This section is soley for developers of the pipeline. We thank you greatly for your work!
-
-All of our testing is done using [nf-test](https://www.nf-test.com/), which is a testing framework that allows us to run tests locally and on Github Actions. All tests dataset are stored in the `s3://nao-testing` s3 bucket, and must be made public for nf-test (and anyone) to access. 
-
-As of right now we have the following test datasets ([more details here](#available-test-datasets)):
-
-- `gold-standard-test`: A small test dataset to test the run workflow.
-
-### Contributing to the pipeline
-
-When developing new features, always make sure to make a new branch that starts with your name, then is followed by a one or two word description of the feature you're working on by underscores. For example, `will_streaming` or `harmon_remove_trimmomatic`. This branch should be made from the `dev` branch.
-
->[!CAUTION]
-> Do not make pull requests from the `dev` or `main` branches.
-
-By default, all changes are made to local brnaches, pulled into `dev` then after enough `dev` changes, merged into `main`.
-
-#### Pull request requirements
-
-To have a pull request accepted, you must do the following:
-
-1. **Write new tests** for the changes that you make using `nf-test` if those tests don't already exist. At the very least, these tests should check that the new implementation runs to completion, however tests that verify the output of the test dataset are strongly encouraged.
-2. **Pass all existing tests locally** by running `nf-test test tests`. *If you make any changes that affect the output of the pipeline, you must list the changes that have occured in the output of the test dataset during your pull request.*
-3. **Update the `CHANGELOG.md` file** with the changes that you are making. More information on how to update the `CHANGELOG.md` file can be found [here](./version_schema.md).
-4. **Pass automated tests on Github Actions**. These run automatically when you open a pull request.
-5. **Reviewed by a maintainer**.
-
-More information on how to run nf-test locally can be found [here](#running-tests-locally-with-nf-test).
-
-### Everything testing
-
-#### Making a test dataset
-
-To make a new test dataset, copy the test dataset to `s3://nao-testing/[name-of-test-dataset]` where `[name-of-test-dataset]` is the name of the test dataset you want to create (request Harmon, or Will for permission to add to the bucket):
-
-```
-aws s3 cp /path/to/my_dataset s3://nao-testing/my_dataset/ --acl public-read
-```
-
-> [!NOTE]
-> Any time you update a test dataset, you must make it public again.
-
-#### Running tests locally with `nf-test`
-
-By default, we use the [gold standard test](#gold-standard-test-s3nao-testinggold-standard-test) dataset to test the run workflow. To run the tests locally, you need to make sure that you have a big enough ec2-instance. We recommend the `m5.xlarge` with at least `32GB` of EBS storage, as this machine closely reflects the VMs on Github Actions. Once you have an instance, run `nf-test test tests`, which will run all tests in the `tests` directory. If you want to run a specific test, you can either reference the name of the test file, or the tag of the test you want to run:
-
-```
-nf-test test tests/main.test.nf # Runs all tests in the main.test.nf file
-nf-test test --tag run     # Runs the run workflow
-```
-
-The intended results for the run workflow can be found in following directory `test-data/gold-standard-results`. Should the `run_output` test fail, you can diff the resulting files of that test, with the files in this folder to find the differences.
-
-> [!NOTE]
-> Periodically delete docker images to free up space on your instance. Running the following command will delete all docker images on your system:
-> ```
-> docker kill $(docker ps -q) 2>/dev/null || true
-> docker rm $(docker ps -a -q) 2>/dev/null || true
-> docker rmi $(docker images -q) -f 2>/dev/null || true
-> docker system prune -af --volumes
-> ```
-
-### Available test datasets
-
-#### Gold standard test (`s3://nao-testing/gold-standard-test/`)
-This is the default test dataset that we use for testing the run workflow. It is a small dataset that contains XXX reads from the [Yang 2020](https://www.sciencedirect.com/science/article/abs/pii/S0048969720358514?via%3Dihub) study. The code to generate this data can be found [here](https://github.com/naobservatory/generate-test-dataset/tree/harmon-dev-edits).
\ No newline at end of file
diff --git a/docs/faqs.md b/docs/faqs.md
deleted file mode 100644
index ad64c606..00000000
--- a/docs/faqs.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# FAQs
-
-This page is a collection of frequently asked questions about the pipeline.
-
-## How do I run the pipeline on a new dataset?
-See [Usage](usage.md) for more information.
-
-## How do I run the pipeline on AWS Batch?
-See [Batch](batch.md) for more information.
-
-## Does the pipeline work on all operating systems?
-By default, we've only tested the pipeline on Linux-based EC2 instances. For other operating systems, such as MACOS or Windows, we cannot guarantee that the pipeline will work.
-
-## How can I contribute to the pipeline?
-See [Developer](developer.md) for more information.
diff --git a/docs/index.md b/docs/index.md
deleted file mode 100644
index 7560399f..00000000
--- a/docs/index.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Index workflow
-
-The index workflow generates static index files from reference data. These reference data and indices don't depend on the reads processed by the run workflow, so a set of indices built by the index workflow can be used by multiple instantiations of the run workflow — no need to rebuild indices for every set of reads. The index workflow should be run once per reasonable length of time, balancing two considerations: Many of these index/reference files are derived from publicly available reference genomes or other resources, and should thus be updated and re-run periodically as new versions of these become available; however, to keep results comparable across datasets analyzed with the run workflow, this should be done relatively rarely.
-
-Key inputs to the index workflow include:
-- A TSV specifying names and taxonomic IDs (taxids) for host taxa for which to search for host-infecting viruses.
-- A URL linking to a suitable Kraken2 database for taxonomic profiling (typically the [latest release](https://benlangmead.github.io/aws-indexes/k2) of the `k2_standard` database).
-- URLS for up-to-date releases of reference genomes for various common contaminant species that can confound the identification of HV reads (currently [human](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9606), [cow](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9913), [pig](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9823), [carp](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=7962)[^carp], [mouse](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=10090), [*E. coli*](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=562)).
-- URLs to sequence databases for small and large ribosomal subunits from [SILVA](https://www.arb-silva.de/download/arb-files/).
-- Up-to-date links to [VirusHostDB](https://www.genome.jp/virushostdb) and [NCBI Taxonomy](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/).
-
-[^carp]: The carp genome is included as a last-ditch attempt to [capture any remaining Illumina adapter sequences](https://dgg32.medium.com/carp-in-the-soil-1168818d2191) before moving on to HV identification. I'm not especially confident this is helpful.
-
-Given these inputs, the index workflow:
-- Generates a TSV of viral taxa, incorporating taxonomic information from NCBI, and annotates each taxon with infection status for each host taxon of interest (using Virus-Host-DB).
-- Makes Bowtie2 indices from (1) all host-infecting viral genomes in Genbank[^genbank], (2) the human genome, (3) common non-human contaminants, plus BBMap indices for (2) and (3).
-- Downloads and extracts local copies of (1) the BLAST nt database, (2) the specified Kraken2 DB, (3) the SILVA rRNA reference files.
-
-[^genbank]: Excluding transgenic, contaminated, or erroneous sequences, which are excluded according to a list of sequence ID patterns specified in the config file.
-
-Run the index workflow by setting `mode = "index"` in the relevant config file. For more information, see `workflows/index.nf` and the associated subworkflows and modules.
diff --git a/docs/output.md b/docs/output.md
deleted file mode 100644
index 637ddb80..00000000
--- a/docs/output.md
+++ /dev/null
@@ -1,107 +0,0 @@
-# Outputs
-
-If the pipeline runs to completion, the following output files are expected. In the future, we will add more specific information about the outputs, including in-depth descriptions of the columns in the output files.
-
-All pipeline output can be found in the `output` directory, which is broken into four subdirectories:
-
-- `input`: Directory containing saved input information (useful for trying to reproduce someone else's results)
-- `logging`: Log files containing meta-level information about the pipeline run itself.
-- `intermediates`: Intermediate files produced by key stages in the run workflow, saved for nonstandard downstream analysis.
-- `results`: Directory containing processed results files for standard downstream analysis.
-
-## Run workflow
-
-Main heading represents the folder name, and subheadings represent a description of the file's usage. If the file is not in the heading folder name, the relative path is given.
-
-### `input`
-
-- `adapters.fasta`: FASTA file of adapter sequences used for adapter screening.
-- `run-params.json`: JSON file giving all the parameters passed to the pipeline.
-- `index-params.json`: JSON file giving parameters used to generate index directory (`params.ref_dir`).
-- `samplesheet.csv`: Copy of the samplesheet file used to configure the pipeline (specified by `params.sample_sheet`).
-
-### `logging`
-
-- `pipeline-version.txt`: Version of the pipeline used for the run.
-- `time.txt`: Start time of the run.
-- `trace.txt`: Tab delimited log of all the information for each task run in the pipeline including runtime, memory usage, exit status, etc. Can be used to create an execution timeline using the the script `bin/plot-timeline-script.R` after the pipeline has finished running. More information regarding the trace file format can be found [here](https://www.nextflow.io/docs/latest/reports.html#trace-file).
-- `pipeline-version-index.txt`: Version of pipeline used to generate index directory (`params.ref_dir`).
-
-### `intermediates`
-
-- `reads/raw_viral`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
-
-### `results`
-
-#### `qc`
-- `total_reads_qc.tsv.gz`: Total number of reads processed at each stage of the preprocessing phase (`stage`)
-- `qc_basic_stats.tsv.gz`: Summary statistics for each sample at each stage of the preprocessing phase (`stage`), including:
-    - GC content (`percent GC`);
-    - Average read length (`mean_seq_len`);
-    - Number of read pairs (`n_read pairs`);
-    - Approximate number of base pairs in reads (`n_bases_approx`);
-    - Percent duplicates as measured by FASTQC (`percent_duplicates`);
-    - Pass/fail scores for each test conducted by FASTQC.
-- `qc_adapter_stats.tsv.gz`: Adapter statistics calculated by FASTQC for each sample and preprocessing stage, given as a percentage of reads containing adapter content (`pc_adapters`) at each position along the read (`position`) for each adapter detected (`adapter`) for each read in the read pair (`read_pair`).
-- `qc_quality_base_stats.tsv.gz`: Per-base read-quality statistics calculated by FASTQC for each sample and preprocessing stage, given as the mean Phred score (`mean_phred_score`) at each position along the read (`position`) for each read in the read pair (`read_pair`).
-- `qc_quality_sequence_stats.tsv.gz`: Per-sequence read-quality statistics calculated by FASTQC for each sample and preprocessing stage, given as the number of reads (`n_sequences`) with a given mean Phred score (`mean_phred_score`) for each read in the read pair (`read_pair`).
-
-#### `viral identification`
-- `virus_hits_db.tsv.gz`: TSV output by the viral identification phase, giving information about each read pair assigned to a host-infecting virus.
-- `virus_clade_counts.tsv.gz`: Summary of the previous file giving the number of HV read pairs mapped to each viral taxon. Includes both read pairs mapped directly to that taxon (`n_reads_direct`) and to that taxon plus all descendent taxa (`n_reads_clade`).
-
-#### `taxonomic identification`
-- `kraken_reports_merged.tsv.gz`: Kraken output reports in TSV format, labeled by sample and ribosomal status.
-- `bracken_reports_merged.tsv.gz`: Bracken output reports in TSV format, labeled by sample and ribosomal status.
-
-#### `blast`
-- `blast_hits_paired.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
-    - The number of reads in the read pair with high-scoring matches to that taxid (`n_reads`).
-    - The best bitscores of alignments to that taxid for each matching read (`bitscore_max` and `bitscore_min`)[^bitscore].
-- `blast_forward_only.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
-- `blast_reverse_only.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
-
-[^bitscore]: If only one read aligns to the target, these two fields will be identical. If not, they will give the higher and lower of the best bitscores for the two reads in the pair..
-
-
-## Index workflow
-
-Main heading represents the folder name, and subheadings describes the tool that consumes the file. Files that are consumed by multiple tools or are not consumed by any tools are put in the `General` subheading. If the file is not in the heading folder name, the relative path is given.
-
-### `input`
-
-- `index-params.json`: JSON file giving all the parameters passed to the pipeline (useful for trying to reproduce someone else's results).
-
-### `logging`
-
-- `pipeline-version.txt`: Version of the pipeline with which index directory was created.
-- `time.txt`: Start time of index workflow run.
-- `trace.txt`: Nextflow trace file containing logging information for each process performed during the workflow run.
-
-### `results`
-
-#### `General`
-
-- `total-virus-db-annotated.tsv.gz`: Database generated from NCBI taxonomy and Virus-Host-DB giving taxonomy and host-infection information for each viral taxon.
-- `taxonomy-nodes.dmp`: Taxonomy dump file from NCBI mapping between taxids and their parents in the NCBI taxonomy tree structure.
-- `taxonomy-names.dmp`: Taxonomy dump file from NCBI mapping between taxids and taxon names.
-
-#### `BLAST`
-
-- `core_nt`: Directory containing extracted BLAST database files for BLASTing against core_nt.
-
-#### `Bowtie2`
-
-- `bt2-virus-index`: Directory containing Bowtie2 index for host-infecting viral genomes.
-- `bt2-human-index`: Directory containing Bowtie2 index for the human genome.
-- `bt2-other-index`: Directory containing Bowtie2 index for other contaminant sequences.
-- `virus-genome-metadata-gid.tsv.gz`: Genome metadata file generated during download of HV genomes from viral Genbank, annotated additionally with Genome IDs used by Bowtie2 (allowing mapping between genome ID and taxid).
-
-#### `Kraken2`
-
-- `kraken_db`: Directory containing Kraken2 reference database (default: Most recent version of Standard).
-
-#### `BBduk`
-
-- `virus-genomes-filtered.fasta.gz`: FASTA file containing host-infecting viral genomes downloaded from viral Genbank (filtered to remove transgenic, contaminated, or erroneous sequences).
-- `ribo-ref-concat.fasta.gz`: Reference database of ribosomal LSU and SSU sequences from SILVA.
diff --git a/docs/overview.md b/docs/overview.md
deleted file mode 100644
index 4734e185..00000000
--- a/docs/overview.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Overview
-
-The pipeline currently consists of three workflows:
-
-- `run`: Performs the main analysis, including viral identification, taxonomic profiling, and optional BLAST validation.
-- `index`: Creates indices and reference files used by the `run` workflow.[^1]
-- `run_validation`: Performs BLAST validation on any set of reads to verify their taxonomic classification.[^2]
-
-[^1]: The `index` workflow is intended to be run first, after which many instantiations of the `run` workflow can use the same index output files. 
-[^2]: The `run_validation` workflow is intended to be run after the `run` workflow if the optional BLAST validation was not selected during the `run` workflow. Typically, this workflow is run on a subset of the host viral reads identified in the `run` workflow, to evaluate the sensitivity and specificity of the viral identification process.
-
-The `run` workflow consists of four parts:
-
-- **Viral identification**: Reads from viruses infecting specific host taxa of interest (default: vertebrate viruses) are sensititvely and specifically identified using a multi-step pipeline based around k-mer filtering, adapter trimming, read alignment, and taxonomic classification.
-- **Taxonomic profile**: A random subset of reads (default: 1M/sample) undergo adapter trimming, ribosomal classification, and taxonomic classification.
-- **QC**: The total number of reads are recorded, then a subset of reads (default 1M/sample) undergo quality assessment, both before and after adapter trimming.
-- **(Optional) BLAST validation**: Putative host viral reads from part 1 (either all reads or a subset) are checked aginst `core_nt` to evaluate the sensitivity and specificity of the viral identification process.
-
-The following diagram provides a high-level overview of the `run` workflow (each blue box is a subworkflow):
-
-```mermaid
----
-title: RUN WORKFLOW
-config:
-  layout: horizontal
-  look: handDrawn
-  theme: defaultr
----
-flowchart LR
-A[Input Files] --> |Raw reads|B[LOAD_SAMPLESHEET]
-B --> C[COUNT_TOTAL_READS] & D[EXTRACT_VIRAL_READS] & E[SUBSET_TRIM]
-D -.-> |*Optional*| F[BLAST_VIRAL]
-E --> |Trim and subset reads| G[RUN_QC] & H[PROFILE]
-E --> |Subset reads|G[RUN_QC]
-%% Adjust layout by placing subgraphs in specific order
-subgraph "Viral identification"
-D
-end
-subgraph "Taxonomic Profile"
-H
-end
-subgraph "QC"
-C
-G
-end
-subgraph "BLAST Validation"
-F
-end
-```
\ No newline at end of file
diff --git a/docs/run_validation.md b/docs/run_validation.md
deleted file mode 100644
index 2d2207f4..00000000
--- a/docs/run_validation.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Run validation
-
-This pipeline allows you to run BLAST validation on the output of the RUN workflow or on any reads. We use [BLASTN](https://blast.ncbi.nlm.nih.gov/Blast.cgi) to align the putative host viral reads output by the previous phase against the `core_nt` database, then process the output to check whether each sequence had a high-scoring alignment to at least one host viral sequence.
-
-```mermaid
----
-title: RUN_VALIDATION
-config:
-  layout: horizontal
-  look: handDrawn
-  theme: default
----
-flowchart LR
-A[Reads] --> C["BLAST_VIRAL"]
-B["Run workflow viral reads"] --> C["BLAST_VIRAL"]
-subgraph "Input (choose one)"
-A 
-B
-end
-```
-
-1. Reads can be provided using the output of the host viral identification phase of the RUN workflow, or directly from the user.
-2. Run the [BLAST_VIRAL](#blast-validation-phase-blast_viral) subworkflow, which BLASTS the forward and reverse reads separately and then merges the results.
-
-## BLAST validation phase (BLAST_VIRAL)
-
-To evaluate the performance of the process described in the viral identification phase, it's useful to get some ground-truth information about whether the host viral assignments made in that subworkflow are correct. For computational tractability, this can be performed on only a subset of surviving host viral reads (specified by `params.blast_hv_fraction`)[^blast].
-
-```mermaid
----
-title: BLAST_VIRAL
-config:
-  layout: horizontal
-  look: handDrawn
-  theme: default
----
-flowchart LR
-A[Reads] -.-> |Optional|B[Subset reads]
-A --> |Forward read|C[BLASTN]
-A --> |Reverse read|D[BLASTN]
-B -.-> |Forward read|C[BLASTN]
-B -.-> |Reverse read|D[BLASTN]
-C --> E[Filter BLASTN output]
-D --> F[Filter BLASTN output]
-E & F --> G[Process & merge output]
-```
-
-1. Reads are subset if `params.blast_hv_fraction` is less than 1, else if `params.blast_hv_fraction` is 1, then BLAST is run on all host viral reads.
-2. Forward and reverse reads are aligned separately with BLASTN.
-3. BLASTN outputs are filtered to keep only the best-scoring alignment for each read.
-4. Output from both reads are combined into a single file, with columns for the read ID, the subject taxid, and the alignment score.
-
-[^blast]: Setting `params.blast_hv_fraction` to 0 skips this step altogether.
\ No newline at end of file

From 49694b98b634c2aed25d2e9986a7f0cf7fb0324e Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Thu, 23 Jan 2025 22:19:52 +0000
Subject: [PATCH 10/34] updated usage document.

---
 docs/usage.md | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index e319e57c..f51ec3bf 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -25,41 +25,42 @@ To run the workflow on new data, you need:
 
 ### 1.1. The sample sheet
 
-The sample sheet must be an uncompressed CSV file with the following fields:
+The sample sheet must be an uncompressed CSV file with the following headers in the order specified:
 
-- First column: Sample ID
-- Second column: Path to FASTQ file 1 which should be the forward read for this sample
-- Third column: Path to FASTQ file 2 which should be the reverse read for this sample
+- `sample` (1st column): Sample ID
+- `fastq_1` (2nd column): Path to FASTQ file 1 which should be the forward read for this sample
+- `fastq_2` (3rd column): Path to FASTQ file 2 which should be the reverse read for this sample
+- `group` (4th column): Group ID (optional)
 
-The easiest way to generate this file is typically using `dev/generate_samplesheet.sh`. This script takes in a path to a directory containing raw FASTQ files (`dir_path`) along with forward (`forward_suffix`) and reverse (`reverse_suffix`) read suffixes (both of which support regex) and an optional output path (`output_path`). The `--s3` flag indicates that the target directory is specified as an S3 path. As output, the script generates a CSV file (names `samplesheet.csv` by default) which can be used as input for the pipeline.
+The easiest way to generate this file is typically using `dev/generate_samplesheet.py`. This script takes in a path to a directory containing raw FASTQ files (`dir_path`) along with forward (`forward_suffix`) and reverse (`reverse_suffix`) read suffixes (both of which support regex) and an optional output path (`output_path`). The `--s3` flag indicates that the target directory is specified as an S3 path. As output, the script generates a CSV file (names `samplesheet.csv` by default) which can be used as input for the pipeline. 
 
 For example:
 ```
-../bin/generate_samplesheet.sh \
+./bin/generate_samplesheet.py \
    --s3
-   --dir_path s3://nao-restricted/MJ-2024-10-21/raw/ \
-   --forward_suffix _1 \
-   --reverse_suffix _2
+   --dir_path s3://nao-testing/gold-standard-test/raw/ \
+   --forward_suffix _R1 \
+   --reverse_suffix _R2
 ```
 
 In addition, the script can also add an additional `group` column that can be used to combine reads from different files that should be processed together (e.g. for deduplication). There are two options for doing this:
 
-- Provide a path to a CSV file containing `sample` and `group` headers, specifying the mapping between samples and groups.
-- Specify the `--group_across_illumina_lanes` option if the target directory contains data from one or more libraries split across lanes on an Illumina flowcell.
+- Provide a path to a CSV file containing `sample` and `group` headers (in that order), specifying the mapping between samples and groups.
+- Specify the `--group_across_illumina_lanes` option if the target directory contains data from one or more libraries split across lanes on an Illumina flowcell (i.e. group all reads with the following pattern 'Lnnn' where 'n' is any digit from 0 to 9).
 
-Alternatively, the sample sheet can be manually edited to provide the `groups` column prior to running the pipeline.
+Alternatively, the sample sheet can be manually edited to provide the `group` column prior to running the pipeline.
 
 ### 1.2. The config file
 
 The config file specifies parameters and other configuration options used by Nextflow in executing the pipeline. To create a config file for your pipeline run, copy `configs/run.config` into your launch directory as a file named `nextflow.config`, then modify the file as follows:
-    - Make sure `params.mode = "run"`; this instructs the pipeline to execute the [core run workflow](./docs/run.md).
+    - Make sure `params.mode = "run"`; this instructs the pipeline to execute the [core run workflow](./run.md).
     - Edit `params.ref_dir` to point to the directory containing the outputs of the reference workflow.
     - Edit `params.sample_sheet` to point to your sample sheet.
     - Edit `params.base_dir` to point to the directory in which Nextflow should put the pipeline working and output directories.
     - Edit `params.grouping` to specify whether to group samples together for common processing, based on the `group` column in the sample sheet.
     - If running on AWS Batch (see below), edit `process.queue` to the name of your Batch job queue.
 
-Most other entries in the config file can be left at their default values for most runs. See [here](./docs/config.md) for a full description of config file parameters and their meanings.
+Most other entries in the config file can be left at their default values for most runs. See [here](./config.md) for a full description of config file parameters and their meanings.
 
 ## 2. Choosing a profile
 
@@ -75,14 +76,6 @@ The pipeline can be run in multiple ways by modifying various configuration vari
 
 To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
 
-### Compute resource requirements
-
-To run the pipeline as is you need at least 128GB of memory and 64 cores. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
-
-To change the compute resources for a process, you can modify the `resources.config` file. This file specifies the compute resources for each process based on the label of the process. For example, to change the compute resources for the `kraken` process, you can add the following to the `resources.config` file:
-
-In the case that you change the resources, you'll need to also change the index.
-
 ## 3. Running the pipeline
 
 After creating your sample sheet and config files and choosing a profile, navigate to the launch directory containing your config file. You can then run the pipeline as follows:
@@ -104,3 +97,10 @@ where PATH/TO/PIPELINE/DIR specifies the path to the directory containing the pi
 > It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
 
 Once the pipeline has finished, output and logging files will be available in the `output` subdirectory of the base directory specified in the config file.
+
+
+## 4. Cleaning up
+
+Running nextflow pipelines will create a large number of files in the working directory. To get rid of these files, you can remove them manually or by running the `nextflow clean` command in the launch directory.
+
+If you are running the pipeline using `ec2_local` or `ec2_s3` profiles, you will also want to clean up the docker images and containers created by the pipeline as these can take up a lot of space. This can be done by running `docker system prune -a` which will remove all unused docker images and containers on your system.
\ No newline at end of file

From 2a979593d63320d0766373fa7007ecae8c3a565b Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Thu, 23 Jan 2025 22:22:33 +0000
Subject: [PATCH 11/34] Updated reference to test dataset.

---
 docs/installation.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/installation.md b/docs/installation.md
index 7814620c..1575ef50 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -111,3 +111,24 @@ nextflow run -resume -profile <PROFILE> REPO_DIR
 ```
 
 Once the pipeline is complete, output and logging files will be available in the `output` subdirectory of the base directory specified in the config file.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+### Compute resource requirements
+
+To run the pipeline as is you need at least 128GB of memory and 64 cores. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
+
+To change the compute resources for a process, you can modify the `resources.config` file. This file specifies the compute resources for each process based on the label of the process. For example, to change the compute resources for the `kraken` process, you can add the following to the `resources.config` file:
+
+In the case that you change the resources, you'll need to also change the index.
\ No newline at end of file

From c8308370f1ff067ecbc46d1200342b5e66e6c987 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Thu, 23 Jan 2025 22:27:28 +0000
Subject: [PATCH 12/34] Ooops commited installation, but meant to do this, I'll
 fix that in a bit.

---
 docs/batch.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/batch.md b/docs/batch.md
index b7f7ba3f..437da385 100644
--- a/docs/batch.md
+++ b/docs/batch.md
@@ -89,4 +89,4 @@ The last step you need to complete on AWS itself is to set up a job queue that N
 
 ## 4. Run Nextflow with Batch
 
-Finally, you need to use all the infrastructure you've just set up to actually run a Nextflow workflow! We recommend using our test dataset to get started. [Click here to see how to run the pipeline on the test dataset](./testing.md#user-guide).
+Finally, you need to use all the infrastructure you've just set up to actually run a Nextflow workflow! We recommend using our test dataset to get started. [Click here to see how to run the pipeline on the test dataset](./installation.md#5-run-the-pipeline-on-test-data).

From 3c4e3ae06c6bbb0f994e909e6561239d30a16c61 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 14:42:49 +0000
Subject: [PATCH 13/34] Updates

---
 docs/config.md       | 23 ++++++++++++++++++++++
 docs/installation.md | 47 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 63 insertions(+), 7 deletions(-)
 create mode 100644 docs/config.md

diff --git a/docs/config.md b/docs/config.md
new file mode 100644
index 00000000..4867144c
--- /dev/null
+++ b/docs/config.md
@@ -0,0 +1,23 @@
+# Configurations 
+
+Configuartions are specified by `.config` files. These files are used to specify parameters and other configuration options used by Nextflow in executing the pipeline. 
+
+All configurations are stored in the `configs` directory.
+
+# Workflow specific configurations
+
+## Run workflow (`configs/run.config`)
+
+- `params.mode = "run"`: This instructs the pipeline to execute the [core run workflow](./run.md).
+- `params.base_dir`: The parent directory for the pipeline working and output directories.
+- `params.ref_dir`: The directory containing the outputs of the `index` workflow.
+- `params.sample_sheet`: The path to the sample sheet.
+- `params.adapters`: The path to the adapter file for adapter trimming.
+- `params.grouping`: Whether to group samples by the `group` column in the sample sheet.
+- `params.n_reads_profile`: The number of reads per sample to run through taxonomic profiling.
+- `params.bt2_score_threshold`: The normalized score threshold for calling a host-infecting virus read (typically 15 or 20).
+- `params.blast_viral_fraction`: The fraction of putative host-infecting virus reads to BLAST vs nt (0 = don't run BLAST).
+- `params.quality_encoding`: The FASTQ quality encoding (probably phred33, maybe phred64).
+- `params.fuzzy_match_alignment_duplicates`: Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2).
+- `params.host_taxon`: The taxon to use for host-infecting virus identification.
+- `params.blast_db_prefix`: The prefix for the BLAST database to use for host-infecting virus identification.
diff --git a/docs/installation.md b/docs/installation.md
index 1575ef50..c6502135 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -58,7 +58,45 @@ docker run hello-world
 
 Clone this repo into a new directory as normal.
 
-## 4. Run index/reference workflow
+## 4. Run the pipeline locally via our test suite
+
+To verify the pipeline works correctly, run the test suite locally. You'll need:
+
+- **Recommended EC2 Instance:** `m5.xlarge`
+  - 4 CPU cores
+  - 14GB RAM
+  - At least 32GB EBS storage
+
+This instance type matches the GitHub Actions virtual machines where CI/CD tests run.
+
+To run the tests:
+```bash
+cd /path/to/repo
+nf-test test tests/
+```
+
+## 5. Prepare compute resources for running the pipeline on real data
+
+The pipeline has significant compute requirements:
+
+- **Base Requirements:**
+  - 128GB RAM
+  - 64 CPU cores
+  - Required for running the standard pipeline with the full Kraken database (128GB)
+
+- **Additional Requirements for BLAST:**
+  - 256GB RAM when running either the `run` or `run_validation` workflows with BLAST enabled
+
+Ensure your computing environment meets these requirements:
+- For `ec2_local` or `ec2_s3` profiles: Select an EC2 instance with sufficient resources
+- For `batch` profile: Configure AWS Batch to scale to these specifications
+
+> [!NOTE]
+> The following recommendations are based on using the default reference files produced as a result of the  `index` workflow. If you do not have access to this much compute, you can modify the `index` workflow to create smaller reference files, however this will reduce the accuracy of the pipeline.
+>
+> In the situation that you do use smaller reference files, you can modify the required resources by changing the resource specifications in the `config/resources.config` file.
+
+## 6. Run index/reference workflow
 
 > [!TIP]
 > If someone else in your organization already uses this pipeline, it's likely they've already run the index workflow and generated an output directory. If this is the case, you can reduce costs and increase reproducibility by using theirs instead of generating your own. If you want to do this, skip this step, and edit `configs/run.config` such that `params.ref_dir` points to `INDEX_DIR/output`.
@@ -84,7 +122,7 @@ nextflow run PATH_TO_REPO_DIR -resume
 
 Wait for the workflow to run to completion; this is likely to take several hours at least.
 
-## 5. Run the pipeline on test data
+## 7. Run the pipeline on test data
 
 To confirm that the pipeline works in your hands, we recommend running it on a small test dataset, such as the one provided at `s3://nao-testing/gold-standard-test/raw/`, before running it on larger input data. To do this with our test dataset, follow the instructions below, or do it yourself according to the directions given [here](./docs/usage.md).
 
@@ -127,8 +165,3 @@ Once the pipeline is complete, output and logging files will be available in the
 
 ### Compute resource requirements
 
-To run the pipeline as is you need at least 128GB of memory and 64 cores. This is because we use the whole KrakenDB whihc is large (128GB) and some for processes consume 64 cores. Simiarly, if one would like to run BLAST, they must have at least 256GB of memory. 
-
-To change the compute resources for a process, you can modify the `resources.config` file. This file specifies the compute resources for each process based on the label of the process. For example, to change the compute resources for the `kraken` process, you can add the following to the `resources.config` file:
-
-In the case that you change the resources, you'll need to also change the index.
\ No newline at end of file

From 78568adeb277345f49d6f3d7ba16d751d5c32354 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 14:45:40 +0000
Subject: [PATCH 14/34] Forgot to remove troubleshooting but this is gone now.

---
 docs/troubleshooting.md | 43 -----------------------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 docs/troubleshooting.md

diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
deleted file mode 100644
index ed1ce2ab..00000000
--- a/docs/troubleshooting.md
+++ /dev/null
@@ -1,43 +0,0 @@
-
-## Troubleshooting
-
-We break down the issues that can arise into two categories:
-
-- Nextflow isn't able to run
-- Nextflow runs then fails
-
-We discuss each of these issues below.
-
-### Nextflow isn't able to run
-
-**The most common sources of errors are AWS permission issues.** Whether you're using the `batch` profile or not, you should make sure you have all AWS permissions, as specified in [our Batch tutorial](./batch.md#step-0-set-up-your-aws-credentials).
-
-Next, make sure Nextflow has access to your AWS credentials (if not you may get `AccessDenied` errors):
-
-```
-eval "$(aws configure export-credentials --format env)"
-```
-
-### Nextflow runs then fails
-
-
-#### None of my jobs are running
-
-The first time a pipeline is run on any dataset, it can take a while for the jobs to start (10-15 minutes at most). If after this period of time, none of your jobs have started, then this may be another AWS permission issue.
-
-#### Docker image failure
-
-Another common issue is for processes to fail with some variation of the following Docker-related error:
-
-```
-docker: failed to register layer: write /usr/lib/jvm/java-11-openjdk-amd64/lib/modules: **no space left on device**.
-```
-
-This is a fairly well-known problem, which can arise even when there is substantial free storage space accessible to your EC2 instance. Following the steps recommended [here](https://www.baeldung.com/linux/docker-fix-no-space-error) or [here](https://forums.docker.com/t/docker-no-space-left-on-device/69205) typically resolves the issue, either by deleting Docker assets to free up space (e.g. via `docker system prune --all --force`) or by giving Docker more space.
-
-#### Insufficient resources
-
-This an issue we're actively working on. Sometimes your jobs may fail due to insufficient memory or CPU resources. To avoid this, you can either change your overall resources or change the resources for a specific process:
-
-- **Change your overall resources**: You can change the resources for your entire pipeline by editing the `configs/resources.config` file. However, note that this will probably also require you to change the reference files and indices generated by the `index` workflow. [See here for more information on compute resource requirements](./usage#compute-resource-requirements).
-- **Change the resources for a specific process**: You can change the resources for a specific process by changing the label of that process to another label that is in the `configs/resources.config` file or you can define a new label in the `configs/resources.config` file.

From 6a2b2795f7e30bbd619dce79d24eafaaf1ab53ba Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 14:53:01 +0000
Subject: [PATCH 15/34] Updated run workflow.

---
 docs/run.md | 224 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 130 insertions(+), 94 deletions(-)

diff --git a/docs/run.md b/docs/run.md
index 0331d613..0ca44e51 100644
--- a/docs/run.md
+++ b/docs/run.md
@@ -9,15 +9,15 @@ This page describes the structure and function of the `run` workflow, which is r
 title: RUN WORKFLOW
 config:
   layout: horizontal
-  look: handDrawn
-  theme: default
 ---
 flowchart LR
-A[Raw input reads] --> B[LOAD_SAMPLESHEET]
+A(Raw reads) --> B[LOAD_SAMPLESHEET]
 B --> C[COUNT_TOTAL_READS] & D[EXTRACT_VIRAL_READS] & E[SUBSET_TRIM]
-D -.-> F[BLAST_VIRAL]
-E --> |Subsetted reads|G[RUN_QC]
-E --> |Trimmed subsetted reads| G[RUN_QC] & H[PROFILE]
+D -.-> |Optional|F[BLAST_VIRAL]
+E --> I(Subset reads)
+E --> J(Subset trimmed reads)
+J --> G[RUN_QC] & H[PROFILE]
+I --> G[RUN_QC]
 %% Adjust layout by placing subgraphs in specific order
 subgraph "Viral identification"
 D
@@ -29,7 +29,7 @@ subgraph "QC"
 C
 G
 end
-subgraph "BLAST Validation (optional)"
+subgraph "BLAST Validation"
 F
 end
 ```
@@ -41,16 +41,16 @@ To perform these functions, the workflow runs a series of subworkflows responsib
 1. [Setup subworkflows](#setup-subworkflows): Prepare the input data for analysis
     - [Load data into channels (LOAD_SAMPLESHEET)](#load-data-into-channels-load_samplesheet)
     - [Subset and trim reads (SUBSET_TRIM)](#subset-and-trim-reads-subset_trim)
-2. [Analysis subworkflows](#analysis-subworkflows): Perform the primary analysis
+2. [Helper subworkflows](#helper-subworkflows): Helper workflows that are used by other subworkflows in this list
+    - [Taxonomic assignment (TAXONOMY)](#taxonomic-assignment-taxonomy)
+    - [QC (QC)](#qc-qc)
+3. [Analysis subworkflows](#analysis-subworkflows): Perform the primary analysis
     - [Viral identification (EXTRACT_VIRAL_READS)](#viral-identification-extract_viral_reads)
     - [Taxonomic profiling (PROFILE)](#taxonomic-profiling-profile)
     - [BLAST validation (BLAST_VIRAL)](#blast-validation-blast_viral)
-3. [QC subworkflows](#qc-subworkflows): Conduct quality control on the analysis results
+4. [QC subworkflows](#qc-subworkflows): Conduct quality control on the analysis results
     - [Count total reads (COUNT_TOTAL_READS)](#count-total-reads-count_total_reads)
     - [QC and output (RUN_QC)](#qc-and-output-run_qc)
-4. [Helper subworkflows](#helper-subworkflows): Helper workflows that are used by other subworkflows in this list
-    - [Taxonomic assignment (TAXONOMY)](#taxonomic-assignment-taxonomy)
-    - [QC (QC)](#qc-qc)
 
 ## Setup subworkflows
 
@@ -67,14 +67,90 @@ This subworkflow uses [Seqtk](https://github.com/lh3/seqtk) to randomly subsampl
 title: TRIM_AND_SUBSET
 config:
   layout: horizontal
-  look: handDrawn
-  theme: default
 ---
 flowchart LR
-A[Raw reads] --> B[Subset with Seqtk]
+A(Raw reads)  --> B[Subset with Seqtk]
 B --> C[Trim with FASTP]
+B --> D(Subset reads)
+C --> E(Subset trimmed reads)
+style A fill:#fff,stroke:#000
+style D fill:#000,color:#fff,stroke:#000
+style E fill:#000,color:#fff,stroke:#000
+```
+
+## Helper workflows
+
+### Taxonomic assignment (TAXONOMY)
+
+This subworkflow performs taxonomic assignment on a set of reads using [Kraken2](https://ccb.jhu.edu/software/kraken2/). It is called by both the [PROFILE](#taxonomic-profiling-profile) and [EXTRACT_VIRAL_READS](#viral-identification-extract_viral_reads) subworkflows.
+
+```mermaid
+---
+title: TAXONOMY
+config:
+  layout: horizontal
+---
+flowchart LR
+A(Reads) -.-> |Optional|B[CLUMPIFY_PAIRED]
+A --> C[BBMERGE]
+B -.-> C[BBMERGE]
+C -.-> D[CLUMPIFY_SINGLE]
+C --> E[KRAKEN]
+D -.-> E[KRAKEN]
+E --> G(Kraken output)
+E --> F[BRACKEN]
+F --> H(Bracken output)
+subgraph "Read deduplication"
+B
+D
+end
+subgraph "Paired reads to single reads"
+C
+end
+subgraph "Taxonomic profiling"
+E
+F
+end
+style A fill:#fff,stroke:#000
+style G fill:#000,color:#fff,stroke:#000
+style H fill:#000,color:#fff,stroke:#000
 ```
 
+1. If enabled, reads are first deduplicated using [Clumpify](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/clumpify-guide/)[^dedup]
+2. Read pairs are then processed with [BBMerge](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmerge-guide/)[^merge] to create single sequences. Any pairs that cannot be merged are joined end-to-end with an "N" base between them.
+3. When deduplication is enabled, the merged reads undergo a second round of Clumpify deduplication.
+4. Finally, reads are taxonomically classified using [Kraken2](https://ccb.jhu.edu/software/kraken2/) with the reference database from the index workflow. [Bracken](https://ccb.jhu.edu/software/bracken/) then processes these results to generate detailed taxonomic abundance estimates.
+
+[^dedup]: Which identifies and collapses groups of reads that are identical modulo some specified error rate.
+[^merge]: Which aligns and merges read pairs with significant overlap.
+
+### QC (QC)
+This phase conducts quality control on any set of reads. It is called by [RUN_QC](#qc-and-output-run_qc) twice, once for the subset reads and once for the subset trimmed reads.
+```mermaid
+---
+title: QC
+config:
+  layout: horizontal
+---
+flowchart LR
+A(Reads) --> B[FASTQ]
+B --> C[MULTIQC]
+C --> D[SUMMARIZE_MULTIQC_PAIR]
+D --> E[Process & merge output]
+E --> F(QC basic stats) & G(QC adapter stats) & H(QC quality base stats) & I(QC quality sequence stats)
+style A fill:#fff,stroke:#000
+style F fill:#000,color:#fff,stroke:#000
+style G fill:#000,color:#fff,stroke:#000
+style H fill:#000,color:#fff,stroke:#000
+style I fill:#000,color:#fff,stroke:#000
+```
+
+1. Run FASTQC on each pair of read files
+2. Extract data with MultiQC for each pair of read files
+3. Summarize MultiQC information for each pair of read files
+4. Process and merge outputs
+
+
 ## Analysis subworkflows
 
 ### Viral identification (EXTRACT_VIRAL_READS)
@@ -86,18 +162,21 @@ The goal of this subworkflow is to sensitively, specifically, and efficiently id
 title: EXTRACT_VIRAL_READS
 config:
   layout: horizontal
-  look: handDrawn
-  theme: default
 ---
 flowchart LR
-A[Raw reads] --> B["BBDuk <br> (viral index)"]
+A(Raw reads) --> B["BBDuk <br> (viral index)"]
+B --> M(Raw reads after BBDuk)
 B --> C[FASTP]
 C --> D[Cutadapt]
 D --> E["Bowtie2 <br> (viral index)"]
 E --> F["Bowtie2 <br> (human index)"]
 F --> G["Bowtie2 <br> (other contaminants index)"]
 G --> H[TAXONOMY]
-H --> I[Process & merge output]
+H --> I[Filter by alignment score and assignment]
+E --> I
+I --> J(Viral reads with metadata)
+I --> K(Viral clade counts)
+I --> N("Forward and reverse fasta file of all viral reads")
 
 subgraph "Filter for viral reads"
 B
@@ -111,6 +190,11 @@ subgraph "Remove contaminants"
 F
 G
 end
+style A fill:#fff,stroke:#000
+style J fill:#000,color:#fff,stroke:#000
+style K fill:#000,color:#fff,stroke:#000
+style N fill:#000,color:#fff,stroke:#000
+style M fill:#000,color:#fff,stroke:#000
 ```
 
 1. To begin with, the raw reads are screened against a database of vertebrate-infecting viral genomes generated from Genbank by the index workflow. This initial screen is performed using [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), which flags any read that contains at least one 24-mer matching any vertebrate-infecting viral genome. The purpose of this initial screen is to rapidly and sensitively identify putative vertebrate-infecting viral reads while discarding the vast majority of non-HV reads, reducing the cost associated with the rest of this phase.
@@ -133,18 +217,21 @@ The goal of this subworkflow is to give an overview of the taxonomic composition
 title: PROFILE
 config:
   layout: horizontal
-  look: handDrawn
-  theme: default
 ---
 flowchart LR
-A[Subset & trimmed reads] --> B["BBDuk <br> (SILVA index)"]
+A("Subset trimmed reads <br> (SUBSET_TRIM)") --> B["BBDuk <br> (SILVA index)"]
 B --> |Ribosomal reads| C[TAXONOMY]
 B --> |Non-ribosomal reads| D[TAXONOMY]
 C --> E[Merge output]
 D --> E
+E --> F(kraken_reports_merged)
+E --> G(bracken_reports_merged)
 subgraph "Ribosomal classification"
 B
 end
+style A fill:#fff,stroke:#000
+style F fill:#000,color:#fff,stroke:#000
+style G fill:#000,color:#fff,stroke:#000
 ```
 
 To do this, reads from SUBSET_CLEAN are separated into ribosomal and non-ribosomal read groups using BBDuk, by searching for ribosomal k-mers from the SILVA database generated by the index workflow. The ribosomal and non-ribosomal reads are then passed separately to [TAXONOMY workflow](#taxonomic-assignment-taxonomy), which returns back the Kraken2 and Bracken outputs. These are then annotated and merged across samples to produce single output files.
@@ -160,18 +247,21 @@ To evaluate the performance of the process described in the viral identification
 title: BLAST_VIRAL
 config:
   layout: horizontal
-  look: handDrawn
-  theme: default
 ---
 flowchart LR
-A[Reads] -.-> |Optional|B[Subset reads]
-A --> |Forward read|C[BLASTN]
-A --> |Reverse read|D[BLASTN]
-B -.-> |Forward read|C[BLASTN]
-B -.-> |Reverse read|D[BLASTN]
+A("Forward and reverse fasta file of all viral reads <br> (EXTRACT_VIRAL_READS)") -.-> |Optional|B[Subset with Seqtk]
+B --> |Forward read|C[BLASTN]
+B --> |Reverse read|D[BLASTN]
 C --> E[Filter BLASTN output]
 D --> F[Filter BLASTN output]
 E & F --> G[Process & merge output]
+G --> H(BLAST assignment with metadata)
+G --> I(Forward read used for BLAST)
+G --> J(Reverse read used for BLAST)
+style A fill:#fff,stroke:#000
+style H fill:#000,color:#fff,stroke:#000
+style I fill:#000,color:#fff,stroke:#000
+style J fill:#000,color:#fff,stroke:#000
 ```
 
 1. Reads are subset if `params.blast_hv_fraction` is less than 1, else if `params.blast_hv_fraction` is 1, then BLAST is run on all host viral reads.
@@ -194,76 +284,22 @@ This subworkflow generates quality metrics on the raw and cleaned read subsets o
 title: RUN_QC
 config:
   layout: horizontal
-  look: handDrawn
-  theme: default
 ---
 flowchart LR
 A[Subset reads] --> B[QC]
-C[Subset and trimmed reads] --> D[QC]
+C(Subset trimmed reads) --> D[QC]
 B & D --> E[Process & merge output]
+E --> F(Combined QC basic stats)
+E --> G(Combined QC adapter stats)
+E --> H(Combined QC quality base stats)
+E --> I(Combined QC quality sequence stats)
+style A fill:#fff,stroke:#000
+style C fill:#fff,stroke:#000
+style F fill:#000,color:#fff,stroke:#000
+style G fill:#000,color:#fff,stroke:#000
+style H fill:#000,color:#fff,stroke:#000
+style I fill:#000,color:#fff,stroke:#000
 ```
 
 [^tsvs]: Specifically, `qc_basic_stats.tsv.gz`, `qc_adapter_stats.tsv.gz`, `qc_quality_base_stats.tsv.gz` and `qc_quality_sequence_stats.tsv.gz`.
 
-## Helper workflows
-
-### Taxonomic assignment (TAXONOMY)
-
-This subworkflow performs taxonomic assignment on a set of reads using [Kraken2](https://ccb.jhu.edu/software/kraken2/). It is called by both the PROFILE and EXTRACT_VIRAL_READS subworkflows.
-
-```mermaid
----
-title: TAXONOMY
-config:
-  layout: horizontal
-  look: handDrawn
-  theme: default
----
-flowchart LR
-A[Reads] --> B[CLUMPIFY_PAIRED]
-B --> C[BBMERGE]
-C --> D[CLUMPIFY_SINGLE]
-D --> E[KRAKEN]
-E --> F[BRACKEN]
-subgraph "Read deduplication"
-B
-D
-end
-subgraph "Paired reads to single reads"
-C
-end
-subgraph "Taxonomic profiling"
-E
-F
-end
-```
-
-1. Reads are deduplicated with [Clumpify](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/clumpify-guide/)[^dedup] 
-2. Merged into single sequences through a combination of [BBMerge](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmerge-guide/)[^merge]. Read pairs that fail to merge with BBMerge are concatenated end-to-end with an intervening "N" base.
-3. Reads are deduplicated once again with Clumpify, but this time they're single end reads.
-4. Reads are passed to [Kraken2](https://ccb.jhu.edu/software/kraken2/) for taxonomic assignment, using the reference database obtained in the index workflow. We record whether each read was (1) assigned to a host-infecting virus taxon with Kraken, (2) assigned to a non-HV taxon with Kraken, or (3) not assigned to any taxon. All reads in category (2) are filtered out.
-
-[^dedup]: Which identifies and collapses groups of reads that are identical modulo some specified error rate.
-[^merge]: Which aligns and merges read pairs with significant overlap.
-
-### QC (QC)
-This phase conducts quality control on any set of reads.
-```mermaid
----
-title: QC
-config:
-  layout: horizontal
-  look: handDrawn
-  theme: default
----
-flowchart LR
-A[Reads] --> B[FASTQ]
-B --> C[MULTIQC]
-C --> D[SUMMARIZE_MULTIQC_PAIR]
-D --> E[Process & merge output]
-```
-
-1. Run FASTQC on each pair of read files
-2. Extract data with MultiQC for each pair of read files
-3. Summarize MultiQC information for each pair of read files
-4. Process and merge outputs

From 51b2b15cd919520cd6d847dbb93f859de4f5f44d Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 15:04:01 +0000
Subject: [PATCH 16/34] Updated these to give more information on the different
 parameters.

---
 docs/config.md       | 25 ++++++++++++++++++++++++-
 docs/installation.md |  2 +-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/docs/config.md b/docs/config.md
index 4867144c..48081824 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -8,7 +8,7 @@ All configurations are stored in the `configs` directory.
 
 ## Run workflow (`configs/run.config`)
 
-- `params.mode = "run"`: This instructs the pipeline to execute the [core run workflow](./run.md).
+- `params.mode = "run"`: This instructs the pipeline to execute the [core run workflow](../workflows/run.nf).
 - `params.base_dir`: The parent directory for the pipeline working and output directories.
 - `params.ref_dir`: The directory containing the outputs of the `index` workflow.
 - `params.sample_sheet`: The path to the sample sheet.
@@ -21,3 +21,26 @@ All configurations are stored in the `configs` directory.
 - `params.fuzzy_match_alignment_duplicates`: Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2).
 - `params.host_taxon`: The taxon to use for host-infecting virus identification.
 - `params.blast_db_prefix`: The prefix for the BLAST database to use for host-infecting virus identification.
+
+## Index workflow (`configs/index.config`)
+
+- `params.mode = "index"`: This instructs the pipeline to execute the [index workflow](../workflows/index.nf).
+- `params.base_dir`: The parent directory for the pipeline working and output directories.
+- `params.taxonomy_url`: The URL for the NCBI taxonomy dump.
+- `params.virus_host_db_url`: The URL for the viral host database.
+- `params.human_url`: The URL used for the human genome, which is used to build the human index which we use for screening out human reads.
+- `params.genome_urls`: The URLs for the genomes that are common contaminants, which is used to build out the other contaminants index which we use for screening out common contaminants.
+- `params.ssu_url`: The URL for the SILVA SSU reference database, which we use for building the ribosomal index to classify reads as ribosomal or not.
+- `params.lsu_url`: The URL for the SILVA LSU reference database, which we use for building the ribosomal index to classify reads as ribosomal or not.
+- `params.host_taxon_db`: The path to the host taxon database, which we use to establish a host-taxon relationship for the viral genome database.
+- `params.contaminants`: The path to the reads that are common contaminants, which we screen out for.
+- `params.adapters`: The path to the common adapters which we use to clean the viral reference genomes that we screen for.
+- `params.genome_patterns_exclude`: The path to the genome patterns to exclude.
+- `params.kraken_db`: The path to the Kraken reference database which we use to taxonomically classify all reads.
+- `params.blast_db_name`: The name of the BLAST database to use which is used during BLAST validation.
+- `params.ncbi_viral_params`: The parameters to use for the NCBI viral genome database which we use to build out our viral indices for BBDuk and Bowtie2.
+- `params.virus_taxid`: The taxid for the virus to use for building the viral genome database.
+- `params.viral_taxids_exclude`: The taxids to exclude from the viral genome database.
+- `params.host_taxa_screen`: The host taxa to screen for when building the viral genome database.
+- `params.kraken_memory`: Initalizes the `run` workflow params to avoid warnings (DO NOT CHANGE)
+- `params.classify_dedup_subset`: Initalizes the `run` workflow params to avoid warnings (DO NOT CHANGE)
diff --git a/docs/installation.md b/docs/installation.md
index c6502135..38be49d9 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -109,7 +109,7 @@ cd index
 cp REPO_DIR/configs/index.config nextflow.config
 ```
 
-Next, edit `nextflow.config` such that `params.base_dir` points to the directory (likely on S3) where you want to store your index files. (You shouldn't need to modify anything else about the config file at this stage.)
+Next, edit `nextflow.config` such that `params.base_dir` points to the directory (likely on S3) where you want to store your index files. (You shouldn't need to modify anything else about the config file at this stage. However, if you'd like to, you can [learn more about what each parameter does here](./config.md).)
 
 Next, call `nextflow run` pointing at the repo directory:
 

From abda8fbedfe139784ef32c4896ca89b795f51d30 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 15:15:09 +0000
Subject: [PATCH 17/34] Updating these in the meantime, need to update the
 output.

---
 README.md               | 434 ++++++----------------------------------
 docs/index.md           |  21 ++
 docs/troubleshooting.md |  53 +++++
 3 files changed, 134 insertions(+), 374 deletions(-)
 create mode 100644 docs/index.md
 create mode 100644 docs/troubleshooting.md

diff --git a/README.md b/README.md
index da472224..9b0dd18d 100644
--- a/README.md
+++ b/README.md
@@ -2,377 +2,63 @@
 
 This Nextflow pipeline is designed to process metagenomic sequencing data, characterize overall taxonomic composition, and identify and quantify reads mapping to viruses infecting certain host taxa of interest. It was developed as part of the [Nucleic Acid Observatory](https://naobservatory.org/) project.
 
-<!-- TOC start (generated with https://github.com/derlin/bitdowntoc) -->
-
-- [Pipeline description](#pipeline-description)
-   * [Overview](#overview)
-   * [Index workflow](#index-workflow)
-   * [Run workflow](#run-workflow)
-      + [Preprocessing phase](#preprocessing-phase)
-      + [Viral identification phase](#viral-identification-phase)
-      + [Taxonomic profiling phase](#taxonomic-profiling-phase)
-      + [BLAST validation phase](#blast-validation-phase)
-      + [QC and output phase](#qc-and-output-phase)
-   * [Pipeline outputs](#pipeline-outputs)
-      + [Index workflow](#index-workflow-1)
-      + [Run workflow](#run-workflow-1)
-- [Using the workflow](#using-the-workflow)
-   * [Profiles and modes](#profiles-and-modes)
-   * [Installation & setup](#installation--setup)
-      + [1. Install dependencies](#1-install-dependencies)
-      + [2. Configure AWS & Docker](#2-configure-aws--docker)
-      + [3. Clone this repository](#3-clone-this-repository)
-      + [4. Run index/reference workflow](#4-run-indexreference-workflow)
-   * [Testing & validation](#testing--validation)
-   * [Running on new data](#running-on-new-data)
-- [Run tests using `nf-test` before making pull requests](#run-tests-using-nf-test-before-making-pull-requests)
-- [Troubleshooting](#troubleshooting)
-
-<!-- TOC end -->
-
-## Pipeline description
-
-### Overview
-
-The pipeline currently consists of two workflows, an **index** workflow and a **run** workflow. The former creates index and reference files that are used by the latter, and is intended to be run first, after which many instantiations of the latter workflow can use the same index output files.
-
-The run workflow then consists of five phases:
-
-1. A **preprocessing phase**, in which input files undergo adapter & quality trimming.
-2. A **viral identification phase**, in which a custom multi-step pipeline based around BBDuk, Bowtie2 and Kraken2 is used to sensitively and specifically identify reads from viruses infecting specific host taxa of interest ("host-infecting virus reads", or HV reads).
-3. A **taxonomic profiling phase**, in which a random subset of reads (default 1M/sample) undergo ribosomal classification with BBDuk, followed by broader taxonomic classification with Kraken2.
-4. An optional **BLAST validation phase**, in which putative HV reads from phase 2 are checked against nt to evaluate the sensitivity and specificity of the HV identification process.
-5. A final **QC and output phase**, in which FASTQC, MultiQC and other tools are used to assess the quality of the data produced by the pipeline at various steps and produce summarized output data for downstream analysis and visualization.
-
-A slight simplification of the overall process is given by this diagram:
-
-![A flowchart summarizing the pipeline's run workflow.](/readme-workflow-diagram.png)
-
-### Index workflow
-
-The index workflow generates static index files from reference data. These reference data and indices don't depend on the reads processed by the run workflow, so a set of indices built by the index workflow can be used by multiple instantiations of the run workflow — no need to rebuild indices for every set of reads. The index workflow should be run once per reasonable length of time, balancing two considerations: Many of these index/reference files are derived from publicly available reference genomes or other resources, and should thus be updated and re-run periodically as new versions of these become available; however, to keep results comparable across datasets analyzed with the run workflow, this should be done relatively rarely.
-
-Key inputs to the index workflow include:
-- A TSV specifying names and taxonomic IDs (taxids) for host taxa for which to search for host-infecting viruses.
-- A URL linking to a suitable Kraken2 database for taxonomic profiling (typically the [latest release](https://benlangmead.github.io/aws-indexes/k2) of the `k2_standard` database).
-- URLS for up-to-date releases of reference genomes for various common contaminant species that can confound the identification of HV reads (currently [human](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9606), [cow](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9913), [pig](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9823), [carp](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=7962)[^carp], [mouse](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=10090), [*E. coli*](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=562)).
-- URLs to sequence databases for small and large ribosomal subunits from [SILVA](https://www.arb-silva.de/download/arb-files/).
-- Up-to-date links to [VirusHostDB](https://www.genome.jp/virushostdb) and [NCBI Taxonomy](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/).
-
-[^carp]: The carp genome is included as a last-ditch attempt to [capture any remaining Illumina adapter sequences](https://dgg32.medium.com/carp-in-the-soil-1168818d2191) before moving on to HV identification. I'm not especially confident this is helpful.
-
-Given these inputs, the index workflow:
-- Generates a TSV of viral taxa, incorporating taxonomic information from NCBI, and annotates each taxon with infection status for each host taxon of interest (using Virus-Host-DB).
-- Makes Bowtie2 indices from (1) all host-infecting viral genomes in Genbank[^genbank], (2) the human genome, (3) common non-human contaminants, plus BBMap indices for (2) and (3).
-- Downloads and extracts local copies of (1) the BLAST nt database, (2) the specified Kraken2 DB, (3) the SILVA rRNA reference files.
-
-[^genbank]: Excluding transgenic, contaminated, or erroneous sequences, which are excluded according to a list of sequence ID patterns specified in the config file.
-
-Run the index workflow by setting `mode = "index"` in the relevant config file. For more information, see `workflows/index.nf` and the associated subworkflows and modules.
-
-### Run workflow
-
-#### Preprocessing phase
-
-The run workflow begins by undergoing cleaning by [FASTP](https://github.com/OpenGene/fastp), which both screens for adapters and trims low-quality and low-complexity sequences. The output of FASTP is then passed on to the other parts of the run workflow.
-
-#### Viral identification phase
-
-The goal of this phase is to sensitively, specifically, and efficiently identify reads arising from host-infecting viruses. It uses a multi-step process designed to keep computational costs low while minimizing false-positive and false-negative errors.
-
-1. To begin with, the cleaned reads produced by the preprocessing phase are screened against a database of host-infecting viral genomes generated from Genbank by the index workflow. This initial screen is performed using [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), which flags any read that contains at least three 21-mers matching any host-infecting viral genome. The purpose of this initial screen is to rapidly and sensitively identify putative host-infecting viral reads while discarding the vast majority of non-HV reads, reducing the cost associated with the rest of this phase.
-2. Surviving reads undergo additional adapter trimming with [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) & [Trimmomatic](https://github.com/timflutre/trimmomatic) to remove any residual adapter contamination that might lead to false positive results.
-3. Next, reads are aligned to the previously-mentioned database of HV genomes with [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) using quite permissive parameters designed to capture as many putative HV reads as possible. The SAM and FASTQ files are processed to generate new read files containing any read pair for which at least one read matches the HV database.
-4. The output of the previous step is passed to a further filtering step, in which reads matching a series of common contaminant sequences are removed. This is done by aligning surviving reads to these contaminants using both Bowtie2 and [BBMap](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmap-guide/) in series[^filter]. Contaminants to be screened against include reference genomes from human, cow, pig, carp, mouse and *E. coli*, as well as various genetic engineering vectors.
-5. Surviving read pairs are deduplicated with [Clumpify](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/clumpify-guide/)[^dedup] and merged into single sequences through a combination of [BBMerge](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbmerge-guide/)[^merge]. Read pairs that fail to merge with BBMerge are concatenated end-to-end with an intervening "N" base.
-6. Deduplicated and merged reads are passed to [Kraken2](https://ccb.jhu.edu/software/kraken2/) for taxonomic assignment, using the reference database obtained in the index workflow. We record whether each read was (1) assigned to a host-infecting virus taxon with Kraken, (2) assigned to a non-HV taxon with Kraken, or (3) not assigned to any taxon. All reads in category (2) are filtered out.
-7. Finally, reads are assigned a final HV status if they:
-    - Are classified as HV by both Bowtie2 and Kraken2; or
-    - Are unassigned by Kraken and align to an HV taxon with Bowtie2 with an alignment score above a user-specifed threshold[^threshold].
-
-Following HV status assignment, information for each read pair is processed into a TSV file available in the output directory as `virus_hits_db.tsv.gz`. Finally, the number of read pairs mapping to each detected HV taxon is counted and output as `virus_clade_counts.tsv.gz` for downstream use.
-
-[^filter]: We've found in past investigations that the two aligners detect different contaminant sequences, and aligning against both is more effective at avoiding false positives than either in isolation.
-[^dedup]: Which identifies and collapses groups of reads that are identical modulo some specified error rate.
-[^merge]: Which aligns and merges read pairs with significant overlap. 
-[^threshold]: Specifically, Kraken-unassigned read pairs are classed as HV if, for either read in the pair, S/ln(L) >= T, where S is the best-match Bowtie2 alignment score for that read, L is the length of the read, and T is the value of `params.bt2_score_threshold` specified in the config file.
-
-#### Taxonomic profiling phase
-
-The goal of this phase is to give an overview of the taxonomic composition of the cleaned reads from the preprocessing phase. In particular, it gives an estimate of (i) the fraction of ribosomal reads in the dataset, (ii) the taxonomic breakdown of the dataset at the domain level[^eukarya], and (iii) more detailed abundance estimates for lower-level taxa.
-
-To maintain efficiency, the reads from the preprocessing phase are first subset, by default to 1 million reads per sample. These subset reads are then separated into ribosomal and non-ribosomal read groups using BBDuk, by searching for ribosomal k-mers from the SILVA database obtained during the index workflow. Both sets of read pairs are then merged into single sequences using BBMerge as described above, then are passed to Kraken2 for taxonomic assignment. The output of Kraken2 is then passed to [Bracken](https://ccb.jhu.edu/software/bracken/) for correction and summarization, and the outputs of both Kraken and Bracken are merged across samples to produce single output files.
-
-[^eukarya]: As human is the only eukaryotic genome included in the Standard reference database for Kraken2, all sequences assigned to that domain can be assigned to *Homo sapiens*.
-
-#### BLAST validation phase
-
-To evaluate the performance of the process described in the viral identification phase, it's useful to get some ground-truth information about whether the HV assignments made in that subworkflow are correct. To do this, we use [BLASTN](https://blast.ncbi.nlm.nih.gov/Blast.cgi) to align the putative HV reads output by the previous phase against the core_nt database, then process the output to check whether each sequence had a high-scoring alignment to at least one HV sequence. For computational tractability, this can be performed on only a subset of surviving HV reads (specified by `params.blast_hv_fraction`)[^blast].
-
-[^blast]: Setting `params.blast_hv_fraction` to 0 skips this step altogether.
-
-Currently, this subworkflow performs the BLAST alignment, filters & summarizes the output to a manageable size, and saves the result to the output directory as `blast_hits_paired.tsv.gz`. Actual assessment of HV status and performance evaluation is currently performed external to this workflow.
-
-#### QC and output phase
-
-In this phase, the raw and cleaned reads from the preprocessing phase are analyzed with [FASTQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [MultiQC](https://multiqc.info/). This allows us to assess important metrics (e.g. read quality, read numbers, adapter content) in the raw input, and how these metrics are affected by cleaning. Relevant metrics are then extracted from MultiQC outputs into easy-to-parse TSV files[^tsvs] for downstream processing.
-
-[^tsvs]: Specifically, `qc_basic_stats.tsv.gz`, `qc_adapter_stats.tsv.gz`, `qc_quality_base_stats.tsv.gz` and `qc_quality_sequence_stats.tsv.gz`.
-
-### Pipeline outputs
-
-If the pipeline runs to completion, the following output files are expected.
-
-#### Index workflow
-
-1. `output/input/index-params.json`: JSON file giving all the parameters passed to the pipeline (useful for trying to reproduce someone else's results).
-2. `output/logging/pipeline-version.txt`: Version of the pipeline with which index directory was created.
-3. `output/logging/time.txt`: Start time of index workflow run.
-4. `output/logging/trace.txt`: Nextflow trace file containing logging information for each process performed during the workflow run.
-5. `output/results/core_nt`: Directory containing extracted BLAST database files for BLASTing against core_nt.
-6. `output/results/bt2-virus-index`: Directory containing Bowtie2 index for host-infecting viral genomes.
-7. `output/results/bt2-human-index`: Directory containing Bowtie2 index for the human genome.
-8. `output/results/bt2-other-index`: Directory containing Bowtie2 index for other contaminant sequences.
-9. `output/results/bbm-human-index`: Directory containing BBMap index for the human genome.
-10. `output/results/bbm-other-index`: Directory containing BBMap index for other contaminant sequences.
-11. `output/results/kraken_db`: Directory containing Kraken2 reference database (by default, the most recent version of Standard).
-12. `output/results/virus-genomes-filtered.fasta.gz`: FASTA file containg host-infecting viral genomes downloaded from viral Genbank (filtered to remove transgenic, contaminated, or erroneous sequences).
-13. `virus-genome-metadata-gid.tsv.gz`: Genome metadata file generated during download of HV genomes from viral Genbank, annotated additionally with Genome IDs used by Bowtie2 (allowing mapping between genome ID and taxid).
-14. `output/results/ribo-ref-concat.fasta.gz`: Reference database of ribosomal LSU and SSU sequences from SILVA.
-15. `output/results/taxonomy-nodes.dmp`: Taxonomy dump file from NCBI mapping between taxids and their parents in the NCBI taxonomy tree structure.
-16. `output/results/taxonomy-names.dmp`: Taxonomy dump file from NCBI mapping between taxids and taxon names.
-17. `output/results/total-virus-db-annotated.tsv.gz`: Database generated from NCBI taxonomy and Virus-Host-DB giving taxonomy and host-infection information for each viral taxon.
-
-#### Run workflow
-
-1. `output/input`: Directory containing saved input information (useful for trying to reproduce someone else's results)
-    1. `adapters.fasta`: FASTA file of adapter sequences used for adapter screening.
-    2. `run-params.json`: JSON file giving all the parameters passed to the pipeline.
-    3. `index-params.json`: JSON file giving parameters used to generate index directory (`params.ref_dir`).
-    4. `samplesheet.csv`: Copy of the samplesheet file used to configure the pipeline (specified by `params.sample_sheet`).
-2. `output/logging`: Log files containing meta-level information about the pipeline run itself.
-    1. `pipeline-version.txt`: Version of the pipeline used for the run.
-    2. `time.txt`: Start time of the run.
-    3. `trace.txt`: Tab delimited log of all the information for each task run in the pipeline including runtime, memory usage, exit status, etc. Can be used to create an execution timeline using the the script `bin/plot-timeline-script.R` after the pipeline has finished running. More information regarding the trace file format can be found [here](https://www.nextflow.io/docs/latest/reports.html#trace-file).
-    4. `pipeline-version-index.txt`: Version of pipeline used to generate index directory (`params.ref_dir`).
-3. `output/intermediates`: Intermediate files produced by key stages in the run workflow, saved for nonstandard downstream analysis.
-    1. `reads/cleaned`: Directory containing paired FASTQ files for cleaned reads (i.e. the output of the preprocessing phase described above).
-    2. `reads/raw_viral`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
-4. `output/results`: Directory containing processed results files for standard downstream analysis.
-    1. `qc_basic_stats.tsv.gz`: Summary statistics for each sample at each stage of the preprocessing phase (`stage`), including:
-        - GC content (`percent GC`);
-        - Average read length (`mean_seq_len`);
-        - Number of read pairs (`n_read pairs`);
-        - Approximate number of base pairs in reads (`n_bases_approx`);
-        - Percent duplicates as measured by FASTQC (`percent_duplicates`);
-        - Pass/fail scores for each test conducted by FASTQC.
-    2. `qc_adapter_stats.tsv.gz`: Adapter statistics calculated by FASTQC for each sample and preprocessing stage, given as a percentage of reads containing adapter content (`pc_adapters`) at each position along the read (`position`) for each adapter detected (`adapter`) for each read in the read pair (`read_pair`).
-    3. `qc_quality_base_stats.tsv.gz`: Per-base read-quality statistics calculated by FASTQC for each sample and preprocessing stage, given as the mean Phred score (`mean_phred_score`) at each position along the read (`position`) for each read in the read pair (`read_pair`).
-    4. `qc_quality_sequence_stats.tsv.gz`: Per-sequence read-quality statistics calculated by FASTQC for each sample and preprocessing stage, given as the number of reads (`n_sequences`) with a given mean Phred score (`mean_phred_score`) for each read in the read pair (`read_pair`).
-    5. `virus_hits_db.tsv.gz`: TSV output by the viral identification phase, giving information about each read pair assigned to a host-infecting virus.
-    6. `virus_clade_counts.tsv.gz`: Summary of the previous file giving the number of HV read pairs mapped to each viral taxon. Includes both read pairs mapped directly to that taxon (`n_reads_direct`) and to that taxon plus all descendent taxa (`n_reads_clade`).
-    3. `blast_hits_paired.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
-        - The number of reads in the read pair with high-scoring matches to that taxid (`n_reads`).
-        - The best bitscores of alignments to that taxid for each matching read (`bitscore_max` and `bitscore_min`)[^bitscore].
-    8. `kraken_reports_merged.tsv.gz`: Kraken output reports in TSV format, labeled by sample and ribosomal status.
-    9. `bracken_reports_merged.tsv.gz`: Bracken output reports in TSV format, labeled by sample and ribosomal status.
-
-[^bitscore]: If only one read aligns to the target, these two fields will be identical. If not, they will give the higher and lower of the best bitscores for the two reads in the pair..
-
-## Using the workflow
-
-### Profiles and modes
-
-The pipeline can be run in two modes, "index" and "run", each of which calls the corresponding workflow from the `workflows` directory.
-
-For both modes, the pipeline can be run in multiple ways by modifying various configuration variables specified in `configs/profiles.config`. Currently, three profiles are implemented, all of which assume the workflow is being launched from an AWS EC2 instance:
-
-- The `batch` profile is the default and attempts to run the pipeline with AWS Batch. This is the quickest and most efficient way to run the pipeline, but requires significant additional setup not described in this repo. To set up AWS Batch for this pipeline, follow the instructions [here](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html) (steps 1-3), then modify your config file to point `process.queue` to the name of your Batch job queue.
-- The `ec2_local` profile attempts to run the whole workflow locally on your EC2 instance, storing intermediate and outflow files on instance-linked block storage. This is simple and can be relatively fast, but is bottlenecked by your instance's CPU and memory allocations; in particular, if you don't use an instance with very high memory, the pipeline is likely to fail when loading the Kraken2 reference DB.
-- The `ec2_s3` profile runs the pipeline on your EC2 instance, but attempts to read and write files to a specified S3 directory. This avoids problems caused by insufficient storage on your EC2 instance, but (1) is significantly slower and (2) is still constrained by your instance's memory allocation.
-
-To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
-
-> [!TIP]
-> It's highly recommended that you always run `nextflow run` with the `-resume` option enabled. It doesn't do any harm if you haven't run a workflow before, and getting into the habit will help you avoid much sadness when you want to resume it without rerunning all your jobs.
-
-### Installation & setup
-
-#### 1. Install dependencies
-
-To run this workflow with full functionality, you need access to the following dependencies:
-
-1. **SDKMAN!:** To install the SDKMAN! Java SDK manager, follow the installation instructions available [here](https://sdkman.io/install).
-2. **Nextflow:** To install the workflow management framework, follow the installation instructions available [here](https://www.nextflow.io/docs/latest/getstarted.html), beginning by installing a recommended Java distribution through SDKMAN!. Pipeline version 2.5.0+ requires Nextflow version 24.10.0+.
-2. **Docker:** To install Docker Engine for command-line use, follow the installation instructions available [here](https://docs.docker.com/engine/install/) (or [here](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-docker.html) for installation on an AWS EC2 instance).
-3. **AWS CLI:** If not already installed, install the AWS CLI by following the instructions available [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
-4. **Git:** To install the Git version control tool, follow the installation instructions available [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
-5. **nf-test**: To install nf-test, follow the install instructions available [here](https://www.nf-test.com/installation/).
-
-#### 2. Configure AWS & Docker
-
-To run the workflow using AWS S3 for the working and output directories, you first need to [configure AWS access](https://www.nextflow.io/docs/latest/aws.html). To do this, you need to create a file at `~/.aws/config` or `~/.aws/credentials` specifying your access key ID and secret access key, e.g.
-
-`~/.aws/config`:
-```
-[default]
-region = us-east-1
-output = table
-tcp_keepalive = true
-aws_access_key_id = <ACCESS_KEY_ID>
-aws_secret_access_key = <SECRET_ACCESS_KEY>
-```
-
-`~/.aws/credentials`:
-```
-[default]
-aws_access_key_id = <ACCESS_KEY_ID>
-aws_secret_access_key = <SECRET_ACCESS_KEY>
-```
-
-If you encounter `AccessDenied` errors after doing this, you may also need to export these keys as environment variables before running Nextflow:
-
-```
-eval "$(aws configure export-credentials --format env)"
-```
-
-Next, you need to make sure your user is configured to use Docker. To do this, create the `docker` user group and add your current user to it:
-
-```
-sudo groupadd docker
-sudo usermod -aG docker $USER
-newgrp docker
-docker run hello-world
-```
-
-#### 3. Clone this repository
-
-Clone this repo into a new directory as normal.
-
-#### 4. Run index/reference workflow
-
-> [!TIP]
-> If someone else in your organization already uses this pipeline, it's likely they've already run the index workflow and generated an output directory. If this is the case, you can reduce costs and increase reproducibility by using theirs instead of generating your own. If you want to do this, skip this step, and edit `configs/run.config` such that `params.ref_dir` points to `INDEX_DIR/output`.
-
-Create a new directory outside the repo directory and copy over the index workflow config file as `nextflow.config` in that directory:
-
-```
-mkdir index
-cd index
-cp REPO_DIR/configs/index.config nextflow.config
-```
-
-Next, edit `nextflow.config` such that `params.base_dir` points to the directory (likely on S3) where you want to store your index files. (You shouldn't need to modify anything else about the config file at this stage.)
-
-Next, call `nextflow run` pointing at the repo directory (NB: *not* `main.nf` or any other workflow file; pointing to the directory will cause Nextflow to automatically run `main.nf` from that directory.):
-
-```
-nextflow run PATH_TO_REPO_DIR -resume
-```
-
-Wait for the workflow to run to completion; this is likely to take several hours at least.
-
-### Testing & validation
-
-To confirm that the pipeline works in your hands, we provide a small test dataset (`s3://nao-testing/gold-standard-test/raw/`) to run through the run workflow. This can be used to test any of the pipeline profiles described above.
-
-If your EC2 instance has the resources to handle it, the simplest way to start using the pipeline is to run the test data through it locally on that instance (i.e. without using S3). To do this:
-
-1. Create a new directory outside the repo directory and copy over the run workflow config file as `nextflow.config` in that directory:
-
-```
-mkdir launch
-cd launch
-cp REPO_DIR/configs/run.config nextflow.config
-```
-
-2. Edit `nextflow.config` to set `params.ref_dir` to the index directory you chose or created above (specifically `PATH_TO_REF_DIR/output`). 
-3. Then set the samplesheet path to the test dataset samplesheet `${projectDir}/test-data/samplesheet.csv`. 
-4. Within this directory, run `nextflow run -profile ec2_local .. -resume`. Wait for the workflow to finish. 
-5. Inspect the `output` directory to view the processed output files.
-
-If this is successful, the next level of complexity is to run the workflow with a working directory on S3. To do this:
-
-1. Edit `nextflow.config` to set `params.base_dir` to the S3 directory of your choice. 
-2. Still within that directory, run `nextflow run -profile ec2_s3 .. -resume`. 
-3. Wait for the workflow to finish, and inspect the output on S3.
-
-Finally, you can run the test dataset through the pipeline on AWS Batch. To do this, configure Batch as described [here](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html) (steps 1-3), then:
-
-1. Edit `nextflow.config` to set `params.base_dir` to a different S3 directory of your choice and `process.queue` to the name of your Batch job queue. 
-2. Still within that directory, run `nextflow run -profile batch .. -resume` (or simply `nextflow run .. -resume`). 
-3. Wait for the workflow to finish, and inspect the output on S3.
-
-### Running on new data
-
-To run the workflow on another dataset, you need:
-
-1. Accessible raw data files in Gzipped FASTQ format, named appropriately.
-2. A sample sheet file specifying the samples, along with paths to the forward and reverse read files for each sample.  `generate_samplesheet.sh` (see below) can make this for you.
-3. A config file in a clean launch directory, pointing to:
-    - The base directory in which to put the working and output directories (`params.base_dir`).
-    - The directory containing the outputs of the reference workflow (`params.ref_dir`).
-    - The sample sheet (`params.sample_sheet`).
-    - Various other parameter values.
-
-> [!NOTE]
-> The samplesheet must have the following format for each row:
-> - First column: Sample ID
-> - Second column: Path to FASTQ file 1 which should be the forward read for this sample
-> - Third column: Path to FASTQ file 2 which should be the reverse read for this sample
-> 
-> The easiest way to get this file is by using the `generate_samplesheet.sh` script. As input, this script takes a path to raw FASTQ files (`dir_path`), and forward (`forward_suffix`) and reverse (`reverse_suffix`) read suffixes, both of which support regex, and an optional output path (`output_path`). Those using data from s3 should make sure to pass the `s3` parameter. Those who would like to group samples by some metadata can pass a path to a CSV file containing a header column named `sample,group`, where each row gives the sample name and the group to group by (`group_file`), edit the samplesheet manually after generation (since manually editing the samplesheet will be easier when the groups CSV isn't readily available), or provide the --group_across_illumina_lanes option if a single library was run across a single Illumina flowcell. As output, the script generates a CSV file named (`samplesheet.csv` by default), which can be used as input for the pipeline.
->
-> For example:
-> ```
-> ../bin/generate_samplesheet.sh \
->    --s3
->    --dir_path s3://nao-restricted/MJ-2024-10-21/raw/ \
->    --forward_suffix _1 \
->    --reverse_suffix _2
-> ```
-
-If running on Batch, a good process for starting the pipeline on a new dataset is as follows:
-
-1. Process the raw data to have appropriate filenames (see above) and deposit it in an accessible S3 directory.
-2. Create a clean launch directory and copy `configs/run.config` to a file named `nextflow.config` in that directory.
-3. Create a sample sheet in that launch directory (see above)
-4. Edit `nextflow.config` to specify each item in `params` as appropriate, as well as setting `process.queue` to the appropriate Batch queue.
-5. Run `nextflow run PATH_TO_REPO_DIR -resume`.
-6. Navigate to `{params.base_dir}/output` to view and download output files.
-
-## Run tests using `nf-test` before making pull requests
-
-During the development process, we now request that users run the pipeline using `nf-test` locally before making pull requests (a test will be run automatically on the PR, but it's often useful to run it locally first). To do this, you need to make sure that you have a big enough ec2-instance. We recommend the `m5.xlarge` with at least `32GB` of EBS storage, as this machine closely reflects the VMs on Github Actions. Once you have an instance, run `nf-test run tests/main.test.nf`, which will run all workflows of the pipeline and check that they run to completion. If you want to run a specific workflow, you use the following commands:
-
-```
-nf-test test --tag index  # Runs the index workflow
-nf-test test --tag run     # Runs the run workflow
-nf-test test --tag validation # Runs the validation workflow
-nf-test test --tag run_output # Runs the run workflow with the test that verifies that the output files are correct
-```
-
-The intended results for the run workflow can be found in following directory `test-data/gold-standard-results`. Should the `run_output` test fail, you can diff the resulting files of that test, with the files in this folder to find the differences.
-
-Importantly, make sure to periodically delete docker images to free up space on your instance. You can do this by running the following command, although note that this will delete all docker images:
-
-```
-docker kill $(docker ps -q) 2>/dev/null || true
-docker rm $(docker ps -a -q) 2>/dev/null || true
-docker rmi $(docker images -q) -f 2>/dev/null || true
-docker system prune -af --volumes
-```
-
-## Troubleshooting
-
-When attempting to run a released version of the pipeline, the most common sources of errors are AWS permission issues. Before debugging a persistent error in-depth, make sure that you have all the permissions specified in Step 0 of [our Batch workflow guide](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html). Next, make sure Nextflow has access to your AWS credentials, such as by running `eval "$(aws configure export-credentials --format env)"`.
-
-Another common issue is for processes to fail with some variation of the following Docker-related error:
-
-```
-docker: failed to register layer: write /usr/lib/jvm/java-11-openjdk-amd64/lib/modules: **no space left on device**.
-```
-
-This is a fairly well-known problem, which can arise even when there is substantial free storage space accessible to your EC2 instance. Following the steps recommended [here](https://www.baeldung.com/linux/docker-fix-no-space-error) or [here](https://forums.docker.com/t/docker-no-space-left-on-device/69205) typically resolves the issue, either by deleting Docker assets to free up space (e.g. via `docker system prune --all --force`) or by giving Docker more space.
-
-We will add more common failure modes here as they get reported.
+Documentation:
+- [Installation instructions](docs/installation.md)
+- [AWS Batch setup](docs/batch.md)
+- [Usage instructions](docs/usage.md)
+- [Configuration files](docs/configs.md)
+- [Run workflow details](docs/run.md)
+- [Index workflow details](docs/index.md)
+- [Pipeline outputs](docs/output.md)
+- [Troubleshooting](docs/troubleshooting.md)
+- [Version Schema](docs/version_schema.md)
+
+## Overview
+
+The pipeline currently consists of three workflows:
+
+- `run`: Performs the main analysis, including viral identification, taxonomic profiling, and optional BLAST validation.
+- `index`: Creates indices and reference files used by the `run` workflow.[^1]
+- `run_validation`: Performs BLAST validation on any set of reads to verify their taxonomic classification.[^2]
+
+[^1]: The `index` workflow is intended to be run first, after which many instantiations of the `run` workflow can use the same index output files. 
+[^2]: The `run_validation` workflow is intended to be run after the `run` workflow if the optional BLAST validation was not selected during the `run` workflow. Typically, this workflow is run on a subset of the host viral reads identified in the `run` workflow, to evaluate the sensitivity and specificity of the viral identification process.
+
+The `run` workflow consists of four parts:
+
+- **Viral identification**: Reads from viruses infecting specific host taxa of interest (default: vertebrate viruses) are sensititvely and specifically identified using a multi-step pipeline based around k-mer filtering, adapter trimming, read alignment, and taxonomic classification.
+- **Taxonomic profile**: A random subset of reads (default: 1M/sample) undergo adapter trimming, ribosomal classification, and taxonomic classification.
+- **QC**: The total number of reads are recorded, then a subset of reads (default 1M/sample) undergo quality assessment, both before and after adapter trimming.
+- **(Optional) BLAST validation**: Putative host viral reads from part 1 (either all reads or a subset) are checked aginst `core_nt` to evaluate the sensitivity and specificity of the viral identification process.
+
+The following diagram provides a high-level overview of the `run` workflow (each blue box is a subworkflow):
+
+```mermaid
+---
+title: RUN WORKFLOW
+config:
+  layout: horizontal
+---
+flowchart LR
+A(Raw reads) --> B[LOAD_SAMPLESHEET]
+B --> C[COUNT_TOTAL_READS] & D[EXTRACT_VIRAL_READS] & E[SUBSET_TRIM]
+D -.-> |Optional|F[BLAST_VIRAL]
+E --> I(Subset reads)
+E --> J(Subset trimmed reads)
+J --> G[RUN_QC] & H[PROFILE]
+I --> G[RUN_QC]
+%% Adjust layout by placing subgraphs in specific order
+subgraph "Viral identification"
+D
+end
+subgraph "Taxonomic Profile"
+H
+end
+subgraph "QC"
+C
+G
+end
+subgraph "BLAST Validation"
+F
+end
+```
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..6e8a3447
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,21 @@
+# Index workflow
+
+The index workflow generates static index files from reference data. These reference data and indices don't depend on the reads processed by the run workflow, so a set of indices built by the index workflow can be used by multiple instantiations of the run workflow — no need to rebuild indices for every set of reads. The index workflow should be run once per reasonable length of time, balancing two considerations: Many of these index/reference files are derived from publicly available reference genomes or other resources, and should thus be updated and re-run periodically as new versions of these become available; however, to keep results comparable across datasets analyzed with the run workflow, this should be done relatively rarely.
+
+Key inputs to the index workflow include:
+- A TSV specifying names and taxonomic IDs (taxids) for host taxa for which to search for host-infecting viruses.
+- A URL linking to a suitable Kraken2 database for taxonomic profiling (typically the [latest release](https://benlangmead.github.io/aws-indexes/k2) of the `k2_standard` database).
+- URLS for up-to-date releases of reference genomes for various common contaminant species that can confound the identification of HV reads (currently [human](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9606), [cow](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9913), [pig](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=9823), [carp](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=7962)[^carp], [mouse](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=10090), [*E. coli*](https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=562)).
+- URLs to sequence databases for small and large ribosomal subunits from [SILVA](https://www.arb-silva.de/download/arb-files/).
+- Up-to-date links to [VirusHostDB](https://www.genome.jp/virushostdb) and [NCBI Taxonomy](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/).
+
+[^carp]: The carp genome is included as a last-ditch attempt to [capture any remaining Illumina adapter sequences](https://dgg32.medium.com/carp-in-the-soil-1168818d2191) before moving on to HV identification. I'm not especially confident this is helpful.
+
+Given these inputs, the index workflow:
+- Generates a TSV of viral taxa, incorporating taxonomic information from NCBI, and annotates each taxon with infection status for each host taxon of interest (using Virus-Host-DB).
+- Makes Bowtie2 indices from (1) all host-infecting viral genomes in Genbank[^genbank], (2) the human genome, (3) common non-human contaminants, plus BBMap indices for (2) and (3).
+- Downloads and extracts local copies of (1) the BLAST nt database, (2) the specified Kraken2 DB, (3) the SILVA rRNA reference files.
+
+[^genbank]: Excluding transgenic, contaminated, or erroneous sequences, which are excluded according to a list of sequence ID patterns specified in the config file.
+
+Run the index workflow by setting `mode = "index"` in the relevant config file. For more information, see `workflows/index.nf` and the associated subworkflows and modules.
\ No newline at end of file
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
new file mode 100644
index 00000000..742842c8
--- /dev/null
+++ b/docs/troubleshooting.md
@@ -0,0 +1,53 @@
+# Troubleshooting
+
+We break down the issues that can arise into two categories:
+
+- Nextflow isn't able to run
+- Nextflow runs then fails
+
+We discuss each of these issues below.
+
+## Nextflow isn't able to run
+
+**The most common sources of errors are AWS permission issues.** Whether you're using the `batch` profile or not, you should make sure you have all AWS permissions, as specified in [our Batch tutorial](./batch.md#step-0-set-up-your-aws-credentials).
+
+Next, make sure Nextflow has access to your AWS credentials (if not you may get `AccessDenied` errors):
+
+```
+eval "$(aws configure export-credentials --format env)"
+```
+
+## Nextflow runs then fails
+
+
+### None of my jobs are running
+
+The first time a pipeline is run on any dataset, it can take a while for the jobs to start (10-15 minutes at most). If after this period of time, none of your jobs have started, then this may be another AWS permission issue.
+
+### Docker image failure
+
+Another common issue is for processes to fail with some variation of the following Docker-related error:
+
+```
+docker: failed to register layer: write /usr/lib/jvm/java-11-openjdk-amd64/lib/modules: **no space left on device**.
+```
+
+This is a fairly well-known problem, which can arise even when there is substantial free storage space accessible to your EC2 instance. Following the steps recommended [here](https://www.baeldung.com/linux/docker-fix-no-space-error) or [here](https://forums.docker.com/t/docker-no-space-left-on-device/69205) typically resolves the issue, either by deleting Docker assets to free up space (e.g. via `docker system prune --all --force`) or by giving Docker more space.
+
+### Insufficient resources
+
+This an issue we're actively working on. Sometimes your jobs may fail due to insufficient memory or CPU resources. To avoid this, you can either change your overall resources or change the resources for a specific process:
+
+- **Change your overall resources**: You can change the resources for your entire pipeline by editing the `configs/resources.config` file. However, note that this will probably also require you to change the reference files and indices generated by the `index` workflow. [See here for more information on compute resource requirements](./usage#compute-resource-requirements).
+- **Change the resources for a specific process**: You can change the resources for a specific process by changing the label of that process to another label that is in the `configs/resources.config` file or you can define a new label in the `configs/resources.config` file.
+
+
+### Rate limit exceeded
+
+Occassionally, you may encounter the following error:
+
+```
+Task failed to start - CannotPullImageManifestError: Error response from daemon: toomanyrequests: Request exceeded pull rate limit for IP XX.XXX.XX.XX
+```
+
+This is an issue with AWS Batch. The solution is to restart your run with the nextflow `-resume` option.
\ No newline at end of file

From a0cd96ce955cc5f6c54f3b318440032afac815c1 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 15:20:21 +0000
Subject: [PATCH 18/34] Docs

---
 docs/output.md | 107 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 docs/output.md

diff --git a/docs/output.md b/docs/output.md
new file mode 100644
index 00000000..1f4cbe8c
--- /dev/null
+++ b/docs/output.md
@@ -0,0 +1,107 @@
+# Outputs
+
+If the pipeline runs to completion, the following output files are expected. In the future, we will add more specific information about the outputs, including in-depth descriptions of the columns in the output files.
+
+All pipeline output can be found in the `output` directory, which is broken into four subdirectories:
+
+- `input`: Directory containing saved input information (useful for trying to reproduce someone else's results)
+- `logging`: Log files containing meta-level information about the pipeline run itself.
+- `intermediates`: Intermediate files produced by key stages in the run workflow, saved for nonstandard downstream analysis.
+- `results`: Directory containing processed results files for standard downstream analysis.
+
+## Run workflow
+
+Main heading represents the folder name, and subheadings represent a description of the file's usage. If the file is not in the heading folder name, the relative path is given.
+
+### `input`
+
+- `adapters.fasta`: FASTA file of adapter sequences used for adapter screening.
+- `run-params.json`: JSON file giving all the parameters passed to the pipeline.
+- `index-params.json`: JSON file giving parameters used to generate index directory (`params.ref_dir`).
+- `samplesheet.csv`: Copy of the samplesheet file used to configure the pipeline (specified by `params.sample_sheet`).
+
+### `logging`
+
+- `pipeline-version.txt`: Version of the pipeline used for the run.
+- `time.txt`: Start time of the run.
+- `trace.txt`: Tab delimited log of all the information for each task run in the pipeline including runtime, memory usage, exit status, etc. Can be used to create an execution timeline using the the script `bin/plot-timeline-script.R` after the pipeline has finished running. More information regarding the trace file format can be found [here](https://www.nextflow.io/docs/latest/reports.html#trace-file).
+- `pipeline-version-index.txt`: Version of pipeline used to generate index directory (`params.ref_dir`).
+
+### `intermediates`
+
+- `reads/raw_viral/*`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
+
+### `results`
+
+#### `qc`
+- `total_reads_qc.tsv.gz`: Total number of raw reads in each sample.
+- `qc_basic_stats.tsv.gz`: Summary statistics for each subset sample before and after adapter trimming, including:
+    - GC content (`percent GC`);
+    - Average read length (`mean_seq_len`);
+    - Number of read pairs (`n_read pairs`);
+    - Approximate number of base pairs in reads (`n_bases_approx`);
+    - Percent duplicates as measured by FASTQC (`percent_duplicates`);
+    - Pass/fail scores for each test conducted by FASTQC.
+- `qc_adapter_stats.tsv.gz`: Adapter statistics calculated by FASTQC for subset sample before and after adapter trimming, given as a percentage of reads containing adapter content (`pc_adapters`) at each position along the read (`position`) for each adapter detected (`adapter`) for each read in the read pair (`read_pair`).
+- `qc_quality_base_stats.tsv.gz`: Per-base read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the mean Phred score (`mean_phred_score`) at each position along the read (`position`) for each read in the read pair (`read_pair`).
+- `qc_quality_sequence_stats.tsv.gz`: Per-sequence read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the number of reads (`n_sequences`) with a given mean Phred score (`mean_phred_score`) for each read in the read pair (`read_pair`).
+
+#### `viral identification`
+- `virus_hits_db.tsv.gz`: TSV output by the viral identification phase, giving information about each read pair assigned to a host-infecting virus.
+- `virus_clade_counts.tsv.gz`: Summary of the previous file giving the number of HV read pairs mapped to each viral taxon. Includes both read pairs mapped directly to that taxon (`n_reads_direct`) and to that taxon plus all descendent taxa (`n_reads_clade`).
+
+#### `taxonomic identification`
+- `kraken_reports_merged.tsv.gz`: Kraken output reports in TSV format, labeled by sample and ribosomal status for subset trimmed samples.
+- `bracken_reports_merged.tsv.gz`: Bracken output reports in TSV format, labeled by sample and ribosomal status for subset trimmed samples.
+
+#### `blast`
+- `blast_hits_paired.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
+    - The number of reads in the read pair with high-scoring matches to that taxid (`n_reads`).
+    - The best bitscores of alignments to that taxid for each matching read (`bitscore_max` and `bitscore_min`)[^bitscore].
+- `blast_forward_only.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
+- `blast_reverse_only.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
+
+[^bitscore]: If only one read aligns to the target, these two fields will be identical. If not, they will give the higher and lower of the best bitscores for the two reads in the pair..
+
+
+## Index workflow
+
+Main heading represents the folder name, and subheadings describes the tool that consumes the file. Files that are consumed by multiple tools or are not consumed by any tools are put in the `General` subheading. If the file is not in the heading folder name, the relative path is given.
+
+### `input`
+
+- `index-params.json`: JSON file giving all the parameters passed to the pipeline (useful for trying to reproduce someone else's results).
+
+### `logging`
+
+- `pipeline-version.txt`: Version of the pipeline with which index directory was created.
+- `time.txt`: Start time of index workflow run.
+- `trace.txt`: Nextflow trace file containing logging information for each process performed during the workflow run.
+
+### `results`
+
+#### `General`
+
+- `total-virus-db-annotated.tsv.gz`: Database generated from NCBI taxonomy and Virus-Host-DB giving taxonomy and host-infection information for each viral taxon.
+- `taxonomy-nodes.dmp`: Taxonomy dump file from NCBI mapping between taxids and their parents in the NCBI taxonomy tree structure.
+- `taxonomy-names.dmp`: Taxonomy dump file from NCBI mapping between taxids and taxon names.
+
+#### `BLAST`
+
+- `core_nt`: Directory containing extracted BLAST database files for BLASTing against core_nt.
+
+#### `Bowtie2`
+
+- `bt2-virus-index`: Directory containing Bowtie2 index for host-infecting viral genomes.
+- `bt2-human-index`: Directory containing Bowtie2 index for the human genome.
+- `bt2-other-index`: Directory containing Bowtie2 index for other contaminant sequences.
+- `virus-genome-metadata-gid.tsv.gz`: Genome metadata file generated during download of HV genomes from viral Genbank, annotated additionally with Genome IDs used by Bowtie2 (allowing mapping between genome ID and taxid).
+
+#### `Kraken2`
+
+- `kraken_db`: Directory containing Kraken2 reference database (default: Most recent version of Standard).
+
+#### `BBduk`
+
+- `virus-genomes-filtered.fasta.gz`: FASTA file containing host-infecting viral genomes downloaded from viral Genbank (filtered to remove transgenic, contaminated, or erroneous sequences).
+- `ribo-ref-concat.fasta.gz`: Reference database of ribosomal LSU and SSU sequences from SILVA.

From e946b8d363908ad0bc139463805c6813bd3a811c Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 15:40:38 +0000
Subject: [PATCH 19/34] removing weird spacing.

---
 docs/installation.md | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 38be49d9..c29a708e 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -72,7 +72,7 @@ This instance type matches the GitHub Actions virtual machines where CI/CD tests
 To run the tests:
 ```bash
 cd /path/to/repo
-nf-test test tests/
+nf-test test
 ```
 
 ## 5. Prepare compute resources for running the pipeline on real data
@@ -149,19 +149,3 @@ nextflow run -resume -profile <PROFILE> REPO_DIR
 ```
 
 Once the pipeline is complete, output and logging files will be available in the `output` subdirectory of the base directory specified in the config file.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-### Compute resource requirements
-

From febbdb610af51f684025cb33e6c312e7b5c850df Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 16:25:02 +0000
Subject: [PATCH 20/34] Fixed spelling mistakes

---
 docs/config.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/config.md b/docs/config.md
index 48081824..9140a687 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -1,10 +1,10 @@
 # Configurations 
 
-Configuartions are specified by `.config` files. These files are used to specify parameters and other configuration options used by Nextflow in executing the pipeline. 
+Configurations are specified by `.config` files. These files are used to specify parameters and other configuration options used by Nextflow in executing the pipeline. 
 
 All configurations are stored in the `configs` directory.
 
-# Workflow specific configurations
+# Workflow-specific configurations
 
 ## Run workflow (`configs/run.config`)
 

From e9eea6e55300a040299b3cebb99d7afe9f0dc97f Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 16:56:34 +0000
Subject: [PATCH 21/34] Remove stale param.

---
 CHANGELOG.md                                 | 1 +
 configs/run.config                           | 1 -
 configs/run_dev_se.config                    | 1 -
 subworkflows/local/extractViralReads/main.nf | 1 -
 test-data/nextflow.config                    | 1 -
 tests/run.config                             | 1 -
 tests/run_dev_ont.config                     | 1 -
 tests/run_dev_se.config                      | 1 -
 workflows/run.nf                             | 2 +-
 9 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b85ee27..4f718635 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Implement masking of viral genome reference in index workflow with MASK_GENOME_FASTA to remove adapter, low-entropy and repeat sequences.
 - Remove TRIMMOMATIC and BBMAP from the pipeline.
 - Fixed bug in extractUnconcReadID that would cause the pipeline to fail if it contained the string 'YT' in the read id.
+- Remove `params.quality_encoding` as it was used only by TRIMMOMATIC
 
 # v2.6.0.0
 - Updated version to reflect the new versioning scheme, which is described in `docs/version_schema.md`.
diff --git a/configs/run.config b/configs/run.config
index 467e3b12..d0441572 100644
--- a/configs/run.config
+++ b/configs/run.config
@@ -21,7 +21,6 @@ params {
     n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
     bt2_score_threshold = 20 // Normalized score threshold for calling a host-infecting virus read (typically 15 or 20)
     blast_viral_fraction = 0 // Fraction of putative host-infecting virus reads to BLAST vs nt (0 = don't run BLAST)
-    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
 
diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config
index b5d05880..0407b7d7 100644
--- a/configs/run_dev_se.config
+++ b/configs/run_dev_se.config
@@ -22,7 +22,6 @@ params {
     bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
     blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
     kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
-    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
 }
diff --git a/subworkflows/local/extractViralReads/main.nf b/subworkflows/local/extractViralReads/main.nf
index d9a1d1cb..3280f187 100644
--- a/subworkflows/local/extractViralReads/main.nf
+++ b/subworkflows/local/extractViralReads/main.nf
@@ -48,7 +48,6 @@ workflow EXTRACT_VIRAL_READS {
         min_kmer_hits
         k
         bbduk_suffix
-        encoding
         fuzzy_match
         grouping
         single_end
diff --git a/test-data/nextflow.config b/test-data/nextflow.config
index 467e3b12..d0441572 100644
--- a/test-data/nextflow.config
+++ b/test-data/nextflow.config
@@ -21,7 +21,6 @@ params {
     n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
     bt2_score_threshold = 20 // Normalized score threshold for calling a host-infecting virus read (typically 15 or 20)
     blast_viral_fraction = 0 // Fraction of putative host-infecting virus reads to BLAST vs nt (0 = don't run BLAST)
-    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
 
diff --git a/tests/run.config b/tests/run.config
index dc1ef93d..c6d68ffb 100644
--- a/tests/run.config
+++ b/tests/run.config
@@ -23,7 +23,6 @@ params {
     n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
     bt2_score_threshold = 20 // Normalized score threshold for calling a host-infecting virus read (typically 15 or 20)
     blast_viral_fraction = 1 // Fraction of putative host-infecting virus reads to BLAST vs nt (0 = don't run BLAST)
-    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
     blast_db_prefix = "nt_others"
diff --git a/tests/run_dev_ont.config b/tests/run_dev_ont.config
index e36229cc..8b32ab6d 100644
--- a/tests/run_dev_ont.config
+++ b/tests/run_dev_ont.config
@@ -24,7 +24,6 @@ params {
     bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
     blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
     kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
-    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
 
diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
index 7174f7b5..1846c3c8 100644
--- a/tests/run_dev_se.config
+++ b/tests/run_dev_se.config
@@ -22,7 +22,6 @@ params {
     bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
     blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
     kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
-    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
     fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
     host_taxon = "vertebrate"
 
diff --git a/workflows/run.nf b/workflows/run.nf
index fb7a0f0a..1471166d 100644
--- a/workflows/run.nf
+++ b/workflows/run.nf
@@ -39,7 +39,7 @@ workflow RUN {
     COUNT_TOTAL_READS(samplesheet_ch)
 
     // Extract and count human-viral reads
-    EXTRACT_VIRAL_READS(samplesheet_ch, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", params.grouping, params.single_end)
+    EXTRACT_VIRAL_READS(samplesheet_ch, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.fuzzy_match_alignment_duplicates}", params.grouping, params.single_end)
 
     // Process intermediate output for chimera detection
     raw_processed_ch = EXTRACT_VIRAL_READS.out.bbduk_match.join(samplesheet_ch, by: 0)

From 3bb7daa42e2ffe5f0b5c91e7151f5925a6593291 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Fri, 24 Jan 2025 15:44:16 -0500
Subject: [PATCH 22/34] Edited config.md

---
 docs/config.md | 78 ++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/docs/config.md b/docs/config.md
index 48081824..7887929e 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -1,46 +1,48 @@
-# Configurations 
+# Configuration files
 
-Configuartions are specified by `.config` files. These files are used to specify parameters and other configuration options used by Nextflow in executing the pipeline. 
+Nextflow configuration is controlled by `.config` files, which specify parameters and other options used in executing the pipeline.
 
-All configurations are stored in the `configs` directory.
+All configuration files used in the pipeline are stored in the `configs` directory. To configure a specific pipeline run, copy the appropriate config file for that pipeline mode (e.g. `run.config`) into the launch directory, rename it to `nextflow.config`, and edit it as appropriate. That config file will in turn call other, standard config files included in the `configs` directory.
 
-# Workflow specific configurations
+The rest of this page describes the specific options present in each config file, with a focus on those intended to be copied and edited by users.
 
-## Run workflow (`configs/run.config`)
+## Run workflow configuration (`configs/run.config`)
 
-- `params.mode = "run"`: This instructs the pipeline to execute the [core run workflow](../workflows/run.nf).
-- `params.base_dir`: The parent directory for the pipeline working and output directories.
-- `params.ref_dir`: The directory containing the outputs of the `index` workflow.
-- `params.sample_sheet`: The path to the sample sheet.
-- `params.adapters`: The path to the adapter file for adapter trimming.
-- `params.grouping`: Whether to group samples by the `group` column in the sample sheet.
-- `params.n_reads_profile`: The number of reads per sample to run through taxonomic profiling.
-- `params.bt2_score_threshold`: The normalized score threshold for calling a host-infecting virus read (typically 15 or 20).
-- `params.blast_viral_fraction`: The fraction of putative host-infecting virus reads to BLAST vs nt (0 = don't run BLAST).
-- `params.quality_encoding`: The FASTQ quality encoding (probably phred33, maybe phred64).
-- `params.fuzzy_match_alignment_duplicates`: Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2).
-- `params.host_taxon`: The taxon to use for host-infecting virus identification.
-- `params.blast_db_prefix`: The prefix for the BLAST database to use for host-infecting virus identification.
+This configuration file controls the pipeline's main RUN workflow. Its options are as follows:
+
+- `params.mode = "run"` [str]: This instructs the pipeline to execute the [core run workflow](./workflows/run.nf).
+- `params.base_dir` [str]: Path to the parent directory for the pipeline working and output directories.
+- `params.ref_dir` [str]: Path to the directory containing the outputs of the [`index` workflow](./docs/index.md).
+- `params.sample_sheet` [str]: Path to the [sample sheet](./docs/usage.md#11-the-sample-sheet) used for the pipeline run.
+- `params.adapters` [str]: Path to the adapter file for adapter trimming (default [`ref/adapters.fasta`](./ref/adapters.fasta).
+- `params.grouping` [bool]: Whether to group samples by the `group` column in the sample sheet.
+- `params.n_reads_profile` [int]: The number of reads per sample to run through taxonomic profiling (default 1 million).
+- `params.bt2_score_threshold` [float]: The length-normalized Bowtie2 score threshold above which a read is considered a valid hit for a host-infecting virus (typically 15 or 20).
+- `params.blast_viral_fraction` [float]: The fraction of putative host-infecting virus reads to validate with BLASTN (0 = don't run BLAST).
+- `params.fuzzy_match_alignment_duplicates` [int]: Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2).
+- `params.host_taxon` [str]: The taxon to use for host-infecting virus identification with Kraken2.
+- `params.blast_db_prefix` [str]: The prefix for the BLAST database to use for host-infecting virus identification (should match the index workflow's `params.blast_db_name`).
+- `process.queue` [str]: The [AWS Batch job queue](./docs/batch.md) to use for this pipeline run.
 
 ## Index workflow (`configs/index.config`)
 
-- `params.mode = "index"`: This instructs the pipeline to execute the [index workflow](../workflows/index.nf).
-- `params.base_dir`: The parent directory for the pipeline working and output directories.
-- `params.taxonomy_url`: The URL for the NCBI taxonomy dump.
-- `params.virus_host_db_url`: The URL for the viral host database.
-- `params.human_url`: The URL used for the human genome, which is used to build the human index which we use for screening out human reads.
-- `params.genome_urls`: The URLs for the genomes that are common contaminants, which is used to build out the other contaminants index which we use for screening out common contaminants.
-- `params.ssu_url`: The URL for the SILVA SSU reference database, which we use for building the ribosomal index to classify reads as ribosomal or not.
-- `params.lsu_url`: The URL for the SILVA LSU reference database, which we use for building the ribosomal index to classify reads as ribosomal or not.
-- `params.host_taxon_db`: The path to the host taxon database, which we use to establish a host-taxon relationship for the viral genome database.
-- `params.contaminants`: The path to the reads that are common contaminants, which we screen out for.
-- `params.adapters`: The path to the common adapters which we use to clean the viral reference genomes that we screen for.
-- `params.genome_patterns_exclude`: The path to the genome patterns to exclude.
-- `params.kraken_db`: The path to the Kraken reference database which we use to taxonomically classify all reads.
-- `params.blast_db_name`: The name of the BLAST database to use which is used during BLAST validation.
-- `params.ncbi_viral_params`: The parameters to use for the NCBI viral genome database which we use to build out our viral indices for BBDuk and Bowtie2.
-- `params.virus_taxid`: The taxid for the virus to use for building the viral genome database.
-- `params.viral_taxids_exclude`: The taxids to exclude from the viral genome database.
-- `params.host_taxa_screen`: The host taxa to screen for when building the viral genome database.
-- `params.kraken_memory`: Initalizes the `run` workflow params to avoid warnings (DO NOT CHANGE)
-- `params.classify_dedup_subset`: Initalizes the `run` workflow params to avoid warnings (DO NOT CHANGE)
+- `params.mode = "index"` [str]: This instructs the pipeline to execute the [index workflow](./workflows/index.nf).
+- `params.base_dir` [str]: Path to the parent directory for the pipeline working and output directories.
+- `params.taxonomy_url` [str]: URL for the NCBI taxonomy dump to be used in index generation.
+- `params.virus_host_db_url` [str]: URL for Virus-Host DB.
+- `params.human_url` [str]: URL for downloading the human genome in FASTA format, which is used in index construction for contaminant screening.
+- `params.genome_urls` [list(str)]: URLs for downloading other common contaminant genomes.
+- `params.ssu_url` [str]: URL for the SILVA SSU reference database, used in ribosomal classification.
+- `params.lsu_url` [str]: URL for the SILVA LSU reference database, used in ribosomal classification.
+- `params.host_taxon_db` [str]: Path to a TSV mapping host taxon names to taxids (default: [`ref/host-taxa.tsv`](./ref/host-taxa.tsv).
+- `params.contaminants` [str]: Path to a local file containing other contaminant genomes to exclude during contaminant filtering (default [`ref/contaminants.fasta.gz`](./ref/contaminants.fasta.gz).
+- `params.adapters` [str]: Path to the adapter file for adapter masking during reference DB generation (default [`ref/adapters.fasta`](./ref/adapters.fasta).
+- `params.genome_patterns_exclude` [str]: Path to a text file specifying string patterns to hard-exclude genomes during viral genome DB generation (e.g. transgenic sequences) (default [`ref/hv_patterns_exclude.txt`](./ref/hv_patterns_exclude.txt).
+- `params.kraken_db` [str]: Path to pre-generated Kraken2 reference database (we use the Standard database by default)
+- `params.blast_db_name` [str]: The name of the BLAST database to use for optional validation of taxonomic assignments (should match the run workflow's `params.blast_db_prefix`).
+- `params.ncbi_viral_params` [str]: Parameters to pass to ncbi-genome-download when generating viral genome DB. Must at a minimum specify `--section genbank` or `--section refseq`.
+- `params.virus_taxid` [int]: The NCBI taxid for the Viruses taxon (currently 10239).
+- `params.viral_taxids_exclude` [str]: Space-separated string of taxids to hard-exclude from the list of host-infecting viruses. Currently includes phage taxa that Virus-Host DB erroneously classifies as human-infecting.
+- `params.host_taxa_screen`: Space-separated list of host taxon names to screen for when building the viral genome database. Should correspond to taxa included in `params.host_taxon_db`.
+- `params.kraken_memory`: Placeholder to initialize `run` workflow params to avoid warnings
+- `params.classify_dedup_subset`: Placeholder to initialize `run` workflow params to avoid warnings

From 9f9e979e36abfafeea8f4a084faabf9a8ae283e9 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Fri, 24 Jan 2025 16:05:39 -0500
Subject: [PATCH 23/34] Edited installation docs

---
 docs/installation.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index c29a708e..42f02062 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -58,20 +58,19 @@ docker run hello-world
 
 Clone this repo into a new directory as normal.
 
-## 4. Run the pipeline locally via our test suite
+## 4. Run tests
 
-To verify the pipeline works correctly, run the test suite locally. You'll need:
+If possible, we recommend validating the pipeline's basic functionality in your hands by running our test suite. To do this, you'll need sufficient resources on your machine to run our tests locally. You'll need:
+- 4 CPU cores
+- 14GB RAM
+- At least 32GB storage sapce
 
-- **Recommended EC2 Instance:** `m5.xlarge`
-  - 4 CPU cores
-  - 14GB RAM
-  - At least 32GB EBS storage
+> [!TIP]
+> We recommend running the test suite on an `m5.xlarge`, to most closely match the conditions under which our CI/CD tests run with Github Actions. However, any Linux machine with sufficient resources should work.
 
-This instance type matches the GitHub Actions virtual machines where CI/CD tests run.
+To run the tests, clone this repository onto your machine, navigate to the repo directory, and run
 
-To run the tests:
 ```bash
-cd /path/to/repo
 nf-test test
 ```
 

From c9838e67d07db6fa1e1d1970c6e225f5ef27bd52 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Fri, 24 Jan 2025 16:18:18 -0500
Subject: [PATCH 24/34] Edited troubleshooting.md

---
 docs/troubleshooting.md | 43 +++++++++--------------------------------
 1 file changed, 9 insertions(+), 34 deletions(-)

diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index 742842c8..ce7d7b6e 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -1,30 +1,16 @@
 # Troubleshooting
 
-We break down the issues that can arise into two categories:
+## Permission issues
 
-- Nextflow isn't able to run
-- Nextflow runs then fails
+When attempting to run a released version of the pipeline, the most common sources of errors are AWS permission issues. Before debugging a persistent error in-depth, make sure you have all the permissions required. When running the pipeline on AWS Batch, the necessary permissions are specified in [our Batch tutorial](./batch.md#step-0-set-up-your-aws-credentials).
 
-We discuss each of these issues below.
-
-## Nextflow isn't able to run
-
-**The most common sources of errors are AWS permission issues.** Whether you're using the `batch` profile or not, you should make sure you have all AWS permissions, as specified in [our Batch tutorial](./batch.md#step-0-set-up-your-aws-credentials).
-
-Next, make sure Nextflow has access to your AWS credentials (if not you may get `AccessDenied` errors):
+Once you've verified you have the required permissions, make sure Nextflow has access to your AWS credentials (if not you may get `AccessDenied` errors):
 
 ```
 eval "$(aws configure export-credentials --format env)"
 ```
 
-## Nextflow runs then fails
-
-
-### None of my jobs are running
-
-The first time a pipeline is run on any dataset, it can take a while for the jobs to start (10-15 minutes at most). If after this period of time, none of your jobs have started, then this may be another AWS permission issue.
-
-### Docker image failure
+## Docker image failures
 
 Another common issue is for processes to fail with some variation of the following Docker-related error:
 
@@ -34,20 +20,9 @@ docker: failed to register layer: write /usr/lib/jvm/java-11-openjdk-amd64/lib/m
 
 This is a fairly well-known problem, which can arise even when there is substantial free storage space accessible to your EC2 instance. Following the steps recommended [here](https://www.baeldung.com/linux/docker-fix-no-space-error) or [here](https://forums.docker.com/t/docker-no-space-left-on-device/69205) typically resolves the issue, either by deleting Docker assets to free up space (e.g. via `docker system prune --all --force`) or by giving Docker more space.
 
-### Insufficient resources
-
-This an issue we're actively working on. Sometimes your jobs may fail due to insufficient memory or CPU resources. To avoid this, you can either change your overall resources or change the resources for a specific process:
-
-- **Change your overall resources**: You can change the resources for your entire pipeline by editing the `configs/resources.config` file. However, note that this will probably also require you to change the reference files and indices generated by the `index` workflow. [See here for more information on compute resource requirements](./usage#compute-resource-requirements).
-- **Change the resources for a specific process**: You can change the resources for a specific process by changing the label of that process to another label that is in the `configs/resources.config` file or you can define a new label in the `configs/resources.config` file.
-
-
-### Rate limit exceeded
-
-Occassionally, you may encounter the following error:
-
-```
-Task failed to start - CannotPullImageManifestError: Error response from daemon: toomanyrequests: Request exceeded pull rate limit for IP XX.XXX.XX.XX
-```
+## Resource constraint errors
 
-This is an issue with AWS Batch. The solution is to restart your run with the nextflow `-resume` option.
\ No newline at end of file
+Jobs may sometimes fail due to insufficient memory or CPU availability, especially on very large datasets or small instances. To fix this, you can:
+- **Increase resource allocations in `configs/resources.config`.** This will alter the resources available to all processes with a given tag (e.g. "small").
+- **Increase resource allocation to a specific process.** You can do this by editing the process in the relevant Nextflow file, most likely found at `modules/local/MODULE_NAME/main.nf`.
+Note that in some cases it may not be possible to allocate enough resources to meet the needs of a given process, especially on a resource-constrained machine. In this case, you will need to use a smaller reference file (e.g. a smaller Kraken reference DB) or obtain a larger machine.

From 2f65912a8c39018b7113dea722d34f64c907e119 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 21:37:22 +0000
Subject: [PATCH 25/34] removed a folder that was used by FASTP that I forgot
 to remove.

---
 main.nf | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/main.nf b/main.nf
index 2225cf1a..5544ab3f 100644
--- a/main.nf
+++ b/main.nf
@@ -28,10 +28,6 @@ output {
         path "results"
         tags nextflow_file_class: "publish", "nextflow.io/temporary": "false"
     }
-    reads_cleaned {
-        path "intermediates/reads/cleaned"
-        tags nextflow_file_class: "intermediate", "nextflow.io/temporary": "false"
-    }
     reads_raw_viral {
         path "intermediates/reads/raw_viral"
         tags nextflow_file_class: "intermediate", "nextflow.io/temporary": "false"

From 2bfab2b79676907ccecacd71d6ea412fb9ffd2ad Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 21:37:54 +0000
Subject: [PATCH 26/34] Moved compute resources to usage. Added the one
 additional file from the new pipeline to output folder.

---
 docs/installation.md | 25 ++-----------------------
 docs/output.md       | 21 +++++++++++----------
 docs/usage.md        | 22 ++++++++++++++++++++++
 3 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 42f02062..8cc51c3e 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -74,28 +74,7 @@ To run the tests, clone this repository onto your machine, navigate to the repo
 nf-test test
 ```
 
-## 5. Prepare compute resources for running the pipeline on real data
-
-The pipeline has significant compute requirements:
-
-- **Base Requirements:**
-  - 128GB RAM
-  - 64 CPU cores
-  - Required for running the standard pipeline with the full Kraken database (128GB)
-
-- **Additional Requirements for BLAST:**
-  - 256GB RAM when running either the `run` or `run_validation` workflows with BLAST enabled
-
-Ensure your computing environment meets these requirements:
-- For `ec2_local` or `ec2_s3` profiles: Select an EC2 instance with sufficient resources
-- For `batch` profile: Configure AWS Batch to scale to these specifications
-
-> [!NOTE]
-> The following recommendations are based on using the default reference files produced as a result of the  `index` workflow. If you do not have access to this much compute, you can modify the `index` workflow to create smaller reference files, however this will reduce the accuracy of the pipeline.
->
-> In the situation that you do use smaller reference files, you can modify the required resources by changing the resource specifications in the `config/resources.config` file.
-
-## 6. Run index/reference workflow
+## 5. Run index/reference workflow
 
 > [!TIP]
 > If someone else in your organization already uses this pipeline, it's likely they've already run the index workflow and generated an output directory. If this is the case, you can reduce costs and increase reproducibility by using theirs instead of generating your own. If you want to do this, skip this step, and edit `configs/run.config` such that `params.ref_dir` points to `INDEX_DIR/output`.
@@ -121,7 +100,7 @@ nextflow run PATH_TO_REPO_DIR -resume
 
 Wait for the workflow to run to completion; this is likely to take several hours at least.
 
-## 7. Run the pipeline on test data
+## 6. Run the pipeline on test data
 
 To confirm that the pipeline works in your hands, we recommend running it on a small test dataset, such as the one provided at `s3://nao-testing/gold-standard-test/raw/`, before running it on larger input data. To do this with our test dataset, follow the instructions below, or do it yourself according to the directions given [here](./docs/usage.md).
 
diff --git a/docs/output.md b/docs/output.md
index 1f4cbe8c..2e58c009 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -29,11 +29,11 @@ Main heading represents the folder name, and subheadings represent a description
 
 ### `intermediates`
 
-- `reads/raw_viral/*`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
+- `/reads/raw_viral/*`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
 
 ### `results`
 
-#### `qc`
+#### QC
 - `total_reads_qc.tsv.gz`: Total number of raw reads in each sample.
 - `qc_basic_stats.tsv.gz`: Summary statistics for each subset sample before and after adapter trimming, including:
     - GC content (`percent GC`);
@@ -45,16 +45,17 @@ Main heading represents the folder name, and subheadings represent a description
 - `qc_adapter_stats.tsv.gz`: Adapter statistics calculated by FASTQC for subset sample before and after adapter trimming, given as a percentage of reads containing adapter content (`pc_adapters`) at each position along the read (`position`) for each adapter detected (`adapter`) for each read in the read pair (`read_pair`).
 - `qc_quality_base_stats.tsv.gz`: Per-base read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the mean Phred score (`mean_phred_score`) at each position along the read (`position`) for each read in the read pair (`read_pair`).
 - `qc_quality_sequence_stats.tsv.gz`: Per-sequence read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the number of reads (`n_sequences`) with a given mean Phred score (`mean_phred_score`) for each read in the read pair (`read_pair`).
+- `qc_length_stats.tsv.gz`: Per-read length statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the number of reads (`n_sequences`) with a given read length (`read_length`) for each read in the read pair (`read_pair`).
 
-#### `viral identification`
+#### Viral identification
 - `virus_hits_db.tsv.gz`: TSV output by the viral identification phase, giving information about each read pair assigned to a host-infecting virus.
 - `virus_clade_counts.tsv.gz`: Summary of the previous file giving the number of HV read pairs mapped to each viral taxon. Includes both read pairs mapped directly to that taxon (`n_reads_direct`) and to that taxon plus all descendent taxa (`n_reads_clade`).
 
-#### `taxonomic identification`
+#### Taxonomic identification
 - `kraken_reports_merged.tsv.gz`: Kraken output reports in TSV format, labeled by sample and ribosomal status for subset trimmed samples.
 - `bracken_reports_merged.tsv.gz`: Bracken output reports in TSV format, labeled by sample and ribosomal status for subset trimmed samples.
 
-#### `blast`
+#### BLAST
 - `blast_hits_paired.tsv.gz`: Summarized BLASTN output for putative HV read pairs, giving, for each read pair and subject taxid:
     - The number of reads in the read pair with high-scoring matches to that taxid (`n_reads`).
     - The best bitscores of alignments to that taxid for each matching read (`bitscore_max` and `bitscore_min`)[^bitscore].
@@ -80,28 +81,28 @@ Main heading represents the folder name, and subheadings describes the tool that
 
 ### `results`
 
-#### `General`
+#### General
 
 - `total-virus-db-annotated.tsv.gz`: Database generated from NCBI taxonomy and Virus-Host-DB giving taxonomy and host-infection information for each viral taxon.
 - `taxonomy-nodes.dmp`: Taxonomy dump file from NCBI mapping between taxids and their parents in the NCBI taxonomy tree structure.
 - `taxonomy-names.dmp`: Taxonomy dump file from NCBI mapping between taxids and taxon names.
 
-#### `BLAST`
+#### BLAST
 
 - `core_nt`: Directory containing extracted BLAST database files for BLASTing against core_nt.
 
-#### `Bowtie2`
+#### Bowtie2
 
 - `bt2-virus-index`: Directory containing Bowtie2 index for host-infecting viral genomes.
 - `bt2-human-index`: Directory containing Bowtie2 index for the human genome.
 - `bt2-other-index`: Directory containing Bowtie2 index for other contaminant sequences.
 - `virus-genome-metadata-gid.tsv.gz`: Genome metadata file generated during download of HV genomes from viral Genbank, annotated additionally with Genome IDs used by Bowtie2 (allowing mapping between genome ID and taxid).
 
-#### `Kraken2`
+#### Kraken2
 
 - `kraken_db`: Directory containing Kraken2 reference database (default: Most recent version of Standard).
 
-#### `BBduk`
+#### BBduk
 
 - `virus-genomes-filtered.fasta.gz`: FASTA file containing host-infecting viral genomes downloaded from viral Genbank (filtered to remove transgenic, contaminated, or erroneous sequences).
 - `ribo-ref-concat.fasta.gz`: Reference database of ribosomal LSU and SSU sequences from SILVA.
diff --git a/docs/usage.md b/docs/usage.md
index f51ec3bf..27b2963d 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -76,6 +76,28 @@ The pipeline can be run in multiple ways by modifying various configuration vari
 
 To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
 
+### Prepare compute resources for running the pipeline on real data
+
+The pipeline has significant compute requirements:
+
+- **Base Requirements:**
+  - 128GB RAM
+  - 64 CPU cores
+  - Required for running the standard pipeline with the full Kraken database (128GB)
+
+- **Additional Requirements for BLAST:**
+  - 256GB RAM when running either the `run` or `run_validation` workflows with BLAST enabled
+
+Ensure your computing environment meets these requirements:
+- For `ec2_local` or `ec2_s3` profiles: Select an EC2 instance with sufficient resources
+- For `batch` profile: Configure AWS Batch to scale to these specifications
+
+> [!NOTE]
+> The following recommendations are based on using the default reference files produced as a result of the  `index` workflow. If you do not have access to this much compute, you can modify the `index` workflow to create smaller reference files, however this will reduce the accuracy of the pipeline.
+>
+> In the situation that you do use smaller reference files, you can modify the required resources by changing the resource specifications in the `config/resources.config` file.
+
+
 ## 3. Running the pipeline
 
 After creating your sample sheet and config files and choosing a profile, navigate to the launch directory containing your config file. You can then run the pipeline as follows:

From bb92515b3e41ec392b73e7e495e0eebe80c81ae7 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Fri, 24 Jan 2025 21:38:58 +0000
Subject: [PATCH 27/34] Added additional qc file to run workflow doc.

---
 docs/run.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/run.md b/docs/run.md
index 0ca44e51..7c4606ac 100644
--- a/docs/run.md
+++ b/docs/run.md
@@ -137,12 +137,13 @@ A(Reads) --> B[FASTQ]
 B --> C[MULTIQC]
 C --> D[SUMMARIZE_MULTIQC_PAIR]
 D --> E[Process & merge output]
-E --> F(QC basic stats) & G(QC adapter stats) & H(QC quality base stats) & I(QC quality sequence stats)
+E --> F(QC basic stats) & G(QC adapter stats) & H(QC quality base stats) & I(QC quality sequence stats) & J(QC length stats)
 style A fill:#fff,stroke:#000
 style F fill:#000,color:#fff,stroke:#000
 style G fill:#000,color:#fff,stroke:#000
 style H fill:#000,color:#fff,stroke:#000
 style I fill:#000,color:#fff,stroke:#000
+style J fill:#000,color:#fff,stroke:#000
 ```
 
 1. Run FASTQC on each pair of read files
@@ -293,13 +294,15 @@ E --> F(Combined QC basic stats)
 E --> G(Combined QC adapter stats)
 E --> H(Combined QC quality base stats)
 E --> I(Combined QC quality sequence stats)
+E --> J(Combined QC length stats)
 style A fill:#fff,stroke:#000
 style C fill:#fff,stroke:#000
 style F fill:#000,color:#fff,stroke:#000
 style G fill:#000,color:#fff,stroke:#000
 style H fill:#000,color:#fff,stroke:#000
 style I fill:#000,color:#fff,stroke:#000
+style J fill:#000,color:#fff,stroke:#000
 ```
 
-[^tsvs]: Specifically, `qc_basic_stats.tsv.gz`, `qc_adapter_stats.tsv.gz`, `qc_quality_base_stats.tsv.gz` and `qc_quality_sequence_stats.tsv.gz`.
+[^tsvs]: Specifically, `qc_basic_stats.tsv.gz`, `qc_adapter_stats.tsv.gz`, `qc_quality_base_stats.tsv.gz`, `qc_quality_sequence_stats.tsv.gz` and `qc_length_stats.tsv.gz`.
 

From 0d1d3946c0925da69ee3e98d3c2204a7b8201960 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Fri, 24 Jan 2025 18:40:38 -0500
Subject: [PATCH 28/34] Edited usage.md

---
 docs/usage.md | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index 27b2963d..f47df05c 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -70,34 +70,13 @@ The pipeline can be run in multiple ways by modifying various configuration vari
   - This profile is the default and attempts to run the pipeline with AWS Batch. This is the most reliable and convenient way to run the pipeline, but requires significant additional setup (described [here](./batch.md)). Before running the pipeline using this profile, make sure `process.queue` in your config file is pointing to the correct Batch job queue.
 - `ec2_local`: **Requires the least setup, but is bottlenecked by your instance's compute, memory and storage.**
   - This profile attempts to run the whole pipeline locally on your EC2 instance, storing all files on instance-linked block storage.
-  - This is simple and can be relatively fast, but requires large CPU, memory and storage allocations. In particular, if you don't use an instance with very high memory, the pipeline is likely to fail when loading the Kraken2 reference DB.
+  - This is simple and can be relatively fast, but requires large CPU, memory and storage allocations: at least 128GB RAM, 64 CPU cores, and 256GB local storage are recommended, though the latter in particular is highly dependent on the size of your dataset.
+  - If running optional BLAST validation, at least 256GB RAM is needed to store the BLAST DB.
 - `ec2_s3`: **Avoids storage issues on your EC2 instance, but is still constrained by local compute and memory.**
   - This profile runs the pipeline on your EC2 instance, but attempts to read and write files to a specified S3 directory. This avoids problems arising from insufficient local storage, but (a) is significantly slower and (b) is still constrained by local compute and memory allocations.
 
 To run the pipeline with a specified profile, run `nextflow run PATH_TO_REPO_DIR -profile PROFILE_NAME -resume`. Calling the pipeline without specifying a profile will run the `batch` profile by default. Future example commands in this README will assume you are using Batch; if you want to instead use a different profile, you'll need to modify the commands accordingly.
 
-### Prepare compute resources for running the pipeline on real data
-
-The pipeline has significant compute requirements:
-
-- **Base Requirements:**
-  - 128GB RAM
-  - 64 CPU cores
-  - Required for running the standard pipeline with the full Kraken database (128GB)
-
-- **Additional Requirements for BLAST:**
-  - 256GB RAM when running either the `run` or `run_validation` workflows with BLAST enabled
-
-Ensure your computing environment meets these requirements:
-- For `ec2_local` or `ec2_s3` profiles: Select an EC2 instance with sufficient resources
-- For `batch` profile: Configure AWS Batch to scale to these specifications
-
-> [!NOTE]
-> The following recommendations are based on using the default reference files produced as a result of the  `index` workflow. If you do not have access to this much compute, you can modify the `index` workflow to create smaller reference files, however this will reduce the accuracy of the pipeline.
->
-> In the situation that you do use smaller reference files, you can modify the required resources by changing the resource specifications in the `config/resources.config` file.
-
-
 ## 3. Running the pipeline
 
 After creating your sample sheet and config files and choosing a profile, navigate to the launch directory containing your config file. You can then run the pipeline as follows:
@@ -125,4 +104,4 @@ Once the pipeline has finished, output and logging files will be available in th
 
 Running nextflow pipelines will create a large number of files in the working directory. To get rid of these files, you can remove them manually or by running the `nextflow clean` command in the launch directory.
 
-If you are running the pipeline using `ec2_local` or `ec2_s3` profiles, you will also want to clean up the docker images and containers created by the pipeline as these can take up a lot of space. This can be done by running `docker system prune -a` which will remove all unused docker images and containers on your system.
\ No newline at end of file
+If you are running the pipeline using `ec2_local` or `ec2_s3` profiles, you will also want to clean up the docker images and containers created by the pipeline as these can take up a lot of space. This can be done by running `docker system prune -a` which will remove all unused docker images and containers on your system.

From 9ac307236456f9d61bae59f4663120bc7b045c50 Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Fri, 24 Jan 2025 18:45:56 -0500
Subject: [PATCH 29/34] Edited README

---
 README.md | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 9b0dd18d..9f665681 100644
--- a/README.md
+++ b/README.md
@@ -2,24 +2,29 @@
 
 This Nextflow pipeline is designed to process metagenomic sequencing data, characterize overall taxonomic composition, and identify and quantify reads mapping to viruses infecting certain host taxa of interest. It was developed as part of the [Nucleic Acid Observatory](https://naobservatory.org/) project.
 
-Documentation:
-- [Installation instructions](docs/installation.md)
-- [AWS Batch setup](docs/batch.md)
-- [Usage instructions](docs/usage.md)
-- [Configuration files](docs/configs.md)
-- [Run workflow details](docs/run.md)
-- [Index workflow details](docs/index.md)
-- [Pipeline outputs](docs/output.md)
-- [Troubleshooting](docs/troubleshooting.md)
-- [Version Schema](docs/version_schema.md)
+## Documentation
+
+- **Installation and usage:**
+    - [Installation instructions](docs/installation.md)
+    - [AWS Batch setup](docs/batch.md)
+    - [Usage instructions](docs/usage.md)
+    - [Troubleshooting](docs/troubleshooting.md)
+- **Workflow details:**
+    - [Index workflow](docs/index.md)
+    - [Run workflow](docs/run.md)
+- **Configuration and output:**
+    - [Configuration files](docs/configs.md)
+    - [Pipeline outputs](docs/output.md)
+- **Other:**
+    - [Version Schema](docs/version_schema.md)
 
 ## Overview
 
 The pipeline currently consists of three workflows:
 
+- `index`: Creates indices and reference files used by the `run` workflow. Intended to be run first.
 - `run`: Performs the main analysis, including viral identification, taxonomic profiling, and optional BLAST validation.
-- `index`: Creates indices and reference files used by the `run` workflow.[^1]
-- `run_validation`: Performs BLAST validation on any set of reads to verify their taxonomic classification.[^2]
+- `run_validation`: Performs part of the `run` workflow dedicated to validation of taxonomic classification with BLAST.[^2]
 
 [^1]: The `index` workflow is intended to be run first, after which many instantiations of the `run` workflow can use the same index output files. 
 [^2]: The `run_validation` workflow is intended to be run after the `run` workflow if the optional BLAST validation was not selected during the `run` workflow. Typically, this workflow is run on a subset of the host viral reads identified in the `run` workflow, to evaluate the sensitivity and specificity of the viral identification process.
@@ -31,7 +36,7 @@ The `run` workflow consists of four parts:
 - **QC**: The total number of reads are recorded, then a subset of reads (default 1M/sample) undergo quality assessment, both before and after adapter trimming.
 - **(Optional) BLAST validation**: Putative host viral reads from part 1 (either all reads or a subset) are checked aginst `core_nt` to evaluate the sensitivity and specificity of the viral identification process.
 
-The following diagram provides a high-level overview of the `run` workflow (each blue box is a subworkflow):
+The following diagram provides a high-level overview of the `run` workflow (each blue box is a subworkflow). See the documentation links above for more information, including detailed [installation](./docs/installation.md) and [usage](./docs/usage.md) instructions.
 
 ```mermaid
 ---
@@ -61,4 +66,4 @@ end
 subgraph "BLAST Validation"
 F
 end
-```
\ No newline at end of file
+```

From 429fac4c846e2df2b28f5a5f37af1c29f8c6bbff Mon Sep 17 00:00:00 2001
From: Will Bradshaw <wjbradshaw1@gmail.com>
Date: Fri, 24 Jan 2025 18:47:22 -0500
Subject: [PATCH 30/34] Minor README edits

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9f665681..d940b50d 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ This Nextflow pipeline is designed to process metagenomic sequencing data, chara
 
 The pipeline currently consists of three workflows:
 
-- `index`: Creates indices and reference files used by the `run` workflow. Intended to be run first.
-- `run`: Performs the main analysis, including viral identification, taxonomic profiling, and optional BLAST validation.
+- [`index`](./docs/index.md): Creates indices and reference files used by the `run` workflow. Intended to be run first.
+- [`run`](./docs/run.md): Performs the main analysis, including viral identification, taxonomic profiling, and optional BLAST validation.
 - `run_validation`: Performs part of the `run` workflow dedicated to validation of taxonomic classification with BLAST.[^2]
 
 [^1]: The `index` workflow is intended to be run first, after which many instantiations of the `run` workflow can use the same index output files. 

From 7d424458bb9ef2bca27b834cee71a5617ad2203d Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Mon, 27 Jan 2025 14:14:27 +0000
Subject: [PATCH 31/34] Added subset to front of file.

---
 subworkflows/local/runQc/main.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/subworkflows/local/runQc/main.nf b/subworkflows/local/runQc/main.nf
index 3db2f274..fd3124fb 100644
--- a/subworkflows/local/runQc/main.nf
+++ b/subworkflows/local/runQc/main.nf
@@ -32,10 +32,10 @@ workflow RUN_QC {
       multiqc_qbase_ch = qc_ch.map{ it[2] }.collect().ifEmpty([])
       multiqc_qseqs_ch = qc_ch.map{ it[3] }.collect().ifEmpty([])
       // 4. Merge MultiQC outputs
-      basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "qc_basic_stats")
-      adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "qc_adapter_stats")
-      qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "qc_quality_base_stats")
-      qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "qc_quality_sequence_stats")
+      basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "subset_qc_basic_stats")
+      adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "subset_qc_adapter_stats")
+      qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "subset_qc_quality_base_stats")
+      qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "subset_qc_quality_sequence_stats")
     emit:
       qc_basic = basic_out_ch
       qc_adapt = adapt_out_ch

From 29208a4512ca6d658156249cb61f8177a777a4b0 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Mon, 27 Jan 2025 14:19:55 +0000
Subject: [PATCH 32/34] Updating documentation to add subset to the qc
 filenames, and fixing some output syntax.

---
 docs/output.md | 26 +++++++++++++-------------
 docs/run.md    |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/output.md b/docs/output.md
index 2e58c009..04886837 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -13,39 +13,39 @@ All pipeline output can be found in the `output` directory, which is broken into
 
 Main heading represents the folder name, and subheadings represent a description of the file's usage. If the file is not in the heading folder name, the relative path is given.
 
-### `input`
+### `input/`
 
 - `adapters.fasta`: FASTA file of adapter sequences used for adapter screening.
 - `run-params.json`: JSON file giving all the parameters passed to the pipeline.
 - `index-params.json`: JSON file giving parameters used to generate index directory (`params.ref_dir`).
 - `samplesheet.csv`: Copy of the samplesheet file used to configure the pipeline (specified by `params.sample_sheet`).
 
-### `logging`
+### `logging/`
 
 - `pipeline-version.txt`: Version of the pipeline used for the run.
 - `time.txt`: Start time of the run.
 - `trace.txt`: Tab delimited log of all the information for each task run in the pipeline including runtime, memory usage, exit status, etc. Can be used to create an execution timeline using the the script `bin/plot-timeline-script.R` after the pipeline has finished running. More information regarding the trace file format can be found [here](https://www.nextflow.io/docs/latest/reports.html#trace-file).
 - `pipeline-version-index.txt`: Version of pipeline used to generate index directory (`params.ref_dir`).
 
-### `intermediates`
+### `intermediates/`
 
-- `/reads/raw_viral/*`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
+- `reads/raw_viral/*`: Directory containing raw reads corresponding to those reads that survive initial viral screening with BBDuk.
 
-### `results`
+### `results/`
 
 #### QC
 - `total_reads_qc.tsv.gz`: Total number of raw reads in each sample.
-- `qc_basic_stats.tsv.gz`: Summary statistics for each subset sample before and after adapter trimming, including:
+- `subset_qc_basic_stats.tsv.gz`: Summary statistics for each subset sample before and after adapter trimming, including:
     - GC content (`percent GC`);
     - Average read length (`mean_seq_len`);
     - Number of read pairs (`n_read pairs`);
     - Approximate number of base pairs in reads (`n_bases_approx`);
     - Percent duplicates as measured by FASTQC (`percent_duplicates`);
     - Pass/fail scores for each test conducted by FASTQC.
-- `qc_adapter_stats.tsv.gz`: Adapter statistics calculated by FASTQC for subset sample before and after adapter trimming, given as a percentage of reads containing adapter content (`pc_adapters`) at each position along the read (`position`) for each adapter detected (`adapter`) for each read in the read pair (`read_pair`).
-- `qc_quality_base_stats.tsv.gz`: Per-base read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the mean Phred score (`mean_phred_score`) at each position along the read (`position`) for each read in the read pair (`read_pair`).
-- `qc_quality_sequence_stats.tsv.gz`: Per-sequence read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the number of reads (`n_sequences`) with a given mean Phred score (`mean_phred_score`) for each read in the read pair (`read_pair`).
-- `qc_length_stats.tsv.gz`: Per-read length statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the number of reads (`n_sequences`) with a given read length (`read_length`) for each read in the read pair (`read_pair`).
+- `subset_qc_adapter_stats.tsv.gz`: Adapter statistics calculated by FASTQC for subset sample before and after adapter trimming, given as a percentage of reads containing adapter content (`pc_adapters`) at each position along the read (`position`) for each adapter detected (`adapter`) for each read in the read pair (`read_pair`).
+- `subset_qc_quality_base_stats.tsv.gz`: Per-base read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the mean Phred score (`mean_phred_score`) at each position along the read (`position`) for each read in the read pair (`read_pair`).
+- `subset_qc_quality_sequence_stats.tsv.gz`: Per-sequence read-quality statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the number of reads (`n_sequences`) with a given mean Phred score (`mean_phred_score`) for each read in the read pair (`read_pair`).
+- `subset_qc_length_stats.tsv.gz`: Per-read length statistics calculated by FASTQC for subset sample before and after adapter trimming, given as the number of reads (`n_sequences`) with a given read length (`read_length`) for each read in the read pair (`read_pair`).
 
 #### Viral identification
 - `virus_hits_db.tsv.gz`: TSV output by the viral identification phase, giving information about each read pair assigned to a host-infecting virus.
@@ -69,17 +69,17 @@ Main heading represents the folder name, and subheadings represent a description
 
 Main heading represents the folder name, and subheadings describes the tool that consumes the file. Files that are consumed by multiple tools or are not consumed by any tools are put in the `General` subheading. If the file is not in the heading folder name, the relative path is given.
 
-### `input`
+### `input/`
 
 - `index-params.json`: JSON file giving all the parameters passed to the pipeline (useful for trying to reproduce someone else's results).
 
-### `logging`
+### `logging/`
 
 - `pipeline-version.txt`: Version of the pipeline with which index directory was created.
 - `time.txt`: Start time of index workflow run.
 - `trace.txt`: Nextflow trace file containing logging information for each process performed during the workflow run.
 
-### `results`
+### `results/`
 
 #### General
 
diff --git a/docs/run.md b/docs/run.md
index 7c4606ac..053b8e15 100644
--- a/docs/run.md
+++ b/docs/run.md
@@ -304,5 +304,5 @@ style I fill:#000,color:#fff,stroke:#000
 style J fill:#000,color:#fff,stroke:#000
 ```
 
-[^tsvs]: Specifically, `qc_basic_stats.tsv.gz`, `qc_adapter_stats.tsv.gz`, `qc_quality_base_stats.tsv.gz`, `qc_quality_sequence_stats.tsv.gz` and `qc_length_stats.tsv.gz`.
+[^tsvs]: Specifically, `subset_qc_basic_stats.tsv.gz`, `subset_qc_adapter_stats.tsv.gz`, `subset_qc_quality_base_stats.tsv.gz`, `subset_qc_quality_sequence_stats.tsv.gz` and `subset_qc_length_stats.tsv.gz`.
 

From 6a9123ef935731af546f6ede4a56fd9f76544d2a Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Mon, 27 Jan 2025 14:35:23 +0000
Subject: [PATCH 33/34] Updating tests

---
 tests/main.nf.test | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/main.nf.test b/tests/main.nf.test
index 12ec80f6..fa2e81ba 100644
--- a/tests/main.nf.test
+++ b/tests/main.nf.test
@@ -45,10 +45,11 @@ nextflow_pipeline {
             assert snapshot(
                 path("${launchDir}/output/results/bracken_reports_merged.tsv.gz"),
                 path("${launchDir}/output/results/kraken_reports_merged.tsv.gz"),
-                path("${launchDir}/output/results/qc_adapter_stats.tsv.gz"),
-                path("${launchDir}/output/results/qc_basic_stats.tsv.gz"),
-                path("${launchDir}/output/results/qc_quality_base_stats.tsv.gz"),
-                path("${launchDir}/output/results/qc_quality_sequence_stats.tsv.gz"),
+                path("${launchDir}/output/results/subset_qc_adapter_stats.tsv.gz"),
+                path("${launchDir}/output/results/subset_qc_basic_stats.tsv.gz"),
+                path("${launchDir}/output/results/subset_qc_quality_base_stats.tsv.gz"),
+                path("${launchDir}/output/results/subset_qc_quality_sequence_stats.tsv.gz"),
+                path("${launchDir}/output/results/subset_qc_length_stats.tsv.gz"),
                 path("${launchDir}/output/results/virus_clade_counts.tsv.gz"),
                 path("${launchDir}/output/results/virus_hits_db.tsv.gz"),
                 path("${launchDir}/output/results/blast_hits_paired.tsv.gz"),

From 23d1f26e8169db081e6220858d1ae92cb39fb0d8 Mon Sep 17 00:00:00 2001
From: EC2 Default User <harmonprograms@protonmail.com>
Date: Mon, 27 Jan 2025 14:48:54 +0000
Subject: [PATCH 34/34] Updated snapshot to have the length stats.

---
 ...tats.tsv.gz => subset_qc_adapter_stats.tsv.gz} | Bin
 ..._stats.tsv.gz => subset_qc_basic_stats.tsv.gz} | Bin
 .../subset_qc_length_stats.tsv.gz                 | Bin 0 -> 407 bytes
 ...tsv.gz => subset_qc_quality_base_stats.tsv.gz} | Bin
 ...gz => subset_qc_quality_sequence_stats.tsv.gz} | Bin
 tests/main.nf.test.snap                           |  11 ++++++-----
 6 files changed, 6 insertions(+), 5 deletions(-)
 rename test-data/gold-standard-results/{qc_adapter_stats.tsv.gz => subset_qc_adapter_stats.tsv.gz} (100%)
 rename test-data/gold-standard-results/{qc_basic_stats.tsv.gz => subset_qc_basic_stats.tsv.gz} (100%)
 create mode 100644 test-data/gold-standard-results/subset_qc_length_stats.tsv.gz
 rename test-data/gold-standard-results/{qc_quality_base_stats.tsv.gz => subset_qc_quality_base_stats.tsv.gz} (100%)
 rename test-data/gold-standard-results/{qc_quality_sequence_stats.tsv.gz => subset_qc_quality_sequence_stats.tsv.gz} (100%)

diff --git a/test-data/gold-standard-results/qc_adapter_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_adapter_stats.tsv.gz
similarity index 100%
rename from test-data/gold-standard-results/qc_adapter_stats.tsv.gz
rename to test-data/gold-standard-results/subset_qc_adapter_stats.tsv.gz
diff --git a/test-data/gold-standard-results/qc_basic_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_basic_stats.tsv.gz
similarity index 100%
rename from test-data/gold-standard-results/qc_basic_stats.tsv.gz
rename to test-data/gold-standard-results/subset_qc_basic_stats.tsv.gz
diff --git a/test-data/gold-standard-results/subset_qc_length_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_length_stats.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..e105663aafffcbe57c11e6b827d2f39ff1644414
GIT binary patch
literal 407
zcmb2|=3oE==DE`j<~17#xSYSe{-YkBX?%0W<0TV07B>E=Ke2Je=8JDw6&I=-KW5#(
z(eL?J_hoe}_xD@XUU&7IzVXy1zUObPVxDiTd!g8Q%kkXyk2muDmpv}HpC=dZ9a}y1
zXWZ|vbvysn?hDkiyf|U=6xV4@0k^x}O}M;a&F9xo^G^TL4@%j8%KC}%$BvI4&n8ZD
z|M8=>KORop|M=0`uI~8p_dkBf%Et@Oj~9+_P}|4oTf;hwNj75vvuS3A+U5VoSHpVJ
zm}DasFms=2WPNsksW_qa{(QBMzb~^`WiS<A*mCKT&%>E-C)}I=_Pr#xL8IjfAZmU8
zE7hVimh)8C$4yOw8rpnKYF8XDJhR}qCC&cuv2$dhfSuhGQ84o$2b2L3>q&_GD6x-0
zM_cZ2cth&Z#06_AcqWPVJGM_bt`PZgh68&WUsgZo8OAB15VAd!shA=4XflMHu*QN1
iC}ViiWHK|)duIK}h5P6GhFNWEQTILbP9{W%fdK%rcEWuC

literal 0
HcmV?d00001

diff --git a/test-data/gold-standard-results/qc_quality_base_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_quality_base_stats.tsv.gz
similarity index 100%
rename from test-data/gold-standard-results/qc_quality_base_stats.tsv.gz
rename to test-data/gold-standard-results/subset_qc_quality_base_stats.tsv.gz
diff --git a/test-data/gold-standard-results/qc_quality_sequence_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_quality_sequence_stats.tsv.gz
similarity index 100%
rename from test-data/gold-standard-results/qc_quality_sequence_stats.tsv.gz
rename to test-data/gold-standard-results/subset_qc_quality_sequence_stats.tsv.gz
diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap
index 10b94795..bdbbb0e9 100644
--- a/tests/main.nf.test.snap
+++ b/tests/main.nf.test.snap
@@ -3,10 +3,11 @@
         "content": [
             "bracken_reports_merged.tsv.gz:md5,d116fc06ea955409463686df223ec413",
             "kraken_reports_merged.tsv.gz:md5,5d04e8270bfddc7f435bcc0a76bafbcb",
-            "qc_adapter_stats.tsv.gz:md5,fa3f30db6ccfcc22952c5a2981c750b8",
-            "qc_basic_stats.tsv.gz:md5,8df07b1d5d1c85bc4931ae4ce81edb17",
-            "qc_quality_base_stats.tsv.gz:md5,13d5395a6039daa23cb0f059c7dd14b2",
-            "qc_quality_sequence_stats.tsv.gz:md5,e4b5747bcb058cab5f48ec116c5e2a03",
+            "subset_qc_adapter_stats.tsv.gz:md5,fa3f30db6ccfcc22952c5a2981c750b8",
+            "subset_qc_basic_stats.tsv.gz:md5,8df07b1d5d1c85bc4931ae4ce81edb17",
+            "subset_qc_quality_base_stats.tsv.gz:md5,13d5395a6039daa23cb0f059c7dd14b2",
+            "subset_qc_quality_sequence_stats.tsv.gz:md5,e4b5747bcb058cab5f48ec116c5e2a03",
+            "subset_qc_length_stats.tsv.gz:md5,032d70cbd5bfa40c163c13885447b92a",
             "virus_clade_counts.tsv.gz:md5,1c5712b6d4726908058cd1b5f91ce9f1",
             "virus_hits_db.tsv.gz:md5,5f1f53ac7aba3c241b06e71ce0abf46d",
             "blast_hits_paired.tsv.gz:md5,5b6eb0055bc60be196b2a1b09006f9af",
@@ -18,6 +19,6 @@
             "nf-test": "0.9.2",
             "nextflow": "24.10.1"
         },
-        "timestamp": "2025-01-17T21:26:45.129553694"
+        "timestamp": "2025-01-27T14:42:02.13541549"
     }
 }
\ No newline at end of file