diff --git a/.nf-core.yml b/.nf-core.yml index 8e7cf81..59723d8 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -6,7 +6,11 @@ lint: - .github/workflows/awsfulltest.yml files_unchanged: - CODE_OF_CONDUCT.md + pipeline_name_conventions: False + actions_awsfulltest: False nextflow_config: - - manifest.name -repository_type: pipeline + - custom_config + - manifest.name + - manifest.homePage +repository_type: pipeline diff --git a/CHANGELOG.md b/CHANGELOG.md index c8a4a8b..7d7e267 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0.0 - [date] +## v1.0.0 - [2024-03-22] Initial release of `phac-nml/viralassembly`, created from combining the [nf-core](https://nf-co.re/) template with the artic steps. ### `Added` - -### `Fixed` - -### `Dependencies` - -### `Deprecated` +- All initial pipeline features and logic +- All initial docs and images diff --git a/README.md b/README.md index cf2f94a..6c44a5a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # viralassembly A generic viral assembly and QC pipeline which utilises a re-implementation of the [artic pipeline](https://github.com/artic-network/fieldbioinformatics/tree/master/artic) to separate out the individual steps allowing greater control on tool versions along with how data is run through the processes. This pipeline can be used as a starting point for analyses on viruses without dedicated workflows already available. +This pipeline is intended to be run on either Nanopore Amplicon Sequencing data or Basic Nanopore NGS Sequencing data that can utilize a reference genome for mapping variant calling, and other downstream analyses. It generates variant calls, consensus sequences, and quality control information based on the reference. To do this, there are three different variant callers that can be utilized which includes: `clair3`, `medaka`, and `nanopolish` (For R9.4.1 flowcells and below only). + Some of the goals of this pipeline are: 1. Rework the artic nanopore pipeline steps as nextflow modules to deal with specific bugs and version incompatibilities - Example: BCFtools consensus error seen in artic pipeline sometimes @@ -14,20 +16,10 @@ Some of the goals of this pipeline are: ## Index - [Installation](#installation) -- [Profiles](#profiles) - [Base Run Commands](#running-commands) - - [Nanopore - Nanopolish](#nanopore---nanopolish) - - [Nanopore - Medaka](#nanopore---medaka) - [Nanopore - Clair3](#nanopore---clair3) -- [Inputs](#inputs) - - [Input Data Options](#input-data-options) - - [Nanopolish Pipeline Required Parameters](#nanopolish-pipeline-required-parameters) - - [Medaka Pipeline Required Parameters](#medaka-pipeline-required-parameters) - - [Clair3 Pipeline Required Parameters](#clair3-pipeline-required-parameters) - - [Optional Inputs](#optional-inputs) - - [Metadata TSV](#metadata) - - [Primer Schemes](#schemes) - - [SnpEff](#snpeff-1) + - [Nanopore - Medaka](#nanopore---medaka) + - [Nanopore - Nanopolish](#nanopore---nanopolish) - [Outputs](#outputs) - [Limitations](#limitations) - [Citations](#citations) @@ -46,17 +38,6 @@ Some of the goals of this pipeline are: - `singularity` - `docker` -## Profiles -Profiles are used to specify dependency installation, resources, and how to handle pipeline jobs. You can specify more than one profile but avoid passing in more than one dependency managment profiles. - -Available: -- `conda`: Utilize conda to install dependencies and environment management -- `mamba`: Utilize mamba to install dependencies and environment management -- `singularity`: Utilize singularity for dependencies and environment management -- `docker`: Utilize docker to for dependencies and environment management - -Custom configs can be created and specified to be used as an additional config file with `-c `. More info [available here](https://www.nextflow.io/docs/latest/config.html) or [here](https://training.nextflow.io/basic_training/config/#config-syntax) - ## Running Commands Simple commands to run input data. Input data can be done in three different ways: 1. Passing `--fastq_pass ` where `fastq_pass` is a directory containing `barcode##` subdirectories with fastq files @@ -65,27 +46,31 @@ Simple commands to run input data. Input data can be done in three different way 1. `sample` - The name of the sample 2. `reads` - Path to a directory containing reads for the sample in `.fastq*` format -The basic examples will show how to run the pipeline using the `--fastq_pass` input but it could be subbed in for the `--input` CSV file if wanted +The basic examples will show how to run the pipeline using the `--fastq_pass` input but it could be subbed in for the `--input` CSV file if wanted. -### Nanopore - Nanopolish -Running the pipeline with [nanopolish](https://github.com/jts/nanopolish) for variant calls requires fastq files, fast5 files, and the sequencing summary file. When running, the pipeline will look for subdirectories off of the input directory called `barcode##` to be used in the pipeline. +*All detailed running information is available in the [usage docs](./docs/usage.md)* + +### Nanopore - Clair3 +Running the pipeline with [Clair3](https://github.com/HKU-BAL/Clair3) for variant calls requires fastq files and a clair3 model. When running, the pipeline will either: +- Look for subdirectories off of the input "--fastq_pass" directory called `barcode##` to be used in the pipeline +- Look for fastq files in the input "--fastq_pass" directory called `*.fastq*` to be used in the pipeline + +This pipeline utilizes the same steps as the artic fieldbioinformatics minion pipeline but with each step run using nextflow to allow clair3 to be easily slotted in. See the [clair3 section](./docs/usage.md#clair3) of the usage docs for more information Basic command: ```bash nextflow run /PATH/TO/artic-generic-nf/main.nf \ -profile \ - --variant_caller 'nanopolish' \ + --variant_caller 'clair3' \ --fastq_pass \ - --fast5_pass \ - --sequencing_summary + --reference \ ``` -[Optional inputs](#inputs) could include: -- Different schemes +[Optional inputs](./docs/usage.md#all-parameters) could include: +- [Amplicon scheme](./docs/usage.md#schemes-and-reference) instead of just a reference fasta file - Metadata - Filtering options -- Using base `artic minion` instead of nextflow implementation - Running SnpEff for variant consequence prediction - Output reporting options @@ -94,18 +79,21 @@ Running the pipeline with [medaka](https://github.com/nanoporetech/medaka) for v - Look for subdirectories off of the input "--fastq_pass" directory called `barcode##` to be used in the pipeline - Look for fastq files in the input "--fastq_pass" directory called `*.fastq*` to be used in the pipeline +See the [medaka section](./docs/usage.md#medaka) of the usage docs for more information + Basic command: ```bash nextflow run /PATH/TO/artic-generic-nf/main.nf \ -profile \ --variant_caller 'medaka' \ --fastq_pass \ - --medaka_model + --medaka_model \ + --reference \ ``` -[Optional inputs](#optional-inputs) could include: -- Different schemes +[Optional inputs](./docs/usage.md#all-parameters) could include: +- [Amplicon scheme](./docs/usage.md#schemes-and-reference) instead of just a reference fasta file - Metadata - Filtering options - Using base `artic minion` instead of nextflow implementation @@ -114,241 +102,41 @@ nextflow run /PATH/TO/artic-generic-nf/main.nf \ Medaka model information [can be found here](https://github.com/nanoporetech/medaka#models) -### Nanopore - Clair3 -Running the pipeline with [Clair3](https://github.com/HKU-BAL/Clair3) for variant calls requires fastq files and a clair3 model. When running, the pipeline will either: -- Look for subdirectories off of the input "--fastq_pass" directory called `barcode##` to be used in the pipeline -- Look for fastq files in the input "--fastq_pass" directory called `*.fastq*` to be used in the pipeline +### Nanopore - Nanopolish +Running the pipeline with [nanopolish](https://github.com/jts/nanopolish) for variant calls requires fastq files, fast5 files, and the sequencing summary file. When running, the pipeline will look for subdirectories off of the input directory called `barcode##` to be used in the pipeline. -This pipeline utilizes the same steps as the artic fieldbioinformatics minion pipeline but with each step run using nextflow to allow clair3 to be easily slotted in. +See the [nanopolish section](./docs/usage.md#nanopolish) of the usage docs for more information Basic command: ```bash nextflow run /PATH/TO/artic-generic-nf/main.nf \ -profile \ - --variant_caller 'clair3' \ + --variant_caller 'nanopolish' \ --fastq_pass \ - --clair3_model + --fast5_pass \ + --sequencing_summary \ + --reference ``` -[Optional inputs](#optional-inputs) could include: -- Different schemes +[Optional inputs](./docs/usage.md#all-parameters) could include: +- [Amplicon scheme](./docs/usage.md#schemes-and-reference) instead of just a reference fasta file - Metadata - Filtering options +- Using base `artic minion` instead of nextflow implementation - Running SnpEff for variant consequence prediction - Output reporting options -## Inputs -The required inputs are based off of what variant caller is being used. The caller is specified with `--variant_caller [nanopolish, medaka, clair3]` and then the remaining required parameters are based on that although the only real difference is that nanopolish requires additional files. - -Most of the optional parameters are available for all three pipelines - -### Input Data Options -There are two ways to specify how fastq files are input into the pipeline: - -1. `--fastq_dir ` - - Using this you can either provide the path to the barcode directories found when demultiplexing nanopore data or to a flat directory of named fastq files - - If using a barcoded directory, you can also rename the fastq barcode by providing a `--metadata metadata.tsv` file - -2. `--input ` - - Pass in an input CSV file containing 2 columns, `sample`, and `reads` where: - - `sample` is the sample name to use - - `reads` is the path to the barcode directory containing reads (for now it is just directory, will update later to work on flat fastqs too) - - This will find the reads in the given directory and attach the sample name to them - -The below examples all use method 1 but swapping out `--fastq_dir ` for `--input ` would also work for all of them - -### Nanopolish Pipeline Required Parameters - -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --variant_caller 'nanopolish' | Set nanopolish for variant calls | Choice | | | -| --fastq_pass | Path to directory containing `barcode##` subdirectories | Path | null | | -| --fast5_pass | Path to directory containing `barcode##` fast5 subdirectories | Path | null | | -| --sequencing_summary | Path to run `sequencing_summary*.txt` file | Path | null | | - -### Medaka Pipeline Required Parameters - -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --variant_caller 'medaka' | Set medaka for variant calls | Choice | | | -| --fastq_pass | Path to directory containing `barcode##` subdirectories or a directory containing `*.fastq*` files | Path | null | | -| --medaka_model | Medaka model to be used in the pipeline | Str | 'r941_min_hac_g507' | Default model will not work the best for all inputs. [See medaka docs](https://github.com/nanoporetech/medaka#models) for additional info | - -### Clair3 Pipeline Required Parameters - -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --variant_caller 'clair3' | Set clair3 for variant calls | Choice | | | -| --fastq_pass | Path to directory containing `barcode##` subdirectories or a directory containing `*.fastq*` files | Path | null | | -| --clair3_model | Clair3 model to be used in the pipeline | Str/Path | 'r941_prom_sup_g5014' | Default model will not work the best for all inputs. [See clair3 docs](https://github.com/HKU-BAL/Clair3#pre-trained-models) for additional info | - -### Optional Inputs -Use `--help` to see all options formatted on the command line - -Use `--version` to see version information - -#### Read Filtering -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --min_length | Minimum read length to be kept | Int | 200 | For artic guppyplex | -| --max_length | Maximum read length to be kept | Int | 3000 | For artic guppyplex | -| --min_reads | Minimum size selected reads to be used in pipeline | Int | 20 | | - -#### Scheme -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --reference_no_scheme | Specify the path to a reference fasta file to run pipeline without a primer scheme | Path | '' | Ignores all scheme inputs | -| --scheme | Name of the primer scheme to use | Str | 'nCoV-2019' | See [schemes](#schemes) for more info | -| --scheme_version | Version name of primer scheme to use | Str | 'freed_V2_nml' | See [schemes](#schemes) for more info | -| --scheme_repo | Github repository URL to download scheme from | Str | 'https://github.com/DarianHole/primer-schemes.git' | See [schemes](#schemes) for more info | -| --local_scheme | Path to directory containing local scheme files | Path | null | See [schemes](#schemes) for more info | - -#### Pipeline Options -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --metadata | Path to metadata TSV file with columns 'sample' and 'barcode' | Path | null | See [metadata](#metadata) for more info | -| --use_artic_tool | Run the artic tool itself instead of nextflow implementation | Bool | False | Not available with clair3 | -| --normalise | Artic minion normalise coverage option | Int | 1000 | Entering `0` turns off normalisation | -| --no_frameshift | Use the Artic minion no frameshift vcf filter | Bool | False | Simple `%3 == 0` check for variants | -| --use_bwa | Use BWA instead of minimap2 for read mapping | Bool | False | | -| --skip_longshot | When running with `medaka`, skip running longshot | Bool | False | Medaka only!! | - -#### SnpEff -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --skip_snpeff | Skip running SnpEff | Bool | False | | -| --gff | Path to gff3 formatted file to use in SnpEff database build | Path | False | If not given, the Reference ID will be used to attempt to pull the genbank file | - -#### QC and Reporting -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --skip_qc | Skip running all QC and reporting steps | Bool | false | | -| --custom_report | Run the custom HTML report | Bool | false | Currently requires the use of conda as there is not a singularity container yet | -| --pcr_primer_bed | Path to PCR primer bed file to check for mutations against | Path | null | For output QC checks | -| --neg_control_threshold | Coverage threshold at which to fail negative control samples | Float | 0.10 | | -| --neg_ctrl_substrings | Negative control sample substrings separated by a `,` | Str | 'ntc,neg,blank' | | - -#### Other Generic Options -| Parameter | Description | Type | Default | Notes | -| - | - | - | - | - | -| --outdir | Directory name to output results to | Str | 'results' | | -| --cache | Specify a location to store conda/singularity envs/containers for reuse | Path | null | | - -### Metadata -Input metadata is used to rename barcoded fastq files along with adding additional lines to the final overall QC csv file. Note that the metadata input is expected to be of a `TSV` format - -Structure for example `metadata.tsv` file: - -| sample | barcode | \ | -| - | - | - | -| SR-1 | 1 | X | -| SR-2 | 02 | Y | -| NTC-12 | 12 | Z | - -### Schemes -Amplicon schemes are a highly targeted approach to sequencing focusing on a specific target genome. If using an amplicon scheme with this pipeline, either a local directory or a URL that contains the wanted primer scheme formatted according to the below information must be provided. - -If not running with an amplicon scheme, pass the `--reference_no_scheme ` argument with a reference fasta file and the pipeline will run without amplicon specific checks/outputs. - -The primer scheme must contain: -- A reference genome fasta sequence titled `*reference.fasta` -- A primer bed file titled `*primer.bed` - - Minimum of 6 columns - - Primer pairs with names containing `_LEFT` and `_RIGHT` - - Primer pools - -Example Primer file: - -| MN908947.3 | 30 | 54 | nCoV-2019_1_LEFT | 1 | + | -| - | - | - | - | - | - | -| MN908947.3 | 1183 | 1205 | nCoV-2019_1_RIGHT | 1 | - | -| MN908947.3 | 1100 | 1128 | nCoV-2019_2_LEFT | 2 | + | -| MN908947.3 | 2244 | 2266 | nCoV-2019_2_RIGHT | 2 | - | -| ... | ... | ... | ... | ... | ... | -| REF ID | Start | Stop | Primer Name | Primer Pool | Direction - - -The directory structure must follow the basic structure as follows: -``` -primer-schemes -└── - └── - ├── reference.fasta - └── scheme.bed -``` - -Example for covid: -``` -primer-schemes -└── nCoV-2019 - ├── midnight - | ├── nCoV-2019.reference.fasta - | └── nCoV-2019.scheme.bed - └── V1 - ├── reference.fasta - └── scheme.bed -``` - -### SnpEff -SnpEff is run by default on all non-segmented viruses by using the reference sequence ID to either: -1. Check if there is a SnpEff database available to download -2. Build a SnpEff database by downloading the sequence genbank file from NCBI - -Instead of relying on the reference ID to build/download a database, you can instead specify a gff3 file with `--gff ` to be used with the reference sequence to create a database - -If building/downloading a database fails, the pipeline will skip over running SnpEff instead of failing out completely. +## Outputs +Outputs are separated based off of their tool or file format and found in the `results/` directory by default. -SnpEff can also be skipped entirely by passing the `--skip_snpeff` parameter +Outputs include: +- Consensus fasta files +- VCF files +- Bam files +- HTML summary files (either custom or MultiQC) -## Outputs -Outputs are separated based off of their tool or file format in the `results/` directory by default. - -### Consensus Sequences -Final consensus sequence output by the pipeline found in the `consensus/` directory as: `SAMPLE.consensus.fasta` - -### Bam Files -BAM files in the `bam/` directory include: -- `SAMPLE.primertrimmed.rg.sorted.bam` - - If using a scheme, these are the bam files used for variant calling and downstream QC steps -- `SAMPLE.sorted.bam` - - If not using a scheme, these are the bam files used for all further steps - - Still output if using a scheme though - -### VCF Files -VCF files found in the `vcf/` directory include: -- `SAMPLE.pass.vcf.gz` - - Variants passing variant filtering -- `SAMPLE.pass.norm.vcf.gz` - - BCFtools normalized variants passing variant filtering - -### SnpEff Outputs -Output files from SnpEff found in the `snpeff/` directory including: -- `SAMPLE.ann.vcf` - - SnpEff annotated VCF file -- `SAMPLE.csv` - - SnpEff CSV summary - -### Variation Files -Positional variation information based on the pileups and formatted as a CSV file is available in the `variation_csvs/` directory - -### Sample Summary Files -- `sample_csvs/` - - Directory containing the individual csv file for each sample -- `sample_mqc/` - - Directory containing MultiQC HTML reports for each sample - -### Overall outputs -- `overall.qc.csv` - - Contains all of the calculated per sample metadata from the run along with giving a qc and negative control status -- `Overall-Run-MultiQC-Report_multiqc_report.html` - - MultiQC HTML report containing all sample if running multiQC output -- `reportDashboard.html` - - Custom HTML report containing all samples if running the custom output - - Note that it may take a bit to load if running a lot of samples - ![custom-report](./pictures/custom_report.png) -- `pipeline_info/` - - Directory containing nextflow run information +*More output information on pipeline steps and output files can be found in the [output docs](./docs/output.md)* ## Limitations Current limitations include: diff --git a/assets/multiqc_config_overall.yaml b/assets/multiqc_config_overall.yaml index d13564e..e1f6d78 100644 --- a/assets/multiqc_config_overall.yaml +++ b/assets/multiqc_config_overall.yaml @@ -20,10 +20,15 @@ run_modules: - samtools - qualimap - bcftools + - nanostat - snpeff # Order for reports module_order: + - nanostat: + name: Nanostat filtered fastq statistics + path_filters: + - "*.nanostat.txt" - samtools - qualimap - bcftools @@ -178,7 +183,9 @@ custom_data: ] # Extensions to clean from names -# extra_fn_clean_exts: +extra_fn_clean_exts: + - '.nanostat' + - '.ann' # Search Pathes sp: diff --git a/assets/multiqc_config_sample.yaml b/assets/multiqc_config_sample.yaml index f65b332..df77717 100644 --- a/assets/multiqc_config_sample.yaml +++ b/assets/multiqc_config_sample.yaml @@ -16,10 +16,15 @@ skip_generalstats: true # Modules/tools supported that we only want to grab run_modules: - custom_content + - nanostat - qualimap # Order for reports module_order: + - nanostat: + name: Nanostat filtered fastq statistics + path_filters: + - "*.nanostat.txt" - qualimap report_section_order: @@ -250,7 +255,8 @@ custom_data: scale: "Purples" # Extensions to clean from names -# extra_fn_clean_exts: +extra_fn_clean_exts: + - '.nanostat' # Search Pathes sp: diff --git a/assets/rmarkdown-reports/reportDashboard.Rmd b/assets/rmarkdown-reports/reportDashboard.Rmd index 8e098be..01cf761 100755 --- a/assets/rmarkdown-reports/reportDashboard.Rmd +++ b/assets/rmarkdown-reports/reportDashboard.Rmd @@ -1,5 +1,5 @@ --- -title: "NML Sequencing Run Report" +title: "Sequencing Run Report" output: flexdashboard::flex_dashboard: orientation: rows diff --git a/conf/nml.config b/conf/nml.config index af34842..b8a66e5 100644 --- a/conf/nml.config +++ b/conf/nml.config @@ -2,7 +2,6 @@ env { OPENBLAS_NUM_THREADS = 1 } - params { // Config params config_profile_name = "nml" @@ -14,7 +13,6 @@ params { max_retries = 3 max_jobs = 100 } - process { // Base process executor = "slurm" diff --git a/conf/test.config b/conf/test.config index 45d6570..cafad4b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -24,5 +24,5 @@ params { // Args variant_caller = "clair3" - reference_no_scheme = "$projectDir/.github/test-data/nanopore/MN908947.3.reference.fasta" + reference = "$projectDir/.github/test-data/nanopore/MN908947.3.reference.fasta" } diff --git a/conf/test_full.config b/conf/test_full.config index f58fe2e..4e070a7 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -26,4 +26,6 @@ params { variant_caller = "medaka" medaka_model = "r1041_e82_400bps_sup_v4.3.0" metadata = "$projectDir/.github/test-data/nanopore/metadata.tsv" + scheme = 'nCoV-2019' + scheme_version = 'V5.3.2' } diff --git a/docs/README.md b/docs/README.md index 961b17c..ebddb6f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,7 +2,9 @@ The phac-nml/viralassembly documentation is split into the following pages: -- [Usage](usage.md) +- [Usage](./usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. -- [Output](output.md) +- [Example Commands](./example_commands.md) + - Different command examples +- [Output](./output.md) - An overview of the different results produced by the pipeline and how to interpret them. diff --git a/docs/example_commands.md b/docs/example_commands.md new file mode 100644 index 0000000..90f8d85 --- /dev/null +++ b/docs/example_commands.md @@ -0,0 +1,94 @@ +# phac-nml/viralassembly: Example Commands +A variety of example commands using different parameter options to display how to use each + +## Amplicon + +### Clair3 +Clair3 with a local model, local scheme, fastq directory, conda, and the custom report output + +```bash +nextflow run phac-nml/viralassembly \ + -profile conda \ + --fastq_pass FASTQ_PASS/ \ + --variant_caller 'clair3' \ + --clair3_model ./r1041_e82_400bps_sup_v420 \ + --local_scheme ./primer_schemes \ + --scheme 'hCMV' \ + --scheme_version 'V1' \ + --custom_report \ + --outdir ./results +``` + +### Medaka +Minimal input medaka with conda, an input csv file for data, and the nCoV-2019 scheme + +```bash +nextflow run phac-nml/viralassembly \ + -profile conda \ + --input INPUT.csv \ + --variant_caller 'medaka' \ + --scheme 'nCoV-2019' \ + --scheme_version 'V5.3.2' \ + --outdir ./results +``` + +### Nanopolish +Nanopolish run using singularity and the base artic command line tool (instead of the default nextflow implementation) + +```bash +nextflow run phac-nml/viralassembly \ + -profile singularity \ + --input INPUT.csv \ + --fast5_pass FAST5_PASS/ \ + --sequencing_summart SEQ_SUM.txt \ + --variant_caller 'nanopolish' \ + --scheme 'nCoV-2019' \ + --scheme_version 'V5.3.2' \ + --use_artic_tool \ + --outdir ./results +``` + +-------------------------- + +## Non-Amplicon + +### Clair3 +Minimal clair3 with docker using a fastq input directory along wth a gff3 reference file for SnpEff + +```bash +nextflow run phac-nml/viralassembly \ + -profile docker \ + --fastq_pass FASTQ_PASS/ \ + --variant_caller 'clair3' \ + --reference ./REFERENCE.fa \ + --gff ./REFERENCE.gff +``` + +### Medaka +Medaka with conda skipping QC and SnpEff + +```bash +nextflow run phac-nml/viralassembly \ + -profile conda \ + --input INPUT.csv \ + --variant_caller 'medaka' \ + --reference ./REFERENCE.fa \ + --skip_qc \ + --skip_snpeff +``` + +### Nanopolish +Nanopolish running with conda, filtering the read lengths to be shorter, and creating a custom report + +```bash +nextflow run phac-nml/viralassembly \ + -profile conda \ + --input INPUT.csv \ + --fast5_pass FAST5_PASS/ \ + --sequencing_summart SEQ_SUM.txt \ + --variant_caller 'nanopolish' \ + --reference ./REFERENCE.fa \ + --min_length 100 \ + --max_length 600 \ + --outdir ./results +``` diff --git a/docs/example_files/inputs/example_custom_scheme/virus/V1-custom/custom.reference.fasta b/docs/example_files/inputs/example_custom_scheme/virus/V1-custom/custom.reference.fasta new file mode 100644 index 0000000..59f7442 --- /dev/null +++ b/docs/example_files/inputs/example_custom_scheme/virus/V1-custom/custom.reference.fasta @@ -0,0 +1,9 @@ +>Custom.1 +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCT +GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACT +CACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATC +TTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT +CGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAAC +CACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATC +TTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCT diff --git a/docs/example_files/inputs/example_custom_scheme/virus/V1-custom/custom.scheme.bed b/docs/example_files/inputs/example_custom_scheme/virus/V1-custom/custom.scheme.bed new file mode 100644 index 0000000..ec9d36a --- /dev/null +++ b/docs/example_files/inputs/example_custom_scheme/virus/V1-custom/custom.scheme.bed @@ -0,0 +1,4 @@ +Custom.1 10 34 custom_1_LEFT 1 + +Custom.1 300 320 custom_1_RIGHT 1 - +Custom.1 220 246 custom_2_LEFT 2 + +Custom.1 440 460 custom_2_RIGHT 2 - diff --git a/docs/example_files/inputs/input.csv b/docs/example_files/inputs/input.csv new file mode 100644 index 0000000..605ed2c --- /dev/null +++ b/docs/example_files/inputs/input.csv @@ -0,0 +1,11 @@ +sample,reads +negative-ctrl-1,barcode11 +run-ntc-1,barcode94 +pos-ctrl-1,barcode95 +cov-1,barcode5 +cov-2,barcode01 +cov-3,barcode12 +cov-4,barcode19 +cov-5,barcode40 +cov-6,barcode61 +cov-7,barcode75 diff --git a/docs/example_files/inputs/metadata.tsv b/docs/example_files/inputs/metadata.tsv new file mode 100644 index 0000000..8ece826 --- /dev/null +++ b/docs/example_files/inputs/metadata.tsv @@ -0,0 +1,11 @@ +sample barcode run other +negative-ctrl-1 11 example-run additional_info +run-ntc-1 94 example-run additional_info +pos-ctrl-1 95 example-run additional_info +cov-1 5 example-run additional_info +cov-2 1 example-run additional_info +cov-3 12 example-run additional_info +cov-4 19 example-run additional_info +cov-5 40 example-run additional_info +cov-6 61 example-run additional_info +cov-7 75 example-run additional_info diff --git a/docs/images/amplicons_custom.png b/docs/images/amplicons_custom.png new file mode 100644 index 0000000..1cb817e Binary files /dev/null and b/docs/images/amplicons_custom.png differ diff --git a/docs/images/bcftools_mqc.png b/docs/images/bcftools_mqc.png new file mode 100644 index 0000000..0169689 Binary files /dev/null and b/docs/images/bcftools_mqc.png differ diff --git a/docs/images/completeness_mqc.png b/docs/images/completeness_mqc.png new file mode 100644 index 0000000..7feb2e7 Binary files /dev/null and b/docs/images/completeness_mqc.png differ diff --git a/docs/images/nanostat_mqc.png b/docs/images/nanostat_mqc.png new file mode 100644 index 0000000..7e8a712 Binary files /dev/null and b/docs/images/nanostat_mqc.png differ diff --git a/docs/images/qc_mqc.png b/docs/images/qc_mqc.png new file mode 100644 index 0000000..c75826b Binary files /dev/null and b/docs/images/qc_mqc.png differ diff --git a/docs/images/qualimap_mqc.png b/docs/images/qualimap_mqc.png new file mode 100644 index 0000000..346b3dd Binary files /dev/null and b/docs/images/qualimap_mqc.png differ diff --git a/docs/images/run_summary_custom.png b/docs/images/run_summary_custom.png new file mode 100644 index 0000000..e3a7a1f Binary files /dev/null and b/docs/images/run_summary_custom.png differ diff --git a/docs/images/sample_custom.png b/docs/images/sample_custom.png new file mode 100644 index 0000000..b07aea8 Binary files /dev/null and b/docs/images/sample_custom.png differ diff --git a/docs/images/sample_mqc_mqc.png b/docs/images/sample_mqc_mqc.png new file mode 100644 index 0000000..742d623 Binary files /dev/null and b/docs/images/sample_mqc_mqc.png differ diff --git a/docs/images/samtools_mqc.png b/docs/images/samtools_mqc.png new file mode 100644 index 0000000..18908ca Binary files /dev/null and b/docs/images/samtools_mqc.png differ diff --git a/docs/images/snpeff_mqc.png b/docs/images/snpeff_mqc.png new file mode 100644 index 0000000..3a19cfd Binary files /dev/null and b/docs/images/snpeff_mqc.png differ diff --git a/docs/images/variation_mqc.png b/docs/images/variation_mqc.png new file mode 100644 index 0000000..7a4051c Binary files /dev/null and b/docs/images/variation_mqc.png differ diff --git a/docs/output.md b/docs/output.md index b497ffd..9cd5f71 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,61 +2,258 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. Most of the plots are taken from either the MultiQC report or the custom report, which both summarise the results at the end of the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps (where a `*` indicates a final output kept in the top level results directory): + +- [Preprocessing](#preprocessing) + - [Reference Stats](#reference-stats)* - Get reference genome information needed for variant calling and QC + - [Artic Guppyplex](#artic-guppyplex) - Read length filtering + - [Chopper](#chopper) - Additional Read QC + - [Nanostat](#nanostat) - Read statistics + +- [Variant Calling](#variant-calling) + - [Minimap2](#minimap2)* - Read mapping + - [Artic Align Trim](#artic-align_trim)* - Primer trimming and normalisation + - [Clair3](#clair3) - Determine initial variants with clair3 + - [Medaka](#medaka) - Determine initial variants with medaka + - [Nanopolish](#nanopolish) - Determine initial variants with nanopolish + - [Longshot](#longshot)* - Genotype and phase called medaka variants + - [Variant Filter](#variant-filter)* - Filter variants not matching required criteria -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [Consensus Generation](#consensus-generation) + - [Artic Mask](#artic-mask) - Mask failing variants and low depth sites in preparation for consensus generation + - [BCFtools Norm](#bcftools-norm)* - Left-align and normalize indels along with make sure the reference alleles match + - [BCFtools Consensus](#bcftools-consensus)* - Create consensus sequence from VCF variants and Masked sites -### FastQC +- [QC and Reporting](#qc-and-reporting) + - [SnpEff](#snpeff)* - Variant annotation and functional prediction + - [Qualimap BAMQC](#qualimap-bamqc) - Alignment quality and metrics + - [Samtools Flagstat](#samtools-flagstat) - Alignment flag stats + - [BCFtools Stats](#bcftools-stats) - Variant quality and statistics + - [Variation CSV](#variation-csv)* - Custom reporting script for finding and calculating variation in the BAM pileups + - [Amplicon Completeness](#amplicon-completeness) - Custom reporting script for calculating amplicon completeness based on bedtools output + - [QC Compilation](#qc-compilation)* - Custom reporting scripts for each sample and the overall run + - [MultiQC](#multiqc)* - Sample and Run HTML visual report + - [Custom Report](#custom-report)* - Custom single HTML report including the run and all individual samples +Additionally [Pipeline information](#pipeline-information) which includes report metrics generated during the workflow execution can also be found + +### Preprocessing +Initial processing steps and statistic gathering. The reference statistics are output to their own final folder while the other statistics are passed to the final multiqc report. + +#### Reference Stats
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `reference/` + - `genome.bed`: Genomic information in bed format that has the coordiantes of the reference genome needed for nanopolish + - `refstats.txt`: Genomic information in a format needed for clair3 + - `*.fai`: Samtools faidx fai file for reference genome +
+ +The reference files are generated with both `awk` and `samtools` and are needed as different inputs for downstream tools. + +#### Artic Guppyplex +Select reads by size and generate size selected fastq files. + +#### Chopper +[Chopper](https://github.com/wdecoster/chopper) filter and trim fastq reads by quality and length. + +#### Nanostat +[Nanostat](https://github.com/wdecoster/nanostat) generates plots and statistics on trimmed fastq files for the final multiqc reports. + +![nanostats_mqc](./images/nanostat_mqc.png) + +---------- +### Variant Calling +Read mapping and variant calling. Note that only one of `clair3`, `medaka`, and `nanopolish` is used. In the end, final normalized passing and failing variants are output along with the BAM files to their respective folders. + +#### Minimap2 +
+Output files + +- `bam/` + - `*.sorted.bam`: Sorted bam file from minimap2 and samtools
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +The sorted BAM file from minimap2 and samtools. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +#### Artic Align_Trim +*Amplicon only* +
+Output files -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +- `bam/` + - `*.trimmed.rg.sorted.bam`: Artic align_trim output which normalises coverage and assigns reads to amplicons + - `*.primertrimmed.rg.sorted.bam`: Artic align_trim output which normalises coverage and assigns reads to amplicons along with softmasking the primer sequences + - The primertrimmed file is used for subsequent variant calling +
+ +See [the artic core pipeline](https://artic.readthedocs.io/en/latest/minion/#core-pipeline) for more info on how `align_trim` trims the BAM files. -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +#### Clair3 +Run clair3 variant caller on BAM files to create initial variant calls in VCF format. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +#### Medaka +Run medaka variant caller on BAM files to create initial variant calls in VCF format. -### MultiQC +#### Nanopolish +Run nanopolish variant caller on BAM files, fast5 files, and the sequencing summary file to create initial variant calls in VCF format. +#### Longshot
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `vcf/` + - `*.longshot.merged.vcf`: Longshot phased VCF file +
+Genotype and phase the variants from the initial medaka VCF variant file. [Longshot](https://github.com/pjedge/longshot) + +#### Variant Filter +
+Output files + +- `vcf/` + - `*.pass.vcf.gz`: VCF file containing variants passing quality filters + - `*.pass.vcf.gz.tbi`: VCF index file containing variants passing quality filters + - `*.fail.vcf`: VCF file containing variants failing quality filters
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +Pass/Fail variants based on quality for the final consensus sequence generation. -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +---------- -### Pipeline information +### Consensus Generation +Final consensus sequence generation based on passing/failing variants and sequencing depth. + +#### Artic Mask +Mask low depth and failing variants to create a preconsensus sequence for BCFtools consensus. + +#### BCFtools Norm +
+Output files + +- `vcf/` + - `*.pass.norm.vcf.gz`: VCF file containing variants passing quality filters that have their indels normalized and reference positions fixed + - Reference positions may need to be fixed if there are overlapping variants +
+ +BCFtools norm is utilized to fix locations in which one two variants overlap which during BCFtools consensus would crash the pipeline previously. [BCFtools](https://samtools.github.io/bcftools/bcftools.html#norm) + +#### BCFtools Consensus +
+Output files + +- `consensus/` + - `*.consensus.fasta`: Fasta file containing the final output consensus sequence with applied variants and masked sites +
+ +Final output consensus sequence for the sample with variants applied and low coverage/failing variants masked with N's. [BCFtools](https://samtools.github.io/bcftools/bcftools.html#norm) + +---------- +### QC and Reporting +All QC and reporting is only currently done on non-segmented viruses + +#### SnpEff +
+Output files + +- `snpeff/` + - `*.ann.vcf`: VCF file with variant annotations + - `*.csv`: Variant annotation csv file +
+ +[SnpEff](https://pcingola.github.io/SnpEff/) is a genetic variant annotation and functional effect prediction toolbox. It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). + +![snpeff_mqc](./images/snpeff_mqc.png) + +#### Qualimap BAMQC +[Qualimap BAMQC](http://qualimap.conesalab.org/) platform-independent application written in Java and R that provides a command-line interface to facilitate the quality control of alignment sequencing data and its derivatives like feature counts. The output is used in the final MultiQC reports. + +![qualimap_mqc](./images/qualimap_mqc.png) + +#### Samtools Flagstat +[Samtools flagstat](http://www.htslib.org/doc/samtools-flagstat.html) counts the number of alignments for each FLAG type. The output is used in the final MultiQC reports. + +![samtools_mqc](./images/samtools_mqc.png) + +#### BCFtools Stats +[BCFtools stats](https://samtools.github.io/bcftools/bcftools.html#stats) produces machine readable variant quality and statistics. The output is used in the final MultiQC reports + +![bcftools_mqc](./images/bcftools_mqc.png) + +#### Variation CSV +
+Output files + +- `variation_csvs/` + - `*_variation.csv`: CSV file displaying positions where there is >= 15% variation from the reference base call +
+ +Custom python script using [pysam](https://pysam.readthedocs.io/en/latest/api.html) to find positions in the pileup which have >= 15% variation from the reference sequence. This gives information on any mixed-sites along with identifying spots in the genome where there may be sequencing artifacts or issues. The CSV file can be viewed or a coloured table can be found in each sample MultiQC report or custom report. + +![variation_mqc](./images/variation_mqc.png) + +#### Amplicon Completeness +Amplicon completeness is calculated using a custom python script along with an amplicon bed file and the final consensus sequence. It reports how many bases were called in each amplicon and gives a final completeness value from `0` - `1.00`. + +![completeness_mqc](./images/completeness_mqc.png) + +#### QC Compilation +
+Output files + +- `sample_csvs/` + - `*.qc.csv`: Individual sample CSV files containing sample stats +- `overall.qc.csv`: Overall sample and run CSV file containing all sample stats +
+ +Final CSV file(s) for both individual samples and the overall run that combines and checks a variety of metrics giving a final QC value for each sample. + +![qc_mqc](./images/qc_mqc.png) + +#### MultiQC +
+Output files + +- `sample_mqc/` + - `*.report.html`: Sample specific MultiQC HTML report containing visuals and tables +- `Overall-Run-MultiQC.report.html`: Final overall MultiQC report containing visuals and tables for all samples combined +
+ +Final output reports generated by [MultiQC](https://multiqc.info/docs/) based on the [overall config](../assets/multiqc_config_overall.yaml) and the [sample config](../assets/multiqc_config_sample.yaml) files which collate all of the outputs of the pipeline + +![sample_mqc_mqc](./images/sample_mqc_mqc.png) + +#### Custom Report +
+Output files + +- `reportDashboard.html`: Custom report dashboard displaying run metrics overall and for each sample +
+ +Custom RMarkdown report that contains sample and run information. Currently it can only be generated when running with `conda` so it is an output that has to be specified. It also still has a few issues relating to load times. + +![run_summary_custom](./images/run_summary_custom.png) +Run summary page + +![sample_custom](./images/sample_custom.png) +Example sample page + +![amplicons_custom](./images/amplicons_custom.png) +Amplicons page + +---------- + +### Pipeline information
Output files @@ -65,7 +262,6 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`. -
[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. diff --git a/docs/usage.md b/docs/usage.md index 60ce86d..5c72117 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,66 +1,166 @@ # phac-nml/viralassembly: Usage -> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ - ## Introduction +This pipeline is intended to be run on either Nanopore Amplicon Sequencing data or Basic Nanopore NGS Sequencing data that can utilize a reference genome for mapping variant calling, and other downstream analyses. It generates variant calls, consensus sequences, and quality control information based on the reference. To do this, there are three different variant callers that can be utilized which includes: `clair3`, `medaka`, and `nanopolish` (For R9.4.1 flowcells and below only). + +For Amplicon Sequencing data it is at minimum required to: +1. Specify a path to the reads/input file +2. Specify the scheme name +3. Specify the scheme version +4. Pick a variant caller and caller model + +For Basic NGS sequencing data it is required to: +1. Specify a path to the reads/input file +2. Specify a path to the reference genome +3. Pick a variant caller and caller model + +## Index +- [Profiles](#profiles) +- [Data Inputs](#data-inputs) + - [Fastq Pass Directory](#fastq-pass-directory---fastq_pass) + - [Input CSV](#input-csv---input) +- [Variant Callers](#variant-callers) + - [Clair3](#clair3) + - [Medaka](#medaka) + - [Nanopolish](#nanopolish) +- [Running the Pipeline](#running-the-pipeline) + - [Amplicon](#amplicon) + - [Non-Amplicon](#non-amplicon) + - [Other Run Note](#other-run-notes) + - [Updating the Pipeline](#updating-the-pipeline) + - [Reproducibility](#reproducibility) +- [Input Parameters](#input-parameters) + - [All Parameters](#all-parameters) + - [Schemes and Reference](#schemes-and-reference) + - [Metadata](#metadata) + - [SnpEff](#snpeff) +- [Core Nextflow Arguments](#core-nextflow-arguments) + +## Profiles +Profiles are used to specify dependency installation, resources, and how to handle pipeline jobs. You can specify more than one profile but avoid passing in more than one dependency managment profiles. They can be passed with `-profile ` + +Available: +- `conda`: Utilize conda to install dependencies and environment management +- `mamba`: Utilize mamba to install dependencies and environment management +- `singularity`: Utilize singularity for dependencies and environment management +- `docker`: Utilize docker to for dependencies and environment management + +## Data Inputs +Two options for fastq data input: `--fastq_pass ` or `--input ` + +### Fastq Pass Directory (--fastq_pass) +Specify fastq data to input based on a given directory. The directory can either be barcoded, as would be seen after demultiplexing, or it could be a flat input of fastq files. The barcoded fastq data will be output with the barcode number but can be renamed with a [metadata csv]() input. The flat fastq files will keep their basename (separated out at the first `.`). + +Barcoded: +``` + +├── barcode01 +| └── FAR41212_pass_barcode01_7d0222ac_0.fastq +├── barcode02 +| ├── FAR41212_pass_barcode02_7d0222ac_0.fastq +| ├── FAR41212_pass_barcode02_7d0222ac_1.fastq +| └── FAR41212_pass_barcode02_7d0222ac_2.fastq +└── barcode03 + └── FAR41212_pass_barcode03_7d0222ac_0.fastq +``` - +Flat: +``` + +├── sample1.fastq +├── sample2.fastq +├── sample3.fastq +├── ntc.fastq +└── pos.fastq +``` -## Samplesheet input +### Input CSV (--input) +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to pass in an input CSV file containing 2 columns, `sample`, and `reads` where: +- `sample` is the sample name to use +- `reads` is the path to the barcode directory containing reads -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +Ex. +| sample | reads | +| - | - | +| sample1 | /path/to/barcode01 | +| ntc | /path/to/barcode02 | +| pos | /path/to/barcode03 | -```bash ---input '[path to samplesheet file]' -``` +This will be expanded upon in future releases to allow more varied inputs for the input sheet. -### Multiple runs of the same sample +## Variant Callers +Three different variant callers are available with slightly different options regarding running with them. For the most accurate results when running with `clair3` or `medaka` pick a model that best matches the input data!! -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +### [Clair3](https://github.com/HKU-BAL/Clair3) +Clair3 is a germline small variant caller for long-reads. -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` +Running with `clair3` requires the following parameters: +- `--variant_caller clair3`: Sets clair3 as the variant caller -### Full samplesheet +And has the optional parameters of: +- `--clair3_model `: Specify the base clair3 model +- `--clair3_user_variant_model
`: Specify the path to an additionally downloaded model directory +- `clair3_no_pool_split`: Do not split inputs into pools -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +Clair3 comes with some models available and is defaulted to `r941_prom_sup_g5014`. Additional models can be downloaded from [ONT Rerio](https://github.com/nanoporetech/rerio/tree/master) and then specified in the `--clair3_user_variant_model
` parameter shown above. Remember to pick a model that best represents the data! -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +### [Medaka](https://github.com/nanoporetech/medaka) +Medaka is a tool to create consensus sequences and variant calls from nanopore sequencing data using neural networks and provied by ONT. -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` +Running with `medaka` requires the following parameters: +- `--variant_caller medaka`: Sets medaka as the variant caller + +And has the optional parameters of: +`--medaka_model `: Specify the wanted medaka model + +Medaka models come built in with the tool itself with the default set to `r941_min_hac_g507` which can be changed with `--medaka_model ` parameter shown above. More information on models [can be found here](https://github.com/nanoporetech/medaka#models). Remember to pick a model that best represents the data! + +### [Nanopolish](https://github.com/jts/nanopolish) +Nanopolish is a software package for signal-level analysis of Oxford Nanopore sequencing data. It does not presently support the R10.4 flowcells so as a variant caller it should only be used with R9.4 flowcells. It also requires that the fastq data is in barcoded directories to work correctly. -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +Running with `nanopolish` requires the following parameters: +- `--variant_caller nanopolish` +- `--fast5_pass ` +- `--sequencing_summary ` -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +Nanopolish requires the fast5 directory along with the sequencing summary file to be used as input instead of a model. ## Running the pipeline -The typical command for running the pipeline is as follows: +### Amplicon +The typical command for running the pipeline with an amplicon scheme using medaka and a different medaka model is as follows: + +```bash +nextflow run phac-nml/viralassembly \ + -profile docker \ + --fastq_pass FASTQ_PASS/ \ + --variant_caller medaka \ + --medaka_model 'r1041_e82_400bps_sup_v4.3.0' \ + --scheme 'nCoV-2019' \ + --scheme_version 'V5.3.2' \ + --outdir ./results +``` + +This will launch the pipeline with the `docker` configuration profile, the `medaka` variant caller, and the `nCoV-2019` version `V5.3.2` primer scheme from https://github.com/artic-network/primer-schemes/tree/master/nCoV-2019 (default scheme repo to pull). Profile information [can be found above](#profiles) + +### Non-Amplicon +The typical command for running the pipeline without an amplicon scheme using medaka and a different medaka model is as follows: ```bash -nextflow run phac-nml/viralassembly --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run phac-nml/viralassembly \ + -profile singularity \ + --fastq_pass FASTQ_PASS/ \ + --variant_caller medaka \ + --medaka_model 'r1041_e82_400bps_sup_v4.3.0' \ + --reference REF.fa \ + --outdir ./results ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `singularity` configuration profile, the `medaka` variant caller, and the specified reference. Profile information [can be found above](#profiles) -Note that the pipeline will create the following files in your working directory: +### Other Run Notes + +Note that both analysis methods of the pipeline will create the following files in your working directory: ```bash work # Directory containing the nextflow working files @@ -86,14 +186,13 @@ nextflow run phac-nml/viralassembly -profile docker -params-file params.yaml with `params.yaml` containing: ```yaml -input: './samplesheet.csv' +fastq_pass: './fastq_pass' +variant_caller: 'medaka' +medaka_model: 'r1041_e82_400bps_sup_v4.3.0' +reference: 'reference.fa' outdir: './results/' -genome: 'GRCh37' -<...> ``` -You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). - ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -116,46 +215,117 @@ To further assist in reproducbility, you can use share and re-use [parameter fil If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. ::: -## Core Nextflow arguments +## Input Parameters +Use `--help` to see all options formatted on the command line + +Use `--version` to see version information + +### All Parameters +| Parameter | Description | Type | Default | Notes | +| - | - | - | - | - | +| --variant_caller | Pick from the 3 variant callers: 'clair3', 'medaka', 'nanopolish' | Choice | '' | Details above | +| --clair3_model | Clair3 base model to be used in the pipeline | Str | 'r941_prom_sup_g5014' | Default model will not work the best for all inputs. [See clair3 docs](https://github.com/HKU-BAL/Clair3#pre-trained-models) for additional info | +| --clair3_user_variant_model | Path to clair3 additional model directory to use instead of a base model | Path | '' | Default model will not work the best for all inputs. [See clair3 docs](https://github.com/HKU-BAL/Clair3#pre-trained-models) for additional info | +| --clair3_no_pool_split | Do not split reads into separate pools | Bool | False | Clair3 amplicon sequencing only | +| --medaka_model | Medaka model to be used in the pipeline | Str | 'r941_min_hac_g507' | Default model will not work the best for all inputs. [See medaka docs](https://github.com/nanoporetech/medaka#models) for additional info | +| --fastq_pass | Path to directory containing `barcode##` subdirectories | Path | null | | +| --fast5_pass | Path to directory containing `barcode##` fast5 subdirectories | Path | null | Only for nanopolish | +| --sequencing_summary | Path to run `sequencing_summary*.txt` file | Path | null | Only for nanopolish | +| --min_length | Minimum read length to be kept | Int | 200 | For artic guppyplex | +| --max_length | Maximum read length to be kept | Int | 3000 | For artic guppyplex | +| --min_reads | Minimum size selected reads to be used in pipeline | Int | 20 | | +| --reference | Specify the path to a reference fasta file to run pipeline without a primer scheme | Path | '' | Ignores all scheme inputs. See [schemes and reference](#schemes-and-reference) | +| --scheme | Name of the primer scheme to use | Str | '' | See [schemes and reference](#schemes-and-reference) | +| --scheme_version | Version name of primer scheme to use | Str | '' | See [schemes and reference](#schemes-and-reference) | +| --scheme_repo | Github repository URL to download scheme from | Str | 'https://github.com/artic-network/primer-schemes.git' | See [schemes and reference](#schemes-and-reference) | +| --local_scheme | Path to directory containing local scheme files | Path | null | See [schemes and reference](#schemes-and-reference) | +| --metadata | Path to metadata TSV file with columns 'sample' and 'barcode' | Path | null | See [metadata](#metadata) for more info | +| --use_artic_tool | Run the artic tool itself instead of nextflow implementation | Bool | False | Not available with clair3 | +| --normalise | Artic minion normalise coverage option | Int | 1000 | Entering `0` turns off normalisation. Only for amplicon sequencing | +| --no_frameshift | Use the Artic minion no frameshift vcf filter | Bool | False | Simple `%3 == 0` check for variants | +| --use_bwa | Use BWA instead of minimap2 for read mapping | Bool | False | | +| --skip_longshot | When running with `medaka`, skip running longshot | Bool | False | Medaka only!! | +| --skip_snpeff | Skip running SnpEff | Bool | False | | +| --gff | Path to gff3 formatted file to use in SnpEff database build | Path | False | Not required to run [SnpEff](#snpeff). See below for details | +| --skip_qc | Skip running all QC and reporting steps | Bool | false | | +| --custom_report | Run the custom HTML report | Bool | false | Currently requires the use of conda as there is not a singularity container yet | +| --pcr_primer_bed | Path to PCR primer bed file to check for mutations against | Path | null | For output QC checks | +| --neg_control_threshold | Coverage threshold at which to fail negative control samples | Float | 0.10 | | +| --neg_ctrl_substrings | Negative control sample substrings separated by a `,` | Str | 'ntc,neg,blank' | | +| --outdir | Directory name to output results to | Str | 'results' | | +| --cache | Specify a location to store conda/singularity envs/containers for reuse | Path | null | | + +### Schemes and Reference +Amplicon schemes are a highly targeted approach to sequencing focusing on a specific target genome. If using an amplicon scheme with this pipeline, either a local directory or a URL that contains the wanted primer scheme formatted according to the below information must be provided. + +If not running with an amplicon scheme, pass the `--reference ` argument with a reference fasta file and the pipeline will run without amplicon specific checks/outputs. + +The primer scheme must contain: +- A reference genome fasta sequence titled `*reference.fasta` +- A primer bed file titled `*primer.bed` + - Minimum of 6 columns + - Primer pairs with names containing `_LEFT` and `_RIGHT` + - Primer pools + +Example Primer file: + +| MN908947.3 | 30 | 54 | nCoV-2019_1_LEFT | 1 | + | +| - | - | - | - | - | - | +| MN908947.3 | 1183 | 1205 | nCoV-2019_1_RIGHT | 1 | - | +| MN908947.3 | 1100 | 1128 | nCoV-2019_2_LEFT | 2 | + | +| MN908947.3 | 2244 | 2266 | nCoV-2019_2_RIGHT | 2 | - | +| ... | ... | ... | ... | ... | ... | +| REF ID | Start | Stop | Primer Name | Primer Pool | Direction + + +The directory structure must follow the basic structure as follows: +``` +primer-schemes +└── + └── + ├── reference.fasta + └── scheme.bed +``` -:::note -These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). -::: +Example for Sars-CoV2: +``` +primer-schemes +└── nCoV-2019 + ├── midnight + | ├── nCoV-2019.reference.fasta + | └── nCoV-2019.scheme.bed + └── V1 + ├── reference.fasta + └── scheme.bed +``` -### `-profile` +### Metadata +Input metadata is used to rename barcoded fastq files along with adding additional lines to the final overall QC csv file. Note that the metadata input is expected to be of a `TSV` format -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Structure for example `metadata.tsv` file: -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. +| sample | barcode | \ | +| - | - | - | +| SR-1 | 1 | X | +| SR-2 | 02 | Y | +| NTC-12 | 12 | Z | -:::info -We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. -::: +### SnpEff +SnpEff is run by default on all non-segmented viruses (due to current implementation) by using the reference sequence ID to either: +1. Check if there is a SnpEff database available to download +2. Build a SnpEff database by downloading the sequence genbank file from NCBI + +Instead of relying on the reference ID to build/download a database, you can instead specify a gff3 file with `--gff ` to be used with the reference sequence to create the SnpEff database + +If building/downloading a database fails, the pipeline will skip over running SnpEff instead of failing out completely. + +SnpEff can also be skipped entirely by passing the `--skip_snpeff` parameter -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). - -Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! -They are loaded in sequence, so later profiles can overwrite earlier profiles. - -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. - -- `test` - - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters -- `docker` - - A generic configuration profile to be used with [Docker](https://docker.com/) -- `singularity` - - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) -- `podman` - - A generic configuration profile to be used with [Podman](https://podman.io/) -- `shifter` - - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) -- `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) -- `apptainer` - - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) -- `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. +## Core Nextflow Arguments + +:::note +These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +::: ### `-resume` @@ -167,56 +337,3 @@ You can also supply a run name to resume a specific run: `-resume [run-name]`. U Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. -## Custom configuration - -### Resource requests - -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. - -To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. - -### Custom Containers - -In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. - -To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. - -### Custom Tool Arguments - -A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. - -To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. - -### nf-core/configs - -In most cases, you will only need to create a custom config as a one-off but if you and others within your organisation are likely to be running nf-core pipelines regularly and need to use the same settings regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter. You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. - -See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information about creating your own configuration files. - -If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). - -## Azure Resource Requests - -To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. -We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. - -Note that the choice of VM size depends on your quota and the overall workload during the analysis. -For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). - -## Running in the background - -Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. - -The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. - -Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. -Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). - -## Nextflow memory requirements - -In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. -We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): - -```bash -NXF_OPTS='-Xms1g -Xmx4g' -``` diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index eac84b4..a39e317 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -89,12 +89,28 @@ class WorkflowMain { log.error("Please provide an input for --variant_caller with any of ['nanopolish', 'medaka', 'clair3']") System.exit(1) } + + //-- Scheme logic checks + //--- Basic + if ( ! params.reference && ! params.scheme ) { + log.error("Please provide either a reference or a scheme with: '--reference REF.fa' or '--scheme SCHEME'") + System.exit(1) + } else if ( params.reference && params.scheme ) { + log.error("Please provide either a reference or a scheme with: '--reference REF.fa' or '--scheme SCHEME'") + System.exit(1) + } + //--- Scheme info check + if ( params.scheme && ! params.scheme_version ) { + log.error("Please provide a scheme version with: '--scheme_version VERSION'") + System.exit(1) + } + //-- Data inputs if ( ! params.input && ! params.fastq_pass ) { - log.error("Please provide input data with either: '--input samplesheet.csv' or '--fastq_pass fastq_dir/'") + log.error("Please provide input data with either: '--input input.csv' or '--fastq_pass fastq_dir/'") System.exit(1) } else if ( params.input && params.fastq_pass ) { - log.error("Please provide input data with either: '--input samplesheet.csv' or '--fastq_pass fastq_dir/'") + log.error("Please provide input data with either: '--input input.csv' or '--fastq_pass fastq_dir/'") System.exit(1) } else if ( params.variant_caller == 'nanopolish' ) { if ( ! params.fast5_pass || ! params.sequencing_summary ) { diff --git a/modules.json b/modules.json index b78b32a..2b92998 100644 --- a/modules.json +++ b/modules.json @@ -1,5 +1,5 @@ { - "name": "viralassembly", + "name": "phac-nml/viralassembly", "homePage": "", "repos": { "https://github.com/nf-core/modules.git": { @@ -8,16 +8,12 @@ "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, diff --git a/modules/local/multiqc/main.nf b/modules/local/multiqc/main.nf index 5992bc4..19e42de 100644 --- a/modules/local/multiqc/main.nf +++ b/modules/local/multiqc/main.nf @@ -13,7 +13,7 @@ process MULTIQC_SAMPLE { input: path multiqc_config - tuple val(meta), path(sample_csv), path(variation_csv), path(consensus_variant_tsv), path(qualimap_bamqc_data), path(amp_depth_tsv) + tuple val(meta), path(sample_csv), path(variation_csv), path(consensus_variant_tsv), path(qualimap_bamqc_data), path(nanostat_txt), path(amp_depth_tsv) output: path "*.html", emit: html @@ -46,6 +46,7 @@ process MULTIQC_OVERALL { path bcftools_stats path samtools_flagstats path qualimap_bamqc_data + path nanostat_data path snpeff_csvs path qc_csv path versions_yml @@ -55,6 +56,11 @@ process MULTIQC_OVERALL { script: """ - multiqc -f -k yaml --config $multiqc_config . + multiqc \\ + -f \\ + -k yaml \\ + --filename Overall-Run-MultiQC.report.html \\ + --config $multiqc_config \\ + . """ } diff --git a/nextflow.config b/nextflow.config index e9aa4bf..ccedf39 100644 --- a/nextflow.config +++ b/nextflow.config @@ -34,10 +34,10 @@ params { max_length = 3000 // Scheme options - reference_no_scheme = '' // Turns off scheme when passed - scheme = 'nCoV-2019' - scheme_version = 'freed_nml_test_V2' - scheme_repo = 'https://github.com/DarianHole/primer-schemes.git' + reference = '' // Turns off scheme when passed + scheme = '' + scheme_version = '' + scheme_repo = 'https://github.com/artic-network/primer-schemes.git' local_scheme = '' // Artic Minion options @@ -78,6 +78,7 @@ params { max_memory = '256.GB' max_cpus = 16 max_time = '120.h' + } // Load base.config by default for all pipelines @@ -226,7 +227,7 @@ dag { } manifest { - name = 'viralassembly' + name = 'phac-nml/viralassembly' author = """Darian Hole""" homePage = 'https://github.com/phac-nml/viralassembly' description = """Assemble and QC viral reads""" @@ -234,6 +235,7 @@ manifest { nextflowVersion = '!>=23.04.0' version = '1.0.0' doi = '' + defaultBranch = 'main' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index a964e78..4c30968 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -95,7 +95,8 @@ }, "clair3_user_variant_model": { "type": "string", - "description": "Path to local clair3 model folder" + "description": "Path to local clair3 model folder", + "format": "directory-path" }, "clair3_no_pool_split": { "type": "boolean", @@ -134,7 +135,7 @@ "description": "Options pertaining to the amplicon sequencing scheme. Scheme directories must include '*.scheme.bed' and '*.reference.fasta'", "default": "", "properties": { - "reference_no_scheme": { + "reference": { "type": "string", "description": "Path to local reference file to map to instead of using a primer scheme" }, diff --git a/subworkflows/local/create_custom_report.nf b/subworkflows/local/create_custom_report.nf index aec7db0..f323827 100644 --- a/subworkflows/local/create_custom_report.nf +++ b/subworkflows/local/create_custom_report.nf @@ -67,7 +67,7 @@ workflow WF_CREATE_CUSTOM_REPORT { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // ch_amplicon_coverage = Channel.empty() ch_amplicon_completeness = Channel.empty() - if ( ! params.reference_no_scheme ) { + if ( ! params.reference ) { // Coverage BEDTOOLS_COVERAGE_AMPLICON_BED( ch_bam, diff --git a/subworkflows/local/create_multiqc_reports.nf b/subworkflows/local/create_multiqc_reports.nf index 89d1c95..38575ff 100644 --- a/subworkflows/local/create_multiqc_reports.nf +++ b/subworkflows/local/create_multiqc_reports.nf @@ -47,6 +47,7 @@ workflow WF_CREATE_MULTIQC_REPORTS { ch_bam // channel: [ val(meta), path(bam), path(bai) ] ch_vcf // channel: [ val(meta), path(vcf) ] ch_sample_csv // channel: [ val(meta), path(csv) ] + ch_nanostats_stats // channel: [ val(meta), path(txt) ] ch_snpeff_csv // channel: [ val(meta), path(csv) ] || empty ch_reference // channel: [ path(reference) ] ch_amplicon_bed // channel: [ path(amplicon_bed) ] || empty @@ -68,7 +69,7 @@ workflow WF_CREATE_MULTIQC_REPORTS { // Amplicon analysis ch_amplicon_completeness = Channel.empty() - if ( ! params.reference_no_scheme ) { + if ( ! params.reference ) { // Coverage BEDTOOLS_COVERAGE_AMPLICON_BED( ch_bam, @@ -114,7 +115,7 @@ workflow WF_CREATE_MULTIQC_REPORTS { // If not using a scheme, need to correct the headers for qualimap by removing the empty RG // BAM channel also no longer needs bai file ch_bam = ch_bam.map { it -> tuple(it[0], it[1])} - if ( ! params.reference_no_scheme ) { + if ( ! params.reference ) { SAMTOOLS_REHEADER( ch_bam, "-c 'grep -v ^@RG'" @@ -139,6 +140,7 @@ workflow WF_CREATE_MULTIQC_REPORTS { .join(CREATE_READ_VARIATION_CSV.out.csv, by: [0]) .join(CREATE_VARIANT_TSV.out.tsv, by: [0]) .join(QUALIMAP_BAMQC.out.results, by: [0]) + .join(ch_nanostats_stats, by: [0]) .join(ch_sample_amplicon_depth, by: [0]) ) @@ -155,6 +157,9 @@ workflow WF_CREATE_MULTIQC_REPORTS { .collect{ it[1] }, QUALIMAP_BAMQC.out.results .collect{ it[1] }, + ch_nanostats_stats + .collect{ it[1] } + .ifEmpty([]), ch_snpeff_csv .collect{ it[1] } .ifEmpty([]), diff --git a/subworkflows/local/nanopore_shotgun.nf b/subworkflows/local/nanopore_shotgun.nf index 785fd46..dc3ecb9 100644 --- a/subworkflows/local/nanopore_shotgun.nf +++ b/subworkflows/local/nanopore_shotgun.nf @@ -8,8 +8,6 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // Other tools -include { CHOPPER } from '../../modules/local/chopper/main' -include { NANOSTAT } from '../../modules/local/nanostat/main' include { MINIMAP2_ALIGN } from '../../modules/local/minimap2/main' include { LONGSHOT } from '../../modules/local/longshot/main' include { BCFTOOLS_NORM } from '../../modules/local/bcftools/norm/main' @@ -52,23 +50,6 @@ workflow WF_NANOPORE_SHOTGUN { // Version tracking ch_versions = Channel.empty() - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // Preprocessing - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // Chopper may be useless as we already filter based on length earlier - // Or I have this workflow earlier in the main one? - // As we are not using amplicons filtering by length isn't as useful depending on if we expect a certain fragment length or not - CHOPPER( - ch_fastqs - ) - ch_versions = ch_versions.mix(CHOPPER.out.versions) - - // Stats - NANOSTAT( - CHOPPER.out.fastq - ) - ch_versions = ch_versions.mix(NANOSTAT.out.versions) - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // Align // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf index 0f8b307..79fb6fd 100644 --- a/workflows/nanopore.nf +++ b/workflows/nanopore.nf @@ -12,6 +12,10 @@ include { RENAME_FASTQ } from '../modules/local/custom/utils.nf' include { TRACK_FILTERED_SAMPLES as TRACK_INITIAL_FILTERED_SAMPLES } from '../modules/local/custom/filtering.nf' include { TRACK_FILTERED_SAMPLES as TRACK_SIZE_FILTERED_SAMPLES } from '../modules/local/custom/filtering.nf' +// Read QC +include { CHOPPER } from '../modules/local/chopper/main' +include { NANOSTAT } from '../modules/local/nanostat/main' + // Artic related include { ARTIC_GUPPYPLEX } from '../modules/local/artic/guppyplex/main' include { ARTIC_MINION } from '../modules/local/artic/minion/main' @@ -43,7 +47,7 @@ ch_fast5s = params.fast5_pass ? file(params.fast5_pass, type: 'dir', checkIfExis ch_seqSum = params.sequencing_summary ? file(params.sequencing_summary, type: 'file', checkIfExists: true) : [] // Reference for if not using a scheme -ch_reference = params.reference_no_scheme ? Channel.value(file(params.reference_no_scheme, type: 'file', checkIfExists: true)) : [] +ch_reference = params.reference ? Channel.value(file(params.reference, type: 'file', checkIfExists: true)) : [] /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,7 +68,7 @@ workflow NANOPORE { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // ch_amplicon_bed = Channel.empty() ch_primer_bed = Channel.value([]) // This has to be a value channel for qc creation to work - if ( ! params.reference_no_scheme ) { + if ( ! params.reference ) { if ( ! ch_local_scheme ) { DOWNLOAD_SCHEME() ch_local_scheme = DOWNLOAD_SCHEME.out.scheme @@ -93,7 +97,7 @@ workflow NANOPORE { ch_versions = ch_versions.mix(GET_REF_STATS.out.versions) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // Read length and count filtering + // Read QC and Statistics // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // ARTIC_GUPPYPLEX( ch_fastqs @@ -113,18 +117,33 @@ workflow NANOPORE { .map{ fastq -> [ [id: fastq.baseName.replaceAll(~/\.fastq.*$/, '')], file(fastq) ] } .set{ ch_fastqs } } - // Pass/fail reads based on count after length filtering + + // Chopper may be useless as we already filter based on length earlier + // But it also does add quality filtering + CHOPPER( + ch_fastqs + ) + ch_versions = ch_versions.mix(CHOPPER.out.versions) + ch_fastqs = CHOPPER.out.fastq + + // Pass/fail reads based on count after length and quality filtering ch_fastqs .branch{ pass: it[1].countFastq() >= params.min_reads empty: it[1].countFastq() < params.min_reads }.set{ ch_filtered_fastqs } + // Stats on final reads + NANOSTAT( + ch_filtered_fastqs.pass + ) + ch_versions = ch_versions.mix(NANOSTAT.out.versions) + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // Chose which pipeline to run based on input params // The "proper" artic minion pipeline or re-implemented nextflow version // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - if ( params.reference_no_scheme ) { + if ( params.reference ) { WF_NANOPORE_SHOTGUN( ch_filtered_fastqs.pass, ch_fast5s, @@ -254,6 +273,7 @@ workflow NANOPORE { ch_bam, ch_vcf, MAKE_SAMPLE_QC_CSV.out.csv, + NANOSTAT.out.stats, ch_snpeff_csv, ch_reference, ch_amplicon_bed,