Merge pull request #133 from naobservatory/simon-ont-raw-clean

Adding ONT functionality to RAW and CLEAN
naobservatory · Jan 22, 2025 · 3c07823 · 3c07823
2 parents 95a3316 + 0eeabe3
commit 3c07823
Show file tree

Hide file tree

Showing 19 changed files with 148 additions and 23 deletions.
diff --git a/.github/workflows/end-to-end-se.yml b/.github/workflows/end-to-end-se.yml
@@ -28,5 +28,8 @@ jobs:
           wget -qO- https://get.nf-test.com | bash
           sudo mv nf-test /usr/local/bin/
 
-      - name: Run run_dev_se workflow
-        run: nf-test test --tag run_dev_se --verbose
+      - name: Run run_dev_se workflow on single-end data
+        run: nf-test test --tag run_dev_se --verbose
+
+      - name: Run run_dev_se workflow on ONT data
+        run: nf-test test --tag run_dev_ont --verbose
diff --git a/configs/containers.config b/configs/containers.config
@@ -55,7 +55,7 @@ process {
         container = "securebio/nao-pypkg"
     }
     withLabel: tidyverse {
-        container = "rocker/tidyverse:4.4.1"
+        container = "rocker/tidyverse:4.4.2"
     }
     withLabel: R {
         container = "securebio/nao-rpkg"
@@ -70,4 +70,10 @@ process {
     withLabel: fastp {
         container = "staphb/fastp:0.23.4"
     }
+    withLabel: filtlong {
+        container = "staphb/filtlong:0.2.1"
+    }
+    withLabel: atria {
+        container = "harmonb/atria:latest"
+    }
 }
diff --git a/configs/read_type.config b/configs/read_type.config
@@ -3,4 +3,4 @@
 params {
     // Whether the underlying data is paired-end or single-end
     single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
-}
+}
diff --git a/configs/run.config b/configs/run.config
@@ -5,6 +5,9 @@
 params {
     mode = "run"
 
+    // Sequencing platform
+    ont = false // Whether the sequencing is ONT (true) or Illumina (false)
+
     // Directories
     base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)

diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config
@@ -5,6 +5,9 @@
 params {
     mode = "run_dev_se"
 
+    // Sequencing platform
+    ont = false // Whether the sequencing is ONT (true) or Illumina (false)
+
     // Directories
     base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)

diff --git a/modules/local/filtlong/main.nf b/modules/local/filtlong/main.nf
@@ -0,0 +1,15 @@
+process FILTLONG {
+    label "small"
+    label "filtlong"
+    input:
+        tuple val(sample), path(reads)
+    output:
+        tuple val(sample), path("${sample}_filtlong.fastq.gz"), emit: reads
+    shell:
+        // Filter reads based on length (min 100 bp) and mean average base quality (min 99%, i.e, a Phred score of 20)
+        '''
+        o=!{sample}_filtlong.fastq.gz
+        i=!{reads[0]}
+        filtlong --min_length 100 --min_mean_q 99 --verbose ${i} | gzip > ${o}
+        '''
+}
diff --git a/modules/local/summarizeMultiqcPair/main.nf → modules/local/summarizeMultiqc/main.nf b/modules/local/summarizeMultiqcPair/main.nf → modules/local/summarizeMultiqc/main.nf
@@ -1,14 +1,14 @@
 // Extract paired MultiQC data into a more usable form
-process SUMMARIZE_MULTIQC_PAIR {
+process SUMMARIZE_MULTIQC {
     label "R"
     label "single"
     input:
         tuple val(stage), val(sample), path(multiqc_data)
         val(single_end)
     output:
-        tuple path("${stage}_${sample}_qc_basic_stats.tsv.gz"), path("${stage}_${sample}_qc_adapter_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_base_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_sequence_stats.tsv.gz")
+        tuple path("${stage}_${sample}_qc_basic_stats.tsv.gz"), path("${stage}_${sample}_qc_adapter_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_base_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_sequence_stats.tsv.gz"), path("${stage}_${sample}_qc_length_stats.tsv.gz")
     shell:
         '''
-        summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -r !{single_end} -o ${PWD}
+        summarize-multiqc.R -i !{multiqc_data} -s !{stage} -S !{sample} -r !{single_end} -o ${PWD}
         '''
 }
diff --git a/...esources/usr/bin/summarize-multiqc-pair.R → ...iqc/resources/usr/bin/summarize-multiqc.R b/...esources/usr/bin/summarize-multiqc-pair.R → ...iqc/resources/usr/bin/summarize-multiqc.R
@@ -41,7 +41,7 @@ out_path_basic <- file.path(opt$output_dir, paste0(id_out, "_qc_basic_stats.tsv.
 out_path_adapters <- file.path(opt$output_dir, paste0(id_out, "_qc_adapter_stats.tsv.gz"))
 out_path_quality_base <- file.path(opt$output_dir, paste0(id_out, "_qc_quality_base_stats.tsv.gz"))
 out_path_quality_sequence <- file.path(opt$output_dir, paste0(id_out, "_qc_quality_sequence_stats.tsv.gz"))
-
+out_path_lengths <- file.path(opt$output_dir, paste0(id_out, "_qc_length_stats.tsv.gz"))
 #=====================#
 # AUXILIARY FUNCTIONS #
 #=====================#
@@ -96,6 +96,18 @@ extract_adapter_data_single <- function(adapter_dataset){
     separate_wider_delim("filename", " - ", names=c("file", "adapter"))
   return(data)
 }
+
+extract_length_data_single <- function(length_dataset){
+  # Convert a single JSON length dataset into a tibble
+  data <- lapply(1:length(length_dataset$name), function(n)
+    length_dataset$data[[n]] %>% as.data.frame %>%
+      mutate(filename=length_dataset$name[n])) %>%
+    bind_rows() %>% as_tibble %>%
+    rename(length=V1, n_sequences=V2) %>%
+    rename(file = filename)
+  return(data)
+}
+
 # NB: Current paired version can't distinguish or annotate forward vs reverse reads in these plots.
 # TODO: Restore this functionality (will require workflow restructuring).
 
@@ -106,6 +118,13 @@ extract_adapter_data <- function(multiqc_json){
   return(data_out)
 }
 
+extract_length_data <- function(multiqc_json){
+  # Extract length data from multiqc JSON
+  datasets <- multiqc_json$report_plot_data$fastqc_sequence_length_distribution_plot$datasets$lines
+  data_out <- lapply(datasets, extract_length_data_single) %>% bind_rows()
+  return(data_out)
+}
+
 extract_per_base_quality_single <- function(per_base_quality_dataset){
   # Convert a single JSON per-base-quality dataset into a tibble
   data <- lapply(1:length(per_base_quality_dataset$name), function(n)
@@ -154,6 +173,7 @@ fastqc_tsv <- readr::read_tsv(fastqc_tsv_path, show_col_types = FALSE)
 add_info <- function(tab) mutate(tab, stage=opt$stage, sample=opt$sample)
 basic_info <- basic_info_fastqc(fastqc_tsv, multiqc_json) %>% add_info
 adapters <- extract_adapter_data(multiqc_json) %>% add_info
+lengths <- extract_length_data(multiqc_json) %>% add_info
 per_base_quality <- extract_per_base_quality(multiqc_json) %>% add_info
 per_sequence_quality <- extract_per_sequence_quality(multiqc_json) %>% add_info
 
@@ -162,3 +182,4 @@ write_tsv(basic_info, out_path_basic)
 write_tsv(adapters, out_path_adapters)
 write_tsv(per_base_quality, out_path_quality_base)
 write_tsv(per_sequence_quality, out_path_quality_sequence)
+write_tsv(lengths, out_path_lengths)
diff --git a/subworkflows/local/qc/main.nf b/subworkflows/local/qc/main.nf
@@ -4,12 +4,12 @@
 
 include { FASTQC_LABELED } from "../../../modules/local/fastqc"
 include { MULTIQC_LABELED } from "../../../modules/local/multiqc"
-include { SUMMARIZE_MULTIQC_PAIR } from "../../../modules/local/summarizeMultiqcPair"
+include { SUMMARIZE_MULTIQC } from "../../../modules/local/summarizeMultiqc"
 include { MERGE_TSVS as MERGE_MULTIQC_BASIC } from "../../../modules/local/mergeTsvs"
 include { MERGE_TSVS as MERGE_MULTIQC_ADAPT } from "../../../modules/local/mergeTsvs"
 include { MERGE_TSVS as MERGE_MULTIQC_QBASE } from "../../../modules/local/mergeTsvs"
 include { MERGE_TSVS as MERGE_MULTIQC_QSEQS } from "../../../modules/local/mergeTsvs"
-
+include { MERGE_TSVS as MERGE_MULTIQC_LENGTHS } from "../../../modules/local/mergeTsvs"
 /***********
 | WORKFLOW |
 ***********/
@@ -27,21 +27,24 @@ workflow QC {
         // 2. Extract data with MultiQC for each read file / pair of read files
         multiqc_ch = MULTIQC_LABELED(stage_label, fastqc_ch.zip)
         // 3. Summarize MultiQC information for each read file / pair of read files
-        process_ch = SUMMARIZE_MULTIQC_PAIR(multiqc_ch.data, single_end)
+        process_ch = SUMMARIZE_MULTIQC(multiqc_ch.data, single_end)
         // 4. Collate MultiQC outputs
         multiqc_basic_ch = process_ch.map{ it[0] }.collect().ifEmpty([])
         multiqc_adapt_ch = process_ch.map{ it[1] }.collect().ifEmpty([])
         multiqc_qbase_ch = process_ch.map{ it[2] }.collect().ifEmpty([])
         multiqc_qseqs_ch = process_ch.map{ it[3] }.collect().ifEmpty([])
+        multiqc_lengths_ch = process_ch.map{ it[4] }.collect().ifEmpty([])
         // 5. Merge MultiQC outputs
-        basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "${stage_label}_subset_qc_basic_stats")
-        adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "${stage_label}_subset_qc_adapter_stats")
-        qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "${stage_label}_subset_qc_quality_base_stats")
-        qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "${stage_label}_subset_qc_quality_sequence_stats")
+        basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "${stage_label}_qc_basic_stats")
+        adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "${stage_label}_qc_adapter_stats")
+        qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "${stage_label}_qc_quality_base_stats")
+        qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "${stage_label}_qc_quality_sequence_stats")
+        lengths_out_ch = MERGE_MULTIQC_LENGTHS(multiqc_lengths_ch, "${stage_label}_qc_length_stats")
         // 6. Combine outputs into a single output channel
         out_ch = basic_out_ch.combine(adapt_out_ch)
             .combine(qbase_out_ch).combine(qseqs_out_ch)
-            .map({file1, file2, file3, file4 -> tuple(file1, file2, file3, file4)})
+            .combine(lengths_out_ch)
+            .map({file1, file2, file3, file4, file5 -> tuple(file1, file2, file3, file4, file5)})
     emit:
         qc = out_ch
 }
diff --git a/subworkflows/local/runQc/main.nf b/subworkflows/local/runQc/main.nf
@@ -8,6 +8,7 @@ include { MERGE_TSVS as MERGE_MULTIQC_BASIC } from "../../../modules/local/merge
 include { MERGE_TSVS as MERGE_MULTIQC_ADAPT } from "../../../modules/local/mergeTsvs"
 include { MERGE_TSVS as MERGE_MULTIQC_QBASE } from "../../../modules/local/mergeTsvs"
 include { MERGE_TSVS as MERGE_MULTIQC_QSEQS } from "../../../modules/local/mergeTsvs"
+include { MERGE_TSVS as MERGE_MULTIQC_LENGTHS } from "../../../modules/local/mergeTsvs"
 
 /***********
 | WORKFLOW |
@@ -31,14 +32,17 @@ workflow RUN_QC {
       multiqc_adapt_ch = qc_ch.map{ it[1] }.collect().ifEmpty([])
       multiqc_qbase_ch = qc_ch.map{ it[2] }.collect().ifEmpty([])
       multiqc_qseqs_ch = qc_ch.map{ it[3] }.collect().ifEmpty([])
+      multiqc_lengths_ch = qc_ch.map{ it[4] }.collect().ifEmpty([])
       // 4. Merge MultiQC outputs
       basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "qc_basic_stats")
       adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "qc_adapter_stats")
       qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "qc_quality_base_stats")
       qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "qc_quality_sequence_stats")
+      lengths_out_ch = MERGE_MULTIQC_LENGTHS(multiqc_lengths_ch, "qc_length_stats")
     emit:
       qc_basic = basic_out_ch
       qc_adapt = adapt_out_ch
       qc_qbase = qbase_out_ch
       qc_qseqs = qseqs_out_ch
+      qc_lengths = lengths_out_ch
 }
diff --git a/subworkflows/local/subsetTrim/main.nf b/subworkflows/local/subsetTrim/main.nf
@@ -6,14 +6,20 @@ if (params.single_end) {
     include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
     include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
     include { SUBSET_READS_SINGLE_TARGET; SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
-    include { FASTP_SINGLE as FASTP } from "../../../modules/local/fastp"
+    if (params.ont) {
+        include { FILTLONG as FILTER_READS } from "../../../modules/local/filtlong"
+    } else {
+        include { FASTP_SINGLE as FILTER_READS } from "../../../modules/local/fastp"
+    }
+
 } else {
     include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
     include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
     include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
-    include { FASTP_PAIRED as FASTP } from "../../../modules/local/fastp"
+    include { FASTP_PAIRED as FILTER_READS } from "../../../modules/local/fastp"
 }
 
+
 /***********
 | WORKFLOW |
 ***********/
@@ -60,8 +66,12 @@ workflow SUBSET_TRIM {
         }
 
         // Call fastp adapter trimming
-        fastp_ch = FASTP(grouped_ch, adapter_path)
+        if (params.ont) {
+            trimmed_ch = FILTER_READS(grouped_ch)
+        } else {
+            trimmed_ch = FILTER_READS(grouped_ch, adapter_path)
+        }
     emit:
         subset_reads = grouped_ch
-        trimmed_subset_reads = fastp_ch.reads
+        trimmed_subset_reads = trimmed_ch.reads
 }
diff --git a/test-data/nextflow.config b/test-data/nextflow.config
@@ -5,12 +5,13 @@
 params {
     mode = "run"
 
+    // Sequencing platform
+    ont = false // Whether the sequencing is ONT (true) or Illumina (false)
+
     // Directories
     base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)
 
-
-
     // Files
     sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
     adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming

diff --git a/test-data/ont-samplesheet.csv b/test-data/ont-samplesheet.csv
@@ -0,0 +1,2 @@
+sample,fastq
+NAO-ONT-20240710-WW-RNA1-div0000,s3://nao-testing/ont-ww-test/NAO-ONT-20240710-WW-RNA1-div0000.fastq.gz
diff --git a/tests/main.nf.test b/tests/main.nf.test
@@ -27,6 +27,16 @@ nextflow_pipeline {
             assert workflow.success
         }
     }
+
+    test("Test Oxford Nanopore run workflow") {
+        config "tests/run_dev_ont.config"
+        tag "run_dev_ont"
+
+        then {
+            assert workflow.success
+        }
+    }
+
     test("Test validation workflow") {
         config "tests/run_validation.config"
         tag "validation"

diff --git a/tests/run.config b/tests/run.config
@@ -7,6 +7,9 @@
 params {
     mode = "run"
 
+    // Sequencing platform
+    ont = false // Whether the sequencing is ONT (true) or Illumina (false)
+
     // Directories
     base_dir = "./" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)

diff --git a/tests/run_dev_ont.config b/tests/run_dev_ont.config
@@ -0,0 +1,37 @@
+/************************************************
+| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW |
+************************************************/
+
+params {
+    mode = "run_dev_se"
+
+    // Sequencing platform
+    ont = true // Whether the sequencing is ONT (true) or Illumina (false)
+
+    // Directories
+    base_dir = "./" // Parent for working and output directories (can be S3)
+    ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
+
+    // Files
+    sample_sheet = "${projectDir}/test-data/ont-samplesheet.csv" // Path to library TSV
+    adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming. Not used for ONT.
+
+    // Numerical
+    human_read_filtering = true // Whether to filter human reads
+    grouping = false // Whether to group samples by 'group' column in samplesheet
+    n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
+    n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
+    bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
+    blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
+    kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
+    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
+    fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
+    host_taxon = "vertebrate"
+
+    blast_db_prefix = "nt_others"
+}
+
+includeConfig "${projectDir}/configs/containers.config"
+includeConfig "${projectDir}/configs/profiles.config"
+includeConfig "${projectDir}/configs/read_type.config"
+includeConfig "${projectDir}/configs/output.config"
diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
@@ -5,6 +5,9 @@
 params {
     mode = "run_dev_se"
 
+    // Sequencing platform
+    ont = false // Whether the sequencing is ONT (true) or Illumina (false)
+
     // Directories
     base_dir = "./" // Parent for working and output directories (can be S3)
     ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)

diff --git a/workflows/run.nf b/workflows/run.nf
@@ -90,6 +90,7 @@ workflow RUN {
         RUN_QC.out.qc_adapt >> "results"
         RUN_QC.out.qc_qbase >> "results"
         RUN_QC.out.qc_qseqs >> "results"
+        RUN_QC.out.qc_lengths >> "results"
         // Final results
         EXTRACT_VIRAL_READS.out.tsv >> "results"
         EXTRACT_VIRAL_READS.out.counts >> "results"

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
@@ -31,7 +31,6 @@ workflow RUN_DEV_SE {
     // Load kraken db path
     kraken_db_path = "${params.ref_dir}/results/kraken_db"
 
-
     // Count reads in files
     COUNT_TOTAL_READS(samplesheet_ch)
 
@@ -68,6 +67,7 @@ workflow RUN_DEV_SE {
         RUN_QC.out.qc_adapt >> "results"
         RUN_QC.out.qc_qbase >> "results"
         RUN_QC.out.qc_qseqs >> "results"
+        RUN_QC.out.qc_lengths >> "results"
         // Final results
         PROFILE.out.bracken >> "results"
         PROFILE.out.kraken >> "results"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		sample,fastq
		NAO-ONT-20240710-WW-RNA1-div0000,s3://nao-testing/ont-ww-test/NAO-ONT-20240710-WW-RNA1-div0000.fastq.gz