Skip to content

Commit

Permalink
Merge pull request #133 from naobservatory/simon-ont-raw-clean
Browse files Browse the repository at this point in the history
Adding ONT functionality to RAW and CLEAN
  • Loading branch information
willbradshaw authored Jan 22, 2025
2 parents 95a3316 + 0eeabe3 commit 3c07823
Show file tree
Hide file tree
Showing 19 changed files with 148 additions and 23 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/end-to-end-se.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,8 @@ jobs:
wget -qO- https://get.nf-test.com | bash
sudo mv nf-test /usr/local/bin/
- name: Run run_dev_se workflow
run: nf-test test --tag run_dev_se --verbose
- name: Run run_dev_se workflow on single-end data
run: nf-test test --tag run_dev_se --verbose

- name: Run run_dev_se workflow on ONT data
run: nf-test test --tag run_dev_ont --verbose
8 changes: 7 additions & 1 deletion configs/containers.config
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ process {
container = "securebio/nao-pypkg"
}
withLabel: tidyverse {
container = "rocker/tidyverse:4.4.1"
container = "rocker/tidyverse:4.4.2"
}
withLabel: R {
container = "securebio/nao-rpkg"
Expand All @@ -70,4 +70,10 @@ process {
withLabel: fastp {
container = "staphb/fastp:0.23.4"
}
withLabel: filtlong {
container = "staphb/filtlong:0.2.1"
}
withLabel: atria {
container = "harmonb/atria:latest"
}
}
2 changes: 1 addition & 1 deletion configs/read_type.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
params {
// Whether the underlying data is paired-end or single-end
single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
}
}
3 changes: 3 additions & 0 deletions configs/run.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
params {
mode = "run"

// Sequencing platform
ont = false // Whether the sequencing is ONT (true) or Illumina (false)

// Directories
base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)
Expand Down
3 changes: 3 additions & 0 deletions configs/run_dev_se.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
params {
mode = "run_dev_se"

// Sequencing platform
ont = false // Whether the sequencing is ONT (true) or Illumina (false)

// Directories
base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)
Expand Down
15 changes: 15 additions & 0 deletions modules/local/filtlong/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
process FILTLONG {
label "small"
label "filtlong"
input:
tuple val(sample), path(reads)
output:
tuple val(sample), path("${sample}_filtlong.fastq.gz"), emit: reads
shell:
// Filter reads based on length (min 100 bp) and mean average base quality (min 99%, i.e, a Phred score of 20)
'''
o=!{sample}_filtlong.fastq.gz
i=!{reads[0]}
filtlong --min_length 100 --min_mean_q 99 --verbose ${i} | gzip > ${o}
'''
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
// Extract paired MultiQC data into a more usable form
process SUMMARIZE_MULTIQC_PAIR {
process SUMMARIZE_MULTIQC {
label "R"
label "single"
input:
tuple val(stage), val(sample), path(multiqc_data)
val(single_end)
output:
tuple path("${stage}_${sample}_qc_basic_stats.tsv.gz"), path("${stage}_${sample}_qc_adapter_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_base_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_sequence_stats.tsv.gz")
tuple path("${stage}_${sample}_qc_basic_stats.tsv.gz"), path("${stage}_${sample}_qc_adapter_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_base_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_sequence_stats.tsv.gz"), path("${stage}_${sample}_qc_length_stats.tsv.gz")
shell:
'''
summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -r !{single_end} -o ${PWD}
summarize-multiqc.R -i !{multiqc_data} -s !{stage} -S !{sample} -r !{single_end} -o ${PWD}
'''
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ out_path_basic <- file.path(opt$output_dir, paste0(id_out, "_qc_basic_stats.tsv.
out_path_adapters <- file.path(opt$output_dir, paste0(id_out, "_qc_adapter_stats.tsv.gz"))
out_path_quality_base <- file.path(opt$output_dir, paste0(id_out, "_qc_quality_base_stats.tsv.gz"))
out_path_quality_sequence <- file.path(opt$output_dir, paste0(id_out, "_qc_quality_sequence_stats.tsv.gz"))

out_path_lengths <- file.path(opt$output_dir, paste0(id_out, "_qc_length_stats.tsv.gz"))
#=====================#
# AUXILIARY FUNCTIONS #
#=====================#
Expand Down Expand Up @@ -96,6 +96,18 @@ extract_adapter_data_single <- function(adapter_dataset){
separate_wider_delim("filename", " - ", names=c("file", "adapter"))
return(data)
}

extract_length_data_single <- function(length_dataset){
# Convert a single JSON length dataset into a tibble
data <- lapply(1:length(length_dataset$name), function(n)
length_dataset$data[[n]] %>% as.data.frame %>%
mutate(filename=length_dataset$name[n])) %>%
bind_rows() %>% as_tibble %>%
rename(length=V1, n_sequences=V2) %>%
rename(file = filename)
return(data)
}

# NB: Current paired version can't distinguish or annotate forward vs reverse reads in these plots.
# TODO: Restore this functionality (will require workflow restructuring).

Expand All @@ -106,6 +118,13 @@ extract_adapter_data <- function(multiqc_json){
return(data_out)
}

extract_length_data <- function(multiqc_json){
# Extract length data from multiqc JSON
datasets <- multiqc_json$report_plot_data$fastqc_sequence_length_distribution_plot$datasets$lines
data_out <- lapply(datasets, extract_length_data_single) %>% bind_rows()
return(data_out)
}

extract_per_base_quality_single <- function(per_base_quality_dataset){
# Convert a single JSON per-base-quality dataset into a tibble
data <- lapply(1:length(per_base_quality_dataset$name), function(n)
Expand Down Expand Up @@ -154,6 +173,7 @@ fastqc_tsv <- readr::read_tsv(fastqc_tsv_path, show_col_types = FALSE)
add_info <- function(tab) mutate(tab, stage=opt$stage, sample=opt$sample)
basic_info <- basic_info_fastqc(fastqc_tsv, multiqc_json) %>% add_info
adapters <- extract_adapter_data(multiqc_json) %>% add_info
lengths <- extract_length_data(multiqc_json) %>% add_info
per_base_quality <- extract_per_base_quality(multiqc_json) %>% add_info
per_sequence_quality <- extract_per_sequence_quality(multiqc_json) %>% add_info

Expand All @@ -162,3 +182,4 @@ write_tsv(basic_info, out_path_basic)
write_tsv(adapters, out_path_adapters)
write_tsv(per_base_quality, out_path_quality_base)
write_tsv(per_sequence_quality, out_path_quality_sequence)
write_tsv(lengths, out_path_lengths)
19 changes: 11 additions & 8 deletions subworkflows/local/qc/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

include { FASTQC_LABELED } from "../../../modules/local/fastqc"
include { MULTIQC_LABELED } from "../../../modules/local/multiqc"
include { SUMMARIZE_MULTIQC_PAIR } from "../../../modules/local/summarizeMultiqcPair"
include { SUMMARIZE_MULTIQC } from "../../../modules/local/summarizeMultiqc"
include { MERGE_TSVS as MERGE_MULTIQC_BASIC } from "../../../modules/local/mergeTsvs"
include { MERGE_TSVS as MERGE_MULTIQC_ADAPT } from "../../../modules/local/mergeTsvs"
include { MERGE_TSVS as MERGE_MULTIQC_QBASE } from "../../../modules/local/mergeTsvs"
include { MERGE_TSVS as MERGE_MULTIQC_QSEQS } from "../../../modules/local/mergeTsvs"

include { MERGE_TSVS as MERGE_MULTIQC_LENGTHS } from "../../../modules/local/mergeTsvs"
/***********
| WORKFLOW |
***********/
Expand All @@ -27,21 +27,24 @@ workflow QC {
// 2. Extract data with MultiQC for each read file / pair of read files
multiqc_ch = MULTIQC_LABELED(stage_label, fastqc_ch.zip)
// 3. Summarize MultiQC information for each read file / pair of read files
process_ch = SUMMARIZE_MULTIQC_PAIR(multiqc_ch.data, single_end)
process_ch = SUMMARIZE_MULTIQC(multiqc_ch.data, single_end)
// 4. Collate MultiQC outputs
multiqc_basic_ch = process_ch.map{ it[0] }.collect().ifEmpty([])
multiqc_adapt_ch = process_ch.map{ it[1] }.collect().ifEmpty([])
multiqc_qbase_ch = process_ch.map{ it[2] }.collect().ifEmpty([])
multiqc_qseqs_ch = process_ch.map{ it[3] }.collect().ifEmpty([])
multiqc_lengths_ch = process_ch.map{ it[4] }.collect().ifEmpty([])
// 5. Merge MultiQC outputs
basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "${stage_label}_subset_qc_basic_stats")
adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "${stage_label}_subset_qc_adapter_stats")
qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "${stage_label}_subset_qc_quality_base_stats")
qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "${stage_label}_subset_qc_quality_sequence_stats")
basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "${stage_label}_qc_basic_stats")
adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "${stage_label}_qc_adapter_stats")
qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "${stage_label}_qc_quality_base_stats")
qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "${stage_label}_qc_quality_sequence_stats")
lengths_out_ch = MERGE_MULTIQC_LENGTHS(multiqc_lengths_ch, "${stage_label}_qc_length_stats")
// 6. Combine outputs into a single output channel
out_ch = basic_out_ch.combine(adapt_out_ch)
.combine(qbase_out_ch).combine(qseqs_out_ch)
.map({file1, file2, file3, file4 -> tuple(file1, file2, file3, file4)})
.combine(lengths_out_ch)
.map({file1, file2, file3, file4, file5 -> tuple(file1, file2, file3, file4, file5)})
emit:
qc = out_ch
}
4 changes: 4 additions & 0 deletions subworkflows/local/runQc/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ include { MERGE_TSVS as MERGE_MULTIQC_BASIC } from "../../../modules/local/merge
include { MERGE_TSVS as MERGE_MULTIQC_ADAPT } from "../../../modules/local/mergeTsvs"
include { MERGE_TSVS as MERGE_MULTIQC_QBASE } from "../../../modules/local/mergeTsvs"
include { MERGE_TSVS as MERGE_MULTIQC_QSEQS } from "../../../modules/local/mergeTsvs"
include { MERGE_TSVS as MERGE_MULTIQC_LENGTHS } from "../../../modules/local/mergeTsvs"

/***********
| WORKFLOW |
Expand All @@ -31,14 +32,17 @@ workflow RUN_QC {
multiqc_adapt_ch = qc_ch.map{ it[1] }.collect().ifEmpty([])
multiqc_qbase_ch = qc_ch.map{ it[2] }.collect().ifEmpty([])
multiqc_qseqs_ch = qc_ch.map{ it[3] }.collect().ifEmpty([])
multiqc_lengths_ch = qc_ch.map{ it[4] }.collect().ifEmpty([])
// 4. Merge MultiQC outputs
basic_out_ch = MERGE_MULTIQC_BASIC(multiqc_basic_ch, "qc_basic_stats")
adapt_out_ch = MERGE_MULTIQC_ADAPT(multiqc_adapt_ch, "qc_adapter_stats")
qbase_out_ch = MERGE_MULTIQC_QBASE(multiqc_qbase_ch, "qc_quality_base_stats")
qseqs_out_ch = MERGE_MULTIQC_QSEQS(multiqc_qseqs_ch, "qc_quality_sequence_stats")
lengths_out_ch = MERGE_MULTIQC_LENGTHS(multiqc_lengths_ch, "qc_length_stats")
emit:
qc_basic = basic_out_ch
qc_adapt = adapt_out_ch
qc_qbase = qbase_out_ch
qc_qseqs = qseqs_out_ch
qc_lengths = lengths_out_ch
}
18 changes: 14 additions & 4 deletions subworkflows/local/subsetTrim/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,20 @@ if (params.single_end) {
include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
include { SUBSET_READS_SINGLE_TARGET; SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
include { FASTP_SINGLE as FASTP } from "../../../modules/local/fastp"
if (params.ont) {
include { FILTLONG as FILTER_READS } from "../../../modules/local/filtlong"
} else {
include { FASTP_SINGLE as FILTER_READS } from "../../../modules/local/fastp"
}

} else {
include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
include { FASTP_PAIRED as FASTP } from "../../../modules/local/fastp"
include { FASTP_PAIRED as FILTER_READS } from "../../../modules/local/fastp"
}


/***********
| WORKFLOW |
***********/
Expand Down Expand Up @@ -60,8 +66,12 @@ workflow SUBSET_TRIM {
}

// Call fastp adapter trimming
fastp_ch = FASTP(grouped_ch, adapter_path)
if (params.ont) {
trimmed_ch = FILTER_READS(grouped_ch)
} else {
trimmed_ch = FILTER_READS(grouped_ch, adapter_path)
}
emit:
subset_reads = grouped_ch
trimmed_subset_reads = fastp_ch.reads
trimmed_subset_reads = trimmed_ch.reads
}
5 changes: 3 additions & 2 deletions test-data/nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
params {
mode = "run"

// Sequencing platform
ont = false // Whether the sequencing is ONT (true) or Illumina (false)

// Directories
base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)



// Files
sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
Expand Down
2 changes: 2 additions & 0 deletions test-data/ont-samplesheet.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sample,fastq
NAO-ONT-20240710-WW-RNA1-div0000,s3://nao-testing/ont-ww-test/NAO-ONT-20240710-WW-RNA1-div0000.fastq.gz
10 changes: 10 additions & 0 deletions tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@ nextflow_pipeline {
assert workflow.success
}
}

test("Test Oxford Nanopore run workflow") {
config "tests/run_dev_ont.config"
tag "run_dev_ont"

then {
assert workflow.success
}
}

test("Test validation workflow") {
config "tests/run_validation.config"
tag "validation"
Expand Down
3 changes: 3 additions & 0 deletions tests/run.config
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
params {
mode = "run"

// Sequencing platform
ont = false // Whether the sequencing is ONT (true) or Illumina (false)

// Directories
base_dir = "./" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
Expand Down
37 changes: 37 additions & 0 deletions tests/run_dev_ont.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/************************************************
| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW |
************************************************/

params {
mode = "run_dev_se"

// Sequencing platform
ont = true // Whether the sequencing is ONT (true) or Illumina (false)

// Directories
base_dir = "./" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)

// Files
sample_sheet = "${projectDir}/test-data/ont-samplesheet.csv" // Path to library TSV
adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming. Not used for ONT.

// Numerical
human_read_filtering = true // Whether to filter human reads
grouping = false // Whether to group samples by 'group' column in samplesheet
n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
host_taxon = "vertebrate"

blast_db_prefix = "nt_others"
}

includeConfig "${projectDir}/configs/containers.config"
includeConfig "${projectDir}/configs/profiles.config"
includeConfig "${projectDir}/configs/read_type.config"
includeConfig "${projectDir}/configs/output.config"
3 changes: 3 additions & 0 deletions tests/run_dev_se.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
params {
mode = "run_dev_se"

// Sequencing platform
ont = false // Whether the sequencing is ONT (true) or Illumina (false)

// Directories
base_dir = "./" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
Expand Down
1 change: 1 addition & 0 deletions workflows/run.nf
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ workflow RUN {
RUN_QC.out.qc_adapt >> "results"
RUN_QC.out.qc_qbase >> "results"
RUN_QC.out.qc_qseqs >> "results"
RUN_QC.out.qc_lengths >> "results"
// Final results
EXTRACT_VIRAL_READS.out.tsv >> "results"
EXTRACT_VIRAL_READS.out.counts >> "results"
Expand Down
2 changes: 1 addition & 1 deletion workflows/run_dev_se.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ workflow RUN_DEV_SE {
// Load kraken db path
kraken_db_path = "${params.ref_dir}/results/kraken_db"


// Count reads in files
COUNT_TOTAL_READS(samplesheet_ch)

Expand Down Expand Up @@ -68,6 +67,7 @@ workflow RUN_DEV_SE {
RUN_QC.out.qc_adapt >> "results"
RUN_QC.out.qc_qbase >> "results"
RUN_QC.out.qc_qseqs >> "results"
RUN_QC.out.qc_lengths >> "results"
// Final results
PROFILE.out.bracken >> "results"
PROFILE.out.kraken >> "results"
Expand Down

0 comments on commit 3c07823

Please sign in to comment.