naobservatory · simonleandergrimm · Dec 14, 2024 · Dec 14, 2024 · Dec 14, 2024 · Dec 14, 2024
diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf
@@ -61,7 +61,7 @@ process BBDUK_SINGLE {
 }
 
 // Detection and removal of contaminant reads (use minkmerhits instead of minkmerfraction)
-process BBDUK_HITS {
+process BBDUK_HITS_PAIRED {
     label "large"
     label "BBTools"
     input:
@@ -93,6 +93,35 @@ process BBDUK_HITS {
         '''
 }
 
+process BBDUK_HITS_SINGLE {
+    label "large"
+    label "BBTools"
+    input:
+        tuple val(sample), path(reads)
+        path(contaminant_ref)
+        val(min_kmer_hits)
+        val(k)
+        val(suffix)
+    output:
+        tuple val(sample), path("${sample}_${suffix}_bbduk_pass.fastq.gz"), emit: reads
+        tuple val(sample), path("${sample}_${suffix}_bbduk_fail.fastq.gz"), emit: fail
+        tuple val(sample), path("${sample}_${suffix}_bbduk.stats.txt"), emit: log
+    shell:
+        '''
+        # Define input/output
+        in=!{reads}
+        op=!{sample}_!{suffix}_bbduk_pass.fastq.gz
+        of=!{sample}_!{suffix}_bbduk_fail.fastq.gz
+        stats=!{sample}_!{suffix}_bbduk.stats.txt
+        ref=!{contaminant_ref}
+        io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}"
+        # Define parameters
+        par="minkmerhits=!{min_kmer_hits} k=!{k} t=!{task.cpus} -Xmx!{task.memory.toGiga()}g"
+        # Execute
+        bbduk.sh ${io} ${par}
+        '''
+}
+
 // Masking contaminant kmers in a sequence database
 process BBDUK_MASK {
     label "large"

diff --git a/modules/local/bowtie2/main.nf b/modules/local/bowtie2/main.nf
@@ -1,5 +1,5 @@
 // Run Bowtie2 and return mapped and unmapped reads
-process BOWTIE2 {
+process BOWTIE2_PAIRED {
     label "Bowtie2"
     label "large"
     input:
@@ -25,6 +25,31 @@ process BOWTIE2 {
         '''
 }
 
+process BOWTIE2_SINGLE {
+    label "Bowtie2"
+    label "large"
+    input:
+        tuple val(sample), path(reads)
+        path(index_dir)
+        val(par_string)
+        val(suffix)
+    output:
+        tuple val(sample), path("${sample}_${suffix}_bowtie2_mapped.sam"), emit: sam
+        tuple val(sample), path("${sample}_${suffix}_bowtie2_aligned.fastq.gz"), emit: reads_conc
+        tuple val(sample), path("${sample}_${suffix}_bowtie2_unaligned.fastq.gz"), emit: reads_unconc
+    shell:
+        '''
+        in=!{reads}
+        idx="!{index_dir}/bt2_index"
+        sam="!{sample}_!{suffix}_bowtie2_mapped.sam"
+        alc="!{sample}_!{suffix}_bowtie2_aligned.fastq.gz"
+        unc="!{sample}_!{suffix}_bowtie2_unaligned.fastq.gz"
+        io="-U ${in} -x ${idx} -S ${sam} --al-gz ${alc} --un-gz ${unc}"
+        par="--threads !{task.cpus} --local --very-sensitive-local !{par_string}"
+        bowtie2 ${par} ${io}
+        '''
+}
+
 // Generate a Bowtie2 index from an input file
 process BOWTIE2_INDEX {
     label "Bowtie2"

diff --git a/modules/local/cutadapt/main.nf b/modules/local/cutadapt/main.nf
@@ -1,4 +1,4 @@
-process CUTADAPT {
+process CUTADAPT_PAIRED {
     label "cutadapt"
     label "large"
     input:
@@ -25,6 +25,33 @@ process CUTADAPT {
         '''
 }
 
+process CUTADAPT_SINGLE {
+    label "cutadapt"
+    label "large"
+    input:
+        // single read file
+        tuple val(sample), path(read)
+        path(adapters)
+    output:
+        tuple val(sample), path("${sample}_cutadapt.fastq.gz"), emit: reads
+        tuple val(sample), path("${sample}_cutadapt_log.txt"), emit: log
+    shell:
+        /* Explanation of cutadapt parameters:
+        -b to trim any adapter in the adapters file from either end of the read
+        -j number of cpu cores
+        -m to drop reads that are <20 bp after trimming
+        -e 0.33 To allow up to a 33% error rate in the matching region between an adapter
+            and the read
+        --action=trim to trim adapters and up/downstream sequence
+        */
+        '''
+        par="-b file:!{adapters} -j !{task.cpus} -m 20 -e 0.33 --action=trim"
+        out="-o !{sample}_cutadapt.fastq.gz"
+        log="!{sample}_cutadapt_log.txt"
+        cutadapt ${par} ${out} !{read} > ${log}
+        '''
+}
+
 process CUTADAPT_MASK {
     label "cutadapt"
     label "large"
@@ -36,7 +63,7 @@ process CUTADAPT_MASK {
         path("${label}_cutadapt_masked.fasta.gz"), emit: masked
         path("${label}_cutadapt_log.txt"), emit: log
     shell:
-        ''' 
+        '''
         out=!{label}_cutadapt_masked.fasta.gz
         log=!{label}_cutadapt_log.txt
         cutadapt --action=mask -b file:!{adapters} -j !{task.cpus} -e 0.2 -o ${out} !{seq_db} > ${log}

diff --git a/modules/local/processViralBowtie2Sam/resources/usr/bin/process_viral_bowtie2_sam.py b/modules/local/processViralBowtie2Sam/resources/usr/bin/process_viral_bowtie2_sam.py
@@ -285,6 +285,16 @@ def process_paired_sam(sam_path, out_path, genbank_metadata, viral_taxids):
 
 # TODO: Process unpaired SAM
 
+def process_single_sam(sam_path, out_path, genbank_metadata, viral_taxids):
+    """Process paired SAM file into a TSV."""
+    with open_by_suffix(sam_path) as inf, open_by_suffix(out_path, "w") as outf:
+        head = write_sam_headers_paired(outf) # Write headers
+        # Process file & generate content
+        for line in inf:
+            fwd_dict = process_sam_alignment(line, genbank_metadata, viral_taxids, False)
+            outf.write()
+
+
 # Main function
 def main():
     # Parse arguments
@@ -327,7 +337,7 @@ def main():
     if paired:
         process_paired_sam(sam_path, out_path, gid_taxid_dict, virus_taxa)
     else:
-        process_unpaired_sam(sam_path, out_path, gid_taxid_dict, virus_taxa)
+        process_single_sam(sam_path, out_path, gid_taxid_dict, virus_taxa)
     print_log("File processed.")
     # Finish time tracking
     end_time = time.time()

diff --git a/modules/local/trimmomatic/main.nf b/modules/local/trimmomatic/main.nf
@@ -1,4 +1,4 @@
-process TRIMMOMATIC {
+process TRIMMOMATIC_PAIRED {
     label "trimmomatic"
     label "large"
     input:
@@ -36,3 +36,38 @@ process TRIMMOMATIC {
         ${cmd}
         '''
 }
+
+
+process TRIMMOMATIC_SINGLE {
+    label "trimmomatic"
+    label "large"
+    input:
+        // single read file
+        tuple val(sample), path(read)
+        path(adapters)
+        val(encoding)
+    output:
+        tuple val(sample), path("${sample}_trimmomatic.fastq.gz"), emit: reads
+        tuple val(sample), path("${sample}_trimmomatic_{trimlog,summary}.txt"), emit: log
+    shell:
+        /* Explanation of trimmomatic filters:
+        ILLUMINACLIP: cut Illumina adapters
+            * max 2 mismatches in a match
+            * min match quality of 8 between adapter and read
+            * adapters must be >= 5 bases
+        LEADING/TRAILING:10 remove leading/trailing bases with quality < 10
+        SLIDINGWINDOW:4:15 cut the read after the average quality of a 4-base
+            window falls below 15
+        MINLEN:20 drop reads below 20 bases
+        */
+        '''
+        out=!{sample}_trimmomatic.fastq.gz
+        log=!{sample}_trimmomatic_trimlog.txt
+        sum=!{sample}_trimmomatic_summary.txt
+        io="-trimlog ${log} -summary ${sum} !{read} ${out}"
+        par="ILLUMINACLIP:!{adapters}:2:8:5 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:15 MINLEN:20"
+        cmd="trimmomatic SE -!{encoding} -threads !{task.cpus} ${io} ${par}"
+        echo ${cmd}
+        ${cmd}
+        '''
+}
diff --git a/subworkflows/local/extractViralReads/main.nf b/subworkflows/local/extractViralReads/main.nf
@@ -2,12 +2,6 @@
 | MODULES AND SUBWORKFLOWS |
 ***************************/
 
-include { BBDUK_HITS } from "../../../modules/local/bbduk"
-include { CUTADAPT } from "../../../modules/local/cutadapt"
-include { TRIMMOMATIC } from "../../../modules/local/trimmomatic"
-include { BOWTIE2 as BOWTIE2_VIRUS } from "../../../modules/local/bowtie2"
-include { BOWTIE2 as BOWTIE2_HUMAN } from "../../../modules/local/bowtie2"
-include { BOWTIE2 as BOWTIE2_OTHER } from "../../../modules/local/bowtie2"
 include { PROCESS_VIRAL_BOWTIE2_SAM } from "../../../modules/local/processViralBowtie2Sam"
 include { COUNT_ALIGNMENT_DUPLICATES } from "../../../modules/local/countAlignmentDuplicates"
 include { EXTRACT_UNCONC_READ_IDS } from "../../../modules/local/extractUnconcReadIDs"
@@ -29,8 +23,20 @@ include { MAKE_VIRUS_READS_FASTA } from "../../../modules/local/makeVirusReadsFa
 include { COUNT_VIRUS_CLADES } from "../../../modules/local/countVirusClades"
 if (params.single_end) {
     include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
+    include { BBDUK_HITS_SINGLE as BBDUK_HITS } from "../../../modules/local/bbduk"
+    include { CUTADAPT_SINGLE as CUTADAPT } from "../../../modules/local/cutadapt"
+    include { TRIMMOMATIC_SINGLE as TRIMMOMATIC } from "../../../modules/local/trimmomatic"
+    include { BOWTIE2_SINGLE as BOWTIE2_VIRUS } from "../../../modules/local/bowtie2"
+    include { BOWTIE2_SINGLE as BOWTIE2_HUMAN } from "../../../modules/local/bowtie2"
+    include { BOWTIE2_SINGLE as BOWTIE2_OTHER } from "../../../modules/local/bowtie2"
 } else {
     include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
+    include { BBDUK_HITS_PAIRED as BBDUK_HITS } from "../../../modules/local/bbduk"
+    include { CUTADAPT_PAIRED as CUTADAPT } from "../../../modules/local/cutadapt"
+    include { TRIMMOMATIC_PAIRED as TRIMMOMATIC } from "../../../modules/local/trimmomatic"
+    include { BOWTIE2_PAIRED as BOWTIE2_VIRUS } from "../../../modules/local/bowtie2"
+    include { BOWTIE2_PAIRED as BOWTIE2_HUMAN } from "../../../modules/local/bowtie2"
+    include { BOWTIE2_PAIRED as BOWTIE2_OTHER } from "../../../modules/local/bowtie2"
 }
 
 /***********
@@ -100,7 +106,7 @@ workflow EXTRACT_VIRAL_READS {
         kraken_output_ch = PROCESS_KRAKEN_VIRAL(tax_ch.kraken_output, virus_db_path, host_taxon)
         bowtie2_kraken_merged_ch = MERGE_SAM_KRAKEN(kraken_output_ch.combine(bowtie2_sam_ch, by: 0))
         merged_ch = MERGE_TSVS_BOWTIE2_KRAKEN(bowtie2_kraken_merged_ch.collect().ifEmpty([]), "bowtie2_kraken_merged")
-        merged_bbmerge_results = MERGE_TSVS_BBMERGE(tax_ch.bbmerge_summary.collect().ifEmpty([]), "bbmerge")
+        merged_bbmerge_results = MERGE_TSVS_BBMERGE(tax_ch.bbmerge_summary, "bbmerge") //collect().ifEmpty([]), "bbmerge")
         merged_dedup_results = MERGE_TSVS_DEDUP(tax_ch.dedup_summary.collect().ifEmpty([]), "dedup")
         merged_alignment_dup_results = MERGE_TSVS_ALIGNMENT_DUPLICATES(alignment_dup_summary.collect().ifEmpty([]), "alignment_duplicates")
         // Filter and process putative hit TSV

diff --git a/subworkflows/local/profile/main.nf b/subworkflows/local/profile/main.nf
@@ -10,16 +10,17 @@
 if (params.single_end) {
     include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
     include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk"
+    include { BBDUK_HITS_SINGLE as BBDUK_HITS } from "../../../modules/local/bbduk"
     include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
     include { SUBSET_READS_SINGLE_TARGET; SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
 } else {
     include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
     include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
     include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk"
+    include { BBDUK_HITS_PAIRED as BBDUK_HITS } from "../../../modules/local/bbduk"
     include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
 }
 
-include { BBDUK_HITS } from "../../../modules/local/bbduk"
 include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy"
 include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy"
 include { MERGE_TAXONOMY_RIBO } from "../../../modules/local/mergeTaxonomyRibo"

diff --git a/workflows/run_dev_se.nf b/workflows/run_dev_se.nf
@@ -14,6 +14,8 @@ include { CLEAN } from "../subworkflows/local/clean"
 include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput"
 include { PROFILE } from "../subworkflows/local/profile"
 include { LOAD_SAMPLESHET } from "../subworkflows/local/loadSampleSheet"
+include { EXTRACT_VIRAL_READS } from "../subworkflows/local/extractViralReads"
+include { EXTRACT_RAW_READS_FROM_PROCESSED } from "../modules/local/extractRawReadsFromProcessed"
 nextflow.preview.output = true
 
 /*****************
@@ -46,6 +48,13 @@ workflow RUN_DEV_SE {
     RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)
     CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end)
 
+    // Extract and count human-viral reads
+    EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", params.grouping, params.single_end)
+
+    // Process intermediate output for chimera detection
+    raw_processed_ch = EXTRACT_VIRAL_READS.out.bbduk_match.join(RAW.out.reads, by: 0)
+    EXTRACT_RAW_READS_FROM_PROCESSED(raw_processed_ch, "raw_viral_subset")
+
     // Taxonomic profiling
     PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, params.single_end)
 
@@ -79,6 +88,9 @@ workflow RUN_DEV_SE {
         PROCESS_OUTPUT.out.qbase >> "results"
         PROCESS_OUTPUT.out.qseqs >> "results"
         // Final results
+                // Final results
+        EXTRACT_VIRAL_READS.out.tsv >> "results"
+        EXTRACT_VIRAL_READS.out.counts >> "results"
         PROFILE.out.bracken >> "results"
         PROFILE.out.kraken >> "results"
 }