Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding single-read functionality to EXTRACT_VIRAL_READS #126

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion modules/local/bbduk/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ process BBDUK_SINGLE {
}

// Detection and removal of contaminant reads (use minkmerhits instead of minkmerfraction)
process BBDUK_HITS {
process BBDUK_HITS_PAIRED {
label "large"
label "BBTools"
input:
Expand Down Expand Up @@ -93,6 +93,35 @@ process BBDUK_HITS {
'''
}

process BBDUK_HITS_SINGLE {
label "large"
label "BBTools"
input:
tuple val(sample), path(reads)
path(contaminant_ref)
val(min_kmer_hits)
val(k)
val(suffix)
output:
tuple val(sample), path("${sample}_${suffix}_bbduk_pass.fastq.gz"), emit: reads
tuple val(sample), path("${sample}_${suffix}_bbduk_fail.fastq.gz"), emit: fail
tuple val(sample), path("${sample}_${suffix}_bbduk.stats.txt"), emit: log
shell:
'''
# Define input/output
in=!{reads}
op=!{sample}_!{suffix}_bbduk_pass.fastq.gz
of=!{sample}_!{suffix}_bbduk_fail.fastq.gz
stats=!{sample}_!{suffix}_bbduk.stats.txt
ref=!{contaminant_ref}
io="in=${in} ref=${ref} out=${op} outm=${of} stats=${stats}"
# Define parameters
par="minkmerhits=!{min_kmer_hits} k=!{k} t=!{task.cpus} -Xmx!{task.memory.toGiga()}g"
# Execute
bbduk.sh ${io} ${par}
'''
}

// Masking contaminant kmers in a sequence database
process BBDUK_MASK {
label "large"
Expand Down
27 changes: 26 additions & 1 deletion modules/local/bowtie2/main.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Run Bowtie2 and return mapped and unmapped reads
process BOWTIE2 {
process BOWTIE2_PAIRED {
label "Bowtie2"
label "large"
input:
Expand All @@ -25,6 +25,31 @@ process BOWTIE2 {
'''
}

process BOWTIE2_SINGLE {
label "Bowtie2"
label "large"
input:
tuple val(sample), path(reads)
path(index_dir)
val(par_string)
val(suffix)
output:
tuple val(sample), path("${sample}_${suffix}_bowtie2_mapped.sam"), emit: sam
tuple val(sample), path("${sample}_${suffix}_bowtie2_aligned.fastq.gz"), emit: reads_conc
tuple val(sample), path("${sample}_${suffix}_bowtie2_unaligned.fastq.gz"), emit: reads_unconc
shell:
'''
in=!{reads}
idx="!{index_dir}/bt2_index"
sam="!{sample}_!{suffix}_bowtie2_mapped.sam"
alc="!{sample}_!{suffix}_bowtie2_aligned.fastq.gz"
unc="!{sample}_!{suffix}_bowtie2_unaligned.fastq.gz"
io="-U ${in} -x ${idx} -S ${sam} --al-gz ${alc} --un-gz ${unc}"
par="--threads !{task.cpus} --local --very-sensitive-local !{par_string}"
bowtie2 ${par} ${io}
'''
}

// Generate a Bowtie2 index from an input file
process BOWTIE2_INDEX {
label "Bowtie2"
Expand Down
31 changes: 29 additions & 2 deletions modules/local/cutadapt/main.nf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
process CUTADAPT {
process CUTADAPT_PAIRED {
label "cutadapt"
label "large"
input:
Expand All @@ -25,6 +25,33 @@ process CUTADAPT {
'''
}

process CUTADAPT_SINGLE {
label "cutadapt"
label "large"
input:
// single read file
tuple val(sample), path(read)
path(adapters)
output:
tuple val(sample), path("${sample}_cutadapt.fastq.gz"), emit: reads
tuple val(sample), path("${sample}_cutadapt_log.txt"), emit: log
shell:
/* Explanation of cutadapt parameters:
-b to trim any adapter in the adapters file from either end of the read
-j number of cpu cores
-m to drop reads that are <20 bp after trimming
-e 0.33 To allow up to a 33% error rate in the matching region between an adapter
and the read
--action=trim to trim adapters and up/downstream sequence
*/
'''
par="-b file:!{adapters} -j !{task.cpus} -m 20 -e 0.33 --action=trim"
out="-o !{sample}_cutadapt.fastq.gz"
log="!{sample}_cutadapt_log.txt"
cutadapt ${par} ${out} !{read} > ${log}
'''
}

process CUTADAPT_MASK {
label "cutadapt"
label "large"
Expand All @@ -36,7 +63,7 @@ process CUTADAPT_MASK {
path("${label}_cutadapt_masked.fasta.gz"), emit: masked
path("${label}_cutadapt_log.txt"), emit: log
shell:
'''
'''
out=!{label}_cutadapt_masked.fasta.gz
log=!{label}_cutadapt_log.txt
cutadapt --action=mask -b file:!{adapters} -j !{task.cpus} -e 0.2 -o ${out} !{seq_db} > ${log}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,16 @@ def process_paired_sam(sam_path, out_path, genbank_metadata, viral_taxids):

# TODO: Process unpaired SAM

def process_single_sam(sam_path, out_path, genbank_metadata, viral_taxids):
"""Process paired SAM file into a TSV."""
with open_by_suffix(sam_path) as inf, open_by_suffix(out_path, "w") as outf:
head = write_sam_headers_paired(outf) # Write headers
# Process file & generate content
for line in inf:
fwd_dict = process_sam_alignment(line, genbank_metadata, viral_taxids, False)
outf.write()


# Main function
def main():
# Parse arguments
Expand Down Expand Up @@ -327,7 +337,7 @@ def main():
if paired:
process_paired_sam(sam_path, out_path, gid_taxid_dict, virus_taxa)
else:
process_unpaired_sam(sam_path, out_path, gid_taxid_dict, virus_taxa)
process_single_sam(sam_path, out_path, gid_taxid_dict, virus_taxa)
print_log("File processed.")
# Finish time tracking
end_time = time.time()
Expand Down
37 changes: 36 additions & 1 deletion modules/local/trimmomatic/main.nf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
process TRIMMOMATIC {
process TRIMMOMATIC_PAIRED {
label "trimmomatic"
label "large"
input:
Expand Down Expand Up @@ -36,3 +36,38 @@ process TRIMMOMATIC {
${cmd}
'''
}


process TRIMMOMATIC_SINGLE {
label "trimmomatic"
label "large"
input:
// single read file
tuple val(sample), path(read)
path(adapters)
val(encoding)
output:
tuple val(sample), path("${sample}_trimmomatic.fastq.gz"), emit: reads
tuple val(sample), path("${sample}_trimmomatic_{trimlog,summary}.txt"), emit: log
shell:
/* Explanation of trimmomatic filters:
ILLUMINACLIP: cut Illumina adapters
* max 2 mismatches in a match
* min match quality of 8 between adapter and read
* adapters must be >= 5 bases
LEADING/TRAILING:10 remove leading/trailing bases with quality < 10
SLIDINGWINDOW:4:15 cut the read after the average quality of a 4-base
window falls below 15
MINLEN:20 drop reads below 20 bases
*/
'''
out=!{sample}_trimmomatic.fastq.gz
log=!{sample}_trimmomatic_trimlog.txt
sum=!{sample}_trimmomatic_summary.txt
io="-trimlog ${log} -summary ${sum} !{read} ${out}"
par="ILLUMINACLIP:!{adapters}:2:8:5 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:15 MINLEN:20"
cmd="trimmomatic SE -!{encoding} -threads !{task.cpus} ${io} ${par}"
echo ${cmd}
${cmd}
'''
}
20 changes: 13 additions & 7 deletions subworkflows/local/extractViralReads/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@
| MODULES AND SUBWORKFLOWS |
***************************/

include { BBDUK_HITS } from "../../../modules/local/bbduk"
include { CUTADAPT } from "../../../modules/local/cutadapt"
include { TRIMMOMATIC } from "../../../modules/local/trimmomatic"
include { BOWTIE2 as BOWTIE2_VIRUS } from "../../../modules/local/bowtie2"
include { BOWTIE2 as BOWTIE2_HUMAN } from "../../../modules/local/bowtie2"
include { BOWTIE2 as BOWTIE2_OTHER } from "../../../modules/local/bowtie2"
include { PROCESS_VIRAL_BOWTIE2_SAM } from "../../../modules/local/processViralBowtie2Sam"
include { COUNT_ALIGNMENT_DUPLICATES } from "../../../modules/local/countAlignmentDuplicates"
include { EXTRACT_UNCONC_READ_IDS } from "../../../modules/local/extractUnconcReadIDs"
Expand All @@ -29,8 +23,20 @@ include { MAKE_VIRUS_READS_FASTA } from "../../../modules/local/makeVirusReadsFa
include { COUNT_VIRUS_CLADES } from "../../../modules/local/countVirusClades"
if (params.single_end) {
include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
include { BBDUK_HITS_SINGLE as BBDUK_HITS } from "../../../modules/local/bbduk"
include { CUTADAPT_SINGLE as CUTADAPT } from "../../../modules/local/cutadapt"
include { TRIMMOMATIC_SINGLE as TRIMMOMATIC } from "../../../modules/local/trimmomatic"
include { BOWTIE2_SINGLE as BOWTIE2_VIRUS } from "../../../modules/local/bowtie2"
include { BOWTIE2_SINGLE as BOWTIE2_HUMAN } from "../../../modules/local/bowtie2"
include { BOWTIE2_SINGLE as BOWTIE2_OTHER } from "../../../modules/local/bowtie2"
} else {
include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
include { BBDUK_HITS_PAIRED as BBDUK_HITS } from "../../../modules/local/bbduk"
include { CUTADAPT_PAIRED as CUTADAPT } from "../../../modules/local/cutadapt"
include { TRIMMOMATIC_PAIRED as TRIMMOMATIC } from "../../../modules/local/trimmomatic"
include { BOWTIE2_PAIRED as BOWTIE2_VIRUS } from "../../../modules/local/bowtie2"
include { BOWTIE2_PAIRED as BOWTIE2_HUMAN } from "../../../modules/local/bowtie2"
include { BOWTIE2_PAIRED as BOWTIE2_OTHER } from "../../../modules/local/bowtie2"
}

/***********
Expand Down Expand Up @@ -100,7 +106,7 @@ workflow EXTRACT_VIRAL_READS {
kraken_output_ch = PROCESS_KRAKEN_VIRAL(tax_ch.kraken_output, virus_db_path, host_taxon)
bowtie2_kraken_merged_ch = MERGE_SAM_KRAKEN(kraken_output_ch.combine(bowtie2_sam_ch, by: 0))
merged_ch = MERGE_TSVS_BOWTIE2_KRAKEN(bowtie2_kraken_merged_ch.collect().ifEmpty([]), "bowtie2_kraken_merged")
merged_bbmerge_results = MERGE_TSVS_BBMERGE(tax_ch.bbmerge_summary.collect().ifEmpty([]), "bbmerge")
merged_bbmerge_results = MERGE_TSVS_BBMERGE(tax_ch.bbmerge_summary, "bbmerge") //collect().ifEmpty([]), "bbmerge")
merged_dedup_results = MERGE_TSVS_DEDUP(tax_ch.dedup_summary.collect().ifEmpty([]), "dedup")
merged_alignment_dup_results = MERGE_TSVS_ALIGNMENT_DUPLICATES(alignment_dup_summary.collect().ifEmpty([]), "alignment_duplicates")
// Filter and process putative hit TSV
Expand Down
3 changes: 2 additions & 1 deletion subworkflows/local/profile/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,17 @@
if (params.single_end) {
include { SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
include { BBDUK_SINGLE as BBDUK } from "../../../modules/local/bbduk"
include { BBDUK_HITS_SINGLE as BBDUK_HITS } from "../../../modules/local/bbduk"
include { CONCAT_GROUP_SINGLE as CONCAT_GROUP } from "../../../modules/local/concatGroup"
include { SUBSET_READS_SINGLE_TARGET; SUBSET_READS_SINGLE_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
} else {
include { SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET } from "../../../modules/local/subsetReads"
include { SUBSET_READS_PAIRED_TARGET; SUBSET_READS_PAIRED_TARGET as SUBSET_READS_TARGET_GROUP } from "../../../modules/local/subsetReads"
include { BBDUK_PAIRED as BBDUK } from "../../../modules/local/bbduk"
include { BBDUK_HITS_PAIRED as BBDUK_HITS } from "../../../modules/local/bbduk"
include { CONCAT_GROUP_PAIRED as CONCAT_GROUP } from "../../../modules/local/concatGroup"
}

include { BBDUK_HITS } from "../../../modules/local/bbduk"
include { TAXONOMY as TAXONOMY_RIBO } from "../../../subworkflows/local/taxonomy"
include { TAXONOMY as TAXONOMY_NORIBO } from "../../../subworkflows/local/taxonomy"
include { MERGE_TAXONOMY_RIBO } from "../../../modules/local/mergeTaxonomyRibo"
Expand Down
12 changes: 12 additions & 0 deletions workflows/run_dev_se.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ include { CLEAN } from "../subworkflows/local/clean"
include { PROCESS_OUTPUT } from "../subworkflows/local/processOutput"
include { PROFILE } from "../subworkflows/local/profile"
include { LOAD_SAMPLESHET } from "../subworkflows/local/loadSampleSheet"
include { EXTRACT_VIRAL_READS } from "../subworkflows/local/extractViralReads"
include { EXTRACT_RAW_READS_FROM_PROCESSED } from "../modules/local/extractRawReadsFromProcessed"
nextflow.preview.output = true

/*****************
Expand Down Expand Up @@ -46,6 +48,13 @@ workflow RUN_DEV_SE {
RAW(samplesheet_ch, params.n_reads_trunc, "2", "4 GB", "raw_concat", params.single_end)
CLEAN(RAW.out.reads, params.adapters, "2", "4 GB", "cleaned", params.single_end)

// Extract and count human-viral reads
EXTRACT_VIRAL_READS(CLEAN.out.reads, group_ch, params.ref_dir, kraken_db_path, params.bt2_score_threshold, params.adapters, params.host_taxon, "1", "24", "viral", "${params.quality_encoding}", "${params.fuzzy_match_alignment_duplicates}", params.grouping, params.single_end)

// Process intermediate output for chimera detection
raw_processed_ch = EXTRACT_VIRAL_READS.out.bbduk_match.join(RAW.out.reads, by: 0)
EXTRACT_RAW_READS_FROM_PROCESSED(raw_processed_ch, "raw_viral_subset")

// Taxonomic profiling
PROFILE(CLEAN.out.reads, group_ch, kraken_db_path, params.n_reads_profile, params.ref_dir, "0.4", "27", "ribo", params.grouping, params.single_end)

Expand Down Expand Up @@ -79,6 +88,9 @@ workflow RUN_DEV_SE {
PROCESS_OUTPUT.out.qbase >> "results"
PROCESS_OUTPUT.out.qseqs >> "results"
// Final results
// Final results
EXTRACT_VIRAL_READS.out.tsv >> "results"
EXTRACT_VIRAL_READS.out.counts >> "results"
PROFILE.out.bracken >> "results"
PROFILE.out.kraken >> "results"
}
Loading