naobservatory · harmonbhasin · Jan 16, 2025 · Jan 11, 2025 · Jan 11, 2025 · Jan 11, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
 # v2.6.1.0 (in progress)
-- Replace Trimmomatic with Atria
+- Implement masking of viral genome reference in index workflow with MASK_GENOME_FASTA to remove adapter, low-entropy and repeat sequences
+- Replace Trimmomatic with Atria in EXTRACT_VIRAL_READS
 
 # v2.6.0.0
 - Updated version to reflect the new versioning scheme, which is described in `docs/version_schema.md`.

diff --git a/configs/index.config b/configs/index.config
@@ -27,6 +27,7 @@ params {
     // Other reference files
     host_taxon_db = "${projectDir}/ref/host-taxa.tsv"
     contaminants = "${projectDir}/ref/contaminants.fasta.gz"
+    adapters = "${projectDir}/ref/adapters.fasta"
     genome_patterns_exclude =  "${projectDir}/ref/hv_patterns_exclude.txt"
     kraken_db = "s3://genome-idx/kraken/k2_standard_20240605.tar.gz" // Path to tarball containing Kraken reference DB
     blast_db_name = "core_nt"

diff --git a/modules/local/bbduk/main.nf b/modules/local/bbduk/main.nf
@@ -93,27 +93,3 @@ process BBDUK_HITS {
         '''
 }
 
-// Masking contaminant kmers in a sequence database
-process BBDUK_MASK {
-    label "large"
-    label "BBTools"
-    input:
-        path(seq_db)
-        path(contaminant_ref)
-        val(k)
-        val(label)
-    output:
-        path("${label}_masked.fasta.gz"), emit: masked
-        path("${label}_mask.stats.txt"), emit: log
-    shell:
-        '''
-        # Define input/output
-        in=!{seq_db}
-        out=!{label}_masked.fasta.gz
-        stats=!{label}_mask.stats.txt
-        ref=!{contaminant_ref}
-        par="k=!{k} hdist=1 mink=8 mm=f rcomp=t maskmiddle=t mask=N t=!{task.cpus} -Xmx!{task.memory.toGiga()}g"
-        # Execute
-        bbduk.sh in=${in} out=${out} ref=${ref} stats=${stats} ${par}
-        '''
-}
diff --git a/modules/local/maskGenomeFasta/main.nf b/modules/local/maskGenomeFasta/main.nf
@@ -0,0 +1,35 @@
+// Filter genomes to exclude specific patterns in sequence headers
+process MASK_GENOME_FASTA {
+    label "large"
+    label "BBTools"
+    input:
+        path(filtered_genomes)
+        path(adapters)
+	val(k)
+	val(hdist)
+	val(entropy)
+	val(polyx_len)
+        val(name_pattern)
+    output:
+        path("${name_pattern}-masked.fasta.gz"), emit: masked
+	path("${name_pattern}-mask-adapters-entropy.stats.txt"), emit: log1
+	path("${name_pattern}-mask-polyx.stats.txt"), emit: log2
+    shell:
+ 	// Simplest way to mask polyX regions is just to pass them as literals, 
+	// e.g. "AAAAA,CCCCC,GGGGG,TTTTT" for polyx_len=5
+	polyx = ['A', 'C', 'G', 'T'].collect { it * (polyx_len as int) }.join(',')
+        '''
+	# Define input/output
+	in=!{filtered_genomes}
+	out1=intermediate-masking.fasta.gz
+	out2=!{name_pattern}-masked.fasta.gz
+	ref=!{adapters}
+	stats1=!{name_pattern}-mask-adapters-entropy.stats.txt
+	stats2=!{name_pattern}-mask-polyx.stats.txt
+	par1="k=!{k} hdist=!{hdist} mm=f mask=N rcomp=t entropy=!{entropy} entropymask=t mink=8 hdist2=1"
+	par2="k=!{polyx_len} hdist=0 mm=f mask=N rcomp=F"
+	# Execute masking in sequence: first adapter/entropy masking, then polyX masking
+	bbduk.sh in=${in} out=${out1} ref=${ref} stats=${stats1} ${par1}
+	bbduk.sh in=${out1} out=${out2} literal=!{polyx} stats=${stats2} ${par2}         
+	'''
+}
diff --git a/subworkflows/local/makeVirusGenomeDB/main.nf b/subworkflows/local/makeVirusGenomeDB/main.nf
@@ -7,6 +7,7 @@ include { FILTER_VIRAL_GENBANK_METADATA } from "../../../modules/local/filterVir
 include { ADD_GENBANK_GENOME_IDS } from "../../../modules/local/addGenbankGenomeIDs"
 include { CONCATENATE_GENOME_FASTA } from "../../../modules/local/concatenateGenomeFasta"
 include { FILTER_GENOME_FASTA } from "../../../modules/local/filterGenomeFasta"
+include { MASK_GENOME_FASTA } from "../../../modules/local/maskGenomeFasta"
 
 /***********
 | WORKFLOW |
@@ -18,6 +19,11 @@ workflow MAKE_VIRUS_GENOME_DB {
         virus_db // TSV giving taxonomic structure and host infection status of virus taxids
         patterns_exclude // File of sequence header patterns to exclude from genome DB
         host_taxa // Tuple of host taxa to include
+	adapters // FASTA file of adapters to mask
+	k // kmer length to use for bbduk adapater masking in reference
+	hdist // hdist (allowed mismatches) to use for bbduk adapter masking
+	entropy // entropy cutoff for bbduk filtering of low-complexity regions
+	polyx_len // minimum length of polyX runs to filter out with bbduk
     main:
         // 1. Download viral Genbank
         dl_ch = DOWNLOAD_VIRAL_NCBI(ncbi_viral_params)
@@ -29,7 +35,9 @@ workflow MAKE_VIRUS_GENOME_DB {
         concat_ch = CONCATENATE_GENOME_FASTA(dl_ch.genomes, meta_ch.path)
         // 5. Filter to remove undesired/contaminated genomes
         filter_ch = FILTER_GENOME_FASTA(concat_ch, patterns_exclude, "virus-genomes-filtered")
+	// 6. Mask to remove adapters, low-entropy regions, and polyX
+	mask_ch = MASK_GENOME_FASTA(filter_ch, adapters, k, hdist, entropy, polyx_len, "virus-genomes")
     emit:
-        fasta = filter_ch
+        fasta = mask_ch.masked
         metadata = gid_ch
 }
diff --git a/tests/index.config b/tests/index.config
@@ -28,6 +28,7 @@ params {
     // Other reference files
     host_taxon_db = "${projectDir}/ref/host-taxa.tsv"
     contaminants = "${projectDir}/ref/contaminants.fasta.gz"
+    adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
     genome_patterns_exclude =  "${projectDir}/ref/hv_patterns_exclude.txt"
 
     // Kraken DB - https://benlangmead.github.io/aws-indexes/k2
@@ -50,4 +51,4 @@ params {
 
 includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/profiles.config"
-includeConfig "${projectDir}/configs/output.config"
+includeConfig "${projectDir}/configs/output.config"
diff --git a/workflows/index.nf b/workflows/index.nf
@@ -29,7 +29,7 @@ workflow INDEX {
     // Build viral taxonomy and infection DB
     MAKE_VIRUS_TAXONOMY_DB(params.taxonomy_url, params.virus_host_db_url, params.host_taxon_db, params.virus_taxid, params.viral_taxids_exclude)
     // Get reference DB of viral genomes of interest
-    MAKE_VIRUS_GENOME_DB(params.ncbi_viral_params, MAKE_VIRUS_TAXONOMY_DB.out.db, params.genome_patterns_exclude, params.host_taxa_screen)
+    MAKE_VIRUS_GENOME_DB(params.ncbi_viral_params, MAKE_VIRUS_TAXONOMY_DB.out.db, params.genome_patterns_exclude, params.host_taxa_screen, params.adapters, "20", "3", "0.5", "10")
     // Build viral alignment index
     MAKE_VIRUS_INDEX(MAKE_VIRUS_GENOME_DB.out.fasta)
     // Build other alignment indices