Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kstansifer mask index #142

Merged
merged 6 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# v2.6.1.0 (in progress)
- Replace Trimmomatic with Atria
- Implement masking of viral genome reference in index workflow with MASK_GENOME_FASTA to remove adapter, low-entropy and repeat sequences
- Replace Trimmomatic with Atria in EXTRACT_VIRAL_READS

# v2.6.0.0
- Updated version to reflect the new versioning scheme, which is described in `docs/version_schema.md`.
Expand Down
1 change: 1 addition & 0 deletions configs/index.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ params {
// Other reference files
host_taxon_db = "${projectDir}/ref/host-taxa.tsv"
contaminants = "${projectDir}/ref/contaminants.fasta.gz"
adapters = "${projectDir}/ref/adapters.fasta"
genome_patterns_exclude = "${projectDir}/ref/hv_patterns_exclude.txt"
kraken_db = "s3://genome-idx/kraken/k2_standard_20240605.tar.gz" // Path to tarball containing Kraken reference DB
blast_db_name = "core_nt"
Expand Down
24 changes: 0 additions & 24 deletions modules/local/bbduk/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -93,27 +93,3 @@ process BBDUK_HITS {
'''
}

// Masking contaminant kmers in a sequence database
process BBDUK_MASK {
label "large"
label "BBTools"
input:
path(seq_db)
path(contaminant_ref)
val(k)
val(label)
output:
path("${label}_masked.fasta.gz"), emit: masked
path("${label}_mask.stats.txt"), emit: log
shell:
'''
# Define input/output
in=!{seq_db}
out=!{label}_masked.fasta.gz
stats=!{label}_mask.stats.txt
ref=!{contaminant_ref}
par="k=!{k} hdist=1 mink=8 mm=f rcomp=t maskmiddle=t mask=N t=!{task.cpus} -Xmx!{task.memory.toGiga()}g"
# Execute
bbduk.sh in=${in} out=${out} ref=${ref} stats=${stats} ${par}
'''
}
35 changes: 35 additions & 0 deletions modules/local/maskGenomeFasta/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Filter genomes to exclude specific patterns in sequence headers
process MASK_GENOME_FASTA {
label "large"
label "BBTools"
input:
path(filtered_genomes)
path(adapters)
val(k)
val(hdist)
val(entropy)
val(polyx_len)
val(name_pattern)
output:
path("${name_pattern}-masked.fasta.gz"), emit: masked
path("${name_pattern}-mask-adapters-entropy.stats.txt"), emit: log1
path("${name_pattern}-mask-polyx.stats.txt"), emit: log2
shell:
// Simplest way to mask polyX regions is just to pass them as literals,
// e.g. "AAAAA,CCCCC,GGGGG,TTTTT" for polyx_len=5
polyx = ['A', 'C', 'G', 'T'].collect { it * (polyx_len as int) }.join(',')
'''
# Define input/output
in=!{filtered_genomes}
out1=intermediate-masking.fasta.gz
out2=!{name_pattern}-masked.fasta.gz
ref=!{adapters}
stats1=!{name_pattern}-mask-adapters-entropy.stats.txt
stats2=!{name_pattern}-mask-polyx.stats.txt
par1="k=!{k} hdist=!{hdist} mm=f mask=N rcomp=t entropy=!{entropy} entropymask=t mink=8 hdist2=1"
par2="k=!{polyx_len} hdist=0 mm=f mask=N rcomp=F"
# Execute masking in sequence: first adapter/entropy masking, then polyX masking
bbduk.sh in=${in} out=${out1} ref=${ref} stats=${stats1} ${par1}
bbduk.sh in=${out1} out=${out2} literal=!{polyx} stats=${stats2} ${par2}
'''
}
10 changes: 9 additions & 1 deletion subworkflows/local/makeVirusGenomeDB/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ include { FILTER_VIRAL_GENBANK_METADATA } from "../../../modules/local/filterVir
include { ADD_GENBANK_GENOME_IDS } from "../../../modules/local/addGenbankGenomeIDs"
include { CONCATENATE_GENOME_FASTA } from "../../../modules/local/concatenateGenomeFasta"
include { FILTER_GENOME_FASTA } from "../../../modules/local/filterGenomeFasta"
include { MASK_GENOME_FASTA } from "../../../modules/local/maskGenomeFasta"

/***********
| WORKFLOW |
Expand All @@ -18,6 +19,11 @@ workflow MAKE_VIRUS_GENOME_DB {
virus_db // TSV giving taxonomic structure and host infection status of virus taxids
patterns_exclude // File of sequence header patterns to exclude from genome DB
host_taxa // Tuple of host taxa to include
adapters // FASTA file of adapters to mask
k // kmer length to use for bbduk adapater masking in reference
hdist // hdist (allowed mismatches) to use for bbduk adapter masking
entropy // entropy cutoff for bbduk filtering of low-complexity regions
polyx_len // minimum length of polyX runs to filter out with bbduk
main:
// 1. Download viral Genbank
dl_ch = DOWNLOAD_VIRAL_NCBI(ncbi_viral_params)
Expand All @@ -29,7 +35,9 @@ workflow MAKE_VIRUS_GENOME_DB {
concat_ch = CONCATENATE_GENOME_FASTA(dl_ch.genomes, meta_ch.path)
// 5. Filter to remove undesired/contaminated genomes
filter_ch = FILTER_GENOME_FASTA(concat_ch, patterns_exclude, "virus-genomes-filtered")
// 6. Mask to remove adapters, low-entropy regions, and polyX
mask_ch = MASK_GENOME_FASTA(filter_ch, adapters, k, hdist, entropy, polyx_len, "virus-genomes")
emit:
fasta = filter_ch
fasta = mask_ch.masked
metadata = gid_ch
}
3 changes: 2 additions & 1 deletion tests/index.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ params {
// Other reference files
host_taxon_db = "${projectDir}/ref/host-taxa.tsv"
contaminants = "${projectDir}/ref/contaminants.fasta.gz"
adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
genome_patterns_exclude = "${projectDir}/ref/hv_patterns_exclude.txt"

// Kraken DB - https://benlangmead.github.io/aws-indexes/k2
Expand All @@ -50,4 +51,4 @@ params {

includeConfig "${projectDir}/configs/containers.config"
includeConfig "${projectDir}/configs/profiles.config"
includeConfig "${projectDir}/configs/output.config"
includeConfig "${projectDir}/configs/output.config"
2 changes: 1 addition & 1 deletion workflows/index.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ workflow INDEX {
// Build viral taxonomy and infection DB
MAKE_VIRUS_TAXONOMY_DB(params.taxonomy_url, params.virus_host_db_url, params.host_taxon_db, params.virus_taxid, params.viral_taxids_exclude)
// Get reference DB of viral genomes of interest
MAKE_VIRUS_GENOME_DB(params.ncbi_viral_params, MAKE_VIRUS_TAXONOMY_DB.out.db, params.genome_patterns_exclude, params.host_taxa_screen)
MAKE_VIRUS_GENOME_DB(params.ncbi_viral_params, MAKE_VIRUS_TAXONOMY_DB.out.db, params.genome_patterns_exclude, params.host_taxa_screen, params.adapters, "20", "3", "0.5", "10")
// Build viral alignment index
MAKE_VIRUS_INDEX(MAKE_VIRUS_GENOME_DB.out.fasta)
// Build other alignment indices
Expand Down
Loading