Skip to content

Commit

Permalink
Merge branch 'dev' into harmon_api_overload
Browse files Browse the repository at this point in the history
  • Loading branch information
harmonbhasin authored Feb 4, 2025
2 parents 67e4e72 + fce94af commit a2fd32e
Show file tree
Hide file tree
Showing 27 changed files with 35 additions and 30 deletions.
16 changes: 12 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
# v2.8.0.1 (in-progress)
- Added instructions for what to do should you run out of API requests for containers

# v2.8.0.0
# v2.8.0.0 (in development)
- Major changes to many parts of the pipeline as part of a general performance overhaul
- Modified most processes in the RUN and RUN_VALIDATION workflows to stream data in and out rather than reading whole files
- As part of the previous change, modified most processes in the RUN and RUN_VALIDATION workflows to work with interleaved rather than paired sequence data
Expand All @@ -19,6 +16,17 @@
- Added new intermediate outputs, including unfiltered viral hits and interleaved FASTQ from EXTRACT_VIRAL_READS
- Viral hits TSV moved from `virus_hits_db.tsv.gz` to `virus_hits_filtered.tsv.gz`
- Numerous changes to column names in viral hits TSV, mainly to improve clarity
- Updated mislabeled processes
- Added instructions for what to do should you run out of API requests for containers

# v2.7.0.2
- Updated `pipeline-version.txt`

# v2.7.0.1
- Fixed index-related issues from v2.7.0.0:
- Updated `EXTRACT_VIRAL_READS` to expect updated path to viral genome DB
- Added `adapters` param to the index config file used to run our tests
- Updated `RUN` and `RUN_VALIDATION` tests to use up-to-date test index (location: `s3://nao-testing/index/20250130`)

# v2.7.0.0
- Implemented masking of viral genome reference in index workflow with MASK_GENOME_FASTA to remove adapter, low-entropy and repeat sequences.
Expand Down
4 changes: 4 additions & 0 deletions configs/containers.config
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,8 @@ process {
// - conda-forge::file=5.46
// - conda-forge::gzip=1.13
}
withLabel: python {
container = "community.wave.seqera.io/library/python:3.13.1--d00663700fcc8bcf"

}
}
8 changes: 2 additions & 6 deletions configs/index-for-run-test.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,10 @@ params {
// Other reference files
host_taxon_db = "${projectDir}/ref/host-taxa.tsv"
contaminants = "${projectDir}/ref/contaminants.fasta.gz"
adapters = "${projectDir}/ref/adapters.fasta"
genome_patterns_exclude = "${projectDir}/ref/hv_patterns_exclude.txt"

// Kraken viral DB
kraken_db = "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20240904.tar.gz"
// Smallest possible BLAST DB
blast_db_name = "nt_others"

// Pull information from GenBank or Ref Seq
ncbi_viral_params = "--section refseq --assembly-level complete"

// Other input values
Expand All @@ -52,4 +48,4 @@ includeConfig "${projectDir}/configs/containers.config"
includeConfig "${projectDir}/configs/resources.config"
includeConfig "${projectDir}/configs/profiles.config"
includeConfig "${projectDir}/configs/output.config"
process.queue = "harmon-queue" // AWS Batch job queue
process.queue = "will-batch-queue" // AWS Batch job queue
2 changes: 1 addition & 1 deletion modules/local/filterTsv/main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Filter a gzipped TSV to keep the first line for each combination of values in the specified columns
process FILTER_TSV {
label "core_utils"
label "coreutils"
label "single"
input:
tuple val(sample), path(tsv)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/headTsv/main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Add a header line to an unheaded TSV file
process HEAD_TSV {
label "core_utils"
label "python"
label "single"
input:
tuple val(sample), path(tsv)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,13 +341,10 @@ def process_paired_sam(inf, outf, genbank_metadata, viral_taxids):
fwd_line = rev_line
rev_line = get_next_alignment(inf)
continue
# Check that pair statuses match and are valid
# Check that pair statuses match
if rev_dict["pair_status"] != fwd_dict["pair_status"]:
msg = f"Pair status mismatch: {fwd_dict['query_name']}, {fwd_dict['pair_status']}, {rev_dict['pair_status']}"
raise ValueError(msg)
if fwd_dict["pair_status"] == "UP" or rev_dict["pair_status"] == "UP":
msg = f"Both mates align but alignment is unpaired: {fwd_dict['query_name']}"
raise ValueError(msg)
# Process pair together
line = line_from_pair(fwd_dict, rev_dict)
outf.write(line)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/reheadTsv/main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Rename fields in a TSV header
process REHEAD_TSV {
label "core_utils"
label "python"
label "single"
input:
tuple val(sample), path(tsv)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/sortFile/main.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Sort a gzipped file by a user-specified key string
// TODO: Expand to handle plaintext files
process SORT_FILE {
label "core_utils"
label "coreutils"
label "single"
input:
tuple val(sample), path(file)
Expand Down
2 changes: 1 addition & 1 deletion modules/local/sortTsv/main.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Sort a gzipped TSV by a specified column header
// TODO: Expand to handle plaintext TSVs
process SORT_TSV {
label "core_utils"
label "coreutils"
label "single"
input:
tuple val(sample), path(tsv)
Expand Down
2 changes: 1 addition & 1 deletion pipeline-version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.5.2
2.7.0.2
4 changes: 2 additions & 2 deletions subworkflows/local/extractViralReads/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ workflow EXTRACT_VIRAL_READS {
bbduk_suffix
bracken_threshold
main:
// 0. Get reference paths
viral_genome_path = "${ref_dir}/results/virus-genomes-filtered.fasta.gz"
// Get reference paths
viral_genome_path = "${ref_dir}/results/virus-genomes-masked.fasta.gz"
genome_meta_path = "${ref_dir}/results/virus-genome-metadata-gid.tsv.gz"
bt2_virus_index_path = "${ref_dir}/results/bt2-virus-index"
bt2_human_index_path = "${ref_dir}/results/bt2-human-index"
Expand Down
Binary file modified test-data/gold-standard-results/bracken_reports_merged.tsv.gz
Binary file not shown.
Binary file modified test-data/gold-standard-results/kraken_reports_merged.tsv.gz
Binary file not shown.
Binary file modified test-data/gold-standard-results/merged_blast_filtered.tsv.gz
Binary file not shown.
Binary file modified test-data/gold-standard-results/read_counts.tsv.gz
Binary file not shown.
Binary file modified test-data/gold-standard-results/subset_qc_adapter_stats.tsv.gz
Binary file not shown.
Binary file modified test-data/gold-standard-results/subset_qc_basic_stats.tsv.gz
Binary file not shown.
Binary file modified test-data/gold-standard-results/subset_qc_length_stats.tsv.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified test-data/gold-standard-results/virus_hits_filtered.tsv.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions tests/modules/local/bbduk/bbduk.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ nextflow_process {
process {
'''
input[0] = INTERLEAVE_FASTQ.out.output
input[1] = "${params.ref_dir}/results/virus-genomes-filtered.fasta.gz"
input[1] = "${params.ref_dir}/results/virus-genomes-masked.fasta.gz"
input[2] = "0.4"
input[3] = "27"
input[4] = "ribo"
Expand Down Expand Up @@ -89,7 +89,7 @@ nextflow_process {
process {
'''
input[0] = LOAD_SAMPLESHEET.out.samplesheet
input[1] = "${params.ref_dir}/results/virus-genomes-filtered.fasta.gz"
input[1] = "${params.ref_dir}/results/virus-genomes-masked.fasta.gz"
input[2] = "0.4"
input[3] = "27"
input[4] = "ribo"
Expand Down
2 changes: 1 addition & 1 deletion tests/modules/local/bbduk/bbduk_hits.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ nextflow_process {
process {
'''
input[0] = LOAD_SAMPLESHEET.out.samplesheet
input[1] = "${params.ref_dir}/results/virus-genomes-filtered.fasta.gz"
input[1] = "${params.ref_dir}/results/virus-genomes-masked.fasta.gz"
input[2] = "1"
input[3] = "24"
input[4] = "viral"
Expand Down
2 changes: 1 addition & 1 deletion tests/run.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ params {

// Directories
base_dir = "./" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
ref_dir = "s3://nao-testing/index/20250130/output/" // Reference/index directory (generated by index workflow)

// Files
sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV
Expand Down
2 changes: 1 addition & 1 deletion tests/run_dev_se.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ params {

// Directories
base_dir = "./" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
ref_dir = "s3://nao-testing/index/20250130/output/" // Reference/index directory (generated by index workflow)

// Files
sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV
Expand Down
2 changes: 1 addition & 1 deletion tests/run_validation.config
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ params {

// Directories
base_dir = "./" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
ref_dir = "s3://nao-testing/index/20250130/output/" // Reference/index directory (generated by index workflow)

// Files
viral_tsv = "${projectDir}/test-data/gold-standard-results/virus_hits_filtered.tsv.gz"
Expand Down
4 changes: 2 additions & 2 deletions tests/workflows/run.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"content": [
"bracken_reports_merged.tsv.gz:md5,6c504fa837ef97ef2096f2569d8c6902",
"kraken_reports_merged.tsv.gz:md5,84f070b42b948d36ae38eaee4a61982e",
"merged_blast_filtered.tsv.gz:md5,be7002de8c1878da615ba4379b84feab",
"merged_blast_filtered.tsv.gz:md5,b26a764f7b7271256c0d58a89b5517eb",
"read_counts.tsv.gz:md5,8dc2e3ad82f42202262a5e67a9d91e1b",
"subset_qc_adapter_stats.tsv.gz:md5,43a90fc81f11a57e191f10176d3b7caf",
"subset_qc_basic_stats.tsv.gz:md5,98699e1e92085c89771f0a46fa54df0d",
Expand All @@ -16,6 +16,6 @@
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-01-30T14:46:04.796716034"
"timestamp": "2025-01-31T16:27:43.310277911"
}
}

0 comments on commit a2fd32e

Please sign in to comment.