Merge branch 'dev' into harmon_api_overload

naobservatory · Feb 4, 2025 · a2fd32e · a2fd32e
2 parents 67e4e72 + fce94af
commit a2fd32e
Show file tree

Hide file tree

Showing 27 changed files with 35 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,4 @@
-# v2.8.0.1 (in-progress)
-- Added instructions for what to do should you run out of API requests for containers
-
-# v2.8.0.0
+# v2.8.0.0 (in development)
 - Major changes to many parts of the pipeline as part of a general performance overhaul
     - Modified most processes in the RUN and RUN_VALIDATION workflows to stream data in and out rather than reading whole files
     - As part of the previous change, modified most processes in the RUN and RUN_VALIDATION workflows to work with interleaved rather than paired sequence data
@@ -19,6 +16,17 @@
     - Added new intermediate outputs, including unfiltered viral hits and interleaved FASTQ from EXTRACT_VIRAL_READS
     - Viral hits TSV moved from `virus_hits_db.tsv.gz` to `virus_hits_filtered.tsv.gz`
     - Numerous changes to column names in viral hits TSV, mainly to improve clarity
+- Updated mislabeled processes
+- Added instructions for what to do should you run out of API requests for containers
+
+# v2.7.0.2
+- Updated `pipeline-version.txt`
+
+# v2.7.0.1
+- Fixed index-related issues from v2.7.0.0:
+    - Updated `EXTRACT_VIRAL_READS` to expect updated path to viral genome DB
+    - Added `adapters` param to the index config file used to run our tests
+    - Updated `RUN` and `RUN_VALIDATION` tests to use up-to-date test index (location: `s3://nao-testing/index/20250130`)
 
 # v2.7.0.0
 - Implemented masking of viral genome reference in index workflow with MASK_GENOME_FASTA to remove adapter, low-entropy and repeat sequences.

diff --git a/configs/containers.config b/configs/containers.config
@@ -107,4 +107,8 @@ process {
         // - conda-forge::file=5.46
         // - conda-forge::gzip=1.13
    }
+   withLabel: python {
+      container = "community.wave.seqera.io/library/python:3.13.1--d00663700fcc8bcf"
+
+   }
 }
diff --git a/configs/index-for-run-test.config b/configs/index-for-run-test.config
@@ -27,14 +27,10 @@ params {
     // Other reference files
     host_taxon_db = "${projectDir}/ref/host-taxa.tsv"
     contaminants = "${projectDir}/ref/contaminants.fasta.gz"
+    adapters = "${projectDir}/ref/adapters.fasta"
     genome_patterns_exclude =  "${projectDir}/ref/hv_patterns_exclude.txt"
-
-    // Kraken viral DB
     kraken_db = "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20240904.tar.gz"
-    // Smallest possible BLAST DB
     blast_db_name = "nt_others"
-
-    // Pull information from GenBank or Ref Seq
     ncbi_viral_params = "--section refseq --assembly-level complete"
 
     // Other input values
@@ -52,4 +48,4 @@ includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
 includeConfig "${projectDir}/configs/output.config"
-process.queue = "harmon-queue" // AWS Batch job queue
+process.queue = "will-batch-queue" // AWS Batch job queue
diff --git a/modules/local/filterTsv/main.nf b/modules/local/filterTsv/main.nf
@@ -1,6 +1,6 @@
 // Filter a gzipped TSV to keep the first line for each combination of values in the specified columns
 process FILTER_TSV {
-    label "core_utils"
+    label "coreutils"
     label "single"
     input:
         tuple val(sample), path(tsv)

diff --git a/modules/local/headTsv/main.nf b/modules/local/headTsv/main.nf
@@ -1,6 +1,6 @@
 // Add a header line to an unheaded TSV file
 process HEAD_TSV {
-    label "core_utils"
+    label "python"
     label "single"
     input:
         tuple val(sample), path(tsv)

diff --git a/modules/local/processViralBowtie2Sam/resources/usr/bin/process_viral_bowtie2_sam.py b/modules/local/processViralBowtie2Sam/resources/usr/bin/process_viral_bowtie2_sam.py
@@ -341,13 +341,10 @@ def process_paired_sam(inf, outf, genbank_metadata, viral_taxids):
             fwd_line = rev_line
             rev_line = get_next_alignment(inf)
             continue
-        # Check that pair statuses match and are valid
+        # Check that pair statuses match
         if rev_dict["pair_status"] != fwd_dict["pair_status"]:
             msg = f"Pair status mismatch: {fwd_dict['query_name']}, {fwd_dict['pair_status']}, {rev_dict['pair_status']}"
             raise ValueError(msg)
-        if fwd_dict["pair_status"] == "UP" or rev_dict["pair_status"] == "UP":
-            msg = f"Both mates align but alignment is unpaired: {fwd_dict['query_name']}"
-            raise ValueError(msg)
         # Process pair together
         line = line_from_pair(fwd_dict, rev_dict)
         outf.write(line)

diff --git a/modules/local/reheadTsv/main.nf b/modules/local/reheadTsv/main.nf
@@ -1,6 +1,6 @@
 // Rename fields in a TSV header
 process REHEAD_TSV {
-    label "core_utils"
+    label "python"
     label "single"
     input:
         tuple val(sample), path(tsv)

diff --git a/modules/local/sortFile/main.nf b/modules/local/sortFile/main.nf
@@ -1,7 +1,7 @@
 // Sort a gzipped file by a user-specified key string
 // TODO: Expand to handle plaintext files
 process SORT_FILE {
-    label "core_utils"
+    label "coreutils"
     label "single"
     input:
         tuple val(sample), path(file)

diff --git a/modules/local/sortTsv/main.nf b/modules/local/sortTsv/main.nf
@@ -1,7 +1,7 @@
 // Sort a gzipped TSV by a specified column header
 // TODO: Expand to handle plaintext TSVs
 process SORT_TSV {
-    label "core_utils"
+    label "coreutils"
     label "single"
     input:
         tuple val(sample), path(tsv)

diff --git a/pipeline-version.txt b/pipeline-version.txt
@@ -1 +1 @@
-2.5.2
+2.7.0.2
diff --git a/subworkflows/local/extractViralReads/main.nf b/subworkflows/local/extractViralReads/main.nf
@@ -42,8 +42,8 @@ workflow EXTRACT_VIRAL_READS {
         bbduk_suffix
         bracken_threshold
     main:
-        // 0. Get reference paths
-        viral_genome_path = "${ref_dir}/results/virus-genomes-filtered.fasta.gz"
+        // Get reference paths
+        viral_genome_path = "${ref_dir}/results/virus-genomes-masked.fasta.gz"
         genome_meta_path  = "${ref_dir}/results/virus-genome-metadata-gid.tsv.gz"
         bt2_virus_index_path = "${ref_dir}/results/bt2-virus-index"
         bt2_human_index_path = "${ref_dir}/results/bt2-human-index"

diff --git a/test-data/gold-standard-results/bracken_reports_merged.tsv.gz b/test-data/gold-standard-results/bracken_reports_merged.tsv.gz
diff --git a/test-data/gold-standard-results/kraken_reports_merged.tsv.gz b/test-data/gold-standard-results/kraken_reports_merged.tsv.gz
diff --git a/test-data/gold-standard-results/merged_blast_filtered.tsv.gz b/test-data/gold-standard-results/merged_blast_filtered.tsv.gz
diff --git a/test-data/gold-standard-results/read_counts.tsv.gz b/test-data/gold-standard-results/read_counts.tsv.gz
diff --git a/test-data/gold-standard-results/subset_qc_adapter_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_adapter_stats.tsv.gz
diff --git a/test-data/gold-standard-results/subset_qc_basic_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_basic_stats.tsv.gz
diff --git a/test-data/gold-standard-results/subset_qc_length_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_length_stats.tsv.gz
diff --git a/test-data/gold-standard-results/subset_qc_quality_base_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_quality_base_stats.tsv.gz
diff --git a/test-data/gold-standard-results/subset_qc_quality_sequence_stats.tsv.gz b/test-data/gold-standard-results/subset_qc_quality_sequence_stats.tsv.gz
diff --git a/test-data/gold-standard-results/virus_hits_filtered.tsv.gz b/test-data/gold-standard-results/virus_hits_filtered.tsv.gz
diff --git a/tests/modules/local/bbduk/bbduk.nf.test b/tests/modules/local/bbduk/bbduk.nf.test
@@ -35,7 +35,7 @@ nextflow_process {
             process {
                 '''
                 input[0] = INTERLEAVE_FASTQ.out.output
-                input[1] = "${params.ref_dir}/results/virus-genomes-filtered.fasta.gz"
+                input[1] = "${params.ref_dir}/results/virus-genomes-masked.fasta.gz"
                 input[2] = "0.4"
                 input[3] = "27"
                 input[4] = "ribo"
@@ -89,7 +89,7 @@ nextflow_process {
             process {
                 '''
                 input[0] = LOAD_SAMPLESHEET.out.samplesheet
-                input[1] = "${params.ref_dir}/results/virus-genomes-filtered.fasta.gz"
+                input[1] = "${params.ref_dir}/results/virus-genomes-masked.fasta.gz"
                 input[2] = "0.4"
                 input[3] = "27"
                 input[4] = "ribo"

diff --git a/tests/modules/local/bbduk/bbduk_hits.nf.test b/tests/modules/local/bbduk/bbduk_hits.nf.test
@@ -28,7 +28,7 @@ nextflow_process {
             process {
                 '''
                 input[0] = LOAD_SAMPLESHEET.out.samplesheet
-                input[1] = "${params.ref_dir}/results/virus-genomes-filtered.fasta.gz"
+                input[1] = "${params.ref_dir}/results/virus-genomes-masked.fasta.gz"
                 input[2] = "1"
                 input[3] = "24"
                 input[4] = "viral"

diff --git a/tests/run.config b/tests/run.config
@@ -12,7 +12,7 @@ params {
 
     // Directories
     base_dir = "./" // Parent for working and output directories (can be S3)
-    ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
+    ref_dir = "s3://nao-testing/index/20250130/output/" // Reference/index directory (generated by index workflow)
 
     // Files
     sample_sheet = "${projectDir}/test-data/samplesheet.csv" // Path to library TSV

diff --git a/tests/run_dev_se.config b/tests/run_dev_se.config
@@ -10,7 +10,7 @@ params {
 
     // Directories
     base_dir = "./" // Parent for working and output directories (can be S3)
-    ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
+    ref_dir = "s3://nao-testing/index/20250130/output/" // Reference/index directory (generated by index workflow)
 
     // Files
     sample_sheet = "${projectDir}/test-data/single-end-samplesheet.csv" // Path to library TSV

diff --git a/tests/run_validation.config b/tests/run_validation.config
@@ -7,7 +7,7 @@ params {
 
     // Directories
     base_dir = "./" // Parent for working and output directories (can be S3)
-    ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
+    ref_dir = "s3://nao-testing/index/20250130/output/" // Reference/index directory (generated by index workflow)
 
     // Files
     viral_tsv = "${projectDir}/test-data/gold-standard-results/virus_hits_filtered.tsv.gz"

diff --git a/tests/workflows/run.nf.test.snap b/tests/workflows/run.nf.test.snap
@@ -3,7 +3,7 @@
         "content": [
             "bracken_reports_merged.tsv.gz:md5,6c504fa837ef97ef2096f2569d8c6902",
             "kraken_reports_merged.tsv.gz:md5,84f070b42b948d36ae38eaee4a61982e",
-            "merged_blast_filtered.tsv.gz:md5,be7002de8c1878da615ba4379b84feab",
+            "merged_blast_filtered.tsv.gz:md5,b26a764f7b7271256c0d58a89b5517eb",
             "read_counts.tsv.gz:md5,8dc2e3ad82f42202262a5e67a9d91e1b",
             "subset_qc_adapter_stats.tsv.gz:md5,43a90fc81f11a57e191f10176d3b7caf",
             "subset_qc_basic_stats.tsv.gz:md5,98699e1e92085c89771f0a46fa54df0d",
@@ -16,6 +16,6 @@
             "nf-test": "0.9.2",
             "nextflow": "24.10.4"
         },
-        "timestamp": "2025-01-30T14:46:04.796716034"
+        "timestamp": "2025-01-31T16:27:43.310277911"
     }
 }