From f7b23d2af719fa0ebd45b551f335d68e6681c296 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Tue, 4 Jun 2024 17:53:15 -0700
Subject: [PATCH 01/24] Work on testing pipeline

---
 .github/scripts/download_fluviewer_db.sh |   7 +
 .github/scripts/run_pipeline.sh          |   1 +
 .github/scripts/simulate_reads.sh        |   2 +-
 .gitignore                               |   3 +
 ReadMe.md                                |  10 +-
 main.nf                                  | 157 ++++++++++++-----------
 modules/genoflu.nf                       |  11 +-
 modules/provenance.nf                    |  26 ++--
 nextflow.config                          |  78 +++++------
 9 files changed, 166 insertions(+), 129 deletions(-)
 create mode 100755 .github/scripts/download_fluviewer_db.sh

diff --git a/.github/scripts/download_fluviewer_db.sh b/.github/scripts/download_fluviewer_db.sh
new file mode 100755
index 0000000..d49cc48
--- /dev/null
+++ b/.github/scripts/download_fluviewer_db.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+mkdir -p .github/data/fluviewer_db
+
+wget -O .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa.gz https://raw.githubusercontent.com/KevinKuchinski/FluViewer/main/FluViewer_db_v_0_1_8.fa.gz
+
+gunzip .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa.gz
diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 5b8d7df..65ecc6f 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -9,4 +9,5 @@ nextflow run main.nf \
 	 -profile conda \
 	 --cache ${HOME}/.conda/envs \
 	 --fastq_input .github/data/fastq \
+	 --db .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa \
 	 --outdir .github/data/test_output
diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh
index 7001cce..41b9482 100755
--- a/.github/scripts/simulate_reads.sh
+++ b/.github/scripts/simulate_reads.sh
@@ -13,7 +13,7 @@ while IFS=',' read -r sample_id assembly; do
     art_illumina \
 	--paired \
 	--in ${assembly} \
-	--fcov 12 \
+	--fcov 100 \
 	--len 150 \
 	--mflen 400 \
 	--sdev 100 \
diff --git a/.gitignore b/.gitignore
index 0715472..84620e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+.github/data/assemblies
+.github/data/fastq
+.github/data/fluviewer_db
 .nextflow*
 work
 test*
diff --git a/ReadMe.md b/ReadMe.md
index b26b45c..4914a67 100644
--- a/ReadMe.md
+++ b/ReadMe.md
@@ -1,6 +1,6 @@
 [![Tests](https://github.com/BCCDC-PHL/fluviewer-nf/actions/workflows/tests.yml/badge.svg)](https://github.com/BCCDC-PHL/fluviewer-nf/actions/workflows/tests.yml)
 
-# FluViewer-nf
+# fluviewer-nf
 
 This is a Nextflow pipeline for running the FluViewer analysis tool (https://github.com/KevinKuchinski/FluViewer) and other custom modules to obtain consensus sequences, HA and NA subtypes, clade calls, and amino acid mutations for Influenza A WGS.  
 
@@ -67,7 +67,13 @@ For a full list of optional arguments, see: https://github.com/KevinKuchinski/Fl
 
 **Example command:**
 ```
-nextflow run FluViewer_installation/main.nf -r 0.1.0 -profile --cache ~/.conda/envs/ --fastq_input flu_A_reference_collection/ --db ref/FluViewer_db_full_20220915.fasta --outdir [outdir]
+nextflow run BCCDC-PHL/fluviewer-nf \
+  -r v0.1.0 \
+  -profile conda \
+  --cache ~/.conda/envs \
+  --fastq_input /path/to/your_fastqs \
+  --db /path/to/FluViewer_db.fa \
+  --outdir /path/to/output_dir
 ```
 
 ## Output
diff --git a/main.nf b/main.nf
index ce9515c..87bb499 100644
--- a/main.nf
+++ b/main.nf
@@ -18,95 +18,106 @@ Future versions will add in:
 
  nextflow.enable.dsl = 2
 
- include { hash_files } from './modules/hash_files.nf'
- include { pipeline_provenance } from './modules/provenance.nf'
- include { collect_provenance } from './modules/provenance.nf'
- include { fastp } from './modules/fastp.nf'
- include { cutadapt} from './modules/cutadapt.nf'
- include { FluViewer } from './modules/FluViewer.nf'
- include { multiqc } from './modules/multiqc.nf'
- include { FASTQC } from './modules/fastqc.nf'
- include { CLADE_CALLING } from './modules/clade_calling.nf'
- include { SNP_CALLING } from './modules/snp_calling.nf'
- include { PULL_GENOFLU ; CHECKOUT_GENOFLU ;  GENOFLU } from './modules/genoflu.nf'
+include { hash_files }          from './modules/hash_files.nf'
+include { pipeline_provenance } from './modules/provenance.nf'
+include { collect_provenance }  from './modules/provenance.nf'
+include { fastp }               from './modules/fastp.nf'
+include { cutadapt}             from './modules/cutadapt.nf'
+include { FluViewer }           from './modules/FluViewer.nf'
+include { multiqc }             from './modules/multiqc.nf'
+include { FASTQC }              from './modules/fastqc.nf'
+include { CLADE_CALLING }       from './modules/clade_calling.nf'
+include { SNP_CALLING }         from './modules/snp_calling.nf'
+include { PULL_GENOFLU }        from './modules/genoflu.nf'
+include { CHECKOUT_GENOFLU }    from './modules/genoflu.nf'
+include { GENOFLU }             from './modules/genoflu.nf'
 
 
 // prints to the screen and to the log
-        log.info """
-
-                 FluViewer Pipeline
-                 ===================================
-                 projectDir        : ${projectDir}
-                 launchDir         : ${launchDir}
-                 database          : ${params.db}
-                 primers           : ${params.primers}
-                 fastqInputDir     : ${params.fastq_input}
-                 outdir            : ${params.outdir}
-                 pipeline run      : ${params.pipeline_short_name}
-                 pipeline version  : ${params.pipeline_minor_version}
-                 run_name          : ${params.run_name}
-                 user              : $workflow.userName
-                 Git repository    : $workflow.repository
-                 git commit id     : $workflow.commitId
-                 branch            : $workflow.revision
-                 """
-                 .stripIndent()
+log.info """
+  FluViewer Pipeline
+  ===================================
+  projectDir        : ${projectDir}
+  launchDir         : ${launchDir}
+  database          : ${params.db}
+  primers           : ${params.primers}
+  fastqInputDir     : ${params.fastq_input}
+  outdir            : ${params.outdir}
+  pipeline run name : ${workflow.runName}
+  pipeline version  : ${workflow.manifest.version}
+  run_name          : ${params.run_name}
+  user              : ${workflow.userName}
+  Git repository    : ${workflow.repository}
+  git commit id     : ${workflow.commitId}
+  branch            : ${workflow.revision}
+""".stripIndent()
 
 
 workflow {
-     ch_start_time = Channel.of(LocalDateTime.now())
-     ch_pipeline_name = Channel.of(workflow.manifest.name)
-     ch_pipeline_version = Channel.of(workflow.manifest.version)
 
+    ch_workflow_metadata = Channel.value([
+	workflow.sessionId,
+	workflow.runName,
+	workflow.manifest.name,
+	workflow.manifest.version,
+	workflow.start,
+    ])
+    
+    ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata)
 
-     ch_pipeline_provenance = pipeline_provenance(ch_pipeline_name.combine(ch_pipeline_version).combine(ch_start_time))
+    ch_primers = Channel.fromPath(params.primer_path)
 
-     ch_primers = Channel.fromPath(params.primer_path)
-     ch_db = Channel.fromPath(params.db)
-     ch_fastq_input = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
+    ch_db = Channel.fromPath(params.db)
 
-     ch_reference_db = Channel.of([file(params.blastx_subtype_db).parent, file(params.blastx_subtype_db).name]).first()
+    ch_fastq_input = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
 
-     main:
-     // Generate hashes for input files
-     hash_files(ch_fastq_input.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq_input")))
+    ch_reference_db = Channel.of([file(params.blastx_subtype_db).parent, file(params.blastx_subtype_db).name]).first()
 
-     // Clean up reads - remove adapters (fastp) and primers (cutadapt)
-     fastp( ch_fastq_input )
-     cutadapt(fastp.out.trimmed_reads.combine(ch_primers))
-     FASTQC(cutadapt.out.primer_trimmed_reads)
-     
-     // Run FluViewer 
-     FluViewer(cutadapt.out.primer_trimmed_reads.combine(ch_db))
-
-     //Collect al the relevant filesfor MULTIQC
-     ch_fastqc_collected = FASTQC.out.zip.map{ it -> [it[1], it[2]]}.collect()
-     multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) )
- 
-     //Call clades for H1 and H3 samples
-     CLADE_CALLING(FluViewer.out.consensus_seqs)
-     
-     SNP_CALLING(FluViewer.out.consensus_main, ch_reference_db)
-
-     
-     PULL_GENOFLU(params.genoflu_github_url)
-
-     CHECKOUT_GENOFLU(PULL_GENOFLU.out.repo, params.genoflu_version)
-
-     GENOFLU(FluViewer.out.consensus_main.combine(PULL_GENOFLU.out.repo))
 
+    main:
+    // Provenance channel starts with just the sample IDs
+    // These will be joined to various provenance files as they are generated
+    ch_provenance = ch_fastq_input.map{ it -> it[0] }
 
+    // Generate hashes for input files
+    hash_files(ch_fastq_input.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq_input")))
 
-     //Pool Provenance data
-     ch_provenance = FluViewer.out.provenance
-     ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], [it[1]] << it[2]] }
-     ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
-     ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
-     ch_provenance = ch_provenance.join(ch_fastq_input.map{ it -> it[0] }.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] }
-     ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
-     ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
-     collect_provenance(ch_provenance)
+    // Clean up reads - remove adapters (fastp) and primers (cutadapt)
+    fastp(ch_fastq_input)
+    cutadapt(fastp.out.trimmed_reads.combine(ch_primers))
+    FASTQC(cutadapt.out.primer_trimmed_reads)
 
+    // Run FluViewer 
+    FluViewer(cutadapt.out.primer_trimmed_reads.combine(ch_db))
 
+    //Collect al the relevant filesfor MULTIQC
+    ch_fastqc_collected = FASTQC.out.zip.map{ it -> [it[1], it[2]]}.collect()
+    multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) )
+ 
+    //Call clades for H1 and H3 samples
+    CLADE_CALLING(FluViewer.out.consensus_seqs)
+     
+    SNP_CALLING(FluViewer.out.consensus_main, ch_reference_db)
+   
+    PULL_GENOFLU(params.genoflu_github_url)
+
+    CHECKOUT_GENOFLU(PULL_GENOFLU.out.repo, params.genoflu_version)
+
+    GENOFLU(FluViewer.out.consensus_main.combine(PULL_GENOFLU.out.repo))
+
+
+    //
+    // Provenance collection processes
+    // The basic idea is to build up a channel with the following structure:
+    // [sample_id, [provenance_file_1.yml, provenance_file_2.yml, provenance_file_3.yml...]]
+    // ...and then concatenate them all together in the 'collect_provenance' process.
+    ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map{ it ->    [it[0], [it[1]]] }
+    ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it ->    [it[0], it[1] << it[2]] }
+    ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it ->         [it[0], it[1] << it[2]] }
+    ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it ->      [it[0], it[1] << it[2]] }
+    ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it ->     [it[0], it[1] << it[2]] }    
+    ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
+    ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it ->       [it[0], it[1] << it[2]] }
+    collect_provenance(ch_provenance)
 
 }
diff --git a/modules/genoflu.nf b/modules/genoflu.nf
index 8a14e7f..2fa31e6 100644
--- a/modules/genoflu.nf
+++ b/modules/genoflu.nf
@@ -14,10 +14,12 @@ process GENOFLU {
 
     script:
     """
-    printf -- "- process_name: genoflu\\n" > ${sample_id}_genoflu_provenance.yml
-    printf -- "  tool_name: genoflu\\n  tool_version: \$(genoflu.py --version | cut -d' ' -f3)\\n" >> ${sample_id}_genoflu_provenance.yml
+    printf -- "- process_name: genoflu\\n" >> ${sample_id}_genoflu_provenance.yml
+    printf -- "  tools:\\n"                >> ${sample_id}_genoflu_provenance.yml
+    printf -- "    - tool_name: genoflu\\n"    >> ${sample_id}_genoflu_provenance.yml
+    printf -- "      tool_version: \$(genoflu.py --version | cut -d' ' -f3)\\n" >> ${sample_id}_genoflu_provenance.yml
 
-	genoflu.py \
+    genoflu.py \
 	-f ${consensus_seqs} \
 	-i ${genoflu_path}/dependencies/fastas/ \
 	-c ${genoflu_path}/dependencies/genotype_key.xlsx \
@@ -28,7 +30,8 @@ process GENOFLU {
 }
 
 process PULL_GENOFLU {
-
+    
+    executor 'local'
     storeDir "${params.genoflu_cache}"
 
     input:
diff --git a/modules/provenance.nf b/modules/provenance.nf
index 36ab158..0a8d115 100644
--- a/modules/provenance.nf
+++ b/modules/provenance.nf
@@ -4,7 +4,7 @@ process collect_provenance {
 
   executor 'local'
 
-  publishDir params.versioned_outdir ? "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/Provenance_files" : "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/Provenance_files", pattern: "${sample_id}_*_provenance.yml", mode: 'copy'
+  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/Provenance_files", pattern: "${sample_id}_*_provenance.yml", mode: 'copy'
 
   input:
   tuple val(sample_id), path(provenance_files)
@@ -20,18 +20,22 @@ process collect_provenance {
 
 process pipeline_provenance {
 
-  tag { pipeline_name + " / " + pipeline_version }
+    tag { pipeline_name + " / " + pipeline_version }
 
-  executor 'local'
+    executor 'local'
 
-  input:
-  tuple val(pipeline_name), val(pipeline_version), val(analysis_start)
+    input:
+    tuple val(session_id), val(run_name), val(pipeline_name), val(pipeline_version), val(timestamp_analysis_start)
 
-  output:
-  file("pipeline_provenance.yml")
+    output:
+    file("pipeline_provenance.yml")
 
-  script:
-  """
-  printf -- "- pipeline_name: ${pipeline_name}\\n  pipeline_version: ${pipeline_version}\\n- timestamp_analysis_start: ${analysis_start}\\n" > pipeline_provenance.yml
-  """
+    script:
+    """
+    printf -- "- pipeline_name: ${pipeline_name}\\n"                       >> pipeline_provenance.yml
+    printf -- "  pipeline_version: ${pipeline_version}\\n"                 >> pipeline_provenance.yml
+    printf -- "  nextflow_session_id: ${session_id}\\n"                    >> pipeline_provenance.yml
+    printf -- "  nextflow_run_name: ${run_name}\\n"                        >> pipeline_provenance.yml
+    printf -- "  timestamp_analysis_start: ${timestamp_analysis_start}\\n" >> pipeline_provenance.yml
+    """
 }
diff --git a/nextflow.config b/nextflow.config
index c825b9a..e93b216 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,39 +1,40 @@
 manifest {
   author = 'James Zlosnik (nextflow pipeline)/Kevin Kuchinski (FluViewer)'
-  name = 'BCCDC-PHL/FluViewer-nf'
-  version = '0.2.0'
+  name = 'BCCDC-PHL/fluviewer-nf'
+  version = '0.2.2'
   description = 'BCCDC-PHL FluViewer'
   mainScript = 'main.nf'
   nextflowVersion = '>=20.01.0'
 }
 
 params {
-  profile = false
-  cache = ''
-  outdir = 'results'
-  fastq_input = ''
-  illumina_suffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ]
-  fastq_exts = ['.fastq.gz', '.fq.gz', '.fastq', '.fq']
-  fastq_search_path = makeFastqSearchPath( illumina_suffixes, fastq_exts )
-  primer_path = "${baseDir}/assets/"
-  primers = "${baseDir}/assets/primers.fa"
-  rev_primers = "${baseDir}/assets/primers_rev_comp.fa"
-  pipeline_short_name = parsePipelineName(manifest.toMap().get('name'))
-  pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version'))
-  run_name = parseRunName( fastq_input )
-  versioned_outdir = ''
-  min_depth = '10'
-  min_q = '30'
-  min_cov = '25'
-  min_ident = '95'
-  keep_interfiles = false
-  h1_dataset = ''
-  h3_dataset = ''
-  h5_dataset = ''
-  blastx_subtype_db = "${projectDir}/assets/blastx/blastx_subtype_db.fasta"
-  genoflu_cache = "${projectDir}/assets/genoflu"
-  genoflu_github_url = 'https://github.com/USDA-VS/GenoFLU/'
-  genoflu_version = "LATEST"
+    profile = false
+    cache = ''
+    outdir = 'results'
+    fastq_input = ''
+    illumina_suffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ]
+    fastq_exts = ['.fastq.gz', '.fq.gz', '.fastq', '.fq']
+    fastq_search_path = makeFastqSearchPath( illumina_suffixes, fastq_exts )
+    primer_path = "${baseDir}/assets/"
+    primers = "${baseDir}/assets/primers.fa"
+    rev_primers = "${baseDir}/assets/primers_rev_comp.fa"
+    pipeline_short_name = parsePipelineName(manifest.toMap().get('name'))
+    pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version'))
+    run_name = parseRunName( fastq_input )
+    versioned_outdir = ''
+    min_depth = '10'
+    min_q = '30'
+    min_cov = '25'
+    min_ident = '95'
+    keep_interfiles = false
+    h1_dataset = ''
+    h3_dataset = ''
+    h5_dataset = ''
+    db = 'NO_FILE'
+    blastx_subtype_db = "${projectDir}/assets/blastx/blastx_subtype_db.fasta"
+    genoflu_cache = "${projectDir}/assets/genoflu"
+    genoflu_github_url = 'https://github.com/USDA-VS/GenoFLU/'
+    genoflu_version = "LATEST"
 }
 
 def makeFastqSearchPath ( illumina_suffixes, fastq_exts ) {
@@ -67,23 +68,24 @@ def parsePipelineName(name) {
 }
 
 profiles {
-  conda {
-    process.conda = "$baseDir/environments/main.yml"
-    if (params.cache) {
-      conda.cacheDir = params.cache
+    conda {
+	conda.enabled = true
+	process.conda = "$baseDir/environments/main.yml"
+	if (params.cache) {
+	    conda.cacheDir = params.cache
+	}
+	conda.useMamba = true
     }
-    conda.useMamba = true
-  }
 }
 
 process {
   withName: FluViewer {
-    cpus = 8
-    memory = '32 GB'
+    cpus = 4
+    memory = '2 GB'
   }
 
   withName: cutadapt {
-    cpus = 8
+    cpus = 4
   }
 
   withName: CLADE_CALLING {
@@ -105,4 +107,4 @@ timeline {
 trace {
   enabled = true
   file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_trace.txt"
-}
\ No newline at end of file
+}

From 1385c310c9582a8e1368de5171062d13a676d83a Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 11:54:51 -0700
Subject: [PATCH 02/24] troubleshooting provenance generation

---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 87bb499..fa65f4c 100644
--- a/main.nf
+++ b/main.nf
@@ -115,9 +115,9 @@ workflow {
     ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it ->    [it[0], it[1] << it[2]] }
     ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it ->         [it[0], it[1] << it[2]] }
     ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it ->      [it[0], it[1] << it[2]] }
-    ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it ->     [it[0], it[1] << it[2]] }    
-    ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
-    ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it ->       [it[0], it[1] << it[2]] }
+    // ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it ->     [it[0], it[1] << it[2]] }    
+    // ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
+    // ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it ->       [it[0], it[1] << it[2]] }
     collect_provenance(ch_provenance)
 
 }

From e048c1d2eaeb45210e3b88c4cd387ad47548fec0 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 11:56:26 -0700
Subject: [PATCH 03/24] Add testing workflow

---
 .github/workflows/tests.yml | 48 +++++++++++++++++++++++++++++++++++++
 .gitignore                  |  4 +++-
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/tests.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..e0ba675
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,48 @@
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+name: Tests
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        nextflow_version: ["21.04.3", "23.10.1"]
+    name: Run tests
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Create Artifacts Directory
+      run: mkdir artifacts
+    - name: Install Miniconda
+      run: bash .github/scripts/install_conda.sh
+    - name: Install Nextflow
+      env:
+        NXF_VER: ${{ matrix.nextflow_version }}
+      run: bash .github/scripts/install_nextflow.sh
+    - name: Create ART Read-Simulation Environment
+      run: bash .github/scripts/create_art_environment.sh
+    - name: Download Assemblies
+      run: bash .github/scripts/download_assemblies.sh
+    - name: Simulate Reads
+      run: bash .github/scripts/simulate_reads.sh
+    - name: Run Pipeline
+      run: bash .github/scripts/run_pipeline.sh
+    - name: Create Output Checking Environment
+      run: bash .github/scripts/create_output_checking_environment.sh
+    - name: Check Outputs
+      run: bash .github/scripts/check_outputs.sh
+    - name: Prepare Artifacts
+      if: always()
+      run: bash .github/scripts/prepare_artifacts.sh
+    - name: Upload Artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: artifacts-BCCDC-PHL-fluviewer-nf-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }}
+        path: artifacts
diff --git a/.gitignore b/.gitignore
index 84620e3..fa42e0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,8 +3,10 @@
 .github/data/fluviewer_db
 .nextflow*
 work
-test*
+test_input
+test_output
 test_data/
+./test*
 ref/
 input_test/
 output_test/

From 953b2558f271bdb82916f307c4b3bf8228009605 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 12:01:33 -0700
Subject: [PATCH 04/24] make conda (and mamba?) available when running the
 pipeline

---
 .github/scripts/run_pipeline.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 65ecc6f..82708a2 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -2,6 +2,10 @@
 
 set -eo pipefail
 
+source /home/analysis/.bashrc
+ 
+eval "$(conda shell.bash hook)"
+
 sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
 sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config 
 

From ecb4da128b5396184ef9288821c8bd88876a317d Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 12:03:29 -0700
Subject: [PATCH 05/24] fix home dir path

---
 .github/scripts/run_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 82708a2..76d0abe 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -2,7 +2,7 @@
 
 set -eo pipefail
 
-source /home/analysis/.bashrc
+source ${HOME}/.bashrc
  
 eval "$(conda shell.bash hook)"
 

From 11ed458293aa2730828220cbb59c8b095717f660 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 12:31:24 -0700
Subject: [PATCH 06/24] add another sample to simulate, this one directly from
 FluViewer db

---
 .github/data/reads_to_simulate.csv     | 1 +
 .github/scripts/download_assemblies.sh | 1 +
 .github/scripts/run_pipeline.sh        | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv
index 283cbd2..75255d7 100644
--- a/.github/data/reads_to_simulate.csv
+++ b/.github/data/reads_to_simulate.csv
@@ -1,2 +1,3 @@
+HQ011408.1,.github/data/assemblies/HQ011408.1.fa
 NC026423.1,.github/data/assemblies/NC_026423.1.fa
 NC026431.1,.github/data/assemblies/NC_026431.1.fa
diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh
index b9b618a..8db2f56 100755
--- a/.github/scripts/download_assemblies.sh
+++ b/.github/scripts/download_assemblies.sh
@@ -2,5 +2,6 @@
 
 mkdir -p .github/data/assemblies
 
+curl -o .github/data/assemblies/HQ011408.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=HQ011408.1&db=nucleotide&rettype=fasta"
 curl -o .github/data/assemblies/NC_026423.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026423.1&db=nucleotide&rettype=fasta"
 curl -o .github/data/assemblies/NC_026431.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026431.1&db=nucleotide&rettype=fasta"
diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 76d0abe..0bd5c23 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -6,6 +6,8 @@ source ${HOME}/.bashrc
  
 eval "$(conda shell.bash hook)"
 
+conda activate base
+
 sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
 sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config 
 

From ac72d6d756b9ed65b2367594d1f25b34b930dc0b Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 12:33:49 -0700
Subject: [PATCH 07/24] Add another sample to simulate from FluViewer db

---
 .github/data/reads_to_simulate.csv     | 1 +
 .github/scripts/download_assemblies.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv
index 75255d7..3737436 100644
--- a/.github/data/reads_to_simulate.csv
+++ b/.github/data/reads_to_simulate.csv
@@ -1,3 +1,4 @@
 HQ011408.1,.github/data/assemblies/HQ011408.1.fa
+CY014984.1,.github/data/assemblies/CY014984.1.fa
 NC026423.1,.github/data/assemblies/NC_026423.1.fa
 NC026431.1,.github/data/assemblies/NC_026431.1.fa
diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh
index 8db2f56..d149c3d 100755
--- a/.github/scripts/download_assemblies.sh
+++ b/.github/scripts/download_assemblies.sh
@@ -3,5 +3,6 @@
 mkdir -p .github/data/assemblies
 
 curl -o .github/data/assemblies/HQ011408.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=HQ011408.1&db=nucleotide&rettype=fasta"
+curl -o .github/data/assemblies/CY014984.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=CY014984.1&db=nucleotide&rettype=fasta"
 curl -o .github/data/assemblies/NC_026423.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026423.1&db=nucleotide&rettype=fasta"
 curl -o .github/data/assemblies/NC_026431.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026431.1&db=nucleotide&rettype=fasta"

From f862e72c55eca43ad5d88b77fae4766d4e9a3ed8 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 13:13:20 -0700
Subject: [PATCH 08/24] Build multi-segment ref to simulate reads from

---
 .github/data/reads_to_simulate.csv     |   5 +----
 .github/scripts/download_assemblies.sh |  14 ++++++++++----
 .github/scripts/simulate_reads.sh      |   2 +-
 .gitignore                             |   2 +-
 bin/__pycache__/tools.cpython-310.pyc  | Bin 3119 -> 0 bytes
 5 files changed, 13 insertions(+), 10 deletions(-)
 delete mode 100644 bin/__pycache__/tools.cpython-310.pyc

diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv
index 3737436..7854e7b 100644
--- a/.github/data/reads_to_simulate.csv
+++ b/.github/data/reads_to_simulate.csv
@@ -1,4 +1 @@
-HQ011408.1,.github/data/assemblies/HQ011408.1.fa
-CY014984.1,.github/data/assemblies/CY014984.1.fa
-NC026423.1,.github/data/assemblies/NC_026423.1.fa
-NC026431.1,.github/data/assemblies/NC_026431.1.fa
+MK58361X-H3N2,.github/data/assemblies/MK58361X-H3N2.fa
diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh
index d149c3d..232d43a 100755
--- a/.github/scripts/download_assemblies.sh
+++ b/.github/scripts/download_assemblies.sh
@@ -2,7 +2,13 @@
 
 mkdir -p .github/data/assemblies
 
-curl -o .github/data/assemblies/HQ011408.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=HQ011408.1&db=nucleotide&rettype=fasta"
-curl -o .github/data/assemblies/CY014984.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=CY014984.1&db=nucleotide&rettype=fasta"
-curl -o .github/data/assemblies/NC_026423.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026423.1&db=nucleotide&rettype=fasta"
-curl -o .github/data/assemblies/NC_026431.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026431.1&db=nucleotide&rettype=fasta"
+curl -o .github/data/assemblies/MK583610.1_segment_1_PB2_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583610.1&db=nucleotide&rettype=fasta"
+curl -o .github/data/assemblies/MK583611.1_segment_2_PB1_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583611.1&db=nucleotide&rettype=fasta"
+curl -o .github/data/assemblies/MK583612.1_segment_3_PA_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583612.1&db=nucleotide&rettype=fasta"
+curl -o .github/data/assemblies/MK583613.1_segment_4_HA_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583613.1&db=nucleotide&rettype=fasta"
+curl -o .github/data/assemblies/MK583614.1_segment_5_NP_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583614.1&db=nucleotide&rettype=fasta"
+curl -o .github/data/assemblies/MK583615.1_segment_6_NA_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583615.1&db=nucleotide&rettype=fasta"
+
+cat .github/data/assemblies/MK58361*.fa > .github/data/assemblies/MK58361X-H3N2.fa
+
+rm .github/data/assemblies/MK58361*.1_segment_*.fa
diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh
index 41b9482..0689e4e 100755
--- a/.github/scripts/simulate_reads.sh
+++ b/.github/scripts/simulate_reads.sh
@@ -13,7 +13,7 @@ while IFS=',' read -r sample_id assembly; do
     art_illumina \
 	--paired \
 	--in ${assembly} \
-	--fcov 100 \
+	--fcov 500 \
 	--len 150 \
 	--mflen 400 \
 	--sdev 100 \
diff --git a/.gitignore b/.gitignore
index fa42e0c..2fb2cb1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,5 +12,5 @@ input_test/
 output_test/
 Validation_notes.md
 .Rproj.user
-__pycache__/
+*/__pycache__/*.pyc
 assets/genoflu/GenoFLU
\ No newline at end of file
diff --git a/bin/__pycache__/tools.cpython-310.pyc b/bin/__pycache__/tools.cpython-310.pyc
deleted file mode 100644
index a9c443bef6bed7562cfce5cbcff9c07b61da1d07..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3119
zcmb7G&5s;M74PbgneO&@y$}mI1fiB<MV{c@0V4s5tSHV}D-e-o#Ze9u!nkLudZsr$
zUA<M+Ua!?7MOq{AFTiPi<i?d#Zu}8_<q-MCIT!f7nh)=+34z|NS2b1jUcGu>zmiw4
z#ty!nzdy{{D~|II`f>5|@Z$qC{Rs$nI4hjtvS&kv-)`Z~y`jgJbN(=}dpg4ozs7wY
zJZHl$-{ldHF$(z=-h1v0BYu_dppQ9w<n+Jy5|W(<ebxr|ihNogPF$K;eWYE0c7Ue;
z2x6Qgws1LHc-&q1+~fXpXAu~e2WRZQ^ZDD0P7^dv)8Xt7?wigTTXdW5x%a|ZgiUzv
z^Ue#lh#IDDnV<>J7;~DaaldfXohDkiO}Oxy&cZ*w);LF<#v66echPszhv?B7zi~4U
zdUw;mGRKLd!@8W9T$YKENtTy9F*A|Or8Y@gRZ3PWPmM^@d0xsSo#a?oX{n3U@{CUt
zBecoOX_6KNSd|co5-F#dOr>+7_v7fk&@hYutHk0&<WEHMVJ@pvGsC%$#PP>sA{9^M
z_()6)aeORHtxCNTC~H&IMkgu$6H`}(crT8kz2x3&yvanCCe2Gap6)S*a^I%=Qbd{L
zr4du1(DQtPYh7gBx(XzwCOLr?wZ>&KskUr!>!-JqpWRL{d#C>$vdelt#)&dZw2>8*
z<`S39;e%POmr7r>@+5_79BzoC2QUAHZvxM4J_>PakI?jO5aTom9L|2=ELg+1`-P{j
zbBt~{2oZ;SjeFhsrL({;gn|ET4rL$JRV9_Zdv+lnDkYwX(p+%XS}k#_pKNw!O3rbK
zc~z`TpN`?Oq?F~}ye>=*hv_|;?fo1ly(Co?>EAL+8m|y#-)lWBj$3ao`7kgt<@_&>
zim>OK-wyOFRbnRRVsIp9<$jeGbD;+NrFd$xLY@p#t%cEp4-XFRAM8E+^+$ulqW&}&
zCqj+NY%tEtfswM%`_*adRVjZ1QZ;2|&ktG1ddz3f-pUH`bhNqJ{@p@O(n7zF!?^OT
zs_*ur)-9{n`%Rg(0Wx4J+GxYk1`}DArVWrS=-iyQeu?03uW+F!Dz7M6`th(+nh{+z
z;9^`)+XzWDvYf+^B9+#Lu^rjmQK)DzQ*Qapyc%}W)b0t<TRv-~C80aDJKHzuU{1dS
z(usUrG<19ME-oE2-^Jx!n)8|aWk|m;tKK9TooOM*Y4J72`2P9T*MI&KeSiCELatkX
zWELuP2Wa|T5aXOPlqnCz%Nw~314X2P`iMPm6j+L)gqU+5Ou=Bkepiu{_S(ZVS0}j^
zwuTDT`iKS5?#N1%qiI@=G%AE>cj0wW@(VN9O{%8CjJ9XrIIe{{-Jb0G>c=>07_Maw
zgQp~@?WR<>g}Q9a9_ol0triDh)jJ>|>ummfrfy<tD;>$^WE^Xl83tq+&QQ0R9l02}
z+%rhUGd6W$Fm?{3VHTBnH3}ME-NX*y17Sepaqq6v1nSO`3p7}++<>vUk2NxD@ZeeS
z`_J}1|H-p}=uMktFDcdfe(S<|UCPnC5Q=ncyHxE_wfh}ppHah@%#4iB@~6wOt}~2V
zPYK=ng~$-!Dxc2GWjpPV{gLsIrgEWw1>yQSNQ{u`xkP=oi*^G|?O|ankh+*4eXPAc
z!hp)xIl~><k#|PDOZ^M?jQtMQTKL9q@EhX*Sm^;RJA{<5qrVwrN8@KMY{JrS5MOb0
zPfhDNN|}y<8;@5$ef(aMmMheuVoZqf3YY9A;sI<)V4Fl?epUH5t5zVaM#6}D|1zEd
z@?cdBL2GEiZlOdu5eX8$rhyf5tWN(AAvVHa3D{ufS|0>%X~n^}bnt6hERDdi@NL@s
z&mvya<r7$P0}qu@z{)v+ceToLycUph*0^I~PJ|%nqnE>46(C-+EtODrZ9u$Bp5z=D
zyz%7Pix!Ihn;-X+I3n%f5rQTd=9MbIUa*$Q1mMNxiG32uvcMb1GEC`}gfO881K1H4
z^d68;geuZXCnvd?UAo2Ey|?M<!Ufn+@u4<7n;`P>cH-RXKi(K^t>)`a`cIYtLgGU7
zWP)%UwRAZ>r1}An2t~6EtR&S(f+<Qln~12sZ(~d;p0yt0ukB()Zg0EzWe0&qg*B~D
zFO|!g|NXT;v2*3mL&&rEnq2F-J>YA|)GsjCXNP@H{g6(6i^z{??n>)Qjp}m>g49_;
zL4em$b*iq?u6KwKD%m<~b+*zjC_>K!Wx+4J3(Tks@jjJ2y$iDI(D{HiH|~c~Z|C3m
C5{SqE


From aebf1ca66df28f4f5349004a963cb3627695f863 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 14:15:08 -0700
Subject: [PATCH 09/24] touching up provenance

---
 modules/cutadapt.nf |  9 ++++++---
 modules/fastp.nf    | 12 ++++++------
 modules/fastqc.nf   | 12 +++++++-----
 nextflow.config     | 31 +++++++++++++++----------------
 4 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/modules/cutadapt.nf b/modules/cutadapt.nf
index fe10ccb..90666e6 100644
--- a/modules/cutadapt.nf
+++ b/modules/cutadapt.nf
@@ -12,8 +12,11 @@ process cutadapt {
 
     script:
     """
-    printf -- "- process_name: cutadapt\\n" > ${sample_id}_cutadapt_provenance.yml
-    printf -- "  tool_name: cutadapt\\n  tool_version: \$(cutadapt --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_cutadapt_provenance.yml
+    printf -- "- process_name: cutadapt\\n"    >> ${sample_id}_cutadapt_provenance.yml
+    printf -- "  tools:\\n"                    >> ${sample_id}_cutadapt_provenance.yml
+    printf -- "    - tool_name: cutadapt\\n"   >> ${sample_id}_cutadapt_provenance.yml
+    printf -- "      tool_version: \$(cutadapt --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_cutadapt_provenance.yml
+
     cutadapt \
       -j ${task.cpus} \
       -a file:${params.primers} \
@@ -26,4 +29,4 @@ process cutadapt {
       ${sample_id}_R2.trim.fastq.gz \
       > ${sample_id}.cutadapt.log
     """
-}
\ No newline at end of file
+}
diff --git a/modules/fastp.nf b/modules/fastp.nf
index 7d522df..999e6ee 100644
--- a/modules/fastp.nf
+++ b/modules/fastp.nf
@@ -15,8 +15,11 @@ process fastp {
 
     script:
     """
-    printf -- "- process_name: fastp\\n" > ${sample_id}_fastp_provenance.yml
-    printf -- "  tool_name: fastp\\n  tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml
+    printf -- "- process_name: fastp\\n"  >> ${sample_id}_fastp_provenance.yml
+    printf -- "  tools:\\n"               >> ${sample_id}_fastp_provenance.yml
+    printf -- "    - tool_name: fastp\\n" >> ${sample_id}_fastp_provenance.yml
+    printf -- "      tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml
+
     fastp \
       -t ${task.cpus} \
       -i ${reads_1} \
@@ -32,16 +35,13 @@ process fastp {
 }
 
 
-//printf -- "- process_name: fastp\\n" > ${sample_id}_fastp_provenance.yml
-//printf -- "  tool_name: fastp\\n  tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml
-
 process fastp_json_to_csv {
 
   tag { sample_id }
 
   executor 'local'
 
-  publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}" : "${params.outdir}/${sample_id}", pattern: "${sample_id}_fastp.csv", mode: 'copy'
+  publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_fastp.csv", mode: 'copy'
 
   input:
   tuple val(sample_id), path(fastp_json)
diff --git a/modules/fastqc.nf b/modules/fastqc.nf
index 82f2ab6..3413c90 100644
--- a/modules/fastqc.nf
+++ b/modules/fastqc.nf
@@ -1,4 +1,4 @@
-process FASTQC {
+process fastqc {
 
     tag { sample_id }
 
@@ -13,9 +13,13 @@ process FASTQC {
 
     script:
     """
+    printf -- "- process_name: fastqc\\n"   >> ${sample_id}_fastqc_provenance.yml
+    printf -- "  tools:\\n"                >> ${sample_id}_fastqc_provenance.yml
+    printf -- "    - tool_name: fastqc\\n" >>
+    printf -- "      tool_version: \$(fastqc --version 2>&1 | sed -n '1 p')\\n" >> ${sample_id}_fastqc_provenance.yml
+
     mkdir -p ./tmp
-    printf -- "- process_name: fastqc\\n" > ${sample_id}_fastqc_provenance.yml
-    printf -- "  tool_name: fastqc\\n  tool_version: \$(fastqc --version 2>&1 | sed -n '1 p')\\n" >> ${sample_id}_fastqc_provenance.yml
+
     fastqc \
       --threads ${task.cpus} \
       --dir ./tmp \
@@ -24,5 +28,3 @@ process FASTQC {
     
     """
 }
- //removed from cutadapt script
- //--json=${sample_id}.cutadapt.json 
diff --git a/nextflow.config b/nextflow.config
index e93b216..56c58de 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -21,7 +21,6 @@ params {
     pipeline_short_name = parsePipelineName(manifest.toMap().get('name'))
     pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version'))
     run_name = parseRunName( fastq_input )
-    versioned_outdir = ''
     min_depth = '10'
     min_q = '30'
     min_cov = '25'
@@ -79,19 +78,19 @@ profiles {
 }
 
 process {
-  withName: FluViewer {
-    cpus = 4
-    memory = '2 GB'
-  }
+    withName: FluViewer {
+	cpus = 8
+	memory = '32 GB'
+    }
 
-  withName: cutadapt {
-    cpus = 4
-  }
+    withName: cutadapt {
+	cpus = 8
+    }
 
-  withName: CLADE_CALLING {
-    conda = "$baseDir/environments/nextclade.yml"
-    cpus = 4
-  }
+    withName: CLADE_CALLING {
+	conda = "$baseDir/environments/nextclade.yml"
+	cpus = 4
+    }
 }
 
 report {
@@ -100,11 +99,11 @@ report {
 }
 
 timeline {
-	enabled = true
-	file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_timeline.html"
+    enabled = true
+    file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_timeline.html"
 }
 
 trace {
-  enabled = true
-  file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_trace.txt"
+    enabled = true
+    file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_trace.txt"
 }

From 5b895e20f43e0d592af31dd005ff8892adce6919 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 14:29:56 -0700
Subject: [PATCH 10/24] tidying up

---
 main.nf                  |  41 +++++------
 modules/FluViewer.nf     | 151 +++++++++++++++++++--------------------
 modules/clade_calling.nf |   4 +-
 modules/genoflu.nf       |   7 +-
 modules/multiqc.nf       |   1 -
 nextflow.config          |   4 +-
 6 files changed, 98 insertions(+), 110 deletions(-)

diff --git a/main.nf b/main.nf
index fa65f4c..7554581 100644
--- a/main.nf
+++ b/main.nf
@@ -3,15 +3,6 @@
 /*
  *   A nextflow wrapper for running FluViewer
  *   -----------------------------------------
-
- == V1  ==
-This pipeline will run FluViewer on a set of fastq files in a baseDir.
-Each output will be its own directory.
-Future versions will add in:
-- fastp to remove adapters and produce QC
-- multqc to read the results of this
-- a script to scrape together results and produce an output csv
-
  */
 
  import java.time.LocalDateTime
@@ -23,14 +14,14 @@ include { pipeline_provenance } from './modules/provenance.nf'
 include { collect_provenance }  from './modules/provenance.nf'
 include { fastp }               from './modules/fastp.nf'
 include { cutadapt}             from './modules/cutadapt.nf'
-include { FluViewer }           from './modules/FluViewer.nf'
+include { fluviewer }           from './modules/FluViewer.nf'
 include { multiqc }             from './modules/multiqc.nf'
-include { FASTQC }              from './modules/fastqc.nf'
-include { CLADE_CALLING }       from './modules/clade_calling.nf'
-include { SNP_CALLING }         from './modules/snp_calling.nf'
-include { PULL_GENOFLU }        from './modules/genoflu.nf'
-include { CHECKOUT_GENOFLU }    from './modules/genoflu.nf'
-include { GENOFLU }             from './modules/genoflu.nf'
+include { fastqc }              from './modules/fastqc.nf'
+include { clade_calling }       from './modules/clade_calling.nf'
+include { snp_calling }         from './modules/snp_calling.nf'
+include { pull_genoflu }        from './modules/genoflu.nf'
+include { checkout_genoflu }    from './modules/genoflu.nf'
+include { genoflu }             from './modules/genoflu.nf'
 
 
 // prints to the screen and to the log
@@ -85,25 +76,25 @@ workflow {
     // Clean up reads - remove adapters (fastp) and primers (cutadapt)
     fastp(ch_fastq_input)
     cutadapt(fastp.out.trimmed_reads.combine(ch_primers))
-    FASTQC(cutadapt.out.primer_trimmed_reads)
+    fastqc(cutadapt.out.primer_trimmed_reads)
 
     // Run FluViewer 
-    FluViewer(cutadapt.out.primer_trimmed_reads.combine(ch_db))
+    fluviewer(cutadapt.out.primer_trimmed_reads.combine(ch_db))
 
-    //Collect al the relevant filesfor MULTIQC
-    ch_fastqc_collected = FASTQC.out.zip.map{ it -> [it[1], it[2]]}.collect()
+    //Collect al the relevant filesfor multiqc
+    ch_fastqc_collected = fastqc.out.zip.map{ it -> [it[1], it[2]]}.collect()
     multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) )
  
     //Call clades for H1 and H3 samples
-    CLADE_CALLING(FluViewer.out.consensus_seqs)
+    clade_calling(fluviewer.out.consensus_seqs)
      
-    SNP_CALLING(FluViewer.out.consensus_main, ch_reference_db)
+    snp_calling(fluviewer.out.consensus_main, ch_reference_db)
    
-    PULL_GENOFLU(params.genoflu_github_url)
+    pull_genoflu(params.genoflu_github_url)
 
-    CHECKOUT_GENOFLU(PULL_GENOFLU.out.repo, params.genoflu_version)
+    checkout_genoflu(pull_geoflu.out.repo, params.genoflu_version)
 
-    GENOFLU(FluViewer.out.consensus_main.combine(PULL_GENOFLU.out.repo))
+    genoflu(fluviewer.out.consensus_main.combine(pull_genoflu.out.repo))
 
 
     //
diff --git a/modules/FluViewer.nf b/modules/FluViewer.nf
index 49f7cc5..27eac7a 100644
--- a/modules/FluViewer.nf
+++ b/modules/FluViewer.nf
@@ -1,83 +1,83 @@
-process FluViewer {
+process fluviewer {
 
-  tag { sample_id }
+    tag { sample_id }
 
-  memory  { 50.GB * task.attempt } 
-  errorStrategy { (task.exitStatus == 2 && task.attempt <= maxRetries) ? 'retry' : 'ignore' } 
-  maxRetries 5
+    memory  { 50.GB * task.attempt } 
+    errorStrategy { (task.exitStatus == 2 && task.attempt <= maxRetries) ? 'retry' : 'ignore' } 
+    maxRetries 5
 
-  conda "${projectDir}/environments/fluviewer.yml"
-
-  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/${sample_id}*", mode:'copy', saveAs: { filename -> filename.split("/").last() }
-  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/*tsv", mode:'copy', saveAs: { filename -> filename.split("/").last() }
-  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/spades_output", mode:'copy', saveAs: { filename -> "spades_output" }
-  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".*", mode:'copy'
-  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/logs", mode:'copy', saveAs: { filename -> "fluviewer_logs" }
-  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".exitcode", mode:'copy'
-  publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".command.*", mode:'copy'
+    publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/${sample_id}*", mode:'copy', saveAs: { filename -> filename.split("/").last() }
+    publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/*tsv", mode:'copy', saveAs: { filename -> filename.split("/").last() }
+    publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/spades_output", mode:'copy', saveAs: { filename -> "spades_output" }
+    publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".*", mode:'copy'
+    publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/logs", mode:'copy', saveAs: { filename -> "fluviewer_logs" }
+    publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".exitcode", mode:'copy'
+    publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".command.*", mode:'copy'
   
-  input:
-  tuple val(sample_id), path(reads_1), path(reads_2), path(db)
-
-  output:
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam"), emit: alignment
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam.bai"), emit: alignmentindex, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*report.tsv"), emit: reports, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_consensus.fa"), emit: consensus_seqs, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa"), emit: consensus_main
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_HPAI.tsv"), emit: HPAI, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_cov.png"), emit: coverage_plot, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_variants.vcf"), emit: vcf, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/logs"), emit: fluviewer_logs
-  tuple val(sample_id), path("${sample_id}_FluViewer_provenance.yml"), emit: provenance
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_mapping_refs.fa"), emit: ref_seqs_for_mapping, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/contigs_blast.tsv"), emit: contig_blast_results, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/spades_output"), emit: spades_results, optional: true
-  tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.png"), emit: depth_cov_plot, optional: true
-
-  script:
-  garbage_collection = params.keep_interfiles ? '-g' : ''
-
-  """
-  printf -- "- process_name: FluViewer\\n" > ${sample_id}_FluViewer_provenance.yml
-  printf -- "  tool_name: FluViewer\\n  tool_version: \$(FluViewer | sed -n '4 p')\\n" >> ${sample_id}_FluViewer_provenance.yml
-  printf -- "  database used: ${db}\\n" >> ${sample_id}_FluViewer_provenance.yml
-  printf -- "  database_path: \$(readlink -f ${db})\\n" >> ${sample_id}_FluViewer_provenance.yml
-  printf -- "  database sha256: \$(shasum -a 256 ${db}|awk '{print \$1}')\\n" >> ${sample_id}_FluViewer_provenance.yml
+    input:
+    tuple val(sample_id), path(reads_1), path(reads_2), path(db)
+
+    output:
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam"), emit: alignment
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam.bai"), emit: alignmentindex, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*report.tsv"), emit: reports, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_consensus.fa"), emit: consensus_seqs, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa"), emit: consensus_main
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_HPAI.tsv"), emit: HPAI, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_cov.png"), emit: coverage_plot, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_variants.vcf"), emit: vcf, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/logs"), emit: fluviewer_logs
+    tuple val(sample_id), path("${sample_id}_FluViewer_provenance.yml"), emit: provenance
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_mapping_refs.fa"), emit: ref_seqs_for_mapping, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/contigs_blast.tsv"), emit: contig_blast_results, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/spades_output"), emit: spades_results, optional: true
+    tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.png"), emit: depth_cov_plot, optional: true
+
+    script:
+    garbage_collection = params.keep_interfiles ? '-g' : ''
+    """
+    printf -- "- process_name: fluviewer\\n"                  >> ${sample_id}_FluViewer_provenance.yml
+    printf -- "  tools:\\n"                                   >> ${sample_id}_FluViewer_provenance.yml
+    printf -- "    - tool_name: FluViewer\\n"                 >> ${sample_id}_FluViewer_provenance.yml
+    printf -- "      tool_version: \$(FluViewer | sed -n '4 p')\\n" >> ${sample_id}_FluViewer_provenance.yml
+    printf -- "  databases:\\n"                               >> ${sample_id}_FluViewer_provenance.yml
+    printf -- "    - database_name: ${db}\\n"                 >> ${sample_id}_FluViewer_provenance.yml
+    printf -- "      database_path: \$(readlink -f ${db})\\n" >> ${sample_id}_FluViewer_provenance.yml
+    printf -- "      database_sha256: \$(shasum -a 256 ${db}|awk '{print \$1}')\\n" >> ${sample_id}_FluViewer_provenance.yml
   
-  EXITCODE=0
-  (FluViewer \
-  ${garbage_collection} \
-  -T ${task.cpus} \
-  -f ${reads_1} -r ${reads_2} \
-  -n ${sample_id}_fluviewer \
-  -d ${db} \
-  -D ${params.min_depth}  \
-  -q ${params.min_q} \
-  -i ${params.min_ident} \
-  -M 40 && EXITCODE=\$?) || EXITCODE=\$?
-
-
-  echo "Extracting NA and HA consensus sequences..."
-
-  if [ `grep "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then 
-    grep -A1 "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa
-  else
-    echo "No HA consensus sequence generated."
-  fi
-
-  if [ `grep "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then 
-    grep -A1 "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_NA_consensus.fa
-  else
-    echo "No NA consensus sequence generated."
-  fi
-
-  if [[ ! -f ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa ]]; then
-    echo "HA segment consensus not generated. Skipping FindCleave.py..."
-  else
-    python ${projectDir}/bin/FindCleave.py -i ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa -o ${sample_id}_fluviewer/${sample_id}_HPAI.tsv
-    echo "Finished running FindCleave.py."
-  fi
+    EXITCODE=0
+    (FluViewer \
+	${garbage_collection} \
+	-T ${task.cpus} \
+	-f ${reads_1} -r ${reads_2} \
+	-n ${sample_id}_fluviewer \
+	-d ${db} \
+	-D ${params.min_depth}  \
+	-q ${params.min_q} \
+	-i ${params.min_ident} \
+	-M 40 && EXITCODE=\$?) \
+	|| EXITCODE=\$?
+
+    echo "Extracting NA and HA consensus sequences..."
+
+    if [ `grep "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then 
+        grep -A1 "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa
+    else
+        echo "No HA consensus sequence generated."
+    fi
+
+    if [ `grep "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then 
+        grep -A1 "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_NA_consensus.fa
+    else
+        echo "No NA consensus sequence generated."
+    fi
+
+    if [[ ! -f ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa ]]; then
+        echo "HA segment consensus not generated. Skipping FindCleave.py..."
+    else
+	python ${projectDir}/bin/FindCleave.py -i ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa -o ${sample_id}_fluviewer/${sample_id}_HPAI.tsv
+        echo "Finished running FindCleave.py."
+    fi
 
   echo \$EXITCODE > .exitcode
 
@@ -89,6 +89,5 @@ process FluViewer {
   cp .command.* \$OUTPATH
   cp .exitcode \$OUTPATH
   exit \$EXITCODE
-
   """
 }
diff --git a/modules/clade_calling.nf b/modules/clade_calling.nf
index c4fab0d..8c49b17 100644
--- a/modules/clade_calling.nf
+++ b/modules/clade_calling.nf
@@ -1,4 +1,4 @@
-process CLADE_CALLING {
+process clade_calling {
 
     conda "${projectDir}/environments/nextclade.yml"
 
@@ -59,4 +59,4 @@ process CLADE_CALLING {
     printf -- "  Dataset location: \$LOCATION" >> ${sample_id}_clade_provenance.yml
     printf -- "  Dataset version: \$VERSION" >> ${sample_id}_clade_provenance.yml
     """
-}
\ No newline at end of file
+}
diff --git a/modules/genoflu.nf b/modules/genoflu.nf
index 2fa31e6..d768c08 100644
--- a/modules/genoflu.nf
+++ b/modules/genoflu.nf
@@ -1,4 +1,4 @@
-process GENOFLU {
+process genoflu {
 
     tag { sample_id }
 
@@ -29,7 +29,7 @@ process GENOFLU {
     """
 }
 
-process PULL_GENOFLU {
+process pull_genoflu {
     
     executor 'local'
     storeDir "${params.genoflu_cache}"
@@ -46,8 +46,7 @@ process PULL_GENOFLU {
     """
 }
 
-process CHECKOUT_GENOFLU {
-
+process checkout_genoflu {
 
     input:
     path(genoflu_path)
diff --git a/modules/multiqc.nf b/modules/multiqc.nf
index 12cfd90..9f85c0a 100644
--- a/modules/multiqc.nf
+++ b/modules/multiqc.nf
@@ -11,6 +11,5 @@ process multiqc {
     script:
     """
     multiqc . -n ${params.run_name}_multiqc_report.html
-    
     """
 }
diff --git a/nextflow.config b/nextflow.config
index 56c58de..f5aa97f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -78,7 +78,7 @@ profiles {
 }
 
 process {
-    withName: FluViewer {
+    withName: fluviewer {
 	cpus = 8
 	memory = '32 GB'
     }
@@ -87,7 +87,7 @@ process {
 	cpus = 8
     }
 
-    withName: CLADE_CALLING {
+    withName: clade_calling {
 	conda = "$baseDir/environments/nextclade.yml"
 	cpus = 4
     }

From da113f7f94104b4888531db908616a5eca29034d Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 14:31:17 -0700
Subject: [PATCH 11/24] rename snp_calling module

---
 modules/snp_calling.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/snp_calling.nf b/modules/snp_calling.nf
index f61e241..211362a 100644
--- a/modules/snp_calling.nf
+++ b/modules/snp_calling.nf
@@ -1,4 +1,4 @@
-process SNP_CALLING {
+process snp_calling {
 
     errorStrategy 'ignore'
 

From e69f3395bb90caadd99edb2aa02eaf0e81f6bddc Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 14:37:03 -0700
Subject: [PATCH 12/24] typo geoflu -> genoflu

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 7554581..0a3b0a8 100644
--- a/main.nf
+++ b/main.nf
@@ -92,7 +92,7 @@ workflow {
    
     pull_genoflu(params.genoflu_github_url)
 
-    checkout_genoflu(pull_geoflu.out.repo, params.genoflu_version)
+    checkout_genoflu(pull_genoflu.out.repo, params.genoflu_version)
 
     genoflu(fluviewer.out.consensus_main.combine(pull_genoflu.out.repo))
 

From 9e42cb647e4042d729504f61615db80758d4dc01 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 14:44:51 -0700
Subject: [PATCH 13/24] small fixes

---
 .gitignore        |  2 +-
 ReadMe.md         | 33 +++++++++++++++++++--------------
 modules/fastqc.nf |  4 ++--
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2fb2cb1..8867849 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,4 @@ output_test/
 Validation_notes.md
 .Rproj.user
 */__pycache__/*.pyc
-assets/genoflu/GenoFLU
\ No newline at end of file
+assets/genoflu/GenoFLU
diff --git a/ReadMe.md b/ReadMe.md
index 4914a67..4bb726d 100644
--- a/ReadMe.md
+++ b/ReadMe.md
@@ -68,7 +68,7 @@ For a full list of optional arguments, see: https://github.com/KevinKuchinski/Fl
 **Example command:**
 ```
 nextflow run BCCDC-PHL/fluviewer-nf \
-  -r v0.1.0 \
+  -r v0.2.2 \
   -profile conda \
   --cache ~/.conda/envs \
   --fastq_input /path/to/your_fastqs \
@@ -127,12 +127,9 @@ Output for each run includes:
 For each pipeline invocation, each sample will produce a `provenance.yml` file with the following contents.  Note the below is a contrived example.  
 
 ```yml
-- process_name: FluViewer
-  tool_name: FluViewer
-  tool_version: FluViewer v0.0.2
-  database used: FluViewer_db_full_20220915.fasta
-  database_path: /home/{USER}/Flu/ref/FluViewer_db_full_20220915.fasta
-  database sha256: 55b33afa21ad44ed1e6db896cf420fae6b1524c0ad205775a1ce9dd11595905d
+- pipeline_name: BCCDC-PHL/FluViewer-nf
+  pipeline_version: 0.2.2
+  timestamp_analysis_start: 2023-11-21T05:43:25.541743
 - input_filename: {Sample}_R1.fastq.gz
   input_path: /home/{USER{}}/Flu/test_data/test_production_run/{Sample}_R1.fastq.gz
   sha256: 47380e49f10374660a2061d3571efe5339401484e646c2b47896fa701dbcf0a8
@@ -140,14 +137,22 @@ For each pipeline invocation, each sample will produce a `provenance.yml` file w
   input_path: /home/{USER}/Flu/test_data/test_production_run/{Sample}.fastq.gz
   sha256: 39c95fd26af111ee9a6caeb840a7aced444b657550efea3ab7f74add0b30f69d
 - process_name: fastp
-  tool_name: fastp
-  tool_version: 0.23.1
+  tools:
+    - tool_name: fastp
+      tool_version: 0.23.1
 - process_name: cutadapt
-  tool_name: cutadapt
-  tool_version: 4.1
-- pipeline_name: BCCDC-PHL/FluViewer-nf
-  pipeline_version: 0.2.0
-- timestamp_analysis_start: 2023-11-21T05:43:25.541743
+  tools:
+    - tool_name: cutadapt
+      tool_version: 4.1
+- process_name: fluviewer
+  tools:
+    - tool_name: FluViewer
+      tool_version: FluViewer v0.0.2
+  databases:
+    - database_name: FluViewer_db_full_20220915.fasta
+      database_path: /home/{USER}/Flu/ref/FluViewer_db_full_20220915.fasta
+      database_sha256: 55b33afa21ad44ed1e6db896cf420fae6b1524c0ad205775a1ce9dd11595905d
+
 - process_name: nextclade
   tool_name: nextclade
   tool_version: 2.9.1
diff --git a/modules/fastqc.nf b/modules/fastqc.nf
index 3413c90..8fcff27 100644
--- a/modules/fastqc.nf
+++ b/modules/fastqc.nf
@@ -13,9 +13,9 @@ process fastqc {
 
     script:
     """
-    printf -- "- process_name: fastqc\\n"   >> ${sample_id}_fastqc_provenance.yml
+    printf -- "- process_name: fastqc\\n"  >> ${sample_id}_fastqc_provenance.yml
     printf -- "  tools:\\n"                >> ${sample_id}_fastqc_provenance.yml
-    printf -- "    - tool_name: fastqc\\n" >>
+    printf -- "    - tool_name: fastqc\\n" >> ${sample_id}_fastqc_provenance.yml
     printf -- "      tool_version: \$(fastqc --version 2>&1 | sed -n '1 p')\\n" >> ${sample_id}_fastqc_provenance.yml
 
     mkdir -p ./tmp

From ff724b86ad2cf249b61f70122ce0336290755869 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 14:48:36 -0700
Subject: [PATCH 14/24] Disable nextflow v23 test temporarily

---
 .github/workflows/tests.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index e0ba675..f306e56 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -12,7 +12,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        nextflow_version: ["21.04.3", "23.10.1"]
+        nextflow_version:
+          - "21.04.3"
+        #  - "23.10.1" <- Failing due to 'conda.useMamba = true'. Issue is in test environment. Revisit
     name: Run tests
     runs-on: ubuntu-latest
     steps:

From fc671b7cb3ffeff05a1851ef5f69aa2faae5b2f9 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 14:55:13 -0700
Subject: [PATCH 15/24] Restore fluviewer conda env directive

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index f5aa97f..4c9184d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -79,8 +79,8 @@ profiles {
 
 process {
     withName: fluviewer {
+	conda = "$baseDir/environments/fluviewer.yml"
 	cpus = 8
-	memory = '32 GB'
     }
 
     withName: cutadapt {

From 479ed7d31133f7ff69b0027866d6c6486b91f1b0 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 15:07:08 -0700
Subject: [PATCH 16/24] updates

---
 .github/scripts/check_outputs.py | 6 ++++++
 environments/fluviewer.yml       | 2 +-
 main.nf                          | 2 +-
 modules/hash_files.nf            | 4 +++-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py
index 84a7166..09a8d8b 100755
--- a/.github/scripts/check_outputs.py
+++ b/.github/scripts/check_outputs.py
@@ -38,6 +38,12 @@ def check_expected_files_exist(output_dir, sample_ids):
     """
     for sample_id in sample_ids:
         expected_files = [
+            f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_alignment.bam",
+            f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_alignment.bam.bai",
+            f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_depth_of_cov.png",
+            f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_mapping_refs.fa",
+            f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_report.tsv",
+            f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_genoflu.tsv",
         ]
 
         for expected_file in expected_files:
diff --git a/environments/fluviewer.yml b/environments/fluviewer.yml
index ff33eb0..a7ea83f 100644
--- a/environments/fluviewer.yml
+++ b/environments/fluviewer.yml
@@ -209,4 +209,4 @@ dependencies:
   - zlib=1.2.13=hd590300_5
   - zstd=1.5.5=hfc55251_0
   - pip:
-      - fluviewer
+      - FluViewer==0.1.11
diff --git a/main.nf b/main.nf
index 0a3b0a8..c5f0961 100644
--- a/main.nf
+++ b/main.nf
@@ -106,7 +106,7 @@ workflow {
     ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it ->    [it[0], it[1] << it[2]] }
     ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it ->         [it[0], it[1] << it[2]] }
     ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it ->      [it[0], it[1] << it[2]] }
-    // ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it ->     [it[0], it[1] << it[2]] }    
+    ch_provenance = ch_provenance.join(fluviewer.out.provenance).map{ it ->     [it[0], it[1] << it[2]] }    
     // ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
     // ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it ->       [it[0], it[1] << it[2]] }
     collect_provenance(ch_provenance)
diff --git a/modules/hash_files.nf b/modules/hash_files.nf
index 6662097..e9865a0 100644
--- a/modules/hash_files.nf
+++ b/modules/hash_files.nf
@@ -13,7 +13,9 @@ process hash_files {
     """
     shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sample_id}_${file_type}.sha256.csv
     while IFS=',' read -r hash filename; do
-      printf -- "- input_filename: \$filename\\n  input_path: \$(realpath \$filename)\\n  sha256: \$hash\\n" >> ${sample_id}_${file_type}_provenance.yml
+      printf -- "- input_filename: \$filename\\n"  >> ${sample_id}_${file_type}_provenance.yml;
+      printf -- "  file_type: ${file_type}\\n"     >> ${sample_id}_${file_type}_provenance.yml;
+      printf -- "  sha256: \$hash\\n"              >> ${sample_id}_${file_type}_provenance.yml;
     done < ${sample_id}_${file_type}.sha256.csv
     """
 }

From 232476c421321ef0e3af98ecbb82f0924802d22a Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 15:28:41 -0700
Subject: [PATCH 17/24] organize environments

---
 .github/scripts/run_pipeline.sh            |  3 ++-
 environments/{main.yml => environment.yml} |  2 +-
 environments/fluviewer.yml                 |  2 +-
 environments/nextclade.yml                 |  4 ++--
 main.nf                                    |  5 +++--
 modules/clade_calling.nf                   |  2 +-
 modules/snp_calling.nf                     | 12 ++++++++++--
 7 files changed, 20 insertions(+), 10 deletions(-)
 rename environments/{main.yml => environment.yml} (99%)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 0bd5c23..811ea05 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -9,7 +9,8 @@ eval "$(conda shell.bash hook)"
 conda activate base
 
 sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
-sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config 
+sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config
+sed -i 'memory  { 50.GB * task.attempt }//g' modules/FluViewer.nf
 
 nextflow run main.nf \
 	 -profile conda \
diff --git a/environments/main.yml b/environments/environment.yml
similarity index 99%
rename from environments/main.yml
rename to environments/environment.yml
index 769a02b..c9df87d 100644
--- a/environments/main.yml
+++ b/environments/environment.yml
@@ -1,4 +1,4 @@
-name: FluViewer-nf
+name: fluviewer-nf
 channels:
   - conda-forge
   - bioconda
diff --git a/environments/fluviewer.yml b/environments/fluviewer.yml
index a7ea83f..7b52e30 100644
--- a/environments/fluviewer.yml
+++ b/environments/fluviewer.yml
@@ -1,4 +1,4 @@
-name: FluViewer
+name: fluviewer-nf-FluViewer
 channels:
   - conda-forge
   - bioconda
diff --git a/environments/nextclade.yml b/environments/nextclade.yml
index d9e51da..ac031d5 100644
--- a/environments/nextclade.yml
+++ b/environments/nextclade.yml
@@ -1,7 +1,7 @@
-name: FluViewer-nf
+name: fluviewer-nf-nextclade
 channels:
 - conda-forge
 - bioconda
 - defaults
 dependencies:
-- nextclade=2.9.1
\ No newline at end of file
+- nextclade=2.9.1
diff --git a/main.nf b/main.nf
index c5f0961..fe85487 100644
--- a/main.nf
+++ b/main.nf
@@ -107,8 +107,9 @@ workflow {
     ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it ->         [it[0], it[1] << it[2]] }
     ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it ->      [it[0], it[1] << it[2]] }
     ch_provenance = ch_provenance.join(fluviewer.out.provenance).map{ it ->     [it[0], it[1] << it[2]] }    
-    // ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
-    // ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it ->       [it[0], it[1] << it[2]] }
+    ch_provenance = ch_provenance.join(clade_calling.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
+    ch_provenance = ch_provenance.join(snp_calling.out.provenance).map{ it ->   [it[0], it[1] << it[2]] }
+    ch_provenance = ch_provenance.join(genoflu.out.provenance).map{ it ->       [it[0], it[1] << it[2]] }
     collect_provenance(ch_provenance)
 
 }
diff --git a/modules/clade_calling.nf b/modules/clade_calling.nf
index 8c49b17..9ce1cef 100644
--- a/modules/clade_calling.nf
+++ b/modules/clade_calling.nf
@@ -14,7 +14,7 @@ process clade_calling {
 
     output:
     tuple val(sample_id), path("*nextclade*"), emit: nextclade, optional: true
-    tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance
+    tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance, optional: true
 
     script:
 
diff --git a/modules/snp_calling.nf b/modules/snp_calling.nf
index 211362a..dea6f72 100644
--- a/modules/snp_calling.nf
+++ b/modules/snp_calling.nf
@@ -14,10 +14,18 @@ process snp_calling {
     output:
     tuple val(sample_id), path("${sample_id}_*mutations.tsv"), emit: mutations
     tuple val(sample_id), path("${sample_id}/*blastx.tsv"), emit: blast, optional: true
-    // tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance, optional: true
+    tuple val(sample_id), path("${sample_id}_snp_calling_provenance.yml"), emit: provenance, optional: true
 
     script:
-    """ 
+    """
+    printf -- "- process_name: snp_caling\\n" >> ${sample_id}_snp_calling_provenance.yml
+    printf -- "  tools:\\n"                   >> ${sample_id}_snp_calling_provenance.yml
+    printf -- "    - tool_name: blastx\\n"    >> ${sample_id}_snp_calling_provenance.yml
+    printf -- "      tool_version: \$(blastx -version | head -n 1)\\n" >> ${sample_id}_snp_calling_provenance.yml
+    printf -- "  databases:\\n"               >> ${sample_id}_snp_calling_provenance.yml
+    printf -- "    - database_name: ${blastx_db_name}\\n" >> ${sample_id}_snp_calling_provenance.yml
+
+    
     export BLASTDB="${blastx_db_path}"
     blastx -query ${consensus_seqs} -db ${blastx_db_name} -outfmt 6 > ${sample_id}_blastx.tsv && 
 

From f9769059ff5d21b866078d656f44fdc09b76ba42 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 15:29:46 -0700
Subject: [PATCH 18/24] fix sed cmd

---
 .github/scripts/run_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 811ea05..e3b2a59 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -10,7 +10,7 @@ conda activate base
 
 sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
 sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config
-sed -i 'memory  { 50.GB * task.attempt }//g' modules/FluViewer.nf
+sed -i 's/memory  { 50.GB * task.attempt }//g' modules/FluViewer.nf
 
 nextflow run main.nf \
 	 -profile conda \

From 332c74595ef4b87aa75528740e4791d2d6dbe6ec Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 15:31:08 -0700
Subject: [PATCH 19/24] fix env path

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 4c9184d..af4b043 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -69,7 +69,7 @@ def parsePipelineName(name) {
 profiles {
     conda {
 	conda.enabled = true
-	process.conda = "$baseDir/environments/main.yml"
+	process.conda = "$baseDir/environments/environment.yml"
 	if (params.cache) {
 	    conda.cacheDir = params.cache
 	}

From fd10afb818135919ba8042dc2897dd4cc21b736e Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 15:55:11 -0700
Subject: [PATCH 20/24] updates

---
 .github/scripts/run_pipeline.sh | 10 ++++++----
 modules/FluViewer.nf            |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index e3b2a59..49f0cf7 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -8,10 +8,12 @@ eval "$(conda shell.bash hook)"
 
 conda activate base
 
-sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
-sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config
-sed -i 's/memory  { 50.GB * task.attempt }//g' modules/FluViewer.nf
-
+# Check for a sign that we're in the GitHub Actions environment.
+# Prevents these settings from being applied in other environments.
+if [ -n "${GITHUB_ACTIONS}" ]; then 
+    sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
+    sed -i '/memory/d' modules/FluViewer.nf
+fi
 nextflow run main.nf \
 	 -profile conda \
 	 --cache ${HOME}/.conda/envs \
diff --git a/modules/FluViewer.nf b/modules/FluViewer.nf
index 27eac7a..2c95f38 100644
--- a/modules/FluViewer.nf
+++ b/modules/FluViewer.nf
@@ -2,7 +2,7 @@ process fluviewer {
 
     tag { sample_id }
 
-    memory  { 50.GB * task.attempt } 
+    memory  { 50.GB * task.attempt }
     errorStrategy { (task.exitStatus == 2 && task.attempt <= maxRetries) ? 'retry' : 'ignore' } 
     maxRetries 5
 

From dc0b34133418529d91a203d844908d56edc361bb Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 16:49:48 -0700
Subject: [PATCH 21/24] update

---
 .github/scripts/run_pipeline.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 49f0cf7..3fd8d12 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -10,10 +10,14 @@ conda activate base
 
 # Check for a sign that we're in the GitHub Actions environment.
 # Prevents these settings from being applied in other environments.
-if [ -n "${GITHUB_ACTIONS}" ]; then 
+if [ -z "${GITHUB_ACTIONS}" ]; then 
+    echo "Not in GitHub Actions environment. Will not modify nextflow.config or FluViewer.nf."
+else
+    echo "In GitHub Actions environment. Modifying nextflow.config and FluViewer.nf."
     sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
     sed -i '/memory/d' modules/FluViewer.nf
 fi
+
 nextflow run main.nf \
 	 -profile conda \
 	 --cache ${HOME}/.conda/envs \

From ea26a15cc7b0a8c69bf6773a10c7982a251bdcb0 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 17:22:54 -0700
Subject: [PATCH 22/24] collect nextflow log

---
 .github/scripts/run_pipeline.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index 3fd8d12..e66fdbf 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -23,4 +23,5 @@ nextflow run main.nf \
 	 --cache ${HOME}/.conda/envs \
 	 --fastq_input .github/data/fastq \
 	 --db .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa \
-	 --outdir .github/data/test_output
+	 --outdir .github/data/test_output \
+	 -with-log artifacts/nextflow.log

From ea6bf486f1d14dc9a6fd32940bb611f4e3208ad3 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 17:25:51 -0700
Subject: [PATCH 23/24] collect nextflow log

---
 .github/scripts/run_pipeline.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
index e66fdbf..37e1360 100755
--- a/.github/scripts/run_pipeline.sh
+++ b/.github/scripts/run_pipeline.sh
@@ -18,10 +18,10 @@ else
     sed -i '/memory/d' modules/FluViewer.nf
 fi
 
-nextflow run main.nf \
+nextflow -log artifacts/nextflow.log \
+	 run main.nf \
 	 -profile conda \
 	 --cache ${HOME}/.conda/envs \
 	 --fastq_input .github/data/fastq \
 	 --db .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa \
-	 --outdir .github/data/test_output \
-	 -with-log artifacts/nextflow.log
+	 --outdir .github/data/test_output

From 458dd7baa2636febcd80b91c8a32d37bafe259a6 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Wed, 5 Jun 2024 17:53:09 -0700
Subject: [PATCH 24/24] tidy up nextclade provenance

---
 modules/clade_calling.nf | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/modules/clade_calling.nf b/modules/clade_calling.nf
index 9ce1cef..d6be48d 100644
--- a/modules/clade_calling.nf
+++ b/modules/clade_calling.nf
@@ -14,11 +14,16 @@ process clade_calling {
 
     output:
     tuple val(sample_id), path("*nextclade*"), emit: nextclade, optional: true
-    tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance, optional: true
+    tuple val(sample_id), path("${sample_id}_clade_calling__provenance.yml"), emit: provenance, optional: true
 
     script:
-
     """
+    printf -- "process_name: nextclade\\n"  >> ${sample_id}_clade_calling_provenance.yml
+    printf -- "tools:\\n"                   >> ${sample_id}_clade_calling_provenance.yml
+    printf -- "  - tool_name: nextclade\\n" >> ${sample_id}_clade_calling_provenance.yml
+    printf -- "    tool_version: \$(nextclade --version 2>&1  | cut -d ' ' -f 2)\\n" >> ${sample_id}_clade_calling_provenance.yml
+    printf -- "    subcommand: run\\n"       >> ${sample_id}_clade_calling_provenance.yml
+
     [ ! -f  ${sample_id}_HA_consensus.fa ] && ln -sf *HA_consensus.fa ${sample_id}_HA_consensus.fa
 
     FOUND=true
@@ -54,9 +59,5 @@ process clade_calling {
         VERSION="NONE_INVALID_HA_TYPE\\n"
     fi 
 
-    printf -- "- process_name: nextclade\\n" > ${sample_id}_clade_provenance.yml
-    printf -- "  tool_name: nextclade\\n  tool_version: \$(nextclade --version 2>&1  | cut -d ' ' -f 2)\\n" >> ${sample_id}_clade_provenance.yml
-    printf -- "  Dataset location: \$LOCATION" >> ${sample_id}_clade_provenance.yml
-    printf -- "  Dataset version: \$VERSION" >> ${sample_id}_clade_provenance.yml
     """
 }