From f7b23d2af719fa0ebd45b551f335d68e6681c296 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Tue, 4 Jun 2024 17:53:15 -0700 Subject: [PATCH 01/24] Work on testing pipeline --- .github/scripts/download_fluviewer_db.sh | 7 + .github/scripts/run_pipeline.sh | 1 + .github/scripts/simulate_reads.sh | 2 +- .gitignore | 3 + ReadMe.md | 10 +- main.nf | 157 ++++++++++++----------- modules/genoflu.nf | 11 +- modules/provenance.nf | 26 ++-- nextflow.config | 78 +++++------ 9 files changed, 166 insertions(+), 129 deletions(-) create mode 100755 .github/scripts/download_fluviewer_db.sh diff --git a/.github/scripts/download_fluviewer_db.sh b/.github/scripts/download_fluviewer_db.sh new file mode 100755 index 0000000..d49cc48 --- /dev/null +++ b/.github/scripts/download_fluviewer_db.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +mkdir -p .github/data/fluviewer_db + +wget -O .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa.gz https://raw.githubusercontent.com/KevinKuchinski/FluViewer/main/FluViewer_db_v_0_1_8.fa.gz + +gunzip .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa.gz diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 5b8d7df..65ecc6f 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -9,4 +9,5 @@ nextflow run main.nf \ -profile conda \ --cache ${HOME}/.conda/envs \ --fastq_input .github/data/fastq \ + --db .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa \ --outdir .github/data/test_output diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh index 7001cce..41b9482 100755 --- a/.github/scripts/simulate_reads.sh +++ b/.github/scripts/simulate_reads.sh @@ -13,7 +13,7 @@ while IFS=',' read -r sample_id assembly; do art_illumina \ --paired \ --in ${assembly} \ - --fcov 12 \ + --fcov 100 \ --len 150 \ --mflen 400 \ --sdev 100 \ diff --git a/.gitignore b/.gitignore index 0715472..84620e3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.github/data/assemblies +.github/data/fastq +.github/data/fluviewer_db .nextflow* work test* diff --git a/ReadMe.md b/ReadMe.md index b26b45c..4914a67 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -1,6 +1,6 @@ [![Tests](https://github.com/BCCDC-PHL/fluviewer-nf/actions/workflows/tests.yml/badge.svg)](https://github.com/BCCDC-PHL/fluviewer-nf/actions/workflows/tests.yml) -# FluViewer-nf +# fluviewer-nf This is a Nextflow pipeline for running the FluViewer analysis tool (https://github.com/KevinKuchinski/FluViewer) and other custom modules to obtain consensus sequences, HA and NA subtypes, clade calls, and amino acid mutations for Influenza A WGS. @@ -67,7 +67,13 @@ For a full list of optional arguments, see: https://github.com/KevinKuchinski/Fl **Example command:** ``` -nextflow run FluViewer_installation/main.nf -r 0.1.0 -profile --cache ~/.conda/envs/ --fastq_input flu_A_reference_collection/ --db ref/FluViewer_db_full_20220915.fasta --outdir [outdir] +nextflow run BCCDC-PHL/fluviewer-nf \ + -r v0.1.0 \ + -profile conda \ + --cache ~/.conda/envs \ + --fastq_input /path/to/your_fastqs \ + --db /path/to/FluViewer_db.fa \ + --outdir /path/to/output_dir ``` ## Output diff --git a/main.nf b/main.nf index ce9515c..87bb499 100644 --- a/main.nf +++ b/main.nf @@ -18,95 +18,106 @@ Future versions will add in: nextflow.enable.dsl = 2 - include { hash_files } from './modules/hash_files.nf' - include { pipeline_provenance } from './modules/provenance.nf' - include { collect_provenance } from './modules/provenance.nf' - include { fastp } from './modules/fastp.nf' - include { cutadapt} from './modules/cutadapt.nf' - include { FluViewer } from './modules/FluViewer.nf' - include { multiqc } from './modules/multiqc.nf' - include { FASTQC } from './modules/fastqc.nf' - include { CLADE_CALLING } from './modules/clade_calling.nf' - include { SNP_CALLING } from './modules/snp_calling.nf' - include { PULL_GENOFLU ; CHECKOUT_GENOFLU ; GENOFLU } from './modules/genoflu.nf' +include { hash_files } from './modules/hash_files.nf' +include { pipeline_provenance } from './modules/provenance.nf' +include { collect_provenance } from './modules/provenance.nf' +include { fastp } from './modules/fastp.nf' +include { cutadapt} from './modules/cutadapt.nf' +include { FluViewer } from './modules/FluViewer.nf' +include { multiqc } from './modules/multiqc.nf' +include { FASTQC } from './modules/fastqc.nf' +include { CLADE_CALLING } from './modules/clade_calling.nf' +include { SNP_CALLING } from './modules/snp_calling.nf' +include { PULL_GENOFLU } from './modules/genoflu.nf' +include { CHECKOUT_GENOFLU } from './modules/genoflu.nf' +include { GENOFLU } from './modules/genoflu.nf' // prints to the screen and to the log - log.info """ - - FluViewer Pipeline - =================================== - projectDir : ${projectDir} - launchDir : ${launchDir} - database : ${params.db} - primers : ${params.primers} - fastqInputDir : ${params.fastq_input} - outdir : ${params.outdir} - pipeline run : ${params.pipeline_short_name} - pipeline version : ${params.pipeline_minor_version} - run_name : ${params.run_name} - user : $workflow.userName - Git repository : $workflow.repository - git commit id : $workflow.commitId - branch : $workflow.revision - """ - .stripIndent() +log.info """ + FluViewer Pipeline + =================================== + projectDir : ${projectDir} + launchDir : ${launchDir} + database : ${params.db} + primers : ${params.primers} + fastqInputDir : ${params.fastq_input} + outdir : ${params.outdir} + pipeline run name : ${workflow.runName} + pipeline version : ${workflow.manifest.version} + run_name : ${params.run_name} + user : ${workflow.userName} + Git repository : ${workflow.repository} + git commit id : ${workflow.commitId} + branch : ${workflow.revision} +""".stripIndent() workflow { - ch_start_time = Channel.of(LocalDateTime.now()) - ch_pipeline_name = Channel.of(workflow.manifest.name) - ch_pipeline_version = Channel.of(workflow.manifest.version) + ch_workflow_metadata = Channel.value([ + workflow.sessionId, + workflow.runName, + workflow.manifest.name, + workflow.manifest.version, + workflow.start, + ]) + + ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata) - ch_pipeline_provenance = pipeline_provenance(ch_pipeline_name.combine(ch_pipeline_version).combine(ch_start_time)) + ch_primers = Channel.fromPath(params.primer_path) - ch_primers = Channel.fromPath(params.primer_path) - ch_db = Channel.fromPath(params.db) - ch_fastq_input = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] } + ch_db = Channel.fromPath(params.db) - ch_reference_db = Channel.of([file(params.blastx_subtype_db).parent, file(params.blastx_subtype_db).name]).first() + ch_fastq_input = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] } - main: - // Generate hashes for input files - hash_files(ch_fastq_input.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq_input"))) + ch_reference_db = Channel.of([file(params.blastx_subtype_db).parent, file(params.blastx_subtype_db).name]).first() - // Clean up reads - remove adapters (fastp) and primers (cutadapt) - fastp( ch_fastq_input ) - cutadapt(fastp.out.trimmed_reads.combine(ch_primers)) - FASTQC(cutadapt.out.primer_trimmed_reads) - - // Run FluViewer - FluViewer(cutadapt.out.primer_trimmed_reads.combine(ch_db)) - - //Collect al the relevant filesfor MULTIQC - ch_fastqc_collected = FASTQC.out.zip.map{ it -> [it[1], it[2]]}.collect() - multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) ) - - //Call clades for H1 and H3 samples - CLADE_CALLING(FluViewer.out.consensus_seqs) - - SNP_CALLING(FluViewer.out.consensus_main, ch_reference_db) - - - PULL_GENOFLU(params.genoflu_github_url) - - CHECKOUT_GENOFLU(PULL_GENOFLU.out.repo, params.genoflu_version) - - GENOFLU(FluViewer.out.consensus_main.combine(PULL_GENOFLU.out.repo)) + main: + // Provenance channel starts with just the sample IDs + // These will be joined to various provenance files as they are generated + ch_provenance = ch_fastq_input.map{ it -> it[0] } + // Generate hashes for input files + hash_files(ch_fastq_input.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq_input"))) - //Pool Provenance data - ch_provenance = FluViewer.out.provenance - ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], [it[1]] << it[2]] } - ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(ch_fastq_input.map{ it -> it[0] }.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - collect_provenance(ch_provenance) + // Clean up reads - remove adapters (fastp) and primers (cutadapt) + fastp(ch_fastq_input) + cutadapt(fastp.out.trimmed_reads.combine(ch_primers)) + FASTQC(cutadapt.out.primer_trimmed_reads) + // Run FluViewer + FluViewer(cutadapt.out.primer_trimmed_reads.combine(ch_db)) + //Collect al the relevant filesfor MULTIQC + ch_fastqc_collected = FASTQC.out.zip.map{ it -> [it[1], it[2]]}.collect() + multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) ) + + //Call clades for H1 and H3 samples + CLADE_CALLING(FluViewer.out.consensus_seqs) + + SNP_CALLING(FluViewer.out.consensus_main, ch_reference_db) + + PULL_GENOFLU(params.genoflu_github_url) + + CHECKOUT_GENOFLU(PULL_GENOFLU.out.repo, params.genoflu_version) + + GENOFLU(FluViewer.out.consensus_main.combine(PULL_GENOFLU.out.repo)) + + + // + // Provenance collection processes + // The basic idea is to build up a channel with the following structure: + // [sample_id, [provenance_file_1.yml, provenance_file_2.yml, provenance_file_3.yml...]] + // ...and then concatenate them all together in the 'collect_provenance' process. + ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map{ it -> [it[0], [it[1]]] } + ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + collect_provenance(ch_provenance) } diff --git a/modules/genoflu.nf b/modules/genoflu.nf index 8a14e7f..2fa31e6 100644 --- a/modules/genoflu.nf +++ b/modules/genoflu.nf @@ -14,10 +14,12 @@ process GENOFLU { script: """ - printf -- "- process_name: genoflu\\n" > ${sample_id}_genoflu_provenance.yml - printf -- " tool_name: genoflu\\n tool_version: \$(genoflu.py --version | cut -d' ' -f3)\\n" >> ${sample_id}_genoflu_provenance.yml + printf -- "- process_name: genoflu\\n" >> ${sample_id}_genoflu_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_genoflu_provenance.yml + printf -- " - tool_name: genoflu\\n" >> ${sample_id}_genoflu_provenance.yml + printf -- " tool_version: \$(genoflu.py --version | cut -d' ' -f3)\\n" >> ${sample_id}_genoflu_provenance.yml - genoflu.py \ + genoflu.py \ -f ${consensus_seqs} \ -i ${genoflu_path}/dependencies/fastas/ \ -c ${genoflu_path}/dependencies/genotype_key.xlsx \ @@ -28,7 +30,8 @@ process GENOFLU { } process PULL_GENOFLU { - + + executor 'local' storeDir "${params.genoflu_cache}" input: diff --git a/modules/provenance.nf b/modules/provenance.nf index 36ab158..0a8d115 100644 --- a/modules/provenance.nf +++ b/modules/provenance.nf @@ -4,7 +4,7 @@ process collect_provenance { executor 'local' - publishDir params.versioned_outdir ? "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/Provenance_files" : "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/Provenance_files", pattern: "${sample_id}_*_provenance.yml", mode: 'copy' + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/Provenance_files", pattern: "${sample_id}_*_provenance.yml", mode: 'copy' input: tuple val(sample_id), path(provenance_files) @@ -20,18 +20,22 @@ process collect_provenance { process pipeline_provenance { - tag { pipeline_name + " / " + pipeline_version } + tag { pipeline_name + " / " + pipeline_version } - executor 'local' + executor 'local' - input: - tuple val(pipeline_name), val(pipeline_version), val(analysis_start) + input: + tuple val(session_id), val(run_name), val(pipeline_name), val(pipeline_version), val(timestamp_analysis_start) - output: - file("pipeline_provenance.yml") + output: + file("pipeline_provenance.yml") - script: - """ - printf -- "- pipeline_name: ${pipeline_name}\\n pipeline_version: ${pipeline_version}\\n- timestamp_analysis_start: ${analysis_start}\\n" > pipeline_provenance.yml - """ + script: + """ + printf -- "- pipeline_name: ${pipeline_name}\\n" >> pipeline_provenance.yml + printf -- " pipeline_version: ${pipeline_version}\\n" >> pipeline_provenance.yml + printf -- " nextflow_session_id: ${session_id}\\n" >> pipeline_provenance.yml + printf -- " nextflow_run_name: ${run_name}\\n" >> pipeline_provenance.yml + printf -- " timestamp_analysis_start: ${timestamp_analysis_start}\\n" >> pipeline_provenance.yml + """ } diff --git a/nextflow.config b/nextflow.config index c825b9a..e93b216 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,39 +1,40 @@ manifest { author = 'James Zlosnik (nextflow pipeline)/Kevin Kuchinski (FluViewer)' - name = 'BCCDC-PHL/FluViewer-nf' - version = '0.2.0' + name = 'BCCDC-PHL/fluviewer-nf' + version = '0.2.2' description = 'BCCDC-PHL FluViewer' mainScript = 'main.nf' nextflowVersion = '>=20.01.0' } params { - profile = false - cache = '' - outdir = 'results' - fastq_input = '' - illumina_suffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ] - fastq_exts = ['.fastq.gz', '.fq.gz', '.fastq', '.fq'] - fastq_search_path = makeFastqSearchPath( illumina_suffixes, fastq_exts ) - primer_path = "${baseDir}/assets/" - primers = "${baseDir}/assets/primers.fa" - rev_primers = "${baseDir}/assets/primers_rev_comp.fa" - pipeline_short_name = parsePipelineName(manifest.toMap().get('name')) - pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version')) - run_name = parseRunName( fastq_input ) - versioned_outdir = '' - min_depth = '10' - min_q = '30' - min_cov = '25' - min_ident = '95' - keep_interfiles = false - h1_dataset = '' - h3_dataset = '' - h5_dataset = '' - blastx_subtype_db = "${projectDir}/assets/blastx/blastx_subtype_db.fasta" - genoflu_cache = "${projectDir}/assets/genoflu" - genoflu_github_url = 'https://github.com/USDA-VS/GenoFLU/' - genoflu_version = "LATEST" + profile = false + cache = '' + outdir = 'results' + fastq_input = '' + illumina_suffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ] + fastq_exts = ['.fastq.gz', '.fq.gz', '.fastq', '.fq'] + fastq_search_path = makeFastqSearchPath( illumina_suffixes, fastq_exts ) + primer_path = "${baseDir}/assets/" + primers = "${baseDir}/assets/primers.fa" + rev_primers = "${baseDir}/assets/primers_rev_comp.fa" + pipeline_short_name = parsePipelineName(manifest.toMap().get('name')) + pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version')) + run_name = parseRunName( fastq_input ) + versioned_outdir = '' + min_depth = '10' + min_q = '30' + min_cov = '25' + min_ident = '95' + keep_interfiles = false + h1_dataset = '' + h3_dataset = '' + h5_dataset = '' + db = 'NO_FILE' + blastx_subtype_db = "${projectDir}/assets/blastx/blastx_subtype_db.fasta" + genoflu_cache = "${projectDir}/assets/genoflu" + genoflu_github_url = 'https://github.com/USDA-VS/GenoFLU/' + genoflu_version = "LATEST" } def makeFastqSearchPath ( illumina_suffixes, fastq_exts ) { @@ -67,23 +68,24 @@ def parsePipelineName(name) { } profiles { - conda { - process.conda = "$baseDir/environments/main.yml" - if (params.cache) { - conda.cacheDir = params.cache + conda { + conda.enabled = true + process.conda = "$baseDir/environments/main.yml" + if (params.cache) { + conda.cacheDir = params.cache + } + conda.useMamba = true } - conda.useMamba = true - } } process { withName: FluViewer { - cpus = 8 - memory = '32 GB' + cpus = 4 + memory = '2 GB' } withName: cutadapt { - cpus = 8 + cpus = 4 } withName: CLADE_CALLING { @@ -105,4 +107,4 @@ timeline { trace { enabled = true file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_trace.txt" -} \ No newline at end of file +} From 1385c310c9582a8e1368de5171062d13a676d83a Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 11:54:51 -0700 Subject: [PATCH 02/24] troubleshooting provenance generation --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 87bb499..fa65f4c 100644 --- a/main.nf +++ b/main.nf @@ -115,9 +115,9 @@ workflow { ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + // ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + // ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + // ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it -> [it[0], it[1] << it[2]] } collect_provenance(ch_provenance) } From e048c1d2eaeb45210e3b88c4cd387ad47548fec0 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 11:56:26 -0700 Subject: [PATCH 03/24] Add testing workflow --- .github/workflows/tests.yml | 48 +++++++++++++++++++++++++++++++++++++ .gitignore | 4 +++- 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..e0ba675 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,48 @@ +on: + pull_request: + branches: + - main + push: + branches: + - main + workflow_dispatch: +name: Tests +jobs: + test: + strategy: + fail-fast: false + matrix: + nextflow_version: ["21.04.3", "23.10.1"] + name: Run tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + - name: Create Artifacts Directory + run: mkdir artifacts + - name: Install Miniconda + run: bash .github/scripts/install_conda.sh + - name: Install Nextflow + env: + NXF_VER: ${{ matrix.nextflow_version }} + run: bash .github/scripts/install_nextflow.sh + - name: Create ART Read-Simulation Environment + run: bash .github/scripts/create_art_environment.sh + - name: Download Assemblies + run: bash .github/scripts/download_assemblies.sh + - name: Simulate Reads + run: bash .github/scripts/simulate_reads.sh + - name: Run Pipeline + run: bash .github/scripts/run_pipeline.sh + - name: Create Output Checking Environment + run: bash .github/scripts/create_output_checking_environment.sh + - name: Check Outputs + run: bash .github/scripts/check_outputs.sh + - name: Prepare Artifacts + if: always() + run: bash .github/scripts/prepare_artifacts.sh + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: artifacts-BCCDC-PHL-fluviewer-nf-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }} + path: artifacts diff --git a/.gitignore b/.gitignore index 84620e3..fa42e0c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,10 @@ .github/data/fluviewer_db .nextflow* work -test* +test_input +test_output test_data/ +./test* ref/ input_test/ output_test/ From 953b2558f271bdb82916f307c4b3bf8228009605 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 12:01:33 -0700 Subject: [PATCH 04/24] make conda (and mamba?) available when running the pipeline --- .github/scripts/run_pipeline.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 65ecc6f..82708a2 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -2,6 +2,10 @@ set -eo pipefail +source /home/analysis/.bashrc + +eval "$(conda shell.bash hook)" + sed -i 's/cpus = 8/cpus = 4/g' nextflow.config sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config From ecb4da128b5396184ef9288821c8bd88876a317d Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 12:03:29 -0700 Subject: [PATCH 05/24] fix home dir path --- .github/scripts/run_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 82708a2..76d0abe 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -2,7 +2,7 @@ set -eo pipefail -source /home/analysis/.bashrc +source ${HOME}/.bashrc eval "$(conda shell.bash hook)" From 11ed458293aa2730828220cbb59c8b095717f660 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 12:31:24 -0700 Subject: [PATCH 06/24] add another sample to simulate, this one directly from FluViewer db --- .github/data/reads_to_simulate.csv | 1 + .github/scripts/download_assemblies.sh | 1 + .github/scripts/run_pipeline.sh | 2 ++ 3 files changed, 4 insertions(+) diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv index 283cbd2..75255d7 100644 --- a/.github/data/reads_to_simulate.csv +++ b/.github/data/reads_to_simulate.csv @@ -1,2 +1,3 @@ +HQ011408.1,.github/data/assemblies/HQ011408.1.fa NC026423.1,.github/data/assemblies/NC_026423.1.fa NC026431.1,.github/data/assemblies/NC_026431.1.fa diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh index b9b618a..8db2f56 100755 --- a/.github/scripts/download_assemblies.sh +++ b/.github/scripts/download_assemblies.sh @@ -2,5 +2,6 @@ mkdir -p .github/data/assemblies +curl -o .github/data/assemblies/HQ011408.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=HQ011408.1&db=nucleotide&rettype=fasta" curl -o .github/data/assemblies/NC_026423.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026423.1&db=nucleotide&rettype=fasta" curl -o .github/data/assemblies/NC_026431.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026431.1&db=nucleotide&rettype=fasta" diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 76d0abe..0bd5c23 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -6,6 +6,8 @@ source ${HOME}/.bashrc eval "$(conda shell.bash hook)" +conda activate base + sed -i 's/cpus = 8/cpus = 4/g' nextflow.config sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config From ac72d6d756b9ed65b2367594d1f25b34b930dc0b Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 12:33:49 -0700 Subject: [PATCH 07/24] Add another sample to simulate from FluViewer db --- .github/data/reads_to_simulate.csv | 1 + .github/scripts/download_assemblies.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv index 75255d7..3737436 100644 --- a/.github/data/reads_to_simulate.csv +++ b/.github/data/reads_to_simulate.csv @@ -1,3 +1,4 @@ HQ011408.1,.github/data/assemblies/HQ011408.1.fa +CY014984.1,.github/data/assemblies/CY014984.1.fa NC026423.1,.github/data/assemblies/NC_026423.1.fa NC026431.1,.github/data/assemblies/NC_026431.1.fa diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh index 8db2f56..d149c3d 100755 --- a/.github/scripts/download_assemblies.sh +++ b/.github/scripts/download_assemblies.sh @@ -3,5 +3,6 @@ mkdir -p .github/data/assemblies curl -o .github/data/assemblies/HQ011408.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=HQ011408.1&db=nucleotide&rettype=fasta" +curl -o .github/data/assemblies/CY014984.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=CY014984.1&db=nucleotide&rettype=fasta" curl -o .github/data/assemblies/NC_026423.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026423.1&db=nucleotide&rettype=fasta" curl -o .github/data/assemblies/NC_026431.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026431.1&db=nucleotide&rettype=fasta" From f862e72c55eca43ad5d88b77fae4766d4e9a3ed8 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 13:13:20 -0700 Subject: [PATCH 08/24] Build multi-segment ref to simulate reads from --- .github/data/reads_to_simulate.csv | 5 +---- .github/scripts/download_assemblies.sh | 14 ++++++++++---- .github/scripts/simulate_reads.sh | 2 +- .gitignore | 2 +- bin/__pycache__/tools.cpython-310.pyc | Bin 3119 -> 0 bytes 5 files changed, 13 insertions(+), 10 deletions(-) delete mode 100644 bin/__pycache__/tools.cpython-310.pyc diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv index 3737436..7854e7b 100644 --- a/.github/data/reads_to_simulate.csv +++ b/.github/data/reads_to_simulate.csv @@ -1,4 +1 @@ -HQ011408.1,.github/data/assemblies/HQ011408.1.fa -CY014984.1,.github/data/assemblies/CY014984.1.fa -NC026423.1,.github/data/assemblies/NC_026423.1.fa -NC026431.1,.github/data/assemblies/NC_026431.1.fa +MK58361X-H3N2,.github/data/assemblies/MK58361X-H3N2.fa diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh index d149c3d..232d43a 100755 --- a/.github/scripts/download_assemblies.sh +++ b/.github/scripts/download_assemblies.sh @@ -2,7 +2,13 @@ mkdir -p .github/data/assemblies -curl -o .github/data/assemblies/HQ011408.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=HQ011408.1&db=nucleotide&rettype=fasta" -curl -o .github/data/assemblies/CY014984.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=CY014984.1&db=nucleotide&rettype=fasta" -curl -o .github/data/assemblies/NC_026423.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026423.1&db=nucleotide&rettype=fasta" -curl -o .github/data/assemblies/NC_026431.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_026431.1&db=nucleotide&rettype=fasta" +curl -o .github/data/assemblies/MK583610.1_segment_1_PB2_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583610.1&db=nucleotide&rettype=fasta" +curl -o .github/data/assemblies/MK583611.1_segment_2_PB1_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583611.1&db=nucleotide&rettype=fasta" +curl -o .github/data/assemblies/MK583612.1_segment_3_PA_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583612.1&db=nucleotide&rettype=fasta" +curl -o .github/data/assemblies/MK583613.1_segment_4_HA_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583613.1&db=nucleotide&rettype=fasta" +curl -o .github/data/assemblies/MK583614.1_segment_5_NP_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583614.1&db=nucleotide&rettype=fasta" +curl -o .github/data/assemblies/MK583615.1_segment_6_NA_H3N2.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=MK583615.1&db=nucleotide&rettype=fasta" + +cat .github/data/assemblies/MK58361*.fa > .github/data/assemblies/MK58361X-H3N2.fa + +rm .github/data/assemblies/MK58361*.1_segment_*.fa diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh index 41b9482..0689e4e 100755 --- a/.github/scripts/simulate_reads.sh +++ b/.github/scripts/simulate_reads.sh @@ -13,7 +13,7 @@ while IFS=',' read -r sample_id assembly; do art_illumina \ --paired \ --in ${assembly} \ - --fcov 100 \ + --fcov 500 \ --len 150 \ --mflen 400 \ --sdev 100 \ diff --git a/.gitignore b/.gitignore index fa42e0c..2fb2cb1 100644 --- a/.gitignore +++ b/.gitignore @@ -12,5 +12,5 @@ input_test/ output_test/ Validation_notes.md .Rproj.user -__pycache__/ +*/__pycache__/*.pyc assets/genoflu/GenoFLU \ No newline at end of file diff --git a/bin/__pycache__/tools.cpython-310.pyc b/bin/__pycache__/tools.cpython-310.pyc deleted file mode 100644 index a9c443bef6bed7562cfce5cbcff9c07b61da1d07..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3119 zcmb7G&5s;M74PbgneO&@y$}mI1fiBzmiw4 z#ty!nzdy{{D~|II`f>5|@Z$qC{Rs$nI4hjtvS&kv-)`Z~y`jgJbN(=}dpg4ozs7wY zJZHl$-{ldHF$(z=-h1v0BYu_dppQ9wJ7;~DaaldfXohDkiO}Oxy&cZ*w);LF<#v66echPszhv?B7zi~4U zdUw;mGRKLd!@8W9T$YKENtTy9F*A|Or8Y@gRZ3PWPmM^@d0xsSo#a?oX{n3U@{CUt zBecoOX_6KNSd|co5-F#dOr>+7_v7fk&@hYutHk0&sA{9^M z_()6)aeORHtxCNTC~H&IMkgu$6H`}(crT8kz2x3&yvanCCe2Gap6)S*a^I%=Qbd{L zr4du1(DQtPYh7gBx(XzwCOLr?wZ>&KskUr!>!-JqpWRL{d#C>$vdelt#)&dZw2>8* z<`S39;e%POmr7r>@+5_79BzoC2QUAHZvxM4J_>PakI?jO5aTom9L|2=ELg+1`-P{j zbBt~{2oZ;SjeFhsrL({;gn|ET4rL$JRV9_Zdv+lnDkYwX(p+%XS}k#_pKNw!O3rbK zc~z`TpN`?Oq?F~}ye>=*hv_|;?fo1ly(Co?>EAL+8m|y#-)lWBj$3ao`7kgt<@_&> zim>OK-wyOFRbnRRVsIp9<$jeGbD;+NrFd$xLY@p#t%cEp4-XFRAM8E+^+$ulqW&}& zCqj+NY%tEtfswM%`_*adRVjZ1QZ;2|&ktG1ddz3f-pUH`bhNqJ{@p@O(n7zF!?^OT zs_*ur)-9{n`%Rg(0Wx4J+GxYk1`}DArVWrS=-iyQeu?03uW+F!Dz7M6`th(+nh{+z z;9^`)+XzWDvYf+^B9+#Lu^rjmQK)DzQ*Qapyc%}W)b0tcj0wW@(VN9O{%8CjJ9XrIIe{{-Jb0G>c=>07_Maw zgQp~@?WR<>g}Q9a9_ol0triDh)jJ>|>ummfrfy$^WE^Xl83tq+&QQ0R9l02} z+%rhUGd6W$Fm?{3VHTBnH3}ME-NX*y17Sepaqq6v1nSO`3p7}++<>vUk2NxD@ZeeS z`_J}1|H-p}=uMktFDcdfe(S<|UCPnC5Q=ncyHxE_wfh}ppHah@%#4iB@~6wOt}~2V zPYK=ng~$-!Dxc2GWjpPV{gLsIrgEWw1>yQSNQ{u`xkP=oi*^G|?O|ankh+*4eXPAc z!hp)xIl~>%X~n^}bnt6hERDdi@NL@s z&mvya46(C-+EtODrZ9u$Bp5z=D zyz%7Pix!Ihn;-X+I3n%f5rQTd=9MbIUa*$Q1mMNxiG32uvcMb1GEC`}gfO881K1H4 z^d68;geuZXCnvd?UAo2Ey|?McH-RXKi(K^t>)`a`cIYtLgGU7 zWP)%UwRAZ>r1}An2t~6EtR&S(f+YA|)GsjCXNP@H{g6(6i^z{??n>)Qjp}m>g49_; zL4em$b*iq?u6KwKD%m<~b+*zjC_>K!Wx+4J3(Tks@jjJ2y$iDI(D{HiH|~c~Z|C3m C5{SqE From aebf1ca66df28f4f5349004a963cb3627695f863 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 14:15:08 -0700 Subject: [PATCH 09/24] touching up provenance --- modules/cutadapt.nf | 9 ++++++--- modules/fastp.nf | 12 ++++++------ modules/fastqc.nf | 12 +++++++----- nextflow.config | 31 +++++++++++++++---------------- 4 files changed, 34 insertions(+), 30 deletions(-) diff --git a/modules/cutadapt.nf b/modules/cutadapt.nf index fe10ccb..90666e6 100644 --- a/modules/cutadapt.nf +++ b/modules/cutadapt.nf @@ -12,8 +12,11 @@ process cutadapt { script: """ - printf -- "- process_name: cutadapt\\n" > ${sample_id}_cutadapt_provenance.yml - printf -- " tool_name: cutadapt\\n tool_version: \$(cutadapt --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_cutadapt_provenance.yml + printf -- "- process_name: cutadapt\\n" >> ${sample_id}_cutadapt_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_cutadapt_provenance.yml + printf -- " - tool_name: cutadapt\\n" >> ${sample_id}_cutadapt_provenance.yml + printf -- " tool_version: \$(cutadapt --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_cutadapt_provenance.yml + cutadapt \ -j ${task.cpus} \ -a file:${params.primers} \ @@ -26,4 +29,4 @@ process cutadapt { ${sample_id}_R2.trim.fastq.gz \ > ${sample_id}.cutadapt.log """ -} \ No newline at end of file +} diff --git a/modules/fastp.nf b/modules/fastp.nf index 7d522df..999e6ee 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -15,8 +15,11 @@ process fastp { script: """ - printf -- "- process_name: fastp\\n" > ${sample_id}_fastp_provenance.yml - printf -- " tool_name: fastp\\n tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml + printf -- "- process_name: fastp\\n" >> ${sample_id}_fastp_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_fastp_provenance.yml + printf -- " - tool_name: fastp\\n" >> ${sample_id}_fastp_provenance.yml + printf -- " tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml + fastp \ -t ${task.cpus} \ -i ${reads_1} \ @@ -32,16 +35,13 @@ process fastp { } -//printf -- "- process_name: fastp\\n" > ${sample_id}_fastp_provenance.yml -//printf -- " tool_name: fastp\\n tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml - process fastp_json_to_csv { tag { sample_id } executor 'local' - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}" : "${params.outdir}/${sample_id}", pattern: "${sample_id}_fastp.csv", mode: 'copy' + publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_fastp.csv", mode: 'copy' input: tuple val(sample_id), path(fastp_json) diff --git a/modules/fastqc.nf b/modules/fastqc.nf index 82f2ab6..3413c90 100644 --- a/modules/fastqc.nf +++ b/modules/fastqc.nf @@ -1,4 +1,4 @@ -process FASTQC { +process fastqc { tag { sample_id } @@ -13,9 +13,13 @@ process FASTQC { script: """ + printf -- "- process_name: fastqc\\n" >> ${sample_id}_fastqc_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_fastqc_provenance.yml + printf -- " - tool_name: fastqc\\n" >> + printf -- " tool_version: \$(fastqc --version 2>&1 | sed -n '1 p')\\n" >> ${sample_id}_fastqc_provenance.yml + mkdir -p ./tmp - printf -- "- process_name: fastqc\\n" > ${sample_id}_fastqc_provenance.yml - printf -- " tool_name: fastqc\\n tool_version: \$(fastqc --version 2>&1 | sed -n '1 p')\\n" >> ${sample_id}_fastqc_provenance.yml + fastqc \ --threads ${task.cpus} \ --dir ./tmp \ @@ -24,5 +28,3 @@ process FASTQC { """ } - //removed from cutadapt script - //--json=${sample_id}.cutadapt.json diff --git a/nextflow.config b/nextflow.config index e93b216..56c58de 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,7 +21,6 @@ params { pipeline_short_name = parsePipelineName(manifest.toMap().get('name')) pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version')) run_name = parseRunName( fastq_input ) - versioned_outdir = '' min_depth = '10' min_q = '30' min_cov = '25' @@ -79,19 +78,19 @@ profiles { } process { - withName: FluViewer { - cpus = 4 - memory = '2 GB' - } + withName: FluViewer { + cpus = 8 + memory = '32 GB' + } - withName: cutadapt { - cpus = 4 - } + withName: cutadapt { + cpus = 8 + } - withName: CLADE_CALLING { - conda = "$baseDir/environments/nextclade.yml" - cpus = 4 - } + withName: CLADE_CALLING { + conda = "$baseDir/environments/nextclade.yml" + cpus = 4 + } } report { @@ -100,11 +99,11 @@ report { } timeline { - enabled = true - file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_timeline.html" + enabled = true + file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_timeline.html" } trace { - enabled = true - file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_trace.txt" + enabled = true + file = "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${params.run_name}_trace.txt" } From 5b895e20f43e0d592af31dd005ff8892adce6919 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 14:29:56 -0700 Subject: [PATCH 10/24] tidying up --- main.nf | 41 +++++------ modules/FluViewer.nf | 151 +++++++++++++++++++-------------------- modules/clade_calling.nf | 4 +- modules/genoflu.nf | 7 +- modules/multiqc.nf | 1 - nextflow.config | 4 +- 6 files changed, 98 insertions(+), 110 deletions(-) diff --git a/main.nf b/main.nf index fa65f4c..7554581 100644 --- a/main.nf +++ b/main.nf @@ -3,15 +3,6 @@ /* * A nextflow wrapper for running FluViewer * ----------------------------------------- - - == V1 == -This pipeline will run FluViewer on a set of fastq files in a baseDir. -Each output will be its own directory. -Future versions will add in: -- fastp to remove adapters and produce QC -- multqc to read the results of this -- a script to scrape together results and produce an output csv - */ import java.time.LocalDateTime @@ -23,14 +14,14 @@ include { pipeline_provenance } from './modules/provenance.nf' include { collect_provenance } from './modules/provenance.nf' include { fastp } from './modules/fastp.nf' include { cutadapt} from './modules/cutadapt.nf' -include { FluViewer } from './modules/FluViewer.nf' +include { fluviewer } from './modules/FluViewer.nf' include { multiqc } from './modules/multiqc.nf' -include { FASTQC } from './modules/fastqc.nf' -include { CLADE_CALLING } from './modules/clade_calling.nf' -include { SNP_CALLING } from './modules/snp_calling.nf' -include { PULL_GENOFLU } from './modules/genoflu.nf' -include { CHECKOUT_GENOFLU } from './modules/genoflu.nf' -include { GENOFLU } from './modules/genoflu.nf' +include { fastqc } from './modules/fastqc.nf' +include { clade_calling } from './modules/clade_calling.nf' +include { snp_calling } from './modules/snp_calling.nf' +include { pull_genoflu } from './modules/genoflu.nf' +include { checkout_genoflu } from './modules/genoflu.nf' +include { genoflu } from './modules/genoflu.nf' // prints to the screen and to the log @@ -85,25 +76,25 @@ workflow { // Clean up reads - remove adapters (fastp) and primers (cutadapt) fastp(ch_fastq_input) cutadapt(fastp.out.trimmed_reads.combine(ch_primers)) - FASTQC(cutadapt.out.primer_trimmed_reads) + fastqc(cutadapt.out.primer_trimmed_reads) // Run FluViewer - FluViewer(cutadapt.out.primer_trimmed_reads.combine(ch_db)) + fluviewer(cutadapt.out.primer_trimmed_reads.combine(ch_db)) - //Collect al the relevant filesfor MULTIQC - ch_fastqc_collected = FASTQC.out.zip.map{ it -> [it[1], it[2]]}.collect() + //Collect al the relevant filesfor multiqc + ch_fastqc_collected = fastqc.out.zip.map{ it -> [it[1], it[2]]}.collect() multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) ) //Call clades for H1 and H3 samples - CLADE_CALLING(FluViewer.out.consensus_seqs) + clade_calling(fluviewer.out.consensus_seqs) - SNP_CALLING(FluViewer.out.consensus_main, ch_reference_db) + snp_calling(fluviewer.out.consensus_main, ch_reference_db) - PULL_GENOFLU(params.genoflu_github_url) + pull_genoflu(params.genoflu_github_url) - CHECKOUT_GENOFLU(PULL_GENOFLU.out.repo, params.genoflu_version) + checkout_genoflu(pull_geoflu.out.repo, params.genoflu_version) - GENOFLU(FluViewer.out.consensus_main.combine(PULL_GENOFLU.out.repo)) + genoflu(fluviewer.out.consensus_main.combine(pull_genoflu.out.repo)) // diff --git a/modules/FluViewer.nf b/modules/FluViewer.nf index 49f7cc5..27eac7a 100644 --- a/modules/FluViewer.nf +++ b/modules/FluViewer.nf @@ -1,83 +1,83 @@ -process FluViewer { +process fluviewer { - tag { sample_id } + tag { sample_id } - memory { 50.GB * task.attempt } - errorStrategy { (task.exitStatus == 2 && task.attempt <= maxRetries) ? 'retry' : 'ignore' } - maxRetries 5 + memory { 50.GB * task.attempt } + errorStrategy { (task.exitStatus == 2 && task.attempt <= maxRetries) ? 'retry' : 'ignore' } + maxRetries 5 - conda "${projectDir}/environments/fluviewer.yml" - - publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/${sample_id}*", mode:'copy', saveAs: { filename -> filename.split("/").last() } - publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/*tsv", mode:'copy', saveAs: { filename -> filename.split("/").last() } - publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/spades_output", mode:'copy', saveAs: { filename -> "spades_output" } - publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".*", mode:'copy' - publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/logs", mode:'copy', saveAs: { filename -> "fluviewer_logs" } - publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".exitcode", mode:'copy' - publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".command.*", mode:'copy' + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/${sample_id}*", mode:'copy', saveAs: { filename -> filename.split("/").last() } + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/*tsv", mode:'copy', saveAs: { filename -> filename.split("/").last() } + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/spades_output", mode:'copy', saveAs: { filename -> "spades_output" } + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".*", mode:'copy' + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: "${sample_id}_fluviewer/logs", mode:'copy', saveAs: { filename -> "fluviewer_logs" } + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".exitcode", mode:'copy' + publishDir "${params.outdir}/${params.run_name}/${params.pipeline_short_name}-v${params.pipeline_minor_version}/${sample_id}", pattern: ".command.*", mode:'copy' - input: - tuple val(sample_id), path(reads_1), path(reads_2), path(db) - - output: - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam"), emit: alignment - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam.bai"), emit: alignmentindex, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*report.tsv"), emit: reports, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_consensus.fa"), emit: consensus_seqs, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa"), emit: consensus_main - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_HPAI.tsv"), emit: HPAI, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_cov.png"), emit: coverage_plot, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_variants.vcf"), emit: vcf, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/logs"), emit: fluviewer_logs - tuple val(sample_id), path("${sample_id}_FluViewer_provenance.yml"), emit: provenance - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_mapping_refs.fa"), emit: ref_seqs_for_mapping, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/contigs_blast.tsv"), emit: contig_blast_results, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/spades_output"), emit: spades_results, optional: true - tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.png"), emit: depth_cov_plot, optional: true - - script: - garbage_collection = params.keep_interfiles ? '-g' : '' - - """ - printf -- "- process_name: FluViewer\\n" > ${sample_id}_FluViewer_provenance.yml - printf -- " tool_name: FluViewer\\n tool_version: \$(FluViewer | sed -n '4 p')\\n" >> ${sample_id}_FluViewer_provenance.yml - printf -- " database used: ${db}\\n" >> ${sample_id}_FluViewer_provenance.yml - printf -- " database_path: \$(readlink -f ${db})\\n" >> ${sample_id}_FluViewer_provenance.yml - printf -- " database sha256: \$(shasum -a 256 ${db}|awk '{print \$1}')\\n" >> ${sample_id}_FluViewer_provenance.yml + input: + tuple val(sample_id), path(reads_1), path(reads_2), path(db) + + output: + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam"), emit: alignment + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.bam.bai"), emit: alignmentindex, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*report.tsv"), emit: reports, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_consensus.fa"), emit: consensus_seqs, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa"), emit: consensus_main + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_HPAI.tsv"), emit: HPAI, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_cov.png"), emit: coverage_plot, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_variants.vcf"), emit: vcf, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/logs"), emit: fluviewer_logs + tuple val(sample_id), path("${sample_id}_FluViewer_provenance.yml"), emit: provenance + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*_mapping_refs.fa"), emit: ref_seqs_for_mapping, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/contigs_blast.tsv"), emit: contig_blast_results, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/spades_output"), emit: spades_results, optional: true + tuple val(sample_id), path("${sample_id}_fluviewer/${sample_id}*.png"), emit: depth_cov_plot, optional: true + + script: + garbage_collection = params.keep_interfiles ? '-g' : '' + """ + printf -- "- process_name: fluviewer\\n" >> ${sample_id}_FluViewer_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_FluViewer_provenance.yml + printf -- " - tool_name: FluViewer\\n" >> ${sample_id}_FluViewer_provenance.yml + printf -- " tool_version: \$(FluViewer | sed -n '4 p')\\n" >> ${sample_id}_FluViewer_provenance.yml + printf -- " databases:\\n" >> ${sample_id}_FluViewer_provenance.yml + printf -- " - database_name: ${db}\\n" >> ${sample_id}_FluViewer_provenance.yml + printf -- " database_path: \$(readlink -f ${db})\\n" >> ${sample_id}_FluViewer_provenance.yml + printf -- " database_sha256: \$(shasum -a 256 ${db}|awk '{print \$1}')\\n" >> ${sample_id}_FluViewer_provenance.yml - EXITCODE=0 - (FluViewer \ - ${garbage_collection} \ - -T ${task.cpus} \ - -f ${reads_1} -r ${reads_2} \ - -n ${sample_id}_fluviewer \ - -d ${db} \ - -D ${params.min_depth} \ - -q ${params.min_q} \ - -i ${params.min_ident} \ - -M 40 && EXITCODE=\$?) || EXITCODE=\$? - - - echo "Extracting NA and HA consensus sequences..." - - if [ `grep "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then - grep -A1 "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa - else - echo "No HA consensus sequence generated." - fi - - if [ `grep "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then - grep -A1 "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_NA_consensus.fa - else - echo "No NA consensus sequence generated." - fi - - if [[ ! -f ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa ]]; then - echo "HA segment consensus not generated. Skipping FindCleave.py..." - else - python ${projectDir}/bin/FindCleave.py -i ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa -o ${sample_id}_fluviewer/${sample_id}_HPAI.tsv - echo "Finished running FindCleave.py." - fi + EXITCODE=0 + (FluViewer \ + ${garbage_collection} \ + -T ${task.cpus} \ + -f ${reads_1} -r ${reads_2} \ + -n ${sample_id}_fluviewer \ + -d ${db} \ + -D ${params.min_depth} \ + -q ${params.min_q} \ + -i ${params.min_ident} \ + -M 40 && EXITCODE=\$?) \ + || EXITCODE=\$? + + echo "Extracting NA and HA consensus sequences..." + + if [ `grep "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then + grep -A1 "|HA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa + else + echo "No HA consensus sequence generated." + fi + + if [ `grep "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa` ]; then + grep -A1 "|NA|" ${sample_id}_fluviewer/${sample_id}*consensus_seqs.fa > ${sample_id}_fluviewer/${sample_id}_NA_consensus.fa + else + echo "No NA consensus sequence generated." + fi + + if [[ ! -f ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa ]]; then + echo "HA segment consensus not generated. Skipping FindCleave.py..." + else + python ${projectDir}/bin/FindCleave.py -i ${sample_id}_fluviewer/${sample_id}_HA_consensus.fa -o ${sample_id}_fluviewer/${sample_id}_HPAI.tsv + echo "Finished running FindCleave.py." + fi echo \$EXITCODE > .exitcode @@ -89,6 +89,5 @@ process FluViewer { cp .command.* \$OUTPATH cp .exitcode \$OUTPATH exit \$EXITCODE - """ } diff --git a/modules/clade_calling.nf b/modules/clade_calling.nf index c4fab0d..8c49b17 100644 --- a/modules/clade_calling.nf +++ b/modules/clade_calling.nf @@ -1,4 +1,4 @@ -process CLADE_CALLING { +process clade_calling { conda "${projectDir}/environments/nextclade.yml" @@ -59,4 +59,4 @@ process CLADE_CALLING { printf -- " Dataset location: \$LOCATION" >> ${sample_id}_clade_provenance.yml printf -- " Dataset version: \$VERSION" >> ${sample_id}_clade_provenance.yml """ -} \ No newline at end of file +} diff --git a/modules/genoflu.nf b/modules/genoflu.nf index 2fa31e6..d768c08 100644 --- a/modules/genoflu.nf +++ b/modules/genoflu.nf @@ -1,4 +1,4 @@ -process GENOFLU { +process genoflu { tag { sample_id } @@ -29,7 +29,7 @@ process GENOFLU { """ } -process PULL_GENOFLU { +process pull_genoflu { executor 'local' storeDir "${params.genoflu_cache}" @@ -46,8 +46,7 @@ process PULL_GENOFLU { """ } -process CHECKOUT_GENOFLU { - +process checkout_genoflu { input: path(genoflu_path) diff --git a/modules/multiqc.nf b/modules/multiqc.nf index 12cfd90..9f85c0a 100644 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -11,6 +11,5 @@ process multiqc { script: """ multiqc . -n ${params.run_name}_multiqc_report.html - """ } diff --git a/nextflow.config b/nextflow.config index 56c58de..f5aa97f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -78,7 +78,7 @@ profiles { } process { - withName: FluViewer { + withName: fluviewer { cpus = 8 memory = '32 GB' } @@ -87,7 +87,7 @@ process { cpus = 8 } - withName: CLADE_CALLING { + withName: clade_calling { conda = "$baseDir/environments/nextclade.yml" cpus = 4 } From da113f7f94104b4888531db908616a5eca29034d Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 14:31:17 -0700 Subject: [PATCH 11/24] rename snp_calling module --- modules/snp_calling.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/snp_calling.nf b/modules/snp_calling.nf index f61e241..211362a 100644 --- a/modules/snp_calling.nf +++ b/modules/snp_calling.nf @@ -1,4 +1,4 @@ -process SNP_CALLING { +process snp_calling { errorStrategy 'ignore' From e69f3395bb90caadd99edb2aa02eaf0e81f6bddc Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 14:37:03 -0700 Subject: [PATCH 12/24] typo geoflu -> genoflu --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7554581..0a3b0a8 100644 --- a/main.nf +++ b/main.nf @@ -92,7 +92,7 @@ workflow { pull_genoflu(params.genoflu_github_url) - checkout_genoflu(pull_geoflu.out.repo, params.genoflu_version) + checkout_genoflu(pull_genoflu.out.repo, params.genoflu_version) genoflu(fluviewer.out.consensus_main.combine(pull_genoflu.out.repo)) From 9e42cb647e4042d729504f61615db80758d4dc01 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 14:44:51 -0700 Subject: [PATCH 13/24] small fixes --- .gitignore | 2 +- ReadMe.md | 33 +++++++++++++++++++-------------- modules/fastqc.nf | 4 ++-- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 2fb2cb1..8867849 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,4 @@ output_test/ Validation_notes.md .Rproj.user */__pycache__/*.pyc -assets/genoflu/GenoFLU \ No newline at end of file +assets/genoflu/GenoFLU diff --git a/ReadMe.md b/ReadMe.md index 4914a67..4bb726d 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -68,7 +68,7 @@ For a full list of optional arguments, see: https://github.com/KevinKuchinski/Fl **Example command:** ``` nextflow run BCCDC-PHL/fluviewer-nf \ - -r v0.1.0 \ + -r v0.2.2 \ -profile conda \ --cache ~/.conda/envs \ --fastq_input /path/to/your_fastqs \ @@ -127,12 +127,9 @@ Output for each run includes: For each pipeline invocation, each sample will produce a `provenance.yml` file with the following contents. Note the below is a contrived example. ```yml -- process_name: FluViewer - tool_name: FluViewer - tool_version: FluViewer v0.0.2 - database used: FluViewer_db_full_20220915.fasta - database_path: /home/{USER}/Flu/ref/FluViewer_db_full_20220915.fasta - database sha256: 55b33afa21ad44ed1e6db896cf420fae6b1524c0ad205775a1ce9dd11595905d +- pipeline_name: BCCDC-PHL/FluViewer-nf + pipeline_version: 0.2.2 + timestamp_analysis_start: 2023-11-21T05:43:25.541743 - input_filename: {Sample}_R1.fastq.gz input_path: /home/{USER{}}/Flu/test_data/test_production_run/{Sample}_R1.fastq.gz sha256: 47380e49f10374660a2061d3571efe5339401484e646c2b47896fa701dbcf0a8 @@ -140,14 +137,22 @@ For each pipeline invocation, each sample will produce a `provenance.yml` file w input_path: /home/{USER}/Flu/test_data/test_production_run/{Sample}.fastq.gz sha256: 39c95fd26af111ee9a6caeb840a7aced444b657550efea3ab7f74add0b30f69d - process_name: fastp - tool_name: fastp - tool_version: 0.23.1 + tools: + - tool_name: fastp + tool_version: 0.23.1 - process_name: cutadapt - tool_name: cutadapt - tool_version: 4.1 -- pipeline_name: BCCDC-PHL/FluViewer-nf - pipeline_version: 0.2.0 -- timestamp_analysis_start: 2023-11-21T05:43:25.541743 + tools: + - tool_name: cutadapt + tool_version: 4.1 +- process_name: fluviewer + tools: + - tool_name: FluViewer + tool_version: FluViewer v0.0.2 + databases: + - database_name: FluViewer_db_full_20220915.fasta + database_path: /home/{USER}/Flu/ref/FluViewer_db_full_20220915.fasta + database_sha256: 55b33afa21ad44ed1e6db896cf420fae6b1524c0ad205775a1ce9dd11595905d + - process_name: nextclade tool_name: nextclade tool_version: 2.9.1 diff --git a/modules/fastqc.nf b/modules/fastqc.nf index 3413c90..8fcff27 100644 --- a/modules/fastqc.nf +++ b/modules/fastqc.nf @@ -13,9 +13,9 @@ process fastqc { script: """ - printf -- "- process_name: fastqc\\n" >> ${sample_id}_fastqc_provenance.yml + printf -- "- process_name: fastqc\\n" >> ${sample_id}_fastqc_provenance.yml printf -- " tools:\\n" >> ${sample_id}_fastqc_provenance.yml - printf -- " - tool_name: fastqc\\n" >> + printf -- " - tool_name: fastqc\\n" >> ${sample_id}_fastqc_provenance.yml printf -- " tool_version: \$(fastqc --version 2>&1 | sed -n '1 p')\\n" >> ${sample_id}_fastqc_provenance.yml mkdir -p ./tmp From ff724b86ad2cf249b61f70122ce0336290755869 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 14:48:36 -0700 Subject: [PATCH 14/24] Disable nextflow v23 test temporarily --- .github/workflows/tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e0ba675..f306e56 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,7 +12,9 @@ jobs: strategy: fail-fast: false matrix: - nextflow_version: ["21.04.3", "23.10.1"] + nextflow_version: + - "21.04.3" + # - "23.10.1" <- Failing due to 'conda.useMamba = true'. Issue is in test environment. Revisit name: Run tests runs-on: ubuntu-latest steps: From fc671b7cb3ffeff05a1851ef5f69aa2faae5b2f9 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 14:55:13 -0700 Subject: [PATCH 15/24] Restore fluviewer conda env directive --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index f5aa97f..4c9184d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -79,8 +79,8 @@ profiles { process { withName: fluviewer { + conda = "$baseDir/environments/fluviewer.yml" cpus = 8 - memory = '32 GB' } withName: cutadapt { From 479ed7d31133f7ff69b0027866d6c6486b91f1b0 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 15:07:08 -0700 Subject: [PATCH 16/24] updates --- .github/scripts/check_outputs.py | 6 ++++++ environments/fluviewer.yml | 2 +- main.nf | 2 +- modules/hash_files.nf | 4 +++- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py index 84a7166..09a8d8b 100755 --- a/.github/scripts/check_outputs.py +++ b/.github/scripts/check_outputs.py @@ -38,6 +38,12 @@ def check_expected_files_exist(output_dir, sample_ids): """ for sample_id in sample_ids: expected_files = [ + f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_alignment.bam", + f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_alignment.bam.bai", + f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_depth_of_cov.png", + f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_mapping_refs.fa", + f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_fluviewer_report.tsv", + f"fastq/fluviewer-nf-v0.2/{sample_id}/{sample_id}_genoflu.tsv", ] for expected_file in expected_files: diff --git a/environments/fluviewer.yml b/environments/fluviewer.yml index ff33eb0..a7ea83f 100644 --- a/environments/fluviewer.yml +++ b/environments/fluviewer.yml @@ -209,4 +209,4 @@ dependencies: - zlib=1.2.13=hd590300_5 - zstd=1.5.5=hfc55251_0 - pip: - - fluviewer + - FluViewer==0.1.11 diff --git a/main.nf b/main.nf index 0a3b0a8..c5f0961 100644 --- a/main.nf +++ b/main.nf @@ -106,7 +106,7 @@ workflow { ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - // ch_provenance = ch_provenance.join(FluViewer.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(fluviewer.out.provenance).map{ it -> [it[0], it[1] << it[2]] } // ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] } // ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it -> [it[0], it[1] << it[2]] } collect_provenance(ch_provenance) diff --git a/modules/hash_files.nf b/modules/hash_files.nf index 6662097..e9865a0 100644 --- a/modules/hash_files.nf +++ b/modules/hash_files.nf @@ -13,7 +13,9 @@ process hash_files { """ shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sample_id}_${file_type}.sha256.csv while IFS=',' read -r hash filename; do - printf -- "- input_filename: \$filename\\n input_path: \$(realpath \$filename)\\n sha256: \$hash\\n" >> ${sample_id}_${file_type}_provenance.yml + printf -- "- input_filename: \$filename\\n" >> ${sample_id}_${file_type}_provenance.yml; + printf -- " file_type: ${file_type}\\n" >> ${sample_id}_${file_type}_provenance.yml; + printf -- " sha256: \$hash\\n" >> ${sample_id}_${file_type}_provenance.yml; done < ${sample_id}_${file_type}.sha256.csv """ } From 232476c421321ef0e3af98ecbb82f0924802d22a Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 15:28:41 -0700 Subject: [PATCH 17/24] organize environments --- .github/scripts/run_pipeline.sh | 3 ++- environments/{main.yml => environment.yml} | 2 +- environments/fluviewer.yml | 2 +- environments/nextclade.yml | 4 ++-- main.nf | 5 +++-- modules/clade_calling.nf | 2 +- modules/snp_calling.nf | 12 ++++++++++-- 7 files changed, 20 insertions(+), 10 deletions(-) rename environments/{main.yml => environment.yml} (99%) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 0bd5c23..811ea05 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -9,7 +9,8 @@ eval "$(conda shell.bash hook)" conda activate base sed -i 's/cpus = 8/cpus = 4/g' nextflow.config -sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config +sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config +sed -i 'memory { 50.GB * task.attempt }//g' modules/FluViewer.nf nextflow run main.nf \ -profile conda \ diff --git a/environments/main.yml b/environments/environment.yml similarity index 99% rename from environments/main.yml rename to environments/environment.yml index 769a02b..c9df87d 100644 --- a/environments/main.yml +++ b/environments/environment.yml @@ -1,4 +1,4 @@ -name: FluViewer-nf +name: fluviewer-nf channels: - conda-forge - bioconda diff --git a/environments/fluviewer.yml b/environments/fluviewer.yml index a7ea83f..7b52e30 100644 --- a/environments/fluviewer.yml +++ b/environments/fluviewer.yml @@ -1,4 +1,4 @@ -name: FluViewer +name: fluviewer-nf-FluViewer channels: - conda-forge - bioconda diff --git a/environments/nextclade.yml b/environments/nextclade.yml index d9e51da..ac031d5 100644 --- a/environments/nextclade.yml +++ b/environments/nextclade.yml @@ -1,7 +1,7 @@ -name: FluViewer-nf +name: fluviewer-nf-nextclade channels: - conda-forge - bioconda - defaults dependencies: -- nextclade=2.9.1 \ No newline at end of file +- nextclade=2.9.1 diff --git a/main.nf b/main.nf index c5f0961..fe85487 100644 --- a/main.nf +++ b/main.nf @@ -107,8 +107,9 @@ workflow { ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(cutadapt.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(fluviewer.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - // ch_provenance = ch_provenance.join(CLADE_CALLING.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - // ch_provenance = ch_provenance.join(GENOFLU.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(clade_calling.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(snp_calling.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(genoflu.out.provenance).map{ it -> [it[0], it[1] << it[2]] } collect_provenance(ch_provenance) } diff --git a/modules/clade_calling.nf b/modules/clade_calling.nf index 8c49b17..9ce1cef 100644 --- a/modules/clade_calling.nf +++ b/modules/clade_calling.nf @@ -14,7 +14,7 @@ process clade_calling { output: tuple val(sample_id), path("*nextclade*"), emit: nextclade, optional: true - tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance + tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance, optional: true script: diff --git a/modules/snp_calling.nf b/modules/snp_calling.nf index 211362a..dea6f72 100644 --- a/modules/snp_calling.nf +++ b/modules/snp_calling.nf @@ -14,10 +14,18 @@ process snp_calling { output: tuple val(sample_id), path("${sample_id}_*mutations.tsv"), emit: mutations tuple val(sample_id), path("${sample_id}/*blastx.tsv"), emit: blast, optional: true - // tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance, optional: true + tuple val(sample_id), path("${sample_id}_snp_calling_provenance.yml"), emit: provenance, optional: true script: - """ + """ + printf -- "- process_name: snp_caling\\n" >> ${sample_id}_snp_calling_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_snp_calling_provenance.yml + printf -- " - tool_name: blastx\\n" >> ${sample_id}_snp_calling_provenance.yml + printf -- " tool_version: \$(blastx -version | head -n 1)\\n" >> ${sample_id}_snp_calling_provenance.yml + printf -- " databases:\\n" >> ${sample_id}_snp_calling_provenance.yml + printf -- " - database_name: ${blastx_db_name}\\n" >> ${sample_id}_snp_calling_provenance.yml + + export BLASTDB="${blastx_db_path}" blastx -query ${consensus_seqs} -db ${blastx_db_name} -outfmt 6 > ${sample_id}_blastx.tsv && From f9769059ff5d21b866078d656f44fdc09b76ba42 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 15:29:46 -0700 Subject: [PATCH 18/24] fix sed cmd --- .github/scripts/run_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 811ea05..e3b2a59 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -10,7 +10,7 @@ conda activate base sed -i 's/cpus = 8/cpus = 4/g' nextflow.config sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config -sed -i 'memory { 50.GB * task.attempt }//g' modules/FluViewer.nf +sed -i 's/memory { 50.GB * task.attempt }//g' modules/FluViewer.nf nextflow run main.nf \ -profile conda \ From 332c74595ef4b87aa75528740e4791d2d6dbe6ec Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 15:31:08 -0700 Subject: [PATCH 19/24] fix env path --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 4c9184d..af4b043 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,7 +69,7 @@ def parsePipelineName(name) { profiles { conda { conda.enabled = true - process.conda = "$baseDir/environments/main.yml" + process.conda = "$baseDir/environments/environment.yml" if (params.cache) { conda.cacheDir = params.cache } From fd10afb818135919ba8042dc2897dd4cc21b736e Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 15:55:11 -0700 Subject: [PATCH 20/24] updates --- .github/scripts/run_pipeline.sh | 10 ++++++---- modules/FluViewer.nf | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index e3b2a59..49f0cf7 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -8,10 +8,12 @@ eval "$(conda shell.bash hook)" conda activate base -sed -i 's/cpus = 8/cpus = 4/g' nextflow.config -sed -i "s/memory = '32 GB'/memory = '2 GB'/g" nextflow.config -sed -i 's/memory { 50.GB * task.attempt }//g' modules/FluViewer.nf - +# Check for a sign that we're in the GitHub Actions environment. +# Prevents these settings from being applied in other environments. +if [ -n "${GITHUB_ACTIONS}" ]; then + sed -i 's/cpus = 8/cpus = 4/g' nextflow.config + sed -i '/memory/d' modules/FluViewer.nf +fi nextflow run main.nf \ -profile conda \ --cache ${HOME}/.conda/envs \ diff --git a/modules/FluViewer.nf b/modules/FluViewer.nf index 27eac7a..2c95f38 100644 --- a/modules/FluViewer.nf +++ b/modules/FluViewer.nf @@ -2,7 +2,7 @@ process fluviewer { tag { sample_id } - memory { 50.GB * task.attempt } + memory { 50.GB * task.attempt } errorStrategy { (task.exitStatus == 2 && task.attempt <= maxRetries) ? 'retry' : 'ignore' } maxRetries 5 From dc0b34133418529d91a203d844908d56edc361bb Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 16:49:48 -0700 Subject: [PATCH 21/24] update --- .github/scripts/run_pipeline.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 49f0cf7..3fd8d12 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -10,10 +10,14 @@ conda activate base # Check for a sign that we're in the GitHub Actions environment. # Prevents these settings from being applied in other environments. -if [ -n "${GITHUB_ACTIONS}" ]; then +if [ -z "${GITHUB_ACTIONS}" ]; then + echo "Not in GitHub Actions environment. Will not modify nextflow.config or FluViewer.nf." +else + echo "In GitHub Actions environment. Modifying nextflow.config and FluViewer.nf." sed -i 's/cpus = 8/cpus = 4/g' nextflow.config sed -i '/memory/d' modules/FluViewer.nf fi + nextflow run main.nf \ -profile conda \ --cache ${HOME}/.conda/envs \ From ea26a15cc7b0a8c69bf6773a10c7982a251bdcb0 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 17:22:54 -0700 Subject: [PATCH 22/24] collect nextflow log --- .github/scripts/run_pipeline.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 3fd8d12..e66fdbf 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -23,4 +23,5 @@ nextflow run main.nf \ --cache ${HOME}/.conda/envs \ --fastq_input .github/data/fastq \ --db .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa \ - --outdir .github/data/test_output + --outdir .github/data/test_output \ + -with-log artifacts/nextflow.log From ea6bf486f1d14dc9a6fd32940bb611f4e3208ad3 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 17:25:51 -0700 Subject: [PATCH 23/24] collect nextflow log --- .github/scripts/run_pipeline.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index e66fdbf..37e1360 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -18,10 +18,10 @@ else sed -i '/memory/d' modules/FluViewer.nf fi -nextflow run main.nf \ +nextflow -log artifacts/nextflow.log \ + run main.nf \ -profile conda \ --cache ${HOME}/.conda/envs \ --fastq_input .github/data/fastq \ --db .github/data/fluviewer_db/FluViewer_db_v_0_1_8.fa \ - --outdir .github/data/test_output \ - -with-log artifacts/nextflow.log + --outdir .github/data/test_output From 458dd7baa2636febcd80b91c8a32d37bafe259a6 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 5 Jun 2024 17:53:09 -0700 Subject: [PATCH 24/24] tidy up nextclade provenance --- modules/clade_calling.nf | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/modules/clade_calling.nf b/modules/clade_calling.nf index 9ce1cef..d6be48d 100644 --- a/modules/clade_calling.nf +++ b/modules/clade_calling.nf @@ -14,11 +14,16 @@ process clade_calling { output: tuple val(sample_id), path("*nextclade*"), emit: nextclade, optional: true - tuple val(sample_id), path("${sample_id}_clade_provenance.yml"), emit: provenance, optional: true + tuple val(sample_id), path("${sample_id}_clade_calling__provenance.yml"), emit: provenance, optional: true script: - """ + printf -- "process_name: nextclade\\n" >> ${sample_id}_clade_calling_provenance.yml + printf -- "tools:\\n" >> ${sample_id}_clade_calling_provenance.yml + printf -- " - tool_name: nextclade\\n" >> ${sample_id}_clade_calling_provenance.yml + printf -- " tool_version: \$(nextclade --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_clade_calling_provenance.yml + printf -- " subcommand: run\\n" >> ${sample_id}_clade_calling_provenance.yml + [ ! -f ${sample_id}_HA_consensus.fa ] && ln -sf *HA_consensus.fa ${sample_id}_HA_consensus.fa FOUND=true @@ -54,9 +59,5 @@ process clade_calling { VERSION="NONE_INVALID_HA_TYPE\\n" fi - printf -- "- process_name: nextclade\\n" > ${sample_id}_clade_provenance.yml - printf -- " tool_name: nextclade\\n tool_version: \$(nextclade --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_clade_provenance.yml - printf -- " Dataset location: \$LOCATION" >> ${sample_id}_clade_provenance.yml - printf -- " Dataset version: \$VERSION" >> ${sample_id}_clade_provenance.yml """ }