nf-core · yuukiiwa · Jun 30, 2023 · Sep 25, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -49,10 +49,12 @@ jobs:
     strategy:
       matrix:
         profiles:
-          - "test_nodx_vc"
-          - "test_nodx_stringtie"
-          - "test_nodx_noaln"
-          - "test_nodx_rnamod"
+          - "test_bc_nodx"
+          - "test_nobc_dx"
+          - "test_nobc_nodx_vc"
+          - "test_nobc_nodx_stringtie"
+          - "test_nobc_nodx_noaln"
+          - "test_nobc_nodx_rnamod"
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v3

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -101,7 +101,7 @@ def check_samplesheet(file_in, updated_path, file_out):
                     barcode = "barcode%s" % (barcode.zfill(2))
 
             ## Check input file extension
-            nanopolish_fast5 = ""
+            fast5 = ""
             if input_file:
                 if input_file.find(" ") != -1:
                     print_error("Input file contains spaces!", "Line", line)
@@ -115,12 +115,12 @@ def check_samplesheet(file_in, updated_path, file_out):
                     if updated_path != "not_changed":
                         input_file = "/".join([updated_path, input_file.split("/")[-1]])
                     list_dir = os.listdir(input_file)
-                    nanopolish_fast5 = input_file
+                    fast5 = input_file
                     if not (all(fname.endswith(".fast5") for fname in list_dir)):
                         if "fast5" in list_dir and "fastq" in list_dir:
-                            nanopolish_fast5 = input_file + "/fast5"
+                            fast5 = input_file + "/fast5"
                             ## CHECK FAST5 DIRECTORY
-                            if not (all(fname.endswith(".fast5") for fname in os.listdir(nanopolish_fast5))):
+                            if not (all(fname.endswith(".fast5") for fname in os.listdir(fast5))):
                                 print_error("fast5 directory contains non-fast5 files.")
                             ## CHECK PROVIDED BASECALLED FASTQ
                             fastq_path = input_file + "/fastq"
@@ -139,8 +139,8 @@ def check_samplesheet(file_in, updated_path, file_out):
                                 '{input_file} path does not end with ".fastq.gz", ".fq.gz", or ".bam" and is not an existing directory with correct fast5 and/or fastq inputs.'
                             )
 
-            ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, nanopolish_fast5 ]}}
-            sample_info = [barcode, input_file, nanopolish_fast5]
+            ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, fast5 ]}}
+            sample_info = [barcode, input_file, fast5]
             if group not in sample_info_dict:
                 sample_info_dict[group] = {}
             if replicate not in sample_info_dict[group]:
@@ -161,7 +161,7 @@ def check_samplesheet(file_in, updated_path, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "barcode", "reads", "nanopolish_fast5"]) + "\n")
+            fout.write(",".join(["sample", "barcode", "reads", "fast5"]) + "\n")
             for sample in sorted(sample_info_dict.keys()):
                 ## Check that replicate ids are in format 1..<NUM_REPS>
                 uniq_rep_ids = set(sample_info_dict[sample].keys())

diff --git a/conf/test.config b/conf/test.config
@@ -1,33 +1,40 @@
 /*
- * -------------------------------------------------
- *  Nextflow config file for running tests
- * -------------------------------------------------
- * Defines bundled input files and everything required
- * to run a fast and simple test. Use as follows:
- *   nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
- */
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
 
 params {
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
-    // Limit resources
-    max_cpus            = 2
-    max_memory          = 6.GB
-    max_time            = 12.h
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '12.h'
 
-    // Input data to perform demultipexing
-    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
-    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
-    gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
-    run_nanolyse        = true
-    protocol            = 'DNA'
+    // Input data to perform both basecalling and demultiplexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS109'
     barcode_kit         = 'NBD103/NBD104'
-    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
-    skip_bigwig         = true
-    skip_bigbed         = true
+    trim_barcodes       = true
+    dorado_model        = '[email protected]'
+    dorado_device       = 'cpu'
+    run_nanolyse        = true
     skip_quantification = true
     skip_fusion_analysis= true
     skip_modification_analysis=true
-    aligner             = 'graphmap2'
+
+    // This variable is just for reference and isnt actually required for the tests
+    // Files are downloaded and staged using the "GetTestData" process
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded_multi/'
 }
diff --git a/conf/test_bc_nodx.config b/conf/test_bc_nodx.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_bc_nodx,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on Travis
+    max_cpus            = 2
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to perform basecalling and to skip demultipexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS108'
+    dorado_model        = '[email protected]'
+    dorado_device       = 'cpu'
+    skip_bigbed         = true
+    skip_bigwig         = true
+    skip_demultiplexing = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+}
diff --git a/conf/test_nobc_dx.config b/conf/test_nobc_dx.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources
+    max_cpus            = 2
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to perform demultipexing
+    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
+    gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
+    skip_basecalling    = true
+    run_nanolyse        = true
+    protocol            = 'DNA'
+    barcode_kit         = 'NBD103/NBD104'
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
+    skip_bigwig         = true
+    skip_bigbed         = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+}
diff --git a/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config b/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config
@@ -20,6 +20,7 @@ params {
     input                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17550000.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17500000.gtf'
+    skip_basecalling    = true
     protocol              = 'directRNA'
     skip_demultiplexing   = true
     skip_alignment        = true

diff --git a/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config b/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config
@@ -20,6 +20,7 @@ params {
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_rnamod.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.gtf'
+    skip_basecalling    = true
     protocol            = 'directRNA'
     run_nanolyse        = true
     skip_bigbed         = true

diff --git a/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config b/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config
@@ -21,6 +21,7 @@ params {
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
     protocol            = 'directRNA'
+    skip_basecalling    = true
     skip_demultiplexing = true
     skip_fusion_analysis= true
     skip_modification_analysis=true

diff --git a/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config b/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config
@@ -19,6 +19,7 @@ params {
     // Input data to skip demultiplexing and variant call
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_vc.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    skip_basecalling    = true
     protocol            = 'DNA'
     skip_quantification = true
     skip_demultiplexing = true

diff --git a/conf/test_withpull.config b/conf/test_withpull.config
@@ -0,0 +1,39 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data to perform both basecalling and demultiplexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS109'
+    barcode_kit         = 'EXP-NBD103'
+    trim_barcodes=true
+    output_demultiplex_fast5 = true
+    run_nanolyse        = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+
+    // This variable is just for reference and isnt actually required for the tests
+    // Files are downloaded and staged using the "GetTestData" process
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded/'
+}
diff --git a/modules/local/dorado.nf b/modules/local/dorado.nf
@@ -0,0 +1,29 @@
+process DORADO {
+    tag "$meta.id"
+    label 'process_medium'
+
+    container "docker.io/ontresearch/dorado"
+
+    input:
+    tuple val(meta), path(pod5_path)
+    val dorado_device
+    val dorado_model
+
+    output:
+    tuple val(meta), path("*.fastq.gz")  , emit: fastq
+    path "versions.yml"                  , emit: versions
+
+    script:
+    """
+    dorado download --model $dorado_model
+    dorado basecaller $dorado_model $pod5_path --device $dorado_device --emit-fastq > basecall.fastq
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//')
+    END_VERSIONS
+
+    gzip basecall.fastq
+    """
+}
+
diff --git a/modules/local/fast5_to_pod5.nf b/modules/local/fast5_to_pod5.nf
@@ -0,0 +1,27 @@
+process FAST5_TO_POD5 {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0"
+    container "docker.io/yuukiiwa/pod5:0.2.4"
+
+    input:
+    tuple val(meta), path(input_path)
+
+    output:
+    tuple val(meta), path("pod5/")    , emit: pod5
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    output_name = "pod5/converted.pod5"
+    """
+    pod5 convert fast5 $input_path --output $output_name
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pod5: \$(echo \$(pod5 --version 2>&1) | sed -r 's/..............//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/get_test_data.nf b/modules/local/get_test_data.nf
@@ -4,15 +4,15 @@ process GET_TEST_DATA {
     container "docker.io/yuukiiwa/git:latest"
 
     output:
-    path "test-datasets/fast5/$barcoded/*"        , emit: ch_input_fast5s_path
+    path "test-datasets/fast5/$barcoded/"         , emit: ch_input_fast5_dir_path
     path "test-datasets/modification_fast5_fastq/", emit: ch_input_dir_path
     path "versions.yml"                           , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded" : "barcoded"
+    barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded_multi" : "barcoded_multi"
     """
     git clone https://github.com/nf-core/test-datasets.git --branch nanoseq --single-branch
 

diff --git a/modules/local/nanopolish_index_eventalign.nf b/modules/local/nanopolish_index_eventalign.nf
@@ -20,7 +20,7 @@ process NANOPOLISH_INDEX_EVENTALIGN {
     script:
     sample_summary = "$meta.id" +"_summary.txt"
     sample_eventalign = "$meta.id" +"_eventalign.txt"
-    fast5 = "$meta.nanopolish_fast5"
+    fast5 = "$meta.fast5"
     """
     nanopolish index -d $fast5 $fastq
     nanopolish eventalign  --reads $fastq --bam $bam --genome $genome --scale-events --signal-index --summary $sample_summary --threads $task.cpus > $sample_eventalign

diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
@@ -19,7 +19,7 @@ process SAMPLESHEET_CHECK {
     task.ext.when == null || task.ext.when
 
     script: // This script is bundled with the pipeline, in nf-core/nanoseq/bin/
-    updated_path = workflow.profile.contains('test_nodx_rnamod') ? "$input_path" : "not_changed"
+    updated_path = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "$input_path" : "not_changed"
     """
     check_samplesheet.py \\
         $samplesheet \\

diff --git a/nextflow.config b/nextflow.config
@@ -19,15 +19,18 @@ params {
     gtf                        = null
 
 
-    // Options: Demultiplexing
+    // Options: Basecalling and Demultiplexing
     input_path                 = null
+    flowcell                   = null
+    kit                        = null
     barcode_kit                = null
     barcode_both_ends          = false
     trim_barcodes              = false
-    gpu_device                 = 'auto'
-    gpu_cluster_options        = null
+    dorado_model               = null
+    dorado_device              = 'cuda:all'
     qcat_min_score             = 60
     qcat_detect_middle         = false
+    skip_basecalling           = false
     skip_demultiplexing        = false
 
     // Options: Raw read cleaning
@@ -221,12 +224,14 @@ profiles {
         executor.cpus          = 16
         executor.memory        = 60.GB
     }
+    test      { includeConfig 'conf/test.config'      }
     test_full { includeConfig 'conf/test_full.config' }
-    test                { includeConfig 'conf/test.config'                }
-    test_nodx_stringtie { includeConfig 'conf/test_nodx_stringtie.config' }
-    test_nodx_noaln     { includeConfig 'conf/test_nodx_noaln.config'     }
-    test_nodx_vc        { includeConfig 'conf/test_nodx_vc.config'        }
-    test_nodx_rnamod    { includeConfig 'conf/test_nodx_rnamod.config'    }
+    test_bc_nodx             { includeConfig 'conf/test_bc_nodx.config'             }
+    test_nobc_dx             { includeConfig 'conf/test_nobc_dx.config'             }
+    test_nobc_nodx_stringtie { includeConfig 'conf/test_nobc_nodx_stringtie.config' }
+    test_nobc_nodx_noaln     { includeConfig 'conf/test_nobc_nodx_noaln.config'     }
+    test_nobc_nodx_vc        { includeConfig 'conf/test_nobc_nodx_vc.config'        }
+    test_nobc_nodx_rnamod    { includeConfig 'conf/test_nobc_nodx_rnamod.config'    }
 }