diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 741538bc..be927121 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: matrix: parameters: - "" - - "--preset ONT_R10 --input https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/testdata/samplesheet_multisample_bam_ont.csv --split_fastq 2 --parallel_snv 1" + - "--preset ONT_R10 --input https://github.com/genomic-medicine-sweden/test-datasets/raw/e2266a34c14d1e0a9ef798de3cd81a76c9216fc1/testdata/samplesheet_multisample_bam_ont.csv --split_fastq 2 --parallel_snv 1" NXF_VER: - "23.04.0" - "latest-everything" diff --git a/CHANGELOG.md b/CHANGELOG.md index 1eaa0a6d..b54e6a6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#252](https://github.com/genomic-medicine-sweden/nallo/pull/252) - Added a new `SCATTER_GENOME` subworkflow - [#255](https://github.com/genomic-medicine-sweden/nallo/pull/255) - Added a new `RANK_VARIANTS` subworkflow to rank SNVs using genmod - [#261](https://github.com/genomic-medicine-sweden/nallo/pull/261) - Added a `--skip_rank_variants` parameter to skip the rank_variants subworkflow +- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Added a `project` column to the sampleheet - [#266](https://github.com/genomic-medicine-sweden/nallo/pull/266) - Added CADD to dynamically calculate indel CADD-scores - [#270](https://github.com/genomic-medicine-sweden/nallo/pull/270) - Added SNV phasing stats to MultiQC - [#271](https://github.com/genomic-medicine-sweden/nallo/pull/271) - Added a `--skip_aligned_read_qc` parameter to skip the qc aligned reads subworkflow @@ -43,6 +44,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#261](https://github.com/genomic-medicine-sweden/nallo/pull/261) - Changed SNV annotation to run in parallel - [#261](https://github.com/genomic-medicine-sweden/nallo/pull/261) - Changed SNV output file names and directory structure - [#262](https://github.com/genomic-medicine-sweden/nallo/pull/262) - Updated README +- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Changed PED file creation from groovy script to process +- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Changed all `multisample` filenames to `{project}` from samplesheet - [#268](https://github.com/genomic-medicine-sweden/nallo/pull/268) - Only output unphased alignments when phasing is off - [#268](https://github.com/genomic-medicine-sweden/nallo/pull/268) - Changed alignment output file names and directory structure - [#270](https://github.com/genomic-medicine-sweden/nallo/pull/270) - Changed whatshap stats to always run, regardless of phasing software, and changed the output from `*.stats.tsv.gz` to `*.stats.tsv` to allow being picked up by MultiQC @@ -57,6 +60,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#243](https://github.com/genomic-medicine-sweden/nallo/pull/243) - Removed VEP report from output files - [#257](https://github.com/genomic-medicine-sweden/nallo/pull/257) - Removed obsolete TODO statements - [#258](https://github.com/genomic-medicine-sweden/nallo/pull/258) - Removed VCF report from DeepVariant output +- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Removed the option to provide extra SNF files to Sniffles with `--extra_snfs` ### `Fixed` @@ -69,14 +73,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | Old parameter | New parameter | | ------------------ | -------------------------- | -| | `--deepvariant_model_type` | -| `--extra_gvcfs` | | | `--skip_repeat_wf` | `--skip_repeat_calling` | | `--skip_repeat_wf` | `--skip_repeat_annotation` | +| | `--deepvariant_model_type` | | | `--skip_rank_variants` | | | `--skip_aligned_read_qc` | | | `--cadd_resources` | | | `--cadd_prescored` | +| `--extra_gvcfs` | | +| `--extra_snfs` | | > [!NOTE] > Parameter has been updated if both old and new parameter information is present. diff --git a/README.md b/README.md index 3c4b2acb..a557844f 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,9 @@ Prepare a samplesheet with input data: `samplesheet.csv` ``` -sample,file,family_id,paternal_id,maternal_id,sex,phenotype -HG002,/path/to/HG002.fastq.gz,FAM1,HG003,HG004,1,2 -HG005,/path/to/HG005.bam,FAM1,HG003,HG004,2,1 +project,sample,file,family_id,paternal_id,maternal_id,sex,phenotype +testrun,HG002,/path/to/HG002.fastq.gz,FAM1,HG003,HG004,1,2 +testrun,HG005,/path/to/HG005.bam,FAM1,HG003,HG004,2,1 ``` Now, you can run the pipeline using: diff --git a/assets/schema_gvcfs.json b/assets/schema_gvcfs.json deleted file mode 100644 index 0ae2d250..00000000 --- a/assets/schema_gvcfs.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_gvcfs.json", - "title": "genomic-medicine-sweden/nallo pipeline - params.extra_gvcfs schema", - "description": "Schema for the file provided with params.extra_gvcfs", - "type": "array", - "items": { - "type": "object", - "properties": { - "sample": { - "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", - "meta": ["id"] - }, - "file": { - "format": "file-path", - "type": "string", - "pattern": "^\\S+\\.(g\\.)?(g)?vcf\\.gz$", - "errorMessage": "gVCF file must be provided, cannot contain spaces and must have extension 'g.vcf.gz' or 'gvcf.gz'" - } - }, - "required": ["sample", "file"] - } -} diff --git a/assets/schema_input.json b/assets/schema_input.json index 63c61f99..46c00ba6 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -48,8 +48,14 @@ "enum": [0, 1, 2], "errorMessage": "Phenoype must be provided as 0 (missing), 1 (unaffected) or 2 (affected)", "meta": ["phenotype"] + }, + "project": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Project name must be provided and cannot contain spaces, needs to be the same for all samples", + "meta": ["project"] } }, - "required": ["sample", "file", "family_id", "paternal_id", "maternal_id", "sex", "phenotype"] + "required": ["sample", "file", "family_id", "paternal_id", "maternal_id", "sex", "phenotype", "project"] } } diff --git a/assets/schema_snfs.json b/assets/schema_snfs.json deleted file mode 100644 index 59d45232..00000000 --- a/assets/schema_snfs.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_snfs.json", - "title": "genomic-medicine-sweden/nallo pipeline - params.extra_snfs schema", - "description": "Schema for the file provided with params.extra_snfs", - "type": "array", - "items": { - "type": "object", - "properties": { - "sample": { - "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", - "meta": ["id"] - }, - "file": { - "format": "file-path", - "type": "string", - "pattern": "^\\S+\\.snf$", - "errorMessage": "SNF file must be provided, cannot contain spaces and must have extension '.snf" - } - }, - "required": ["sample", "file"] - } -} diff --git a/conf/modules/general.config b/conf/modules/general.config index ea2e3e13..fe3718e5 100644 --- a/conf/modules/general.config +++ b/conf/modules/general.config @@ -94,6 +94,13 @@ process { ] } + withName: '.*:NALLO:CREATE_PEDIGREE_FILE' { + publishDir = [ + path: { "${params.outdir}/pedigree" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + withName: '.*:NALLO:SPLIT_BED_CHUNKS' { publishDir = [ enabled: false diff --git a/conf/modules/structural_variant_calling.config b/conf/modules/structural_variant_calling.config index b5b73eed..8e1e5a28 100644 --- a/conf/modules/structural_variant_calling.config +++ b/conf/modules/structural_variant_calling.config @@ -33,10 +33,10 @@ process { withName: '.*:STRUCTURAL_VARIANT_CALLING:SNIFFLES_MULTISAMPLE' { - ext.prefix = 'multisample_sniffles' + ext.prefix = { "${meta.id}_sniffles" } publishDir = [ - path: { "${params.outdir}/sv_calling/sniffles/multi_sample" }, + path: { "${params.outdir}/sv_calling/sniffles/multi_sample/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test.config b/conf/test.config index ea54eaea..7c96ecd8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,7 @@ params { // Genome references fasta = params.pipelines_testdata_base_path + 'nallo/reference/hg38.test.fa.gz' - input = params.pipelines_testdata_base_path + 'nallo/testdata/samplesheet.csv' + input = 'https://github.com/genomic-medicine-sweden/test-datasets/raw/2948776ddf24ea131f527aa1f2dc23a43bb7b952/testdata/samplesheet.csv' bed = params.pipelines_testdata_base_path + 'nallo/reference/test_data.bed' diff --git a/docs/output.md b/docs/output.md index a4a0302f..d288ad65 100644 --- a/docs/output.md +++ b/docs/output.md @@ -24,7 +24,7 @@ - [Repeat annotation](#repeat-annotation) - [SNV Annotation](#snv-annotation) - [Ranked Variants](#ranked-variants) - - [SNV Calling](#snv-calling) + - [SV Calling](#sv-calling) ## Pipeline overview @@ -245,10 +245,10 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
Output files from Somalier -- `{outputdir}/qc_aligned_reads/somalier/relate/mutlisample/` - - `*.html`: HTML report - - `*.pairs.tsv`: Output information in sample pairs - - `*.samples.tsv`: Output information per sample +- `{outputdir}/qc_aligned_reads/somalier/relate/{project}/` + - `{project}.html`: HTML report + - `{project}.pairs.tsv`: Output information in sample pairs + - `{project}.samples.tsv`: Output information per sample
### Raw read QC @@ -274,7 +274,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
Output files from TRGT -- `{outputdir}/repeat_calling/trgt/multi_sample/multisample/` +- `{outputdir}/repeat_calling/trgt/multi_sample/{project}/` - `*.vcf.gz`: Merged VCF for all samples - `*.vcf.gz.tbi`: Index of the corresponding VCF file - `{outputdir}/repeat_calling/trgt/single_sample/{sample}/` @@ -305,9 +305,9 @@ In case of affected samples, [echtvar](https://github.com/brentp/echtvar) and [V
Output files from SNV Annotation -- `{outputdir}/databases/echtvar/encode/multisample/` +- `{outputdir}/databases/echtvar/encode/{project}/` - `*.zip`: Database with AF and AC for all samples run -- `{outputdir}/snvs/{single_sample,multi_sample/multisample}/` +- `{outputdir}/snvs/{single_sample,multi_sample/{project}/` - `*_snvs_annotated*.vcf.gz`: VCF with annotated variants - `*_snvs_annotated*.vcf.gz.tbi`: Index of the corresponding VCF file - `{outputdir}/snvs/stats/single_sample/` @@ -327,29 +327,26 @@ In case of affected samples, [echtvar](https://github.com/brentp/echtvar) and [V
Output files -- `{outputdir}/snvs/{single_sample,multi_sample/multisample}/` - - `*_snvs_annotated_ranked.vcf.gz`: VCF with annotated and ranked variants - - `*_snvs_annotated_ranked.vcf.gz.tbi`: Index of the corresponding VCF file +- `{outputdir}/snvs/single_sample/{sample}/` + - `{sample}_snv_annotated_ranked.vcf.gz`: VCF with annotated and ranked variants + - `{sample}_snv_annotated_ranked.vcf.gz.tbi`: Index of the corresponding VCF file +- `{outputdir}/snvs/multi_sample/{project}/` + - `{project}_snv_annotated_ranked.vcf.gz`: VCF with annotated and ranked variants + - `{project}_snv_annotated_ranked.vcf.gz.tbi`: Index of the corresponding VCF file
-### SNV Calling +### SV Calling [Sniffles](https://github.com/fritzsedlazeck/Sniffles) is used to call and merge structural variants.
Output files from SNV Calling -- `{outputdir}/sv_calling/multi_sample/` +- `{outputdir}/sv_calling/multi_sample/{project}` - `*.vcf.gz`: VCF with variants - `*.vcf.gz.tbi`: Index of the corresponding VCF file - `{outputdir}/sv_calling/single_sample/{sample}` - `*.snf`: Sniffles SNF file - `*.vcf.gz`: VCF with variants - `*.vcf.gz.tbi`: Index of the corresponding VCF file -- `{outputdir}/snv_calling/single_sample/deepvariant/gvcf/{sample}/` - - `*.g.vcf.gz`: gVCF with variants - - `*.g.vcf.gz.tbi`: Index of the corresponding gVCF file -- `{outputdir}/snv_calling/single_sample/deepvariant/vcf/{sample}/` - - `*.vcf.gz`: VCF with variants - - `*.vcf.gz.tbi`: Index of the corresponding VCF file
diff --git a/docs/usage.md b/docs/usage.md index 7dc2f7ad..7eb15085 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -57,18 +57,20 @@ You will need to create a samplesheet with information about the samples you wou --input '[path to samplesheet file]' ``` -It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. +It has to be a comma-separated file with 7 columns, and a header row as shown in the examples below. `file` can either be a gzipped-fastq file or an aligned or unalinged BAM file (BAM files will be converted to FASTQ and aligned again). +`project` needs to be the same for all samples in a run. If you don't have related samples, `family_id` could be set to sample name, and `paternal_id` and `maternal_id` should be set to 0. ```console -sample,file,family_id,paternal_id,maternal_id,sex,phenotype -HG002,/path/to/HG002.fastq.gz,FAM,HG003,0,1,2 -HG003,/path/to/HG003.bam,FAM,0,0,2,1 +project,sample,file,family_id,paternal_id,maternal_id,sex,phenotype +testrun,HG002,/path/to/HG002.fastq.gz,FAM,HG003,0,1,2 +testrun,HG003,/path/to/HG003.bam,FAM,0,0,2,1 ``` | Fields | Description | | ------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `project` | Project name must be provided and cannot contain spaces, needs to be the same for all samples." | | `sample` | Custom sample name, cannot contain spaces. | | `file` | Absolute path to gzipped FASTQ or BAM file. File has to have the extension ".fastq.gz", .fq.gz" or ".bam". | | `family_id` | "Family ID must be provided and cannot contain spaces. If no family ID is available you can use the same ID as the sample | @@ -127,16 +129,6 @@ cadd,/path/to/cadd.v1.6.hg38.zip - If running without `--skip_cnv_calling`, expected CN regions for your reference genome can be downloaded from [HiFiCNV GitHub](https://github.com/PacificBiosciences/HiFiCNV/tree/main/data) to supply with `--hificnv_xy`, `--hificnv_xx` (expected_cn) and `--hificnv_exclude` (excluded_regions). -- If you want to include extra samples for mili-sample calling of SVs - prepare a samplesheet with .snf files from Sniffles to supply with `--extra_snfs`: - -`extra_snfs.csv` - -``` -sample,file -HG01123,/path/to/HG01123_sniffles.snf -HG01124,/path/to/HG01124_sniffles.snf -``` - - If running without `--skip_call_paralogs`, the reference genome needs to be hg38 - If running without `--skip_mapping_wf`, a VCF of known polymorphic sites (e.g. [sites.hg38.vcg.gz](https://github.com/brentp/somalier/files/3412456/sites.hg38.vcf.gz)) needs to be supplied with `--somalier_sites`, from which sex will be inferred if possible. @@ -255,7 +247,6 @@ Different processes may need extra input files | Parameter | Description | Type | Default | Required | Hidden | | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | | `dipcall_par` | Provide a bed file of chrX PAR regions for dipcall | `string` | | | | -| `extra_snfs` | Extra input files for Sniffles | `string` | | | | | `tandem_repeats` | Tandem repeat BED-file for sniffles | `string` | | | | | `trgt_repeats` | BED-file for repeats to be genotyped | `string` | | | | | `snp_db` | Extra echtvar-databases to annotate SNVs with | `string` | | | | diff --git a/lib/CustomFunctions.groovy b/lib/CustomFunctions.groovy deleted file mode 100644 index 4f9979b4..00000000 --- a/lib/CustomFunctions.groovy +++ /dev/null @@ -1,21 +0,0 @@ -import nextflow.Nextflow - -class CustomFunctions { - - // Function to generate a pedigree file - public static File makePed(samples, outdir) { - def case_name = "multisample" - def outfile = new File(outdir +"/pipeline_info/${case_name}" + '.ped') - outfile.text = ['#family_id', 'sample_id', 'father', 'mother', 'sex', 'phenotype'].join('\t') - def samples_list = [] - for(int i = 0; i${project}.ped + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + create_pedigree_file: v1.0 + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + touch ${project}.ped + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + create_pedigree_file: v1.0 + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 2f2dd9f5..592e5f28 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,7 +15,6 @@ params { cadd_resources = null cadd_prescored = null dipcall_par = null - extra_snfs = null tandem_repeats = null trgt_repeats = null variant_catalog = null diff --git a/nextflow_schema.json b/nextflow_schema.json index ef0e0217..4b9c4470 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -422,14 +422,6 @@ "format": "file-path", "exists": true }, - "extra_snfs": { - "type": "string", - "description": "Extra input files for Sniffles", - "pattern": "^\\S+\\.csv$", - "format": "file-path", - "schema": "assets/schema_snfs.json", - "exists": true - }, "tandem_repeats": { "type": "string", "format": "file-path", diff --git a/subworkflows/local/bam_infer_sex.nf b/subworkflows/local/bam_infer_sex.nf index 3f49e5e5..a394d7ff 100644 --- a/subworkflows/local/bam_infer_sex.nf +++ b/subworkflows/local/bam_infer_sex.nf @@ -11,7 +11,6 @@ workflow BAM_INFER_SEX { ch_ped // channel: [ val(meta), path(ped) ] main: - ch_versions = Channel.empty() // Extract sites @@ -24,9 +23,9 @@ workflow BAM_INFER_SEX { ch_versions = ch_versions.mix(SOMALIER_EXTRACT.out.versions) SOMALIER_EXTRACT.out.extract - .map { meta, extract -> [ [ id: 'multisample' ], extract ] } + .map { meta, extract -> [ [ id: meta.project ], extract ] } .groupTuple() - .join( ch_ped.map { ped -> [ [ id:'multisample'], ped ] } ) + .join( ch_ped ) .set { ch_somalier_relate_in } // Infer sex @@ -59,7 +58,8 @@ workflow BAM_INFER_SEX { maternal_id : meta.maternal_id, sex : meta.sex == 0 ? somalier.sex.toInteger() : meta.sex, phenotype : meta.phenotype, - single_end : meta.single_end + single_end : meta.single_end, + project : meta.project ] [ new_meta, bam, bai ] } diff --git a/subworkflows/local/call_repeat_expansions/main.nf b/subworkflows/local/call_repeat_expansions/main.nf index 5b6a43d1..4999e0bf 100644 --- a/subworkflows/local/call_repeat_expansions/main.nf +++ b/subworkflows/local/call_repeat_expansions/main.nf @@ -33,7 +33,7 @@ workflow CALL_REPEAT_EXPANSIONS { BCFTOOLS_SORT_TRGT.out.vcf .join( BCFTOOLS_SORT_TRGT.out.tbi ) - .map { meta, bcf, csi -> [ [ id : 'multisample' ], bcf, csi ] } + .map { meta, bcf, csi -> [ [ id : meta.project ], bcf, csi ] } .groupTuple() .set{ ch_bcftools_merge_in } diff --git a/subworkflows/local/short_variant_calling/main.nf b/subworkflows/local/short_variant_calling/main.nf index dacaf4ac..002ab504 100644 --- a/subworkflows/local/short_variant_calling/main.nf +++ b/subworkflows/local/short_variant_calling/main.nf @@ -60,13 +60,16 @@ workflow SHORT_VARIANT_CALLING { // This creates a multisample VCF, with regions from ONE bed file DEEPVARIANT.out.gvcf - .map { meta, gvcf -> [ meta.region.name, meta.phenotype == 2, gvcf ] } + .map { meta, gvcf -> + [ meta.region.name, meta.project, meta.phenotype == 2, gvcf ] + } .groupTuple() // Group all files together per region // If any of the samples in the VCF have an affected phenotype (2) // add this to the meta of the multisample VCF to know if we should run RANK_VARIANTS or not - .map { region, affected, gvcfs -> + .map { meta, project, affected, gvcfs -> new_meta = [ - 'id': region, + 'id': meta, + 'project': project.first(), // Works only because only one project per run is allowed 'contains_affected': affected.any(), ] [ new_meta, gvcfs ] diff --git a/subworkflows/local/short_variant_calling/tests/main.nf.test.snap b/subworkflows/local/short_variant_calling/tests/main.nf.test.snap index 589e60ef..8212abbf 100644 --- a/subworkflows/local/short_variant_calling/tests/main.nf.test.snap +++ b/subworkflows/local/short_variant_calling/tests/main.nf.test.snap @@ -75,10 +75,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:07:01.955811332" + "timestamp": "2024-08-09T12:33:33.642550865" }, "2 samples - 2 bed, fasta, fai, bed": { "content": [ @@ -214,10 +214,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:08:56.682263691" + "timestamp": "2024-08-09T12:34:36.087668576" }, "2 samples - 2 bed, fasta, fai, bed - stub": { "content": [ @@ -353,10 +353,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:09:52.997168986" + "timestamp": "2024-08-09T12:35:27.910838148" }, "1 sample - no bed, fasta, fai, []": { "content": [ @@ -442,10 +442,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:06:05.618203586" + "timestamp": "2024-08-09T12:32:38.562683632" }, "1 sample - 1 bed, fasta, fai, []": { "content": [ @@ -523,10 +523,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:06:33.975664464" + "timestamp": "2024-08-09T12:33:06.128266568" }, "1 sample - 1 bed, fasta, fai, [] - stub": { "content": [ @@ -604,10 +604,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:09:16.393719634" + "timestamp": "2024-08-09T12:34:56.051878451" }, "1 sample - 1 bed, fasta, fai, bed - stub": { "content": [ @@ -685,10 +685,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:09:26.530459549" + "timestamp": "2024-08-09T12:35:05.660557092" }, "1 sample - no bed, fasta, fai, [] - stub": { "content": [ @@ -774,10 +774,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:09:06.541104295" + "timestamp": "2024-08-09T12:34:45.861028555" }, "1 sample - 2 bed, fasta, fai, bed": { "content": [ @@ -891,10 +891,10 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:07:43.362545869" + "timestamp": "2024-08-09T12:34:03.458045229" }, "1 sample - 2 bed, fasta, fai, bed - stub": { "content": [ @@ -1008,9 +1008,9 @@ } ], "meta": { - "nf-test": "0.9.0", + "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-24T11:09:38.575037214" + "timestamp": "2024-08-09T12:35:16.395171025" } } \ No newline at end of file diff --git a/subworkflows/local/snv_annotation/tests/main.nf.test.snap b/subworkflows/local/snv_annotation/tests/main.nf.test.snap index bc3feffa..c8c3f045 100644 --- a/subworkflows/local/snv_annotation/tests/main.nf.test.snap +++ b/subworkflows/local/snv_annotation/tests/main.nf.test.snap @@ -153,6 +153,39 @@ "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-08-05T11:12:17.324211719" + "timestamp": "2024-08-09T12:31:54.375357945" + }, + "bcf, db, vep_cache, '110', -stub": { + "content": [ + [ + "versions.yml:md5,797275193dd19766e99030e63c23bd5f", + "versions.yml:md5,992301857689684643c42695c032a7f2", + "versions.yml:md5,a07924ee4ebc2d4de5bb7ef897ddc30c", + "versions.yml:md5,c0e55e36a31ed71acf25702b7d059533" + ], + [ + [ + { + "id": "test_data.bed", + "contains_affected": false + }, + "test_data.bed.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + [ + { + "id": "test_data.bed", + "contains_affected": false + }, + "test_data.bed.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-09T12:32:10.118218292" } -} \ No newline at end of file +} diff --git a/subworkflows/local/structural_variant_calling.nf b/subworkflows/local/structural_variant_calling.nf index ace51636..be738c63 100644 --- a/subworkflows/local/structural_variant_calling.nf +++ b/subworkflows/local/structural_variant_calling.nf @@ -5,7 +5,6 @@ workflow STRUCTURAL_VARIANT_CALLING { take: ch_bam_bai // channel: [ val(meta), [[ bam ], [bai]] ] - ch_snfs ch_fasta ch_fai ch_tandem_repeats @@ -15,13 +14,10 @@ workflow STRUCTURAL_VARIANT_CALLING { SNIFFLES (ch_bam_bai, ch_fasta, ch_tandem_repeats, true, true) - // Combine sniffles output with supplied extra snfs SNIFFLES.out.snf - .map{ it [1] } - .concat(ch_snfs.map{ it[1] }) - .collect() - .sort{ it.name } - .map { snfs -> [ [id:'multisample'], snfs, [] ] } + .map { meta, snf -> [ [ 'id': meta.project ], snf ] } + .groupTuple() + .map { meta, snfs -> [ meta, snfs, [] ] } .set{ ch_multisample_input } SNIFFLES_MULTISAMPLE( ch_multisample_input, ch_fasta, ch_tandem_repeats, true, false ) diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf index ff0ca7b7..c25f7f3f 100644 --- a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf @@ -200,7 +200,16 @@ workflow PIPELINE_INITIALISATION { } } - + // Check that there's no more than one project + // TODO: Try to do this in nf-schema + ch_samplesheet + .map { meta, reads -> meta.project } + .unique() + .collect() + .filter{ it.size() == 1 } + .ifEmpty { + error("Only one project may be specified per run") + } emit: samplesheet = ch_samplesheet versions = ch_versions diff --git a/tests/main.nf.test b/tests/main.nf.test index d2ba3580..04bb42d9 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -14,7 +14,7 @@ nextflow_pipeline { pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/' // Test files fasta = params.pipelines_testdata_base_path + 'nallo/reference/hg38.test.fa.gz' - input = params.pipelines_testdata_base_path + 'nallo/testdata/samplesheet.csv' + input = 'https://github.com/genomic-medicine-sweden/test-datasets/raw/e2266a34c14d1e0a9ef798de3cd81a76c9216fc1/testdata/samplesheet.csv' bed = params.pipelines_testdata_base_path + 'nallo/reference/test_data.bed' hificnv_xy = params.pipelines_testdata_base_path + 'nallo/reference/expected_cn.hg38.XY.bed' hificnv_xx = params.pipelines_testdata_base_path + 'nallo/reference/expected_cn.hg38.XX.bed' @@ -41,10 +41,10 @@ nextflow_pipeline { { assert workflow.success }, // Assert with snapshot multisample { assert snapshot( - file("$outputDir/pipeline_info/multisample.ped"), - file("$outputDir/qc_aligned_reads/somalier/relate/multisample/multisample.pairs.tsv"), - file("$outputDir/qc_aligned_reads/somalier/relate/multisample/multisample.samples.tsv"), - file("$outputDir/qc_aligned_reads/somalier/relate/multisample/multisample.html"), + file("$outputDir/pedigree/test.ped"), + file("$outputDir/qc_aligned_reads/somalier/relate/test/test.pairs.tsv"), + file("$outputDir/qc_aligned_reads/somalier/relate/test/test.samples.tsv"), + file("$outputDir/qc_aligned_reads/somalier/relate/test/test.html"), file("$outputDir/multiqc/multiqc_data/multiqc_citations.txt"), file("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), file("$outputDir/multiqc/multiqc_data/multiqc_somalier.txt"), @@ -84,22 +84,22 @@ nextflow_pipeline { file("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio/HG002_Revio_sorted.vcf.gz"), file("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio/HG002_Revio_sorted.vcf.gz.tbi"), bam("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio/HG002_Revio_spanning_sorted.bam", stringency: 'silent').getReadsMD5(), - file("$outputDir/snvs/stats/single_sample/HG002_Revio.vcf.gz.bcftools_stats.txt"), + file("$outputDir/snvs/stats/single_sample/HG002_Revio.vcf.gz.bcftools_stats.txt").readLines()[0..2], ).match() }, // Assert exists multisample - { assert new File("$outputDir/databases/echtvar/encode/multisample/multisample.zip").exists() }, + { assert new File("$outputDir/databases/echtvar/encode/test/test.zip").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_data.json").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc.log").exists() }, { assert new File("$outputDir/multiqc/multiqc_report.html").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_sources.txt").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_software_versions.txt").exists() }, - { assert new File("$outputDir/repeat_calling/trgt/multi_sample/multisample/multisample.vcf.gz").exists() }, - { assert new File("$outputDir/repeat_calling/trgt/multi_sample/multisample/multisample.vcf.gz.tbi").exists() }, - { assert new File("$outputDir/snvs/multi_sample/multisample/multisample_snv_annotated_ranked.vcf.gz").exists() }, - { assert new File("$outputDir/snvs/multi_sample/multisample/multisample_snv_annotated_ranked.vcf.gz.tbi").exists() }, - { assert new File("$outputDir/sv_calling/sniffles/multi_sample/multisample_sniffles.vcf.gz").exists() }, - { assert new File("$outputDir/sv_calling/sniffles/multi_sample/multisample_sniffles.vcf.gz.tbi").exists() }, + { assert new File("$outputDir/repeat_calling/trgt/multi_sample/test/test.vcf.gz").exists() }, + { assert new File("$outputDir/repeat_calling/trgt/multi_sample/test/test.vcf.gz.tbi").exists() }, + { assert new File("$outputDir/snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz").exists() }, + { assert new File("$outputDir/snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz.tbi").exists() }, + { assert new File("$outputDir/sv_calling/sniffles/multi_sample/test/test_sniffles.vcf.gz").exists() }, + { assert new File("$outputDir/sv_calling/sniffles/multi_sample/test/test_sniffles.vcf.gz.tbi").exists() }, // Assert exists HG002_Revio { assert new File("$outputDir/aligned_reads/HG002_Revio/HG002_Revio_phased.bam.bai").exists() }, { assert new File("$outputDir/assembly_variant_calling/dipcall/HG002_Revio/HG002_Revio.hap1.bam.bai").exists() }, @@ -146,7 +146,7 @@ nextflow_pipeline { pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/' // Test files fasta = params.pipelines_testdata_base_path + 'nallo/reference/hg38.test.fa.gz' - input = params.pipelines_testdata_base_path + 'nallo/testdata/samplesheet_multisample_bam.csv' + input = 'https://github.com/genomic-medicine-sweden/test-datasets/raw/e2266a34c14d1e0a9ef798de3cd81a76c9216fc1/testdata/samplesheet_multisample_bam.csv' bed = params.pipelines_testdata_base_path + 'nallo/reference/test_data.bed' hificnv_xy = params.pipelines_testdata_base_path + 'nallo/reference/expected_cn.hg38.XY.bed' hificnv_xx = params.pipelines_testdata_base_path + 'nallo/reference/expected_cn.hg38.XX.bed' @@ -173,7 +173,7 @@ nextflow_pipeline { { assert workflow.success }, // Assert with snapshot multisample { assert snapshot( - file("$outputDir/pipeline_info/multisample.ped"), + file("$outputDir/pedigree/test.ped"), file("$outputDir/multiqc/multiqc_data/multiqc_citations.txt"), file("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), // Assert with snapshot HG002_Revio_A @@ -212,7 +212,7 @@ nextflow_pipeline { file("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_A/HG002_Revio_A_sorted.vcf.gz"), file("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_A/HG002_Revio_A_sorted.vcf.gz.tbi"), bam("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_A/HG002_Revio_A_spanning_sorted.bam", stringency: 'silent').getReadsMD5(), - file("$outputDir/snvs/stats/single_sample/HG002_Revio_A.vcf.gz.bcftools_stats.txt"), + file("$outputDir/snvs/stats/single_sample/HG002_Revio_A.vcf.gz.bcftools_stats.txt").readLines()[0..2], // Assert with snapshot HG002_Revio_B bam("$outputDir/aligned_reads/HG002_Revio_B/HG002_Revio_B_phased.bam", stringency: 'silent').getReadsMD5(), file("$outputDir/assembly_haplotypes/gfastats/HG002_Revio_B/HG002_Revio_B.asm.bp.hap1.p_ctg.assembly_summary"), @@ -249,13 +249,13 @@ nextflow_pipeline { file("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_B/HG002_Revio_B_sorted.vcf.gz"), file("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_B/HG002_Revio_B_sorted.vcf.gz.tbi"), bam("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_B/HG002_Revio_B_spanning_sorted.bam", stringency: 'silent').getReadsMD5(), - file("$outputDir/snvs/stats/single_sample/HG002_Revio_B.vcf.gz.bcftools_stats.txt"), + file("$outputDir/snvs/stats/single_sample/HG002_Revio_B.vcf.gz.bcftools_stats.txt").readLines()[0..2], ).match() }, // Assert exists multisample - note the trgt multisample that doesn't exist in singlesample - { assert new File("$outputDir/databases/echtvar/encode/multisample/multisample.zip").exists() }, - { assert new File("$outputDir/qc_aligned_reads/somalier/relate/multisample/multisample.pairs.tsv").exists() }, - { assert new File("$outputDir/qc_aligned_reads/somalier/relate/multisample/multisample.samples.tsv").exists() }, - { assert new File("$outputDir/qc_aligned_reads/somalier/relate/multisample/multisample.html").exists() }, + { assert new File("$outputDir/databases/echtvar/encode/test/test.zip").exists() }, + { assert new File("$outputDir/qc_aligned_reads/somalier/relate/test/test.pairs.tsv").exists() }, + { assert new File("$outputDir/qc_aligned_reads/somalier/relate/test/test.samples.tsv").exists() }, + { assert new File("$outputDir/qc_aligned_reads/somalier/relate/test/test.html").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_somalier.txt").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_data.json").exists() }, @@ -263,12 +263,12 @@ nextflow_pipeline { { assert new File("$outputDir/multiqc/multiqc_report.html").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_sources.txt").exists() }, { assert new File("$outputDir/multiqc/multiqc_data/multiqc_software_versions.txt").exists() }, - { assert new File("$outputDir/snvs/multi_sample/multisample/multisample_snv_annotated_ranked.vcf.gz").exists() }, - { assert new File("$outputDir/snvs/multi_sample/multisample/multisample_snv_annotated_ranked.vcf.gz.tbi").exists() }, - { assert new File("$outputDir/sv_calling/sniffles/multi_sample/multisample_sniffles.vcf.gz").exists() }, - { assert new File("$outputDir/sv_calling/sniffles/multi_sample/multisample_sniffles.vcf.gz.tbi").exists() }, - { assert new File("$outputDir/repeat_calling/trgt/multi_sample/multisample/multisample.vcf.gz").exists() }, - { assert new File("$outputDir/repeat_calling/trgt/multi_sample/multisample/multisample.vcf.gz.tbi").exists() }, + { assert new File("$outputDir/snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz").exists() }, + { assert new File("$outputDir/snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz.tbi").exists() }, + { assert new File("$outputDir/sv_calling/sniffles/multi_sample/test/test_sniffles.vcf.gz").exists() }, + { assert new File("$outputDir/sv_calling/sniffles/multi_sample/test/test_sniffles.vcf.gz.tbi").exists() }, + { assert new File("$outputDir/repeat_calling/trgt/multi_sample/test/test.vcf.gz").exists() }, + { assert new File("$outputDir/repeat_calling/trgt/multi_sample/test/test.vcf.gz.tbi").exists() }, // Assert exists HG002_Revio_A { assert new File("$outputDir/aligned_reads/HG002_Revio_A/HG002_Revio_A_phased.bam.bai").exists() }, { assert new File("$outputDir/assembly_variant_calling/dipcall/HG002_Revio_A/HG002_Revio_A.hap1.bam.bai").exists() }, diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap index 942dcf0c..31a1d818 100644 --- a/tests/main.nf.test.snap +++ b/tests/main.nf.test.snap @@ -1,10 +1,10 @@ { "test profile": { "content": [ - "multisample.ped:md5,f52b1fb9647cb255313b9602841481fd", - "multisample.pairs.tsv:md5,4a0988fc3c0fe5cfd5dd205fe6755595", - "multisample.samples.tsv:md5,1685dc6cb8c6b9806ca636662980d686", - "multisample.html:md5,d05e0eceb70ada3a0c25f99a16ad1889", + "test.ped:md5,bd5cec27ba7337a85cf98e787131e2b5", + "test.pairs.tsv:md5,4a0988fc3c0fe5cfd5dd205fe6755595", + "test.samples.tsv:md5,1685dc6cb8c6b9806ca636662980d686", + "test.html:md5,d05e0eceb70ada3a0c25f99a16ad1889", "multiqc_citations.txt:md5,a27affce20d456d20ed387097a4f0350", "multiqc_fastqc.txt:md5,055c2c156136798feeb1658adf905e95", "multiqc_somalier.txt:md5,20b4c5b2d5b94b77fb800548e07a874e", @@ -43,17 +43,21 @@ "HG002_Revio_sorted.vcf.gz:md5,bc06de08b8e36b3b48e0d7b9e21df389", "HG002_Revio_sorted.vcf.gz.tbi:md5,08a5c82838264c558eb30726906f47e0", "110181f29066158df34abbad9e3becc8", - "HG002_Revio.vcf.gz.bcftools_stats.txt:md5,ba63db6ea639dc7080fabdedf19779b4" + [ + "# This file was produced by bcftools stats (1.20+htslib-1.20) and can be plotted using plot-vcfstats.", + "# The command line was:\tbcftools stats HG002_Revio.vcf.gz", + "#" + ] ], "meta": { "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-26T10:41:39.74321081" + "timestamp": "2024-08-09T13:23:09.224424657" }, "test profile - multisample": { "content": [ - "multisample.ped:md5,2d69697ac006715f975502a6578c9d1f", + "test.ped:md5,a1e82af069bce823564e204c316d5500", "multiqc_citations.txt:md5,a27affce20d456d20ed387097a4f0350", "multiqc_fastqc.txt:md5,234f2958710c30f62446a9406cbfcaae", "74b4822241bd8d1bc42f494f1f3e326c", @@ -91,7 +95,11 @@ "HG002_Revio_A_sorted.vcf.gz:md5,b95e709a27fe1df9ee1487b99f396bf4", "HG002_Revio_A_sorted.vcf.gz.tbi:md5,b1eb1f21f36782089b8e0bb0a54105ed", "110181f29066158df34abbad9e3becc8", - "HG002_Revio_A.vcf.gz.bcftools_stats.txt:md5,735742ed6775d3c3a22966ce9080b1ce", + [ + "# This file was produced by bcftools stats (1.20+htslib-1.20) and can be plotted using plot-vcfstats.", + "# The command line was:\tbcftools stats HG002_Revio_A.vcf.gz", + "#" + ], "fe7bb70701d1100b2874c10a512a2144", "HG002_Revio_B.asm.bp.hap1.p_ctg.assembly_summary:md5,4941730ceacb4012e771208be7a6673a", "HG002_Revio_B.asm.bp.hap2.p_ctg.assembly_summary:md5,be7dcb093d25922b72ef0f7bc1bf0706", @@ -127,12 +135,16 @@ "HG002_Revio_B_sorted.vcf.gz:md5,05ae66b46d2f87a2133fcdf93d30f38c", "HG002_Revio_B_sorted.vcf.gz.tbi:md5,244a3f966e3434220cd69fcb04b08d01", "18e3bd1fe43fc17ace2f57db5861498c", - "HG002_Revio_B.vcf.gz.bcftools_stats.txt:md5,999ab680e9f012d0f1cb6f7aaafdc772" + [ + "# This file was produced by bcftools stats (1.20+htslib-1.20) and can be plotted using plot-vcfstats.", + "# The command line was:\tbcftools stats HG002_Revio_B.vcf.gz", + "#" + ] ], "meta": { "nf-test": "0.8.4", "nextflow": "24.04.3" }, - "timestamp": "2024-07-26T10:46:15.4519051" + "timestamp": "2024-08-09T13:26:52.566315569" } } \ No newline at end of file diff --git a/workflows/nallo.nf b/workflows/nallo.nf index aead21cf..9cc76c2d 100644 --- a/workflows/nallo.nf +++ b/workflows/nallo.nf @@ -32,6 +32,7 @@ include { STRUCTURAL_VARIANT_CALLING } from '../subworkflows/local/stru */ // local +include { CREATE_PEDIGREE_FILE } from '../modules/local/create_pedigree_file' include { ECHTVAR_ENCODE } from '../modules/local/echtvar/encode/main' include { FQCRS } from '../modules/local/fqcrs' include { SAMTOOLS_MERGE } from '../modules/nf-core/samtools/merge/main' @@ -110,7 +111,17 @@ workflow NALLO { if (params.phaser.matches('hiphase_sv|hiphase_snv') && params.preset == 'ONT_R10') { error "The HiPhase license only permits analysis of data from PacBio. For details see: https://github.com/PacificBiosciences/HiPhase/blob/main/LICENSE.md" } // Create PED from samplesheet - ch_pedfile = ch_input.toList().map { file(CustomFunctions.makePed(it, params.outdir)) } + ch_input + .map { meta, files -> [ meta.project, meta ] } + .groupTuple() + .set { ch_ped_in } + + ch_pedfile = CREATE_PEDIGREE_FILE ( ch_ped_in ) + ch_versions = ch_versions.mix(CREATE_PEDIGREE_FILE.out.versions) + + CREATE_PEDIGREE_FILE.out.ped + .map { project, ped -> [ [ 'id': project ], ped ] } + .set { ch_pedfile } // // Convert BAM files to FASTQ @@ -284,8 +295,7 @@ workflow NALLO { // // Call SVs with Sniffles2 - // - STRUCTURAL_VARIANT_CALLING( bam_bai , ch_extra_snfs, fasta, fai, ch_tandem_repeats ) + STRUCTURAL_VARIANT_CALLING( bam_bai, fasta, fai, ch_tandem_repeats ) ch_versions = ch_versions.mix(STRUCTURAL_VARIANT_CALLING.out.versions) // @@ -356,7 +366,7 @@ workflow NALLO { // Only run if we have affected individuals RANK_VARIANTS_SNV ( ANN_CSQ_PLI_SNV.out.vcf_ann.filter { meta, vcf -> meta.contains_affected }, - ch_pedfile, + ch_pedfile.map { meta, ped -> ped }, ch_reduced_penetrance, ch_score_config_snv ) @@ -382,7 +392,7 @@ workflow NALLO { } ch_vcf_tbi_per_region - .map { meta, vcf, tbi -> [ [ id: 'multisample' ], vcf, tbi ] } + .map { meta, vcf, tbi -> [ [ id: meta.project ], vcf, tbi ] } .groupTuple() .set { ch_bcftools_concat_in }