Use project name instead of multisample (#264)

genomic-medicine-sweden · Aug 9, 2024 · 7f064db · 7f064db
1 parent 687743e
commit 7f064db
Show file tree

Hide file tree

Showing 25 changed files with 244 additions and 204 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -28,7 +28,7 @@ jobs:
       matrix:
         parameters:
           - ""
-          - "--preset ONT_R10 --input https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/testdata/samplesheet_multisample_bam_ont.csv --split_fastq 2 --parallel_snv 1"
+          - "--preset ONT_R10 --input https://github.com/genomic-medicine-sweden/test-datasets/raw/e2266a34c14d1e0a9ef798de3cd81a76c9216fc1/testdata/samplesheet_multisample_bam_ont.csv --split_fastq 2 --parallel_snv 1"
         NXF_VER:
           - "23.04.0"
           - "latest-everything"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#252](https://github.com/genomic-medicine-sweden/nallo/pull/252) - Added a new `SCATTER_GENOME` subworkflow
 - [#255](https://github.com/genomic-medicine-sweden/nallo/pull/255) - Added a new `RANK_VARIANTS` subworkflow to rank SNVs using genmod
 - [#261](https://github.com/genomic-medicine-sweden/nallo/pull/261) - Added a `--skip_rank_variants` parameter to skip the rank_variants subworkflow
+- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Added a `project` column to the sampleheet
 - [#266](https://github.com/genomic-medicine-sweden/nallo/pull/266) - Added CADD to dynamically calculate indel CADD-scores
 - [#270](https://github.com/genomic-medicine-sweden/nallo/pull/270) - Added SNV phasing stats to MultiQC
 - [#271](https://github.com/genomic-medicine-sweden/nallo/pull/271) - Added a `--skip_aligned_read_qc` parameter to skip the qc aligned reads subworkflow
@@ -43,6 +44,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#261](https://github.com/genomic-medicine-sweden/nallo/pull/261) - Changed SNV annotation to run in parallel
 - [#261](https://github.com/genomic-medicine-sweden/nallo/pull/261) - Changed SNV output file names and directory structure
 - [#262](https://github.com/genomic-medicine-sweden/nallo/pull/262) - Updated README
+- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Changed PED file creation from groovy script to process
+- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Changed all `multisample` filenames to `{project}` from samplesheet
 - [#268](https://github.com/genomic-medicine-sweden/nallo/pull/268) - Only output unphased alignments when phasing is off
 - [#268](https://github.com/genomic-medicine-sweden/nallo/pull/268) - Changed alignment output file names and directory structure
 - [#270](https://github.com/genomic-medicine-sweden/nallo/pull/270) - Changed whatshap stats to always run, regardless of phasing software, and changed the output from `*.stats.tsv.gz` to `*.stats.tsv` to allow being picked up by MultiQC
@@ -57,6 +60,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#243](https://github.com/genomic-medicine-sweden/nallo/pull/243) - Removed VEP report from output files
 - [#257](https://github.com/genomic-medicine-sweden/nallo/pull/257) - Removed obsolete TODO statements
 - [#258](https://github.com/genomic-medicine-sweden/nallo/pull/258) - Removed VCF report from DeepVariant output
+- [#264](https://github.com/genomic-medicine-sweden/nallo/pull/264) - Removed the option to provide extra SNF files to Sniffles with `--extra_snfs`
 
 ### `Fixed`
 
@@ -69,14 +73,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 | Old parameter      | New parameter              |
 | ------------------ | -------------------------- |
-|                    | `--deepvariant_model_type` |
-| `--extra_gvcfs`    |                            |
 | `--skip_repeat_wf` | `--skip_repeat_calling`    |
 | `--skip_repeat_wf` | `--skip_repeat_annotation` |
+|                    | `--deepvariant_model_type` |
 |                    | `--skip_rank_variants`     |
 |                    | `--skip_aligned_read_qc`   |
 |                    | `--cadd_resources`         |
 |                    | `--cadd_prescored`         |
+| `--extra_gvcfs`    |                            |
+| `--extra_snfs`     |                            |
 
 > [!NOTE]
 > Parameter has been updated if both old and new parameter information is present.

diff --git a/README.md b/README.md
@@ -57,9 +57,9 @@ Prepare a samplesheet with input data:
 `samplesheet.csv`
 
 ```
-sample,file,family_id,paternal_id,maternal_id,sex,phenotype
-HG002,/path/to/HG002.fastq.gz,FAM1,HG003,HG004,1,2
-HG005,/path/to/HG005.bam,FAM1,HG003,HG004,2,1
+project,sample,file,family_id,paternal_id,maternal_id,sex,phenotype
+testrun,HG002,/path/to/HG002.fastq.gz,FAM1,HG003,HG004,1,2
+testrun,HG005,/path/to/HG005.bam,FAM1,HG003,HG004,2,1
 ```
 
 Now, you can run the pipeline using:

diff --git a/assets/schema_gvcfs.json b/assets/schema_gvcfs.json
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -48,8 +48,14 @@
                 "enum": [0, 1, 2],
                 "errorMessage": "Phenoype must be provided as 0 (missing), 1 (unaffected) or 2 (affected)",
                 "meta": ["phenotype"]
+            },
+            "project": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "Project name must be provided and cannot contain spaces, needs to be the same for all samples",
+                "meta": ["project"]
             }
         },
-        "required": ["sample", "file", "family_id", "paternal_id", "maternal_id", "sex", "phenotype"]
+        "required": ["sample", "file", "family_id", "paternal_id", "maternal_id", "sex", "phenotype", "project"]
     }
 }
diff --git a/assets/schema_snfs.json b/assets/schema_snfs.json
diff --git a/conf/modules/general.config b/conf/modules/general.config
@@ -94,6 +94,13 @@ process {
         ]
     }
 
+    withName: '.*:NALLO:CREATE_PEDIGREE_FILE' {
+        publishDir = [
+            path: { "${params.outdir}/pedigree" },
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
+    }
+
     withName: '.*:NALLO:SPLIT_BED_CHUNKS' {
         publishDir = [
             enabled: false

diff --git a/conf/modules/structural_variant_calling.config b/conf/modules/structural_variant_calling.config
@@ -33,10 +33,10 @@ process {
 
     withName: '.*:STRUCTURAL_VARIANT_CALLING:SNIFFLES_MULTISAMPLE' {
 
-        ext.prefix = 'multisample_sniffles'
+        ext.prefix = { "${meta.id}_sniffles" }
 
         publishDir = [
-            path: { "${params.outdir}/sv_calling/sniffles/multi_sample" },
+            path: { "${params.outdir}/sv_calling/sniffles/multi_sample/${meta.id}" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]

diff --git a/conf/test.config b/conf/test.config
@@ -23,7 +23,7 @@ params {
     // Genome references
 
     fasta = params.pipelines_testdata_base_path + 'nallo/reference/hg38.test.fa.gz'
-    input = params.pipelines_testdata_base_path + 'nallo/testdata/samplesheet.csv'
+    input = 'https://github.com/genomic-medicine-sweden/test-datasets/raw/2948776ddf24ea131f527aa1f2dc23a43bb7b952/testdata/samplesheet.csv'
 
     bed = params.pipelines_testdata_base_path + 'nallo/reference/test_data.bed'
 

diff --git a/docs/output.md b/docs/output.md
@@ -24,7 +24,7 @@
     - [Repeat annotation](#repeat-annotation)
     - [SNV Annotation](#snv-annotation)
     - [Ranked Variants](#ranked-variants)
-    - [SNV Calling](#snv-calling)
+    - [SV Calling](#sv-calling)
 
 ## Pipeline overview
 
@@ -245,10 +245,10 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
 <details markdown="1">
 <summary>Output files from Somalier</summary>
 
-- `{outputdir}/qc_aligned_reads/somalier/relate/mutlisample/`
-  - `*.html`: HTML report
-  - `*.pairs.tsv`: Output information in sample pairs
-  - `*.samples.tsv`: Output information per sample
+- `{outputdir}/qc_aligned_reads/somalier/relate/{project}/`
+  - `{project}.html`: HTML report
+  - `{project}.pairs.tsv`: Output information in sample pairs
+  - `{project}.samples.tsv`: Output information per sample
   </details>
 
 ### Raw read QC
@@ -274,7 +274,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
 <details markdown="1">
 <summary>Output files from TRGT</summary>
 
-- `{outputdir}/repeat_calling/trgt/multi_sample/multisample/`
+- `{outputdir}/repeat_calling/trgt/multi_sample/{project}/`
   - `*.vcf.gz`: Merged VCF for all samples
   - `*.vcf.gz.tbi`: Index of the corresponding VCF file
 - `{outputdir}/repeat_calling/trgt/single_sample/{sample}/`
@@ -305,9 +305,9 @@ In case of affected samples, [echtvar](https://github.com/brentp/echtvar) and [V
 <details markdown="1">
 <summary>Output files from SNV Annotation</summary>
 
-- `{outputdir}/databases/echtvar/encode/multisample/`
+- `{outputdir}/databases/echtvar/encode/{project}/`
   - `*.zip`: Database with AF and AC for all samples run
-- `{outputdir}/snvs/{single_sample,multi_sample/multisample}/`
+- `{outputdir}/snvs/{single_sample,multi_sample/{project}/`
   - `*_snvs_annotated*.vcf.gz`: VCF with annotated variants
   - `*_snvs_annotated*.vcf.gz.tbi`: Index of the corresponding VCF file
 - `{outputdir}/snvs/stats/single_sample/`
@@ -327,29 +327,26 @@ In case of affected samples, [echtvar](https://github.com/brentp/echtvar) and [V
 <details markdown="1">
 <summary>Output files</summary>
 
-- `{outputdir}/snvs/{single_sample,multi_sample/multisample}/`
-  - `*_snvs_annotated_ranked.vcf.gz`: VCF with annotated and ranked variants
-  - `*_snvs_annotated_ranked.vcf.gz.tbi`: Index of the corresponding VCF file
+- `{outputdir}/snvs/single_sample/{sample}/`
+  - `{sample}_snv_annotated_ranked.vcf.gz`: VCF with annotated and ranked variants
+  - `{sample}_snv_annotated_ranked.vcf.gz.tbi`: Index of the corresponding VCF file
+- `{outputdir}/snvs/multi_sample/{project}/`
+  - `{project}_snv_annotated_ranked.vcf.gz`: VCF with annotated and ranked variants
+  - `{project}_snv_annotated_ranked.vcf.gz.tbi`: Index of the corresponding VCF file
   </details>
 
-### SNV Calling
+### SV Calling
 
 [Sniffles](https://github.com/fritzsedlazeck/Sniffles) is used to call and merge structural variants.
 
 <details markdown="1">
 <summary>Output files from SNV Calling</summary>
 
-- `{outputdir}/sv_calling/multi_sample/`
+- `{outputdir}/sv_calling/multi_sample/{project}`
   - `*.vcf.gz`: VCF with variants
   - `*.vcf.gz.tbi`: Index of the corresponding VCF file
 - `{outputdir}/sv_calling/single_sample/{sample}`
   - `*.snf`: Sniffles SNF file
   - `*.vcf.gz`: VCF with variants
   - `*.vcf.gz.tbi`: Index of the corresponding VCF file
-- `{outputdir}/snv_calling/single_sample/deepvariant/gvcf/{sample}/`
-  - `*.g.vcf.gz`: gVCF with variants
-  - `*.g.vcf.gz.tbi`: Index of the corresponding gVCF file
-- `{outputdir}/snv_calling/single_sample/deepvariant/vcf/{sample}/`
-  - `*.vcf.gz`: VCF with variants
-  - `*.vcf.gz.tbi`: Index of the corresponding VCF file
   </details>
diff --git a/docs/usage.md b/docs/usage.md
@@ -57,18 +57,20 @@ You will need to create a samplesheet with information about the samples you wou
 --input '[path to samplesheet file]'
 ```
 
-It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below.
+It has to be a comma-separated file with 7 columns, and a header row as shown in the examples below.
 `file` can either be a gzipped-fastq file or an aligned or unalinged BAM file (BAM files will be converted to FASTQ and aligned again).
+`project` needs to be the same for all samples in a run.
 If you don't have related samples, `family_id` could be set to sample name, and `paternal_id` and `maternal_id` should be set to 0.
 
 ```console
-sample,file,family_id,paternal_id,maternal_id,sex,phenotype
-HG002,/path/to/HG002.fastq.gz,FAM,HG003,0,1,2
-HG003,/path/to/HG003.bam,FAM,0,0,2,1
+project,sample,file,family_id,paternal_id,maternal_id,sex,phenotype
+testrun,HG002,/path/to/HG002.fastq.gz,FAM,HG003,0,1,2
+testrun,HG003,/path/to/HG003.bam,FAM,0,0,2,1
 ```
 
 | Fields        | Description                                                                                                               |
 | ------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `project`     | Project name must be provided and cannot contain spaces, needs to be the same for all samples."                           |
 | `sample`      | Custom sample name, cannot contain spaces.                                                                                |
 | `file`        | Absolute path to gzipped FASTQ or BAM file. File has to have the extension ".fastq.gz", .fq.gz" or ".bam".                |
 | `family_id`   | "Family ID must be provided and cannot contain spaces. If no family ID is available you can use the same ID as the sample |
@@ -127,16 +129,6 @@ cadd,/path/to/cadd.v1.6.hg38.zip
 
 - If running without `--skip_cnv_calling`, expected CN regions for your reference genome can be downloaded from [HiFiCNV GitHub](https://github.com/PacificBiosciences/HiFiCNV/tree/main/data) to supply with `--hificnv_xy`, `--hificnv_xx` (expected_cn) and `--hificnv_exclude` (excluded_regions).
 
-- If you want to include extra samples for mili-sample calling of SVs - prepare a samplesheet with .snf files from Sniffles to supply with `--extra_snfs`:
-
-`extra_snfs.csv`
-
-```
-sample,file
-HG01123,/path/to/HG01123_sniffles.snf
-HG01124,/path/to/HG01124_sniffles.snf
-```
-
 - If running without `--skip_call_paralogs`, the reference genome needs to be hg38
 
 - If running without `--skip_mapping_wf`, a VCF of known polymorphic sites (e.g. [sites.hg38.vcg.gz](https://github.com/brentp/somalier/files/3412456/sites.hg38.vcf.gz)) needs to be supplied with `--somalier_sites`, from which sex will be inferred if possible.
@@ -255,7 +247,6 @@ Different processes may need extra input files
 | Parameter                          | Description                                                                                                                                                                                                                                                               | Type      | Default | Required | Hidden |
 | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
 | `dipcall_par`                      | Provide a bed file of chrX PAR regions for dipcall                                                                                                                                                                                                                        | `string`  |         |          |        |
-| `extra_snfs`                       | Extra input files for Sniffles                                                                                                                                                                                                                                            | `string`  |         |          |        |
 | `tandem_repeats`                   | Tandem repeat BED-file for sniffles                                                                                                                                                                                                                                       | `string`  |         |          |        |
 | `trgt_repeats`                     | BED-file for repeats to be genotyped                                                                                                                                                                                                                                      | `string`  |         |          |        |
 | `snp_db`                           | Extra echtvar-databases to annotate SNVs with                                                                                                                                                                                                                             | `string`  |         |          |        |

diff --git a/lib/CustomFunctions.groovy b/lib/CustomFunctions.groovy
diff --git a/modules/local/create_pedigree_file.nf b/modules/local/create_pedigree_file.nf
@@ -0,0 +1,51 @@
+process CREATE_PEDIGREE_FILE {
+    tag "${project}"
+    label 'process_single'
+
+    conda "conda-forge::python=3.8.3"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
+        'biocontainers/python:3.8.3' }"
+
+    input:
+    tuple val(project), val(samples)
+
+    output:
+    tuple val(project), path("*.ped"), emit: ped
+    path "versions.yml"              , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def out   = new File(project + ".ped")
+    outfile_text = ['#family_id', 'sample_id', 'father', 'mother', 'sex', 'phenotype'].join('\\t')
+    def samples_list = []
+    for(int i = 0; i<samples.size(); i++) {
+        sample_name = samples[i].id
+        if (!samples_list.contains(sample_name)) {
+            outfile_text += "\\n" + [samples[i].family_id, sample_name, samples[i].paternal_id, samples[i].maternal_id, samples[i].sex, samples[i].phenotype].join('\\t')
+            samples_list.add(sample_name)
+        }
+    }
+    """
+    echo -e "$outfile_text" >${project}.ped
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        create_pedigree_file: v1.0
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${project}.ped
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        create_pedigree_file: v1.0
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}