Merge pull request #65 from Duke-GCB/exomeseq-gatk4

exome-seq - GATK4 Preprocessing single-sample workflow
Duke-GCB · Oct 29, 2018 · 3e3c7d2 · 3e3c7d2
2 parents f251e51 + 5d30c87
commit 3e3c7d2
Show file tree

Hide file tree

Showing 18 changed files with 973 additions and 14 deletions.
diff --git a/examples/exome-seq/exomeseq-01-preprocessing-bespin-dev.json b/examples/exome-seq/exomeseq-01-preprocessing-bespin-dev.json
@@ -0,0 +1,57 @@
+{
+  "cwl_url": "https://gist.githubusercontent.com/dleehr/9bd2155a6cbf5783c4f6c8d9c32d16e3/raw/0c2fa4b829eef058ef34232ea46ec453da1537e2/exomeseq-gatk4-preprocessing.cwl",
+  "description": "WES 01 - Preprocessing of Whole Exome sequencing following GATK best practices workflow. Germline SNP & Indel Discovery using b37 human genome assembly on samples sequenced with an xGen Exome Research Panel v1.0 capture kit. Merges reads split into multiple files.",
+  "methods_template_url": "https://raw.githubusercontent.com/Duke-GCB/bespin-cwl/v0.9.3/workflows/exomeseq-methods.j2",
+  "name": "Whole Exome Sequence preprocessing - Human b37",
+  "share_group_name": "informatics",
+  "system_json": {
+    "GATKJar": {
+      "class": "File",
+      "path": "/data/exome-seq/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar"
+    },
+    "interval_padding": 100,
+    "target_intervals": [
+      {
+        "class": "File",
+        "path": "/data/exome-seq/capture/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
+      }
+    ],
+    "knownSites": [
+      {
+        "class": "File",
+        "path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
+      },
+      {
+        "class": "File",
+        "path": "/data/exome-seq/b37/Mills_and_1000G_gold_standard.indels.b37.vcf"
+      },
+      {
+        "class": "File",
+        "path": "/data/exome-seq/b37/1000G_phase1.indels.b37.vcf"
+      }
+    ],
+    "platform": "Illumina",
+    "bait_intervals": [
+      {
+        "class": "File",
+        "path": "/data/exome-seq/capture/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
+      }
+    ],
+    "reference_genome": {
+      "class": "File",
+      "path": "/data/exome-seq/b37/decoy/human_g1k_v37_decoy.fasta"
+    },
+    "resource_dbsnp": {
+      "class": "File",
+      "path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
+    },
+    "threads": 32
+  },
+  "vm_flavor_name": "m1.xxlarge",
+  "vm_settings_name": "lando_worker_cwltool_dev_20181012_151512",
+  "volume_size_base": 3000,
+  "volume_size_factor": 20,
+  "workflow_version_number": 1,
+  "workflow_tag": "whole-exome-sequence-analysis-01-preprocessing",
+  "type_tag": "b37-human-xgen"
+}
diff --git a/examples/exome-seq/exomeseq-gatk-preprocessing-bespin-dev.json b/examples/exome-seq/exomeseq-gatk-preprocessing-bespin-dev.json
@@ -0,0 +1,52 @@
+{
+  "cwl_url": "https://gist.githubusercontent.com/dleehr/9bd2155a6cbf5783c4f6c8d9c32d16e3/raw/a2db9f39ae4fdbd4aa8e6d9d7a7e27ffea920e03/exomeseq-gatk4-preprocessing.cwl",
+  "description": "WES-GATK4 01 - Preprocessing of Whole Exome sequencing following GATK4 best practices workflow. Germline SNP & Indel Discovery using b37 human genome assembly on samples sequenced with an xGen Exome Research Panel v1.0 capture kit. Merges reads split into multiple files.",
+  "methods_template_url": "https://raw.githubusercontent.com/Duke-GCB/bespin-cwl/v0.9.3/workflows/exomeseq-methods.j2",
+  "name": "Whole Exome Sequence preprocessing - GATK4/Human b37",
+  "share_group_name": "informatics",
+  "system_json": {
+    "target_intervals": [
+      {
+        "class": "File",
+        "path": "/data/exome-seq/capture/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
+      }
+    ],
+    "known_sites": [
+      {
+        "class": "File",
+        "path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
+      },
+      {
+        "class": "File",
+        "path": "/data/exome-seq/b37/Mills_and_1000G_gold_standard.indels.b37.vcf"
+      },
+      {
+        "class": "File",
+        "path": "/data/exome-seq/b37/1000G_phase1.indels.b37.vcf"
+      }
+    ],
+    "platform": "Illumina",
+    "bait_intervals": [
+      {
+        "class": "File",
+        "path": "/data/exome-seq/capture/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
+      }
+    ],
+    "reference_genome": {
+      "class": "File",
+      "path": "/data/exome-seq/b37/decoy/human_g1k_v37_decoy.fasta"
+    },
+    "resource_dbsnp": {
+      "class": "File",
+      "path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
+    },
+    "threads": 32
+  },
+  "vm_flavor_name": "m1.xxlarge",
+  "vm_settings_name": "lando_worker_cwltool_dev_20180918_183036",
+  "volume_size_base": 3000,
+  "volume_size_factor": 20,
+  "workflow_version_number": 1,
+  "workflow_tag": "wes-gatk4-preprocessing",
+  "type_tag": "b37-human-xgen"
+}
diff --git a/examples/tools/fixtags.json b/examples/tools/fixtags.json
@@ -0,0 +1,10 @@
+{
+  "input_file":
+    {
+      "path": "/Users/dcl9/Data/sam-header/mapped.bam",
+      "class": "File"
+    }
+  ,
+  "output_filename": "fixed.bam",
+  "reference": { "path": "/Users/dcl9/Data/genomes/hg38.fa", "class": "File"}
+}
diff --git a/examples/tools/markduplicates.json b/examples/tools/markduplicates.json
@@ -1,10 +1,13 @@
 {
-  "input_file": 
+  "input_file":
     {
-      "path": "/home/ubuntu/bespin-cwl/data/mapped-sorted.bam",
+      "path": "/Users/dcl9/Data/sam-header/mapped.bam",
       "class": "File"
     }
-  , 
+  ,
   "output_filename": "mapped-duplicates-removed.bam",
-  "metrics_filename": "mapped-duplicates-metrics.txt"
+  "metrics_filename": "mapped-duplicates-metrics.txt",
+  "optical_duplicate_pixel_distance": 2500,
+  "assume_sort_order": "queryname",
+  "validation_stringency": "SILENT"
 }
diff --git a/subworkflows/exomeseq-01-preprocessing.cwl b/subworkflows/exomeseq-01-preprocessing.cwl
@@ -6,7 +6,6 @@ requirements:
   - class: ScatterFeatureRequirement
   - $import: ../types/bespin-types.yml
 inputs:
-  # NOTE: How long is this expected to take?
   # Intervals should come from capture kit in bed format
   intervals: File[]?
   # target intervals in picard interval_list format (created from intervals bed file)
@@ -15,18 +14,14 @@ inputs:
   bait_interval_list: File
   interval_padding: int?
   # Read samples, fastq format
-  # NOTE: Broad recommends the illumina basecalls and converts to unmapped SAM
-  #   but do we typically have fastq?
+  # NOTE: GATK best practices recommends unmapped SAM/BAM files
   read_pair:
     type: ../types/bespin-types.yml#FASTQReadPairType
   # reference genome, fasta
   # NOTE: GATK can't handle compressed fasta reference genome
-  # NOTE: is b37 appropriate to use?
-  # NOTE: Indexed with bwa and avoided .64 files
-  # NOTE: For mapping, they recommend a merge step, but this may only apply to having raw basecalls
   reference_genome: File
   # Number of threads to use for mapping
-  threads: int?
+  threads: int
   # Read Group annotations
   # Can be the project name
   library: string
@@ -131,7 +126,7 @@ steps:
     run: ../tools/bwa-mem-samtools.cwl
     requirements:
       - class: ResourceRequirement
-        coresMin: 8
+        coresMin: $(inputs.threads)
         ramMin: 16000
         outdirMin: 12000
         tmpdirMin: 12000

diff --git a/subworkflows/exomeseq-gatk4-01-preprocessing.cwl b/subworkflows/exomeseq-gatk4-01-preprocessing.cwl
@@ -0,0 +1,200 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.0
+class: Workflow
+requirements:
+  - class: ScatterFeatureRequirement
+  - $import: ../types/bespin-types.yml
+inputs:
+  # Intervals should come from capture kit in bed format
+  intervals: File[]?
+  # target intervals in picard interval_list format (created from intervals bed file)
+  target_interval_list: File
+  # bait intervals in picard interval_list format
+  bait_interval_list: File
+  # Read samples, fastq format
+  # NOTE: GATK best practices recommends unmapped SAM/BAM files
+  read_pair:
+    type: ../types/bespin-types.yml#FASTQReadPairType
+  # reference genome, fasta
+  # NOTE: GATK can't handle compressed fasta reference genome
+  reference_genome: File
+  # Number of threads to use for mapping
+  threads: int
+  # Read Group annotations
+  # Can be the project name
+  library: string
+  # e.g. Illumina
+  platform: string
+  known_sites: File[] # vcf files of known sites, with indexing
+outputs:
+  fastqc_reports:
+    type: File[]
+    outputSource: qc/output_qc_report
+  trim_reports:
+    type: File[]
+    outputSource: trim/trim_reports
+  markduplicates_bam:
+    type: File
+    outputSource: mark_duplicates/output_dedup_bam_file
+  # Recalibration
+  recalibration_table:
+    type: File
+    outputSource: recalibrate_01_analyze/output_recalibration_report
+  recalibrated_reads:
+    type: File
+    outputSource: recalibrate_02_apply_bqsr/output_recalibrated_bam
+
+steps:
+  file_pair_details:
+    run: ../tools/extract-named-file-pair-details.cwl
+    in:
+       read_pair: read_pair
+       library: library
+       platform: platform
+    out:
+       - reads
+       - read_pair_name
+       - read_group_header
+  generate_sample_filenames:
+    run: ../tools/generate-sample-filenames.cwl
+    in:
+      sample_name: file_pair_details/read_pair_name
+    out:
+      - combined_reads_output_filenames
+      - mapped_reads_output_filename
+      - sorted_reads_output_filename
+      - dedup_reads_output_filename
+      - dedup_metrics_output_filename
+      - recal_reads_output_filename
+      - recal_table_output_filename
+      - raw_variants_output_filename
+      - haplotypes_bam_output_filename
+      - fixedtag_reads_output_filename
+  combine_reads:
+    run: ../tools/concat-gz-files.cwl
+    scatter: [files, output_filename]
+    scatterMethod: dotproduct
+    in:
+       files: file_pair_details/reads
+       output_filename: generate_sample_filenames/combined_reads_output_filenames
+    out:
+       - output
+  qc:
+    run: ../tools/fastqc.cwl
+    requirements:
+      - class: ResourceRequirement
+        coresMin: 4
+        ramMin: 2500
+    scatter: input_fastq_file
+    in:
+      input_fastq_file: combine_reads/output
+      threads:
+        default: 4
+    out:
+      - output_qc_report
+  trim:
+    run: ../tools/trim_galore.cwl
+    requirements:
+      - class: ResourceRequirement
+        coresMin: 4
+        ramMin: 8000
+    in:
+      reads: combine_reads/output
+      paired:
+        default: true
+    out:
+      - trimmed_reads
+      - trim_reports
+  map:
+    run: ../tools/gitc-bwa-mem-samtools.cwl
+    requirements:
+      - class: ResourceRequirement
+        coresMin: $(inputs.threads)
+        ramMin: 16000
+        outdirMin: 12000
+        tmpdirMin: 12000
+    in:
+      reads: trim/trimmed_reads
+      reference: reference_genome
+      read_group_header: file_pair_details/read_group_header
+      output_filename: generate_sample_filenames/mapped_reads_output_filename
+      threads: threads
+    out:
+      - output
+  sort:
+    run: ../tools/GATK4-SortSam.cwl
+    requirements:
+      - class: ResourceRequirement
+        ramMin: 5000
+    in:
+      input_file: map/output
+      output_sorted_bam_filename: generate_sample_filenames/sorted_reads_output_filename
+      sort_order: { default: "coordinate" }
+      java_opt: { default: "-Xms4000m" }
+    out:
+      - output_sorted_bam
+  mark_duplicates:
+    run: ../tools/GATK4-MarkDuplicates.cwl
+    requirements:
+      - class: ResourceRequirement
+        ramMin: 7000
+        outdirMin: 12000
+        tmpdirMin: 12000
+    in:
+      input_file: sort/output_sorted_bam
+      output_filename: generate_sample_filenames/dedup_reads_output_filename
+      metrics_filename: generate_sample_filenames/dedup_metrics_output_filename
+      validation_stringency: { default: "SILENT" }
+      assume_sort_order: { default: "coordinate" }
+      optical_duplicate_pixel_distance: { default: 2500 }
+      java_opt: { default: "-Xms4000m" }
+    out:
+      - output_dedup_bam_file
+      - output_metrics_file
+  fixtags:
+    run: ../tools/GATK4-SetNmAndUqTags.cwl # what does this do?
+    requirements:
+      - class: ResourceRequirement
+        ramMin: 1000
+    in:
+      input_file: sort/output_sorted_bam
+      output_filename: generate_sample_filenames/fixedtag_reads_output_filename
+      reference: reference_genome
+      java_opt: { default: "-Xms500m" }
+    out:
+      - output_fixed_tags_bam
+  # Now recalibrate
+  recalibrate_01_analyze:
+    run: ../tools/GATK4-BaseRecalibrator.cwl
+    requirements:
+      - class: ResourceRequirement
+        ramMin: 6000
+    in:
+      reference: reference_genome
+      input_bam: fixtags/output_fixed_tags_bam
+      use_original_qualities: { default: true }
+      output_recalibration_report_filename: generate_sample_filenames/recal_table_output_filename
+      known_sites: known_sites
+      intervals: intervals
+      java_opt: { default: "-Xms4000m" }
+    out:
+      - output_recalibration_report
+  recalibrate_02_apply_bqsr:
+    run: ../tools/GATK4-ApplyBQSR.cwl
+    requirements:
+      - class: ResourceRequirement
+        ramMin: 3500
+    in:
+      reference: reference_genome
+      input_bam: fixtags/output_fixed_tags_bam
+      output_recalibrated_bam_filename: generate_sample_filenames/recal_reads_output_filename
+      intervals: intervals
+      bqsr_report: recalibrate_01_analyze/output_recalibration_report
+      static_quantized_quals: { default: [10, 20, 30]}
+      add_output_sam_program_record: { default: true }
+      use_original_qualities: { default: true }
+      java_opt: { default: "-Xms3000m" }
+    out:
+      - output_recalibrated_bam
+