Skip to content
This repository has been archived by the owner on Jan 28, 2020. It is now read-only.

Commit

Permalink
Merge pull request #65 from Duke-GCB/exomeseq-gatk4
Browse files Browse the repository at this point in the history
exome-seq - GATK4 Preprocessing single-sample workflow
  • Loading branch information
dleehr authored Oct 29, 2018
2 parents f251e51 + 5d30c87 commit 3e3c7d2
Show file tree
Hide file tree
Showing 18 changed files with 973 additions and 14 deletions.
57 changes: 57 additions & 0 deletions examples/exome-seq/exomeseq-01-preprocessing-bespin-dev.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"cwl_url": "https://gist.githubusercontent.com/dleehr/9bd2155a6cbf5783c4f6c8d9c32d16e3/raw/0c2fa4b829eef058ef34232ea46ec453da1537e2/exomeseq-gatk4-preprocessing.cwl",
"description": "WES 01 - Preprocessing of Whole Exome sequencing following GATK best practices workflow. Germline SNP & Indel Discovery using b37 human genome assembly on samples sequenced with an xGen Exome Research Panel v1.0 capture kit. Merges reads split into multiple files.",
"methods_template_url": "https://raw.githubusercontent.com/Duke-GCB/bespin-cwl/v0.9.3/workflows/exomeseq-methods.j2",
"name": "Whole Exome Sequence preprocessing - Human b37",
"share_group_name": "informatics",
"system_json": {
"GATKJar": {
"class": "File",
"path": "/data/exome-seq/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar"
},
"interval_padding": 100,
"target_intervals": [
{
"class": "File",
"path": "/data/exome-seq/capture/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
}
],
"knownSites": [
{
"class": "File",
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
},
{
"class": "File",
"path": "/data/exome-seq/b37/Mills_and_1000G_gold_standard.indels.b37.vcf"
},
{
"class": "File",
"path": "/data/exome-seq/b37/1000G_phase1.indels.b37.vcf"
}
],
"platform": "Illumina",
"bait_intervals": [
{
"class": "File",
"path": "/data/exome-seq/capture/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
}
],
"reference_genome": {
"class": "File",
"path": "/data/exome-seq/b37/decoy/human_g1k_v37_decoy.fasta"
},
"resource_dbsnp": {
"class": "File",
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
},
"threads": 32
},
"vm_flavor_name": "m1.xxlarge",
"vm_settings_name": "lando_worker_cwltool_dev_20181012_151512",
"volume_size_base": 3000,
"volume_size_factor": 20,
"workflow_version_number": 1,
"workflow_tag": "whole-exome-sequence-analysis-01-preprocessing",
"type_tag": "b37-human-xgen"
}
52 changes: 52 additions & 0 deletions examples/exome-seq/exomeseq-gatk-preprocessing-bespin-dev.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"cwl_url": "https://gist.githubusercontent.com/dleehr/9bd2155a6cbf5783c4f6c8d9c32d16e3/raw/a2db9f39ae4fdbd4aa8e6d9d7a7e27ffea920e03/exomeseq-gatk4-preprocessing.cwl",
"description": "WES-GATK4 01 - Preprocessing of Whole Exome sequencing following GATK4 best practices workflow. Germline SNP & Indel Discovery using b37 human genome assembly on samples sequenced with an xGen Exome Research Panel v1.0 capture kit. Merges reads split into multiple files.",
"methods_template_url": "https://raw.githubusercontent.com/Duke-GCB/bespin-cwl/v0.9.3/workflows/exomeseq-methods.j2",
"name": "Whole Exome Sequence preprocessing - GATK4/Human b37",
"share_group_name": "informatics",
"system_json": {
"target_intervals": [
{
"class": "File",
"path": "/data/exome-seq/capture/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
}
],
"known_sites": [
{
"class": "File",
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
},
{
"class": "File",
"path": "/data/exome-seq/b37/Mills_and_1000G_gold_standard.indels.b37.vcf"
},
{
"class": "File",
"path": "/data/exome-seq/b37/1000G_phase1.indels.b37.vcf"
}
],
"platform": "Illumina",
"bait_intervals": [
{
"class": "File",
"path": "/data/exome-seq/capture/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed"
}
],
"reference_genome": {
"class": "File",
"path": "/data/exome-seq/b37/decoy/human_g1k_v37_decoy.fasta"
},
"resource_dbsnp": {
"class": "File",
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf"
},
"threads": 32
},
"vm_flavor_name": "m1.xxlarge",
"vm_settings_name": "lando_worker_cwltool_dev_20180918_183036",
"volume_size_base": 3000,
"volume_size_factor": 20,
"workflow_version_number": 1,
"workflow_tag": "wes-gatk4-preprocessing",
"type_tag": "b37-human-xgen"
}
10 changes: 10 additions & 0 deletions examples/tools/fixtags.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"input_file":
{
"path": "/Users/dcl9/Data/sam-header/mapped.bam",
"class": "File"
}
,
"output_filename": "fixed.bam",
"reference": { "path": "/Users/dcl9/Data/genomes/hg38.fa", "class": "File"}
}
11 changes: 7 additions & 4 deletions examples/tools/markduplicates.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
{
"input_file":
"input_file":
{
"path": "/home/ubuntu/bespin-cwl/data/mapped-sorted.bam",
"path": "/Users/dcl9/Data/sam-header/mapped.bam",
"class": "File"
}
,
,
"output_filename": "mapped-duplicates-removed.bam",
"metrics_filename": "mapped-duplicates-metrics.txt"
"metrics_filename": "mapped-duplicates-metrics.txt",
"optical_duplicate_pixel_distance": 2500,
"assume_sort_order": "queryname",
"validation_stringency": "SILENT"
}
11 changes: 3 additions & 8 deletions subworkflows/exomeseq-01-preprocessing.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ requirements:
- class: ScatterFeatureRequirement
- $import: ../types/bespin-types.yml
inputs:
# NOTE: How long is this expected to take?
# Intervals should come from capture kit in bed format
intervals: File[]?
# target intervals in picard interval_list format (created from intervals bed file)
Expand All @@ -15,18 +14,14 @@ inputs:
bait_interval_list: File
interval_padding: int?
# Read samples, fastq format
# NOTE: Broad recommends the illumina basecalls and converts to unmapped SAM
# but do we typically have fastq?
# NOTE: GATK best practices recommends unmapped SAM/BAM files
read_pair:
type: ../types/bespin-types.yml#FASTQReadPairType
# reference genome, fasta
# NOTE: GATK can't handle compressed fasta reference genome
# NOTE: is b37 appropriate to use?
# NOTE: Indexed with bwa and avoided .64 files
# NOTE: For mapping, they recommend a merge step, but this may only apply to having raw basecalls
reference_genome: File
# Number of threads to use for mapping
threads: int?
threads: int
# Read Group annotations
# Can be the project name
library: string
Expand Down Expand Up @@ -131,7 +126,7 @@ steps:
run: ../tools/bwa-mem-samtools.cwl
requirements:
- class: ResourceRequirement
coresMin: 8
coresMin: $(inputs.threads)
ramMin: 16000
outdirMin: 12000
tmpdirMin: 12000
Expand Down
200 changes: 200 additions & 0 deletions subworkflows/exomeseq-gatk4-01-preprocessing.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#!/usr/bin/env cwl-runner

cwlVersion: v1.0
class: Workflow
requirements:
- class: ScatterFeatureRequirement
- $import: ../types/bespin-types.yml
inputs:
# Intervals should come from capture kit in bed format
intervals: File[]?
# target intervals in picard interval_list format (created from intervals bed file)
target_interval_list: File
# bait intervals in picard interval_list format
bait_interval_list: File
# Read samples, fastq format
# NOTE: GATK best practices recommends unmapped SAM/BAM files
read_pair:
type: ../types/bespin-types.yml#FASTQReadPairType
# reference genome, fasta
# NOTE: GATK can't handle compressed fasta reference genome
reference_genome: File
# Number of threads to use for mapping
threads: int
# Read Group annotations
# Can be the project name
library: string
# e.g. Illumina
platform: string
known_sites: File[] # vcf files of known sites, with indexing
outputs:
fastqc_reports:
type: File[]
outputSource: qc/output_qc_report
trim_reports:
type: File[]
outputSource: trim/trim_reports
markduplicates_bam:
type: File
outputSource: mark_duplicates/output_dedup_bam_file
# Recalibration
recalibration_table:
type: File
outputSource: recalibrate_01_analyze/output_recalibration_report
recalibrated_reads:
type: File
outputSource: recalibrate_02_apply_bqsr/output_recalibrated_bam

steps:
file_pair_details:
run: ../tools/extract-named-file-pair-details.cwl
in:
read_pair: read_pair
library: library
platform: platform
out:
- reads
- read_pair_name
- read_group_header
generate_sample_filenames:
run: ../tools/generate-sample-filenames.cwl
in:
sample_name: file_pair_details/read_pair_name
out:
- combined_reads_output_filenames
- mapped_reads_output_filename
- sorted_reads_output_filename
- dedup_reads_output_filename
- dedup_metrics_output_filename
- recal_reads_output_filename
- recal_table_output_filename
- raw_variants_output_filename
- haplotypes_bam_output_filename
- fixedtag_reads_output_filename
combine_reads:
run: ../tools/concat-gz-files.cwl
scatter: [files, output_filename]
scatterMethod: dotproduct
in:
files: file_pair_details/reads
output_filename: generate_sample_filenames/combined_reads_output_filenames
out:
- output
qc:
run: ../tools/fastqc.cwl
requirements:
- class: ResourceRequirement
coresMin: 4
ramMin: 2500
scatter: input_fastq_file
in:
input_fastq_file: combine_reads/output
threads:
default: 4
out:
- output_qc_report
trim:
run: ../tools/trim_galore.cwl
requirements:
- class: ResourceRequirement
coresMin: 4
ramMin: 8000
in:
reads: combine_reads/output
paired:
default: true
out:
- trimmed_reads
- trim_reports
map:
run: ../tools/gitc-bwa-mem-samtools.cwl
requirements:
- class: ResourceRequirement
coresMin: $(inputs.threads)
ramMin: 16000
outdirMin: 12000
tmpdirMin: 12000
in:
reads: trim/trimmed_reads
reference: reference_genome
read_group_header: file_pair_details/read_group_header
output_filename: generate_sample_filenames/mapped_reads_output_filename
threads: threads
out:
- output
sort:
run: ../tools/GATK4-SortSam.cwl
requirements:
- class: ResourceRequirement
ramMin: 5000
in:
input_file: map/output
output_sorted_bam_filename: generate_sample_filenames/sorted_reads_output_filename
sort_order: { default: "coordinate" }
java_opt: { default: "-Xms4000m" }
out:
- output_sorted_bam
mark_duplicates:
run: ../tools/GATK4-MarkDuplicates.cwl
requirements:
- class: ResourceRequirement
ramMin: 7000
outdirMin: 12000
tmpdirMin: 12000
in:
input_file: sort/output_sorted_bam
output_filename: generate_sample_filenames/dedup_reads_output_filename
metrics_filename: generate_sample_filenames/dedup_metrics_output_filename
validation_stringency: { default: "SILENT" }
assume_sort_order: { default: "coordinate" }
optical_duplicate_pixel_distance: { default: 2500 }
java_opt: { default: "-Xms4000m" }
out:
- output_dedup_bam_file
- output_metrics_file
fixtags:
run: ../tools/GATK4-SetNmAndUqTags.cwl # what does this do?
requirements:
- class: ResourceRequirement
ramMin: 1000
in:
input_file: sort/output_sorted_bam
output_filename: generate_sample_filenames/fixedtag_reads_output_filename
reference: reference_genome
java_opt: { default: "-Xms500m" }
out:
- output_fixed_tags_bam
# Now recalibrate
recalibrate_01_analyze:
run: ../tools/GATK4-BaseRecalibrator.cwl
requirements:
- class: ResourceRequirement
ramMin: 6000
in:
reference: reference_genome
input_bam: fixtags/output_fixed_tags_bam
use_original_qualities: { default: true }
output_recalibration_report_filename: generate_sample_filenames/recal_table_output_filename
known_sites: known_sites
intervals: intervals
java_opt: { default: "-Xms4000m" }
out:
- output_recalibration_report
recalibrate_02_apply_bqsr:
run: ../tools/GATK4-ApplyBQSR.cwl
requirements:
- class: ResourceRequirement
ramMin: 3500
in:
reference: reference_genome
input_bam: fixtags/output_fixed_tags_bam
output_recalibrated_bam_filename: generate_sample_filenames/recal_reads_output_filename
intervals: intervals
bqsr_report: recalibrate_01_analyze/output_recalibration_report
static_quantized_quals: { default: [10, 20, 30]}
add_output_sam_program_record: { default: true }
use_original_qualities: { default: true }
java_opt: { default: "-Xms3000m" }
out:
- output_recalibrated_bam

Loading

0 comments on commit 3e3c7d2

Please sign in to comment.