This repository has been archived by the owner on Jan 28, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #65 from Duke-GCB/exomeseq-gatk4
exome-seq - GATK4 Preprocessing single-sample workflow
- Loading branch information
Showing
18 changed files
with
973 additions
and
14 deletions.
There are no files selected for viewing
57 changes: 57 additions & 0 deletions
57
examples/exome-seq/exomeseq-01-preprocessing-bespin-dev.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
{ | ||
"cwl_url": "https://gist.githubusercontent.com/dleehr/9bd2155a6cbf5783c4f6c8d9c32d16e3/raw/0c2fa4b829eef058ef34232ea46ec453da1537e2/exomeseq-gatk4-preprocessing.cwl", | ||
"description": "WES 01 - Preprocessing of Whole Exome sequencing following GATK best practices workflow. Germline SNP & Indel Discovery using b37 human genome assembly on samples sequenced with an xGen Exome Research Panel v1.0 capture kit. Merges reads split into multiple files.", | ||
"methods_template_url": "https://raw.githubusercontent.com/Duke-GCB/bespin-cwl/v0.9.3/workflows/exomeseq-methods.j2", | ||
"name": "Whole Exome Sequence preprocessing - Human b37", | ||
"share_group_name": "informatics", | ||
"system_json": { | ||
"GATKJar": { | ||
"class": "File", | ||
"path": "/data/exome-seq/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar" | ||
}, | ||
"interval_padding": 100, | ||
"target_intervals": [ | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/capture/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed" | ||
} | ||
], | ||
"knownSites": [ | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf" | ||
}, | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" | ||
}, | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/1000G_phase1.indels.b37.vcf" | ||
} | ||
], | ||
"platform": "Illumina", | ||
"bait_intervals": [ | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/capture/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed" | ||
} | ||
], | ||
"reference_genome": { | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/decoy/human_g1k_v37_decoy.fasta" | ||
}, | ||
"resource_dbsnp": { | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf" | ||
}, | ||
"threads": 32 | ||
}, | ||
"vm_flavor_name": "m1.xxlarge", | ||
"vm_settings_name": "lando_worker_cwltool_dev_20181012_151512", | ||
"volume_size_base": 3000, | ||
"volume_size_factor": 20, | ||
"workflow_version_number": 1, | ||
"workflow_tag": "whole-exome-sequence-analysis-01-preprocessing", | ||
"type_tag": "b37-human-xgen" | ||
} |
52 changes: 52 additions & 0 deletions
52
examples/exome-seq/exomeseq-gatk-preprocessing-bespin-dev.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
{ | ||
"cwl_url": "https://gist.githubusercontent.com/dleehr/9bd2155a6cbf5783c4f6c8d9c32d16e3/raw/a2db9f39ae4fdbd4aa8e6d9d7a7e27ffea920e03/exomeseq-gatk4-preprocessing.cwl", | ||
"description": "WES-GATK4 01 - Preprocessing of Whole Exome sequencing following GATK4 best practices workflow. Germline SNP & Indel Discovery using b37 human genome assembly on samples sequenced with an xGen Exome Research Panel v1.0 capture kit. Merges reads split into multiple files.", | ||
"methods_template_url": "https://raw.githubusercontent.com/Duke-GCB/bespin-cwl/v0.9.3/workflows/exomeseq-methods.j2", | ||
"name": "Whole Exome Sequence preprocessing - GATK4/Human b37", | ||
"share_group_name": "informatics", | ||
"system_json": { | ||
"target_intervals": [ | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/capture/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed" | ||
} | ||
], | ||
"known_sites": [ | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf" | ||
}, | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" | ||
}, | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/1000G_phase1.indels.b37.vcf" | ||
} | ||
], | ||
"platform": "Illumina", | ||
"bait_intervals": [ | ||
{ | ||
"class": "File", | ||
"path": "/data/exome-seq/capture/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c-trimmed-chr.bed" | ||
} | ||
], | ||
"reference_genome": { | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/decoy/human_g1k_v37_decoy.fasta" | ||
}, | ||
"resource_dbsnp": { | ||
"class": "File", | ||
"path": "/data/exome-seq/b37/dbsnp_138.b37.vcf" | ||
}, | ||
"threads": 32 | ||
}, | ||
"vm_flavor_name": "m1.xxlarge", | ||
"vm_settings_name": "lando_worker_cwltool_dev_20180918_183036", | ||
"volume_size_base": 3000, | ||
"volume_size_factor": 20, | ||
"workflow_version_number": 1, | ||
"workflow_tag": "wes-gatk4-preprocessing", | ||
"type_tag": "b37-human-xgen" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"input_file": | ||
{ | ||
"path": "/Users/dcl9/Data/sam-header/mapped.bam", | ||
"class": "File" | ||
} | ||
, | ||
"output_filename": "fixed.bam", | ||
"reference": { "path": "/Users/dcl9/Data/genomes/hg38.fa", "class": "File"} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,13 @@ | ||
{ | ||
"input_file": | ||
"input_file": | ||
{ | ||
"path": "/home/ubuntu/bespin-cwl/data/mapped-sorted.bam", | ||
"path": "/Users/dcl9/Data/sam-header/mapped.bam", | ||
"class": "File" | ||
} | ||
, | ||
, | ||
"output_filename": "mapped-duplicates-removed.bam", | ||
"metrics_filename": "mapped-duplicates-metrics.txt" | ||
"metrics_filename": "mapped-duplicates-metrics.txt", | ||
"optical_duplicate_pixel_distance": 2500, | ||
"assume_sort_order": "queryname", | ||
"validation_stringency": "SILENT" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
#!/usr/bin/env cwl-runner | ||
|
||
cwlVersion: v1.0 | ||
class: Workflow | ||
requirements: | ||
- class: ScatterFeatureRequirement | ||
- $import: ../types/bespin-types.yml | ||
inputs: | ||
# Intervals should come from capture kit in bed format | ||
intervals: File[]? | ||
# target intervals in picard interval_list format (created from intervals bed file) | ||
target_interval_list: File | ||
# bait intervals in picard interval_list format | ||
bait_interval_list: File | ||
# Read samples, fastq format | ||
# NOTE: GATK best practices recommends unmapped SAM/BAM files | ||
read_pair: | ||
type: ../types/bespin-types.yml#FASTQReadPairType | ||
# reference genome, fasta | ||
# NOTE: GATK can't handle compressed fasta reference genome | ||
reference_genome: File | ||
# Number of threads to use for mapping | ||
threads: int | ||
# Read Group annotations | ||
# Can be the project name | ||
library: string | ||
# e.g. Illumina | ||
platform: string | ||
known_sites: File[] # vcf files of known sites, with indexing | ||
outputs: | ||
fastqc_reports: | ||
type: File[] | ||
outputSource: qc/output_qc_report | ||
trim_reports: | ||
type: File[] | ||
outputSource: trim/trim_reports | ||
markduplicates_bam: | ||
type: File | ||
outputSource: mark_duplicates/output_dedup_bam_file | ||
# Recalibration | ||
recalibration_table: | ||
type: File | ||
outputSource: recalibrate_01_analyze/output_recalibration_report | ||
recalibrated_reads: | ||
type: File | ||
outputSource: recalibrate_02_apply_bqsr/output_recalibrated_bam | ||
|
||
steps: | ||
file_pair_details: | ||
run: ../tools/extract-named-file-pair-details.cwl | ||
in: | ||
read_pair: read_pair | ||
library: library | ||
platform: platform | ||
out: | ||
- reads | ||
- read_pair_name | ||
- read_group_header | ||
generate_sample_filenames: | ||
run: ../tools/generate-sample-filenames.cwl | ||
in: | ||
sample_name: file_pair_details/read_pair_name | ||
out: | ||
- combined_reads_output_filenames | ||
- mapped_reads_output_filename | ||
- sorted_reads_output_filename | ||
- dedup_reads_output_filename | ||
- dedup_metrics_output_filename | ||
- recal_reads_output_filename | ||
- recal_table_output_filename | ||
- raw_variants_output_filename | ||
- haplotypes_bam_output_filename | ||
- fixedtag_reads_output_filename | ||
combine_reads: | ||
run: ../tools/concat-gz-files.cwl | ||
scatter: [files, output_filename] | ||
scatterMethod: dotproduct | ||
in: | ||
files: file_pair_details/reads | ||
output_filename: generate_sample_filenames/combined_reads_output_filenames | ||
out: | ||
- output | ||
qc: | ||
run: ../tools/fastqc.cwl | ||
requirements: | ||
- class: ResourceRequirement | ||
coresMin: 4 | ||
ramMin: 2500 | ||
scatter: input_fastq_file | ||
in: | ||
input_fastq_file: combine_reads/output | ||
threads: | ||
default: 4 | ||
out: | ||
- output_qc_report | ||
trim: | ||
run: ../tools/trim_galore.cwl | ||
requirements: | ||
- class: ResourceRequirement | ||
coresMin: 4 | ||
ramMin: 8000 | ||
in: | ||
reads: combine_reads/output | ||
paired: | ||
default: true | ||
out: | ||
- trimmed_reads | ||
- trim_reports | ||
map: | ||
run: ../tools/gitc-bwa-mem-samtools.cwl | ||
requirements: | ||
- class: ResourceRequirement | ||
coresMin: $(inputs.threads) | ||
ramMin: 16000 | ||
outdirMin: 12000 | ||
tmpdirMin: 12000 | ||
in: | ||
reads: trim/trimmed_reads | ||
reference: reference_genome | ||
read_group_header: file_pair_details/read_group_header | ||
output_filename: generate_sample_filenames/mapped_reads_output_filename | ||
threads: threads | ||
out: | ||
- output | ||
sort: | ||
run: ../tools/GATK4-SortSam.cwl | ||
requirements: | ||
- class: ResourceRequirement | ||
ramMin: 5000 | ||
in: | ||
input_file: map/output | ||
output_sorted_bam_filename: generate_sample_filenames/sorted_reads_output_filename | ||
sort_order: { default: "coordinate" } | ||
java_opt: { default: "-Xms4000m" } | ||
out: | ||
- output_sorted_bam | ||
mark_duplicates: | ||
run: ../tools/GATK4-MarkDuplicates.cwl | ||
requirements: | ||
- class: ResourceRequirement | ||
ramMin: 7000 | ||
outdirMin: 12000 | ||
tmpdirMin: 12000 | ||
in: | ||
input_file: sort/output_sorted_bam | ||
output_filename: generate_sample_filenames/dedup_reads_output_filename | ||
metrics_filename: generate_sample_filenames/dedup_metrics_output_filename | ||
validation_stringency: { default: "SILENT" } | ||
assume_sort_order: { default: "coordinate" } | ||
optical_duplicate_pixel_distance: { default: 2500 } | ||
java_opt: { default: "-Xms4000m" } | ||
out: | ||
- output_dedup_bam_file | ||
- output_metrics_file | ||
fixtags: | ||
run: ../tools/GATK4-SetNmAndUqTags.cwl # what does this do? | ||
requirements: | ||
- class: ResourceRequirement | ||
ramMin: 1000 | ||
in: | ||
input_file: sort/output_sorted_bam | ||
output_filename: generate_sample_filenames/fixedtag_reads_output_filename | ||
reference: reference_genome | ||
java_opt: { default: "-Xms500m" } | ||
out: | ||
- output_fixed_tags_bam | ||
# Now recalibrate | ||
recalibrate_01_analyze: | ||
run: ../tools/GATK4-BaseRecalibrator.cwl | ||
requirements: | ||
- class: ResourceRequirement | ||
ramMin: 6000 | ||
in: | ||
reference: reference_genome | ||
input_bam: fixtags/output_fixed_tags_bam | ||
use_original_qualities: { default: true } | ||
output_recalibration_report_filename: generate_sample_filenames/recal_table_output_filename | ||
known_sites: known_sites | ||
intervals: intervals | ||
java_opt: { default: "-Xms4000m" } | ||
out: | ||
- output_recalibration_report | ||
recalibrate_02_apply_bqsr: | ||
run: ../tools/GATK4-ApplyBQSR.cwl | ||
requirements: | ||
- class: ResourceRequirement | ||
ramMin: 3500 | ||
in: | ||
reference: reference_genome | ||
input_bam: fixtags/output_fixed_tags_bam | ||
output_recalibrated_bam_filename: generate_sample_filenames/recal_reads_output_filename | ||
intervals: intervals | ||
bqsr_report: recalibrate_01_analyze/output_recalibration_report | ||
static_quantized_quals: { default: [10, 20, 30]} | ||
add_output_sam_program_record: { default: true } | ||
use_original_qualities: { default: true } | ||
java_opt: { default: "-Xms3000m" } | ||
out: | ||
- output_recalibrated_bam | ||
|
Oops, something went wrong.