-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmicroc.wdl
371 lines (324 loc) · 11.6 KB
/
microc.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
version 1.0
workflow microc {
String pipeline_ver = 'dev'
String image_id = sub(pipeline_ver, "dev", "latest")
meta {
author: ''
email: ''
description: 'Process microC data. Inspired in pipeline from: https://micro-c.readthedocs.io/en/latest/index.html'
organization: ''
parameter_group: {
reference_genome: {
title: 'Reference genome',
description: 'Genome specific files.',
help: 'For now, only working with hardcoded hg38 genome. The idea is to input the reference genome with the indexes'
},
input_genomic_data: {
title: 'Input genomic data',
description: 'Genomic input files for experiment.',
help: 'Paired end fastq microC files'
},
alignment: {
title: 'Alignment',
description: 'Parameters for alignment.',
help: 'The alignment is done using bwa, and requires the reference genome indexes, tar and gziped. The number of cores is hardoded for now.'
}
}
}
input {
String sample_id
Array[File] fastq_r1
Array[File] fastq_r2
File? reference_bwa_idx
String? reference_bwa_idx_prefix
File chrom_sizes
Int num_reads_per_chunk = 10000000
File resource_monitor_script
File top_monitor_script
}
# Calculate the total fastq file size
call sum_fastq_size {input: fastq_r1 = fastq_r1, fastq_r2 = fastq_r2}
scatter (fastq_pair in zip(fastq_r1, fastq_r2)) {
call microc_align {input:
image_id = image_id,
sample_id = sample_id,
fastq_r1 = fastq_pair.left,
fastq_r2 = fastq_pair.right,
sample_id = sample_id,
reference_index = reference_bwa_idx,
reference_index_prefix = reference_bwa_idx_prefix,
chrom_sizes = chrom_sizes,
resource_monitor_script = resource_monitor_script,
top_monitor_script = top_monitor_script
}
}
call merge_pairs {input:
image_id = image_id,
sample_id = sample_id,
pairsams = microc_align.pairsam,
resource_monitor_script = resource_monitor_script,
disk_gb = 30 + sum_fastq_size.gb * 40
}
call juicer_hic {input:
image_id = image_id,
sample_id = sample_id,
chrom_sizes = chrom_sizes,
mapped_pairs = merge_pairs.mapped_pairs
}
call cooler {input:
image_id = image_id,
sample_id = sample_id,
chrom_sizes = chrom_sizes,
mapped_pairs = merge_pairs.mapped_pairs
}
call version_info {input: image_id = image_id}
call run_qc {input:
image_id = image_id,
#mapped_pairs = merge_pairs.mapped_pairs,
mapped_stats = merge_pairs.microc_stats,
sample_id = sample_id
}
output {
File stats = merge_pairs.microc_stats
File mapped_pairs = merge_pairs.mapped_pairs
File bam = merge_pairs.bam
File bai = merge_pairs.bai
File hic = juicer_hic.hic
File raw_mcool = cooler.raw_mcool
File balanced_mcool = cooler.balanced_mcool
String pipeline_version = version_info.pipeline_version
String reads_total = run_qc.reads_total
String reads_mapped = run_qc.reads_mapped
String reads_nodups = run_qc.reads_nodups
String reads_cis_1kb = run_qc.reads_cis_1kb
String reads_cis_10kb = run_qc.reads_cis_10kb
Array[File] resources_align = microc_align.resources
Array[File] top_align = microc_align.top
}
}
task sum_fastq_size {
input {
Array[File] fastq_r1
Array[File] fastq_r2
Int size_gb = round(size(fastq_r1, "GB") + size(fastq_r2, "GB"))
}
command <<<
echo "Calculating fastq file size"
>>>
runtime {
docker: "debian:stretch"
disks: "local-disk 2000 SSD"
}
output {
Int gb = size_gb
}
}
task microc_align {
input {
String image_id
String sample_id
File fastq_r1
File fastq_r2
File? reference_index
String? reference_index_prefix
File chrom_sizes
Int bwa_cores = 5
String memory = "20GB"
String disk = "100"
String mapq = "20"
Int preemptible = 0
String chunk_id = basename(fastq_r1, "_R1.fastq.gz")
File resource_monitor_script
File top_monitor_script
}
command {
# Start process monitoring script (with top)
chmod +x ${top_monitor_script}
${top_monitor_script} > ${chunk_id}.top.log &
# Start resource monitoring script
chmod +x ${resource_monitor_script}
${resource_monitor_script} > ${chunk_id}.resources.log &
# Check if bwa index is provided as a tar.gz file ("reference_index")
# or a URI prefix ("reference_index_prefix", e.g. gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.64)
if [ ! -z "${reference_index}" ] && [ ! -z "${reference_index_prefix}" ]
then
echo "ERROR: Both reference_index and reference_index_prefix provided. Please provide only one."
exit
fi
if [ ! -z "${reference_index}" ]
then
echo "Using provided reference .tar.gz: ${reference_index}"
mkdir genome_index
tar zxvf ${reference_index} -C genome_index
else
echo "Using reference_index_prefix: ${reference_index_prefix}"
mkdir genome_index
cd genome_index
gsutil cp ${reference_index_prefix}.amb .
gsutil cp ${reference_index_prefix}.ann .
gsutil cp ${reference_index_prefix}.bwt .
gsutil cp ${reference_index_prefix}.pac .
gsutil cp ${reference_index_prefix}.sa .
echo "Downloaded bwa index files:"
ls -lh
cd ..
fi
# Get genome index name
BWT=$(find genome_index -name '*.bwt')
GENOME_INDEX_FA="$(dirname $BWT)"/"$(basename $BWT .bwt)"
echo "Using bwa index: $GENOME_INDEX_FA"
bwa mem -5SP -T0 -t${bwa_cores} $GENOME_INDEX_FA ${fastq_r1} ${fastq_r2}| \
pairtools parse --min-mapq ${mapq} --walks-policy 5unique \
--max-inter-align-gap 30 --add-columns pos5,pos3,dist_to_5,dist_to_3,read_len \
--nproc-in ${bwa_cores} --nproc-out ${bwa_cores} --chroms-path ${chrom_sizes} | \
pairtools sort --nproc ${bwa_cores} -o chunk.pairsam.gz
}
runtime {
docker: "us-central1-docker.pkg.dev/aryeelab/docker/microc:${image_id}"
preemptible: preemptible
bootDiskSizeGb: 40
cpu: bwa_cores
memory: memory
disks: "local-disk " + disk + " SSD"
}
output {
File pairsam = "chunk.pairsam.gz"
File resources = "${chunk_id}.resources.log"
File top = "${chunk_id}.top.log"
}
}
task merge_pairs {
input {
String image_id
String sample_id
Array[File] pairsams
File resource_monitor_script
Int disk_gb
}
command {
chmod u+x ${resource_monitor_script}
${resource_monitor_script} > resources.log &
pairtools merge --nproc 12 ${sep=' ' pairsams} | \
pairtools dedup --nproc-in 2 --nproc-out 8 --mark-dups --output-stats ${sample_id}.stats.txt | \
pairtools split --nproc-in 2 --nproc-out 8 --output-pairs ${sample_id}.mapped.pairs --output-sam -| \
samtools view -bS -@6 | \
samtools sort -@6 -o ${sample_id}.bam
samtools index ${sample_id}.bam
}
runtime {
continueOnReturnCode: false
docker: "us-central1-docker.pkg.dev/aryeelab/docker/microc:${image_id}"
bootDiskSizeGb: 20
cpu: 16
memory: "16GB"
disks: "local-disk " + disk_gb + " SSD"
}
output {
File resources = "resources.log"
File mapped_pairs = "${sample_id}.mapped.pairs"
File microc_stats = "${sample_id}.stats.txt"
File bam = "${sample_id}.bam"
File bai = "${sample_id}.bam.bai"
}
}
task juicer_hic {
input {
String image_id
String sample_id
File chrom_sizes
File mapped_pairs
Int cores = 4
String disk = "200"
}
command {
java -Xmx120g -Djava.awt.headless=true -jar /usr/local/bin/juicer_tools_1.22.01.jar pre \
--threads ${cores} \
${mapped_pairs} \
${sample_id}.hic \
${chrom_sizes}
}
runtime {
docker: "us-central1-docker.pkg.dev/aryeelab/docker/juicer:${image_id}"
bootDiskSizeGb: 40
memory: "128GB"
disks: "local-disk " + disk + " SSD"
}
output {
File hic = "${sample_id}.hic"
}
}
task cooler {
input {
String image_id
String sample_id
File chrom_sizes
File mapped_pairs
Int resolution = "10000"
String memory = "32"
String disk = "2000"
}
command {
cooler cload pairs -c1 2 -p1 3 -c2 4 -p2 5 ${chrom_sizes}:${resolution} ${mapped_pairs} ${sample_id}.cool
cooler zoomify --resolutions ${resolution}N -o ${sample_id}.raw.mcool -p 4 ${sample_id}.cool
cooler zoomify --resolutions ${resolution}N -o ${sample_id}.balanced.mcool -p 4 --balance --balance-args '--nproc 4' ${sample_id}.cool
}
runtime {
docker: "us-central1-docker.pkg.dev/aryeelab/docker/cooler:${image_id}"
cpu: 4
memory: memory + "GB"
disks: "local-disk " + disk + " SSD"
}
output {
File raw_mcool = "${sample_id}.raw.mcool"
File balanced_mcool = "${sample_id}.balanced.mcool"
}
}
task version_info {
input {
String image_id
}
command {
cat /VERSION
}
runtime {
continueOnReturnCode: false
docker: "us-central1-docker.pkg.dev/aryeelab/docker/microc:${image_id}"
cpu: 1
memory: "1GB"
}
output {
String pipeline_version = read_string(stdout())
}
}
task run_qc {
input {
String image_id
File mapped_stats
String sample_id
}
command {
cat ${mapped_stats} | grep -w "total" | cut -f2
cat ${mapped_stats} | grep -w "total_mapped" | cut -f2
cat ${mapped_stats} | grep -w "total_nodups" | cut -f2
cat ${mapped_stats} | grep -w "cis_1kb+" | cut -f2
cat ${mapped_stats} | grep -w "cis_10kb+" | cut -f2
}
runtime {
docker: "us-central1-docker.pkg.dev/aryeelab/docker/utils:${image_id}"
cpu: 1
memory: "4GB"
disks: "local-disk 10 SSD"
}
output {
#File qc_stats_file = "${sample_id}_qc.zip"
Array[String] qc_stats = read_lines(stdout())
#String reads_total_with_commas = qc_stats[0]
#String dist20kb_reads = qc_stats[1]
#String dist20kb_percent = qc_stats[2]
String reads_total = qc_stats[0]
String reads_mapped = qc_stats[1]
String reads_nodups = qc_stats[2]
String reads_cis_1kb = qc_stats[3]
String reads_cis_10kb = qc_stats[4]
}
}