-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmbin_nmdc.wdl
executable file
·544 lines (509 loc) · 17.8 KB
/
mbin_nmdc.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
version 1.0
workflow nmdc_mags {
input {
String proj
String contig_file
String sam_file
String gff_file
String proteins_file
String cog_file
String ec_file
String ko_file
String pfam_file
String tigrfam_file
String crispr_file
String product_names_file
String gene_phylogeny_file
String lineage_file
String? map_file
String? scratch_dir
Int cpu=32
Int threads=64
Int pthreads=1
String gtdbtk_db="/refdata/GTDBTK_DB/gtdbtk_release207_v2"
String checkm_db="/refdata/checkM_DB/checkm_data_2015_01_16"
String eukcc2_db="/refdata/EUKCC2_DB/eukcc2_db_ver_1.2"
String package_container = "microbiomedata/nmdc_mbin_vis:0.7.0"
String container = "microbiomedata/nmdc_mbin@sha256:2d9d79740eef27b2959d00c34fa5816ff911ec7c4dc80fedb110ab170ce0b868"
}
call stage {
input:
container=container,
contig_file=contig_file,
sam_file=sam_file,
gff_file=gff_file,
proteins_file=proteins_file,
cog_file=cog_file,
ec_file=ec_file,
ko_file=ko_file,
pfam_file=pfam_file,
tigrfam_file=tigrfam_file,
crispr_file=crispr_file,
product_names_file=product_names_file,
gene_phylogeny_file=gene_phylogeny_file,
lineage_file=lineage_file,
map_file=map_file
}
call check_id_map {
input:
container=container,
contig_file=stage.contig,
proteins_file=stage.proteins
}
call mbin_nmdc {
input:
name=proj,
fna = check_id_map.contig,
aln = stage.sam,
gff = stage.gff,
lineage=stage.lineage_tsv,
threads = threads,
pthreads = pthreads,
gtdbtk_env = gtdbtk_db,
checkm_env = checkm_db,
eukcc2_env = eukcc2_db,
map_file = stage.map_tsv,
mbin_container = container
}
call package {
input: proj = proj,
bins=flatten([mbin_nmdc.hqmq_bin_fasta_files,mbin_nmdc.lq_bin_fasta_files]),
json_stats=mbin_nmdc.stats_json,
gff_file=stage.gff,
proteins_file=stage.proteins,
cog_file=stage.cog,
ec_file=stage.ec,
ko_file=stage.ko,
pfam_file=stage.pfam,
tigrfam_file=stage.tigrfam,
crispr_file=stage.crispr,
gene_phylogeny_file=stage.gene_phylogeny,
product_names_file=stage.product_names,
container=package_container
}
call finish_mags {
input: container="microbiomedata/workflowmeta:1.1.1",
proj=proj,
bacsum= mbin_nmdc.bacsum,
arcsum = mbin_nmdc.arcsum,
short = mbin_nmdc.short,
low = mbin_nmdc.low,
unbinned = mbin_nmdc.unbinned,
checkm = mbin_nmdc.checkm,
mbin_sdb = mbin_nmdc.mbin_sdb,
mbin_version = mbin_nmdc.mbin_version,
stats_json = package.stats_json,
stats_tsv = mbin_nmdc.stats_tsv,
hqmq_bin_tarfiles = package.hqmq_bin_tarfiles,
lq_bin_tarfiles = package.lq_bin_tarfiles,
barplot = package.barplot,
heatmap = package.heatmap,
kronaplot = package.kronaplot,
eukcc_file=mbin_nmdc.eukcc_csv,
ko_matrix = package.ko_matrix
}
output {
File final_hqmq_bins_zip = finish_mags.final_hqmq_bins_zip
File final_lq_bins_zip = finish_mags.final_lq_bins_zip
File final_gtdbtk_bac_summary = finish_mags.final_gtdbtk_bac_summary
File final_gtdbtk_ar_summary = finish_mags.final_gtdbtk_ar_summary
File short = finish_mags.final_short
File low = finish_mags.final_lowDepth_fa
File final_unbinned_fa = finish_mags.final_unbinned_fa
File final_checkm = finish_mags.final_checkm
File mags_version = finish_mags.final_version
File final_stats_json = finish_mags.final_stats_json
File barplot = finish_mags.final_barplot
File heatmap = finish_mags.final_heatmap
File kronaplot = finish_mags.final_kronaplot
}
}
task mbin_nmdc {
input{
File fna
File aln
File gff
File lineage
String name
File? map_file
Int? threads
Int? pthreads
String gtdbtk_env
String checkm_env
String? eukcc2_env
String mbin_container
}
command<<<
set -euo pipefail
export GTDBTK_DATA_PATH=~{gtdbtk_env}
export CHECKM_DATA_PATH=~{checkm_env}
mbin.py ~{"--threads " + threads} ~{"--pthreads " + pthreads} ~{"--map " + map_file} ~{"--eukccdb " + eukcc2_env} --fna ~{fna} --gff ~{gff} --aln ~{aln} --lintsv ~{lineage}
mbin_stats.py $PWD
mbin_versions.py > mbin_nmdc_versions.log
touch MAGs_stats.tsv
if [ -f gtdbtk-output/gtdbtk.bac120.summary.tsv ]; then
echo "bacterial summary exists."
else
mkdir -p gtdbtk-output
echo "No Bacterial Results for ~{name}" > gtdbtk-output/gtdbtk.bac120.summary.tsv
fi
if [ -f gtdbtk-output/gtdbtk.ar122.summary.tsv ]; then
echo "archaeal summary exists."
else
mkdir -p gtdbtk-output
echo "No Archaeal Results for ~{name}" > gtdbtk-output/gtdbtk.ar122.summary.tsv
fi
if [ -f checkm-qa.out ]; then
echo "checkm summary exists."
else
mkdir -p gtdbtk-output
echo "No Checkm Results for ~{name}" > checkm-qa.out
fi
if [ -f mbin.sdb ]; then
echo "mbin.sdb exists."
else
mkdir -p gtdbtk-output
echo "Mbin Sdb Could not be created for ~{name}" && touch mbin.sdb
fi
if [ -f eukcc_output/eukcc.csv.final ]; then
echo "eukcc.csv.final exists."
else
mkdir -p eukcc_output
echo "No EUKCC2 result for ~{name}" > eukcc_output/eukcc.csv.final
fi
>>>
runtime{
docker : mbin_container
memory : "120 G"
time : "2:00:00"
cpu : threads
}
output{
File short = "bins.tooShort.fa"
File low = "bins.lowDepth.fa"
File unbinned = "bins.unbinned.fa"
File checkm = "checkm-qa.out"
File stats_json = "MAGs_stats.json"
File stats_tsv = "MAGs_stats.tsv"
File mbin_sdb = "mbin.sdb"
File mbin_version = "mbin_nmdc_versions.log"
File bacsum = "gtdbtk-output/gtdbtk.bac120.summary.tsv"
File arcsum = "gtdbtk-output/gtdbtk.ar122.summary.tsv"
File eukcc_csv = "eukcc_output/eukcc.csv.final"
Array[File] hqmq_bin_fasta_files = glob("hqmq-metabat-bins/*fa")
Array[File] lq_bin_fasta_files = glob("filtered-metabat-bins/*fa")
}
}
task stage {
input{
String container
String contig_file
String sam_file
String gff_file
String proteins_file
String cog_file
String ec_file
String ko_file
String pfam_file
String tigrfam_file
String crispr_file
String product_names_file
String gene_phylogeny_file
String lineage_file
String? map_file
String contigs_out="contigs.fasta"
String bam_out="pairedMapped_sorted.bam"
String gff_out="functional_annotation.gff"
String proteins_out="proteins.faa"
String cog_out="cog.gff"
String ec_out="ec.tsv"
String ko_out="ko.tsv"
String pfam_out="pfam.gff"
String tigrfam_out="tigrfam.gff"
String crispr_out="crispr.tsv"
String products_out="products.tsv"
String gene_phylogeny_out="gene_phylogeny.tsv"
String lineage_out="lineage.tsv"
String map_out="map_file.tsv"
}
command<<<
set -euo pipefail
function stage() {
in=$1
out=$2
if [ $( echo $in |egrep -c "https*:") -gt 0 ] ; then
wget $in -O $out
else
ln $in $out || cp $in $out
fi
}
function fail {
printf '%s\n' "$1" >&2 ## Send message to stderr.
exit "${2-1}" ## Return a code specified by $2, or 1 by default.
}
stage ~{contig_file} ~{contigs_out} &
stage ~{sam_file} ~{bam_out} &
stage ~{gff_file} ~{gff_out} &
stage ~{proteins_file} ~{proteins_out} &
stage ~{cog_file} ~{cog_out} &
stage ~{ec_file} ~{ec_out} &
stage ~{ko_file} ~{ko_out} &
stage ~{pfam_file} ~{pfam_out} &
stage ~{tigrfam_file} ~{tigrfam_out} &
stage ~{crispr_file} ~{crispr_out} &
stage ~{product_names_file} ~{products_out} &
stage ~{gene_phylogeny_file} ~{gene_phylogeny_out} &
stage ~{lineage_file} ~{lineage_out}
~{"stage " + map_file + " " + map_out}
wait
if grep -q "Retrying" stderr; then
if [ ! -s ~{contigs_out} ]; then
fail "Staged Contig file is empty."
fi
if [ ! -s ~{bam_out} ]; then
fail "Staged Bam file is empty."
fi
if [ ! -s ~{gff_out} ]; then
fail "Staged gff file is empty."
fi
if [ ! -s ~{proteins_out} ]; then
fail "Staged proteins file is empty."
fi
if [ ! -s ~{cog_out} ]; then
fail "Staged cog file is empty."
fi
if [ ! -s ~{ec_out} ]; then
fail "Staged ec file is empty."
fi
if [ ! -s ~{ko_out} ]; then
fail "Staged ko file is empty."
fi
if [ ! -s ~{pfam_out} ]; then
fail "Staged pfam file is empty."
fi
if [ ! -s ~{tigrfam_out} ]; then
fail "Staged tigrfam file is empty."
fi
if [ ! -s ~{crispr_out} ]; then
fail "Staged crispr file is empty."
fi
if [ ! -s ~{products_out} ]; then
fail "Staged products file is empty."
fi
if [ ! -s ~{gene_phylogeny_out} ]; then
fail "Staged gene_phylogeny file is empty."
fi
if [ ! -s ~{lineage_out} ]; then
fail "Staged lineage file is empty."
fi
if [ ! -s ~{map_out} ]; then
fail "Staged map file is empty."
fi
fi
if grep -q "ERROR" stderr; then
fail "Staging task has Error. Please check the $PWD/stderr and make sure there are no incomplete/partial files."
fi
date --iso-8601=seconds > start.txt
>>>
output{
File contig = contigs_out
File sam = bam_out
File gff = gff_out
File proteins = proteins_out
File cog = cog_out
File ec = ec_out
File ko = ko_out
File pfam = pfam_out
File tigrfam = tigrfam_out
File crispr = crispr_out
File product_names = products_out
File gene_phylogeny = gene_phylogeny_out
File lineage_tsv = lineage_out
File? map_tsv = map_out
String start = read_string("start.txt")
}
runtime {
memory: "1 GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task check_id_map{
input{
String container
File contig_file
File proteins_file
String contig_file_name=basename(contig_file)
}
command<<<
set -euo pipefail
python <<CODE
import sys
contigIDs={}
with open("~{contig_file}","r") as c_file:
for line in c_file:
if line.startswith(">"):
seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001
contigIDs[seq_id]=1
with open("~{proteins_file}","r") as p_file:
for line in p_file:
if line.startswith(">"):
seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001_1_225
contig_id = "_".join(seq_id.split("_")[0:-2]) # nmdc:wfmgan-12-gbysvd76.1_0000001
if contig_id not in contigIDs:
print(f"{contig_id} is not in ~{contig_file_name}.", file=sys.stderr)
sys.exit(1)
CODE
>>>
output{
File contig = contig_file
}
runtime {
memory: "1 GiB"
cpu: 1
docker: container
}
}
task package{
input{
String proj
String prefix=sub(proj, ":", "_")
Array[File] bins
File json_stats
File gff_file
File proteins_file
File cog_file
File ec_file
File ko_file
File pfam_file
File tigrfam_file
File crispr_file
File gene_phylogeny_file
File product_names_file
String container
}
command<<<
set -euo pipefail
create_tarfiles.py ~{prefix} \
~{json_stats} ~{gff_file} ~{proteins_file} ~{cog_file} \
~{ec_file} ~{ko_file} ~{pfam_file} ~{tigrfam_file} \
~{crispr_file} ~{gene_phylogeny_file} \
~{product_names_file} \
~{sep=" " bins}
if [ -f ~{prefix}_heatmap.pdf ]; then
if [ -f ~{prefix}_barplot.pdf ]; then
echo "KO analysis plot exists."
else
echo "There are no modules above 80% completeness. No barplot will be generated." && touch ~{prefix}_barplot.pdf
fi
else
echo "No KO analysis result for ~{proj}" && touch ~{prefix}_heatmap.pdf
echo "No KO analysis result for ~{proj}" && touch ~{prefix}_barplot.pdf
echo "No KO analysis result for ~{proj}" > ~{prefix}_ko_krona.html
echo "No KO analysis result for ~{proj}" > ~{prefix}_module_completeness.tab
fi
>>>
output {
Array[File] hqmq_bin_tarfiles = flatten([glob("*_HQ.tar.gz"), glob("*_MQ.tar.gz")])
Array[File] lq_bin_tarfiles = glob("*_LQ.tar.gz")
File barplot = prefix + "_barplot.pdf"
File heatmap = prefix + "_heatmap.pdf"
File kronaplot = prefix + "_ko_krona.html"
File ko_matrix = prefix + "_module_completeness.tab"
File stats_json = prefix + "_stats.json"
}
runtime {
docker: container
memory: "1 GiB"
cpu: 1
}
}
task finish_mags {
input{
String container
File mbin_sdb
File mbin_version
String proj
String prefix=sub(proj, ":", "_")
File bacsum
File arcsum
File? short
File? low
File? unbinned
File? checkm
Array[File] hqmq_bin_tarfiles
Array[File] lq_bin_tarfiles
File stats_json
File stats_tsv
Int n_hqmq=length(hqmq_bin_tarfiles)
Int n_lq=length(lq_bin_tarfiles)
File barplot
File heatmap
File kronaplot
File ko_matrix
File eukcc_file
}
command<<<
set -euo pipefail
end=`date --iso-8601=seconds`
ln ~{low} ~{prefix}_bins.lowDepth.fa
ln ~{short} ~{prefix}_bins.tooShort.fa
ln ~{unbinned} ~{prefix}_bins.unbinned.fa
ln ~{checkm} ~{prefix}_checkm_qa.out
ln ~{mbin_version} ~{prefix}_bin.info
ln ~{bacsum} ~{prefix}_gtdbtk.bac122.summary.tsv
ln ~{arcsum} ~{prefix}_gtdbtk.ar122.summary.tsv
ln ~{barplot} ~{prefix}_barplot.pdf
ln ~{heatmap} ~{prefix}_heatmap.pdf
ln ~{kronaplot} ~{prefix}_kronaplot.html
ln ~{ko_matrix} ~{prefix}_ko_matrix.txt
ln ~{eukcc_file} ~{prefix}_eukcc.csv
# cp all tarfiles, zip them under prefix, if empty touch no_mags.txt
mkdir -p hqmq
if [ ~{n_hqmq} -gt 0 ] ; then
(cd hqmq && cp ~{sep=" " hqmq_bin_tarfiles} .)
(cd hqmq && cp ~{mbin_sdb} .)
(cd hqmq && zip -j ../~{prefix}_hqmq_bin.zip *tar.gz mbin.sdb ../*pdf ../*kronaplot.html ../*ko_matrix.txt)
else
(cd hqmq && touch no_hqmq_mags.txt)
(cd hqmq && cp ~{mbin_sdb} .)
(cd hqmq && zip -j ../~{prefix}_hqmq_bin.zip *.txt mbin.sdb)
fi
mkdir -p lq
if [ ~{n_lq} -gt 0 ] ; then
(cd lq && cp ~{sep=" " lq_bin_tarfiles} .)
(cd lq && cp ~{mbin_sdb} .)
(cd lq && zip -j ../~{prefix}_lq_bin.zip *tar.gz mbin.sdb ../~{prefix}_eukcc.csv ../*pdf ../*kronaplot.html ../*ko_matrix.txt)
else
(cd lq && touch no_lq_mags.txt)
(cd lq && cp ~{mbin_sdb} .)
(cd lq && zip -j ../~{prefix}_lq_bin.zip *.txt mbin.sdb ../~{prefix}_eukcc.csv )
fi
# Fix up attribute name
cat ~{stats_json} | \
sed 's/: null/: "null"/g' | \
sed 's/lowDepth_/low_depth_/' > ~{prefix}_mags_stats.json
>>>
output {
File final_checkm = "~{prefix}_checkm_qa.out"
File final_hqmq_bins_zip = "~{prefix}_hqmq_bin.zip"
File final_lq_bins_zip = "~{prefix}_lq_bin.zip"
File final_stats_json = "~{prefix}_mags_stats.json"
File final_gtdbtk_bac_summary = "~{prefix}_gtdbtk.bac122.summary.tsv"
File final_gtdbtk_ar_summary = "~{prefix}_gtdbtk.ar122.summary.tsv"
File final_lowDepth_fa = "~{prefix}_bins.lowDepth.fa"
File final_unbinned_fa = "~{prefix}_bins.unbinned.fa"
File final_short = "~{prefix}_bins.tooShort.fa"
File final_version = "~{prefix}_bin.info"
File final_kronaplot = "~{prefix}_kronaplot.html"
File final_heatmap = "~{prefix}_heatmap.pdf"
File final_barplot = "~{prefix}_barplot.pdf"
}
runtime {
memory: "10 GiB"
cpu: 4
maxRetries: 1
docker: container
}
}