forked from twbattaglia/tumor-microbiome-nf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
264 lines (228 loc) · 9.57 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env nextflow
def helpMessage() {
log.info"""
=============================================================
NKI-Atlas taxonomic profiling v${workflow.manifest.version}
Author: ${workflow.manifest.author}
============================================================
Usage:
The typical command for running the pipeline is as follows:
nextflow run main.nf \
--csv 'hmf-manifest.csv' \
--krakenDB 'gs://microbe-resources/kraken2/standard_db' \
--pathseqDB 'gs://microbe-resources/pathseq' \
--outdir_local 'results' \
-work-dir 'gs://nki-atlas/workdir'
Required arguments:
--csv [file] File path to a CSV converted file with input CRAM/BAM file locations from HMF.
-work-dir [file] Google cloud bucket that handles the intermediate files. Must be GCP bucket location (e.g gs://XXX)
-profile [str] Select local or google cloud profile. (Default: googlev2)
Optional arguments:
--bam [file] File path to the folder with CRAM/BAM files reads for processing. Can be used instead of CSV input.
--outdir [folder] Directory to resulting folder, must have permissions on bucket (Default: gs://nki-atlas/results)
--blood [bool] Option to profile blood instead of tumor (Default: false)
Quality filtering options:
--min_length [int] Minimum sequence length when quality filtering (Default: 50)
--min_quality [int] Minimum sequence quality when filtering (Default: 20)
Kraken2 database:
--krakenDB [file] Location of the database for use with Kraken2 (must be on GCP e.g gs://XXX)
--kraken_len [int] Length of the sequences for Bracken read re-estimation (Default: 150)
--confidence [float] Confidence cutoff for Kraken2 quantification. Must be [0-1] (Default: 0)
--min_counts [int] Minimum number of counts in Kraken2 results to be re-estimated in Bracken (Default: 10)
PathSeq options:
--pathseqDB [file] Location of the microbe index database for use with PathSeq (must be on GCP e.g gs://XXX)
--phred_cutoff [int] Minimum Phred score during quality filtering (Default = 15)
--min_clip_length [int] Minimum clipping length for reads during the quality filtering step (Default = 60)
--microbe_cutoff [int] Minimum score threshold for microbe alignments (Default = 30)
--normalize_abundance [bool] Divide abundance scores by each taxon's reference genome length (in millions) (Default = false)
--identity_margin [float] Identity margin, as a fraction of the best hit (between 0 and 1) (Default = 0.02)
--min_score_identity [float] Alignment identity score threshold, as a fraction of the read length (between 0 and 1). (Default = 0.9)
--skip_pre_bwa [bool] Skip pre-bwa index partition (faster for samples with greater microbial reads)
""".stripIndent()
}
// Show help message
if (params.help){
helpMessage()
exit 0
}
// Parse CSV file of CRAM files locations
if (params.print) {
Channel
.fromPath(params.csv)
.splitCsv(header:true)
.map{ row-> tuple(row.sampleId, file(row.Tumor)) }
.println()
} else {
if(params.blood) {
Channel
.fromPath(params.csv)
.splitCsv(header:true)
.map{ row-> tuple(row.sampleId, file(row.Normal)) }
.set { file_inputs }
} else {
Channel
.fromPath(params.csv)
.splitCsv(header:true)
.map{ row-> tuple(row.sampleId, file(row.Tumor)) }
.set { file_inputs }
}
}
// Slice out reads that have been aligned (CRAM format)
// -f 12 -F 256 (both unmapped)
// -f 4 (at least one is unmapped)
process preprocess {
publishDir "$params.outdir/${sample_id}/quality_filter", mode: 'copy', pattern: '*.txt'
publishDir "$params.outdir/${sample_id}/quality_filter", mode: 'copy', pattern: '*.{html, zip}'
publishDir "gs://nki-atlas/atlas-results/${sample_id}/quality_filter", mode: 'copy', pattern: '*-trim-R{1,2}.fq.gz'
input:
set sample_id, file(reads) from file_inputs
output:
set val(sample_id), file("*-trim-R{1,2}.fq.gz") optional true into bam_fastq
set val(sample_id), file("${sample_id}.unmapped.bam") optional true into unmapped_bam
file("${sample_id}.unmapped.bam.md5") optional true into unmapped_bam_md5
file("${sample_id}-flagstat.txt") into cram_flagstat
file("${sample_id}.unmapped_fastqc.{html, zip}") optional true into unmapped_fastqc
file("${sample_id}-qc-report.txt") optional true into qc_report
file("${sample_id}-trim-R1_fastqc.{html, zip}") optional true into fastqc_r1
file("${sample_id}-trim-R2_fastqc.{html, zip}") optional true into fastqc_r2
script:
"""
# Extract unmapped reads
samtools view \
-f 12 -F 256 \
--output-fmt BAM \
--threads 8 \
-o ${sample_id}.unmapped.bam \
${reads}
# Create MD5 checksum for file integrity
md5sum ${sample_id}.unmapped.bam > ${sample_id}.unmapped.bam.md5
# Get the number of initial reads per sample
samtools index ${reads}
samtools flagstat --threads 8 ${reads} > ${sample_id}-flagstat.txt
# Remove CRAM file after processing
rm ${reads}
# Run FastQC to profile unmapped reads
fastqc \
-o . \
--format bam \
--threads 8 \
${sample_id}.unmapped.bam
# Convert to paired fastq
picard \
-Xmx6G \
SamToFastq \
I=${sample_id}.unmapped.bam \
F=${sample_id}_R1.fq.gz \
F2=${sample_id}_R2.fq.gz
# Remove low quality reads & adapters
cutadapt \
-o ${sample_id}-trim-R1.fq.gz \
-p ${sample_id}-trim-R2.fq.gz \
-a AGATCGGAAGAGC \
--minimum-length ${params.min_length} \
--quality-cutoff ${params.min_quality} \
--cores 8 \
${sample_id}_R1.fq.gz \
${sample_id}_R2.fq.gz &> ${sample_id}-qc-report.txt
# Run fastq on the filtered data
fastqc \
-o . \
--threads 8 \
${sample_id}-trim-R1.fq.gz \
${sample_id}-trim-R2.fq.gz
"""
}
process kraken2 {
publishDir "$params.outdir/${sample_id}/kraken2", mode: 'copy', pattern: "*.txt*"
input:
set sample_id, file(reads) from bam_fastq.filter{ it.size()>0 }
path index from params.krakenDB
output:
file("${sample_id}-output.txt.gz") optional true into output
file("${sample_id}-report.txt") into report
file("${sample_id}-report-mpa.txt") optional true into mpa_report
file("${sample_id}-bracken-species.txt") into species
file("${sample_id}-bracken-genus.txt") into genus
file("${sample_id}-map-genus-R{1,2}.fq.gz") optional true into mapped_genus
when:
params.skip_kraken2 == false
script:
"""
# Run Kraken2
kraken2 \
--output ${sample_id}-output.txt \
--report ${sample_id}-report.txt \
--db ${index} \
--confidence ${params.confidence} \
--threads 8 \
--paired \
--gzip-compressed \
${reads[0]} \
${reads[1]}
# Convert to metaphlan2 output
#kreport2mpa.py \
#--report-file ${sample_id}-report.txt \
#--output ${sample_id}-report-mpa.txt \
#--no-intermediate-ranks
# Run bracken to re-estimate @ the species level
est_abundance.py \
-i ${sample_id}-report.txt \
-k ${index}/database${params.kraken_len}mers.kmer_distrib \
-o ${sample_id}-bracken-species.txt \
-l 'S' \
-t ${params.min_counts}
# Run bracken to re-estimate @ the genus level
est_abundance.py \
-i ${sample_id}-report.txt \
-k ${index}/database${params.kraken_len}mers.kmer_distrib \
-o ${sample_id}-bracken-genus.txt \
-l 'G' \
-t ${params.min_counts}
# Compress large files
#gzip ${sample_id}-map-genus-R1.fq
#gzip ${sample_id}-map-genus-R2.fq
#gzip ${sample_id}-output.txt
"""
}
// Run Pathseq
process pathseq {
publishDir "$params.outdir/${sample_id}/pathseq", pattern: '*.txt', mode: 'copy'
input:
set sample_id, file(reads) from unmapped_bam.filter{ it.size()>0 }
path index from params.pathseqDB
output:
file("${sample_id}-pathseq.bam") optional true into pathseq_bam
file("${sample_id}-scores.txt") optional true into pathseq_scores
file("${sample_id}-metrics.txt") optional true into pathseq_metrics
file("${sample_id}-score-metrics.txt") optional true into pathseq_scores_metrics
when:
params.skip_pathseq == false
script:
"""
# Make temporary folder
mkdir -p tmp/
# Run PathSeq
gatk --java-options '-Xmx104G -XX:ParallelGCThreads=6' \
PathSeqPipelineSpark \
--input ${reads} \
--output ${sample_id}-pathseq.bam \
--scores-output ${sample_id}-scores.txt \
--filter-metrics ${sample_id}-metrics.txt \
--score-metrics ${sample_id}-score-metrics.txt \
--microbe-fasta ${index}/combined_refgenome.fasta \
--microbe-bwa-image ${index}/combined_refgenome.img \
--taxonomy-file ${index}/combined_refgenome.db \
--filter-bwa-image ${index}/hg19_noEBV/pathseq_noEBV/pathseq_host_hg19_noEBV.img \
--kmer-file ${index}/hg19_noEBV/pathseq_noEBV/pathseq_host_hg19_noEBV.bfi \
--quality-threshold ${params.phred_cutoff} \
--min-clipped-read-length ${params.min_clip_length} \
--bwa-score-threshold ${params.microbe_cutoff} \
--divide-by-genome-length ${params.normalize_abundance} \
--identity-margin ${params.identity_margin} \
--min-score-identity ${params.min_score_identity} \
--tmp-dir tmp/ \
-- \
--spark-runner LOCAL \
--spark-master local[*]
"""
}