-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwgs_flow.nf
171 lines (134 loc) · 5.39 KB
/
wgs_flow.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// Test workflow to run a simple run with the genalice server
// before running you have to get a reference and start the server:
// gaReference --input=/home/genome/build/GRCh37/GRCh37.fa --output=/store/nf/reference/GRCh37.grf
// gaServer --server_name=DEMO --reference=/store/nf/reference/GRCh37.grf
//fq1=file("/store/G15511_S1/tiny_1.fastq")
//fq2=file("/store/G15511_S1/tiny_2.fastq")
//fq1=file("/store/G15511_S1/C09DFACXX111207.1.TTGAGCCT_1.fastq.gz")
//fq2=file("/store/G15511_S1/C09DFACXX111207.1.TTGAGCCT_2.fastq.gz")
import nextflow.extension.FilesEx
sampleTSVconfig = file(params.sample)
if (!params.sample) {
text = Channel.from(
"GENALICE WGS WORKFLOW - Missing the sample TSV config file: ",
" Usage",
" nextflow run wgs_flow.nf -c <file.config> --sample <sample.tsv>")
text.subscribe { println "$it" }
exit 1
}
rawFastqFiles = Channel
.from(sampleTSVconfig.readLines())
.map { line ->
list = line.split()
idSample = list[0]
idType = list[1]
fastqFile1 = file(list[2])
fastqFile2 = file(list[3])
[ idSample, idType, fastqFile1, fastqFile2 ]
}
/*
To considerread groups, Genalice mapping have to be set up in a certain way.
Advice from Bas Tolhuis:
We have data set from two lanes (L001 and L002).
myData-L001.R1.fastq.gz
myData-L001.R2.fastq.gz
myData-L002.R1.fastq.gz
myData-L002.R2.fastq.gz
We create two directories:
mkdir myData.pair1
mkdir myData.pair2
We make symbolic links of the R1 files in the myData.pair1 folder
ln -s /full/path/to/myData-L001.R1.fastq.gz myData.pair1/myData-L001.R1.fastq.gz
ln -s /full/path/to/myData-L002.R1.fastq.gz myData.pair1/myData-L002.R1.fastq.gz
We make symbolic links of the R2 files in the myData.pair2 folder
ln -s /full/path/to/myData-L001.R2.fastq.gz myData.pair2/myData-L001.R2.fastq.gz
ln -s /full/path/to/myData-L002.R2.fastq.gz myData.pair2/myData-L002.R2.fastq.gz
We give directories as input to gaMap
Note that its a comma separated list and each directory ends with a "/" character
gaMap --server_name=myServer --input=myData.pair1/,myData.pair2/ --output=... etc
Now the fastq links in the myData.pair* directories will be mapped into one GAR file
using alphabetical order. The order in pair1 and pair2 directories should be the same
(i.e. first L001 and then L002).. It is not possible to give the individual fastq inputs
different read group names.
*/
(rawFastqFiles, idSample) = getIdSample(rawFastqFiles)
println "Sample ID: " + idSample
// make a directory that contains links to all the important outputs
resultsDir = file("${idSample}.results")
resultsDir.mkdir()
R1_dir = file("${idSample}.R1")
R2_dir = file("${idSample}.R2")
R1_dir.mkdir()
R2_dir.mkdir()
// create files containing full path names for R1 and R2 reads
R1File = file("R1.readgroups")
R2File = file("R2.readgroups")
R1Channel = Channel.create()
R2Channel = Channel.create()
Channel.from rawFastqFiles.separate(R1Channel,R2Channel) {x -> [x,x] }
// Now we have two channels for reads, make symlinks in the corresponding directories
// TODO: these lines are not checking for existence and are throwing an error if
// files are already there
def makeLink(aChannel,readOrientation) {
aChannel = logChannelContent("Linking file: ",aChannel)
aChannel.subscribe { it ->
linkName = idSample + readOrientation + it.getName()
try {
it.mklink(linkName)
} catch ( java.nio.file.FileAlreadyExistsException e) {
println "File already exists, ignoring. " + linkName
}
}
return "Links ready in " + idSample + readOrientation
}
println makeLink ( R1Channel.map { x -> x.get(2)}, ".R1/" )
println makeLink ( R2Channel.map { x -> x.get(3)}, ".R2/" )
process map {
publishDir resultsDir, mode: 'copy'
input:
file R1_dir
file R2_dir
output:
file "${idSample}.gar" into result_gar
file "${idSample}.report" into result_report
file "${idSample}.qc.json" into result_metrics
"""
gaMap --server_name=${params.serverName} --input=${R1_dir}/,${R2_dir}/ --output=`pwd`/${idSample}.gar --run_name=${idSample} --report_file=`pwd`/${idSample}.report --cmd_file=${params.humanMapCfg}
gaMetrics --input=`pwd`/${idSample}.gar --metric='full detail' --format=json --output=`pwd`/${idSample}.qc.json
"""
}
process variant_call {
publishDir resultsDir, mode: 'copy'
input:
file result_gar
output:
file "${idSample}.vcf" into VCF
file "${idSample}.variant.json" into vcfjson
"""
gaVariant --input=$result_gar --output=`pwd`/${idSample}.vcf --run_name=${idSample} --cmd_file=${params.humanVariantCfg} --output_statistics=true --statistics_file=`pwd`/${idSample}.variant.json
"""
}
// HELPER FUNCTIONS
def getIdSample(aCh) {
consCh = Channel.create()
originalCh = Channel.create()
// get the patient ID
// duplicate channel to get sample name
Channel.from aCh.separate(consCh,originalCh) {x -> [x,x]}
// use the "consumed" channel to get it
// we are assuming the first column is the same for the patient, as hoping
// people do not want to compare samples from differnet patients
idPatient = consCh.map { x -> [x.get(0)]}.unique().getVal()[0]
// we have to close to make sure remainding items are not
consCh.close()
return [ originalCh, idPatient]
}
def logChannelContent (aMessage, aChannel) {
resChannel = Channel.create()
logChannel = Channel.create()
Channel
.from aChannel
.separate(resChannel,logChannel) {a -> [a, a]}
logChannel.subscribe {log.info aMessage + " -- $it"}
return resChannel
}