-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommands.txt
143 lines (116 loc) · 5.03 KB
/
commands.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
## Commands
This section lists command(s) run by mutect2consensustumoronly workflow
* Running mutect2consensustumoronly
### Derive file name
```
basename ~{fileName} | cut -d. -f1
```
### Preprocess and CombineVariants
```
python3<<CODE
import subprocess
import sys
inputStrings = []
v = "~{sep=' ' inputVcfs}"
vcfFiles = v.split()
w = "~{sep=' ' workflows}"
workflowIds = w.split()
priority = "~{priority}"
if len(vcfFiles) != len(workflowIds):
print("The arrays with input files and their respective workflow names are not of equal size!")
else:
for f in range(0, len(vcfFiles)):
inputStrings.append("--variant:" + workflowIds[f] + " " + vcfFiles[f])
javaMemory = ~{jobMemory} - 6
gatkCommand = "$JAVA_ROOT/bin/java -Xmx" + str(javaMemory) + "G -jar $GATK_ROOT/GenomeAnalysisTK.jar "
gatkCommand += "-T CombineVariants "
gatkCommand += " ".join(inputStrings)
gatkCommand += " -R ~{referenceFasta} "
gatkCommand += "-o ~{outputPrefix}_combined.vcf.gz "
gatkCommand += "-genotypeMergeOptions PRIORITIZE "
gatkCommand += "-priority " + priority
gatkCommand += " 2>&1"
result_output = subprocess.run(gatkCommand, shell=True)
sys.exit(result_output.returncode)
CODE
```
### Annotate with bcftools
```
bcftools annotate -a ~{uniqueVcf} \
-c FMT/AD,FMT/DP ~{mergedVcf} -Oz \
-o "~{outputPrefix}.merged.vcf.gz"
tabix -p vcf "~{outputPrefix}.merged.vcf.gz"
```
### Generate consensus calls
```
python3<<CODE
## Adapted from https://github.com/oicr-gsi/djerba/blob/GCGI-806_v1.0.0-dev/src/lib/djerba/plugins/tar/snv_indel/plugin.py
## this code will filter a maf file, generated from tumor-only mutect2 calls
import pandas as pd
maf_file_path = "~{mafFile}"
maf_normal_path = "~{mafNormalFile}"
freq_list_path = "~{freqList}"
output_path_prefix = "~{outputPrefix}"
genes_to_keep_path = "~{genesToKeep}"
if maf_normal_path:
df_bc = pd.read_csv(maf_normal_path,
sep = "\t",
on_bad_lines="error",
compression='gzip',
skiprows=[0])
df_pl = pd.read_csv(maf_file_path,
sep = "\t",
on_bad_lines="error",
compression='gzip',
skiprows=[0])
df_freq = pd.read_csv(freq_list_path,
sep = "\t")
with open(genes_to_keep_path) as f:
GENES_TO_KEEP = f.read()
for row in df_pl.iterrows():
hugo_symbol = row[1]['Hugo_Symbol']
chromosome = row[1]['Chromosome']
start_position = row[1]['Start_Position']
reference_allele = row[1]['Reference_Allele']
allele = row[1]['Allele']
# If there is normal input, annotate rows with information from the matched normal and from the frequency table
if maf_normal_path:
# Lookup the entry in the BC and annotate the tumour maf with
# n_depth, n_ref_count, n_alt_count
row_lookup = df_bc[(df_bc['Hugo_Symbol'] == hugo_symbol) &
(df_bc['Chromosome'] == chromosome) &
(df_bc['Start_Position'] == start_position) &
(df_bc['Reference_Allele'] == reference_allele) &
(df_bc['Allele'] == allele)]
# If there's only one entry, take its normal values
if len(row_lookup) == 1:
df_pl.at[row[0], "n_depth"] = row_lookup['n_depth'].item()
df_pl.at[row[0], "n_ref_count"] = row_lookup['n_ref_count'].item()
df_pl.at[row[0], "n_alt_count"] = row_lookup['n_alt_count'].item()
# If the entry isn't in the table,
# or if there is more than one value and so you can't choose which normal values to take,
# set them as 0
else:
df_pl.at[row[0], "n_depth"] = 0
df_pl.at[row[0], "n_ref_count"] = 0
df_pl.at[row[0], "n_alt_count"] = 0
# Lookup the entry in the frequency table and annotate the tumour maf with Freq
row_lookup = df_freq[(df_freq['Start_Position'] == row[1]['Start_Position']) &
(df_freq['Reference_Allele'] == row[1]['Reference_Allele']) &
((df_freq['Tumor_Seq_Allele'] == row[1]['Tumor_Seq_Allele1']) |
(df_freq['Tumor_Seq_Allele'] == row[1]['Tumor_Seq_Allele2']))]
if len(row_lookup) > 0:
df_pl.at[row[0], 'Freq'] = row_lookup['Freq'].item()
else:
df_pl.at[row[0], 'Freq'] = 0
# Filter the maf to remove rows based on various criteria, but always maintaining genes in the GENES_TO_KEEP list
for row in df_pl.iterrows():
hugo_symbol = row[1]['Hugo_Symbol']
frequency = row[1]['Freq']
gnomAD_AF = row[1]['gnomAD_AF']
n_alt_count = row[1]['n_alt_count']
if hugo_symbol not in GENES_TO_KEEP or frequency > 0.1 or n_alt_count > 4 or gnomAD_AF > 0.001:
df_pl = df_pl.drop(row[0])
df_pl.to_csv(output_path_prefix + '_filtered_maf_for_tar.maf.gz', sep = "\t", compression='gzip', index=False)
CODE
```