-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvcf2fasta.py
185 lines (160 loc) · 5.2 KB
/
vcf2fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
####FROM https://github.com/aihardin/utils/blob/master/vcf2fasta.py USED WITH SLIGHT MODIFICATIONS
#! /usr/bin/env python
import os
import sys
from Bio import SeqIO
def readFasta(fastafn):
'''Read a fasta file, return a dictionary keyed by sequence name containing the respective sequences.'''
fastafh = open(fastafn, 'r')
# parse the FASTA file
seqs = {}
for s in list(SeqIO.parse(fastafh, format='fasta')):
seqs[s.name] = s.seq.tostring()
return seqs
def parseVCF(snp_file,filter_vcf=True,filter_het=True,write_filtered=None):
subs = list()
indels = list()
#strain = snp_file.split('.')[0]
if write_filtered:
filtered_fh = open(write_filtered,'w')
for line in snp_file:
if line.startswith('#'):
if write_filtered:
filtered_fh.write(line)
if line.startswith('#CHROM'):
strain = line.split()[9].split('.')[0]
continue
if filter_vcf:
if 'PASS' not in line:
continue
if filter_het:
if not '1/1' in line and not '1|1' in line:
continue
if write_filtered:
filtered_fh.write(line)
p = line.split('\t')
chr =p[0]
pos = int(p[1])
ref = p[3]
var = p[4]
genotype = p[9].split(':')[0]
if len(var) != len(ref):
indels.append((chr,pos, ref, var, genotype))
else:
subs.append((chr, pos, ref, var, genotype))
if write_filtered:
filtered_fh.close()
return subs, indels
def insertSNPs(subs, indels, ref):
if subs:
new_chrs = insertSubs(subs, ref)
if indels:
new_chrs = insertIndels(indels, new_chrs)
return new_chrs
def insertSubs(subs, ref):
''' subs are 1 based, ref is 0'''
new_chrs = dict()
for chr in ref:
new_chrs[chr] = list(ref[chr])
chr_snp = [s for s in subs if s[0] == chr]
#print len(chr_snp), chr
for sub in chr_snp:
pos = sub[1]
alt = sub[3]
genotype = sub[4]
if '1/1' not in genotype:
continue
new_chrs[chr][pos - 1] = alt
#print 'pre subs:', ref[chr]
#print 'postsubs:', ''.join(new_chrs[chr])
return new_chrs
def insertIndels(indels, ref):
# construct a new chr adding insertions and ommiting deleted parts
# if a deletion, make a new part from the old idx to pos, setting the new
# pos to after the deletion
# if an insertion, add a new part from idx to pos + the insertion and
# setting the idx to after the insertion
new_chrs = dict()
for chr, seq in ref.items():
chr_indels = [i for i in indels if i[0] == chr]
chr_indels.sort()
if not chr_indels:
new_chrs[chr] = ''.join(seq)
#print chr+' has no indels'
continue
p = []
idx = 0
net = 0
for indel in chr_indels:
pos = indel[1]
ref = indel[2]
alt = indel[3]
if ',' in alt:
continue #heterozygous
#print 'before processing ',indel, p, idx
if len(alt) < len(ref):
#deletion
del_len = len(ref)-1
p.extend(seq[idx:pos])
idx = pos + del_len
net -= del_len
elif len(alt) > len(ref):
#insertion
p.extend(seq[idx:pos+1]) # insertions goes after pos
p.extend(list(alt[1:]))
idx = pos
net += len(alt)-1
else:
print 'error: alt has same len as ref',indel
sys.exit()
#print 'after:', p, idx
# add last bit
p.extend(seq[idx:])
#print 'end:', p
new_chrs[chr] = ''.join(p)
#print len(chr_indels), chr, 'net change:', net
#print 'post indels:', new_chrs
return new_chrs
def writeFasta(chrs,out_file):
fh = open(out_file, 'w')
keys = chrs.keys()
keys.sort()
for chr in keys:
fh.write('>'+chr+'\n')
fh.write("".join(chrs[chr])+'\n')
fh.close()
def main(argv):
if len(argv) < 2:
print 'usage: script.py ref_seq.fasta sample -|vcf_file [filtered_vcf_out]'
sys.exit()
ref_file = argv[1]
sample = argv[2]
vcf_file = argv[3]
if vcf_file == '-':
fh = sys.stdin
else:
fh = open(vcf_file)
out_vcf = None
try:
out_vcf = sys.argv[4]
except:
pass
subs, indels = parseVCF(fh,write_filtered=out_vcf)
print 'subs', len(subs)
print 'indels', len(indels)
if len(indels) != 0:
print len(subs), len(indels), float(len(subs))/len(indels)
chrs = readFasta(ref_file)
new_chrs = insertSNPs(subs, False, chrs)
for chr in new_chrs:
print chr, 'diff:', len(new_chrs[chr])-len(chrs[chr])
keys = new_chrs.keys()
for k in keys:
new_header = k + ' ' + sample
new_chrs[new_header] = new_chrs[k][:]
del new_chrs[k]
print new_chrs.keys()
#print new_chrs["chrM bla_snp"]
writeFasta(new_chrs, sample+'.fasta')
if __name__ == '__main__':
main(sys.argv)