forked from Kevinlega/DBGDE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbgDif.py
331 lines (260 loc) · 8.83 KB
/
dbgDif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# code taken from https://raw.githubusercontent.com/pmelsted/dbg/master/dbg.py
# import multiprocessing as mp
import sys
import argparse
import collections
from Bio import Seq, SeqIO, SeqRecord
import gfapy
# Create reverse complement
def twin(km):
return Seq.reverse_complement(km)
# Chunk reads into k sized chunks
def kmers(seq,k):
for i in range(len(seq)-k+1):
yield seq[i:i+k]
# Move sequence analysis one nucleotide forward
def fw(km):
for x in 'ACGT':
yield km[1:]+x
# Move sequence analysis one nucleotide backward
def bw(km):
for x in 'ACGT':
yield x + km[:-1]
# count kmers and build dictionary with counts
# use limit as cut off of very down regulated seq
def build(fn,k=31,limit=1):
d = collections.defaultdict(int)
# For each filename in input parse fast.q file
for f in fn:
reads = SeqIO.parse(f,'fastq')
# Extract reads
for read in reads:
seq_s = str(read.seq)
seq_l = seq_s.split('N')
#
for seq in seq_l:
for km in kmers(seq,k):
d[km] +=1
seq = twin(seq)
for km in kmers(seq,k):
d[km] += 1
# Remove k-mer elements of dict that
d1 = [x for x in d if d[x] <= limit]
for x in d1:
del d[x]
return d
def merge_dicts(d1,d2):
merge = {}
for i in d1.keys():
merge[i] = (d1[i], d2[i])
for i in d2.keys():
merge[i] = (d1[i], d2[i])
return merge
#
def contig_to_string(c):
return c[0] + ''.join(x[-1] for x in c[1:])
# we will work here today
def get_contig(d,km):
c_fw = get_contig_forward(d,km)
c_bw = get_contig_forward(d,twin(km))
if km in fw(c_fw[-1]):
c = c_fw
else:
c = [twin(x) for x in c_bw[-1:0:-1]] + c_fw
return contig_to_string(c),c
# And here
def get_contig_forward(d,km):
c_fw = [km]
while True:
if sum(x in d for x in fw(c_fw[-1])) != 1:
break
cand = [x for x in fw(c_fw[-1]) if x in d][0]
if cand == km or cand == twin(km):
break # break out of cycles or mobius contigs
if cand == twin(c_fw[-1]):
break # break out of hairpins
if sum(x in d for x in bw(cand)) != 1:
break
c_fw.append(cand)
return c_fw
def all_contigs(d,k):
done = set()
r = []
for x in d:
if x not in done:
s,c = get_contig(d,x)
for y in c:
done.add(y)
done.add(twin(y))
r.append(s)
G = {}
heads = {}
tails = {}
for i,x in enumerate(r):
G[i] = ([],[])
heads[x[:k]] = (i,'+')
tails[twin(x[-k:])] = (i,'-')
for i in G:
x = r[i]
for y in fw(x[-k:]):
if y in heads:
G[i][0].append(heads[y])
if y in tails:
G[i][0].append(tails[y])
for z in fw(twin(x[:k])):
if z in heads:
G[i][1].append(heads[z])
if z in tails:
G[i][1].append(tails[z])
return G,r
# This returns True if complies with the threshold else False
def coveragekmer(C,kmerA,kmerB,k):
global g
kA,kB = kmerA,kmerB
kA = kA.split("A:")
kA = kA[1]
kA = kA.split(',B:')
kA[1] = kA[1].split(")")[0]
kB = kB.split("A:")
kB = kB[1]
kB = kB.split(',B:')
kB[1] = kB[1].split(")")[0]
coverageA = abs(int(kA[0]) - int(kA[1]))
coverageB = abs(int(kB[0]) - int(kB[1]))
DifEx = (coverageA + coverageB)/2
if DifEx < C:
return False
else:
a = ("L\t%s\t+\t%s\t+\t%sM\tKC:i:%d"%(kmerA,kmerB,(k-1),int(DifEx)))
return a
def coverageSegmentF(C,kmerA,kmerB,Db,k):
global g
kA,kB = kmerA,kmerB
kA = kA.split("A:")
kA = kA[1]
kA = kA.split(',B:')
kA[1] = kA[1].split(")")[0]
kB = kB.split("A:")
kB = kB[1]
kB = kB.split(',B:')
kB[1] = kB[1].split(")")[0]
coverageA = abs(int(kA[0]) - int(kA[1]))
coverageB = abs(int(kB[0]) - int(kB[1]))
DifEx = (coverageA + coverageB)/2
if DifEx < C:
return False
else:
a = ("L\t%s\t+\t%s\t%s\t%dM\tKC:i:%d"%(kmerA,kmerB,dB,k-1,int(DifEx)))
return a
def coverageSegmentR(C,kmerA,kmerB,Db,k):
global g
kA,kB = kmerA,kmerB
kA = kA.split("A:")
kA = kA[1]
kA = kA.split(',B:')
kA[1] = kA[1].split(")")[0]
kB = kB.split("A:")
kB = kB[1]
kB = kB.split(',B:')
kB[1] = kB[1].split(")")[0]
coverageA = abs(int(kA[0]) - int(kA[1]))
coverageB = abs(int(kB[0]) - int(kB[1]))
DifEx = (coverageA + coverageB)/2
if DifEx < C:
return False
else:
a = ("L\t%s\t-\t%s\t%s\t%dM\tKC:i:%d"%(kmerA,kmerB,dB,k-1,int(DifEx)))
return a
def get_kmers_and_links(cs,d, k, s,C):
global g, listofkmers,listoflinks,lastkmerid
kmers = []
links = []
for x in range(0,len(cs)+1-k): # to get all subsegmet, holds the all the kmers
key = cs[x:x+k]
kmers.append("S\t%s:%s:(A:%s,B:%s)\t%s"%(s,x,d[key][0],d[key][1],key))
for x in range(len(kmers)-1):
if C == 0:
links.append("L\t%s\t+\t%s\t+\t%sM"%(kmerA,kmerB,(k-1)))
else:
kmerA = kmers[x]
kmerB = kmers[x+1]
kmerA = kmerA.split("\t")
kmerB = kmerB.split("\t")
kmerA = kmerA[1]
kmerB = kmerB[1]
a =coveragekmer(C,kmerA,kmerB,k)
if a == False:
pass
else:
links.append(a)
kmer = kmers[len(kmers)-1] #dictionary for links taken by all contigs
kmer = kmer.split("\t")
kmer = kmer[1]
lastkmerid[s] = kmer
listofkmers += kmers
listoflinks += links
def write_GFA2(G,cs,k,d,C):
global args, g,listofkmers, listoflinks,lastkmerid
if args.output: # If the output file name is given use it
filename = args.output
else: # else use standard one
filename = "output.gfa"
g.add_line("H\tVN:Z:1.0") # Get the header with the GFA version to the GFA
for i,x in enumerate(cs): # Get the one contig and a number id for the contig
# g.add_line("S\t%d\t%s"%(i, x )) # Write the segment(contig) <segment> <- S <sid:id> <slen:int> <sequence> <tag>*
get_kmers_and_links(x,d,k,i,C) # Function to get the fragments of organism A and B if included
for kmer in listofkmers:
g.add_line(kmer)
for link in listoflinks:
g.add_line(link)
if C == 0:
for i in G: # Get the links to the gfa
for j,o in G[i][0]:
g.add_line("L\t%s\t+\t%s\t%s\t%dM"%(lastkmerid[i],lastkmerid[j],o,k-1)) #need to change to the
for j,o in G[i][1]:
g.add_line("L\t%s\t-\t%s\t%s\t%dM"%(lastkmerid[i],lastkmerid[j],o,k-1))
else:
for i in G: # Get the links to the gfa
for j,o in G[i][0]:
l = []
try:
l.append(coverageSegmentF(C,lastkmerid[i],lastkmerid[j],o,k-1))
g.add_line(l[0])
except:
pass
for j,o in G[i][1]:
l = []
try:
l.append(coverageSegmentF(C,lastkmerid[i],lastkmerid[j],o,k-1))
g.add_line(l[0])
except:
pass
g.to_file(filename) # Write to file
def positive(C):
C = int(C)
if C <= 0:
sys.exit("Coverage must be positive integer")
return C
def main():
global args
dA = build(args.A, args.k, 1)
dB = build(args.B, args.k, 1)
d = merge_dicts(dA, dB)
G,cs = all_contigs(d,args.k)
write_GFA2(G,cs,args.k,d,args.C)
parser = argparse.ArgumentParser(description="Creates a GFA file with one or two organisms given a kmer size. \
If coverage is give eliminates below that coverage ")
parser.add_argument("-k", type=int, required=True, help="the kmer size")
parser.add_argument("-A", nargs='+', required=True, help="Organism_A_files")
parser.add_argument("-B", nargs='+', required=True, help="Organism_B_files")
parser.add_argument("-output",required=False,help="Output GFA file name")
parser.add_argument("-C", required=False,default=0,type=positive,help="If given eliminates links with Diferential Expresion below coverage")
args = parser.parse_args()
g = gfapy.Gfa()
listofkmers = []
listoflinks = []
lastkmerid = {}
# To add more organisms add this parser.add_argument("-B", nargs='+', required=True, help="Organism_B_files")
# change the name and do another call to build and do multiple merge_dicts calls
if __name__ == "__main__":
main()