-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbg_optimized.py
252 lines (211 loc) · 8.74 KB
/
dbg_optimized.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
##########################################################################################
# #
# Python 3.6 de-Bruijn graph generator and differential expression encoder #
# #
# Base code taken from https://raw.githubusercontent.com/pmelsted/dbg/master/dbg.py #
# #
# Developed at Megaprobe-lab, PI Dr. Humberto Ortiz-Zuazaga #
# Developers: Kevin Legarreta, Louis Gil, Israel O. Dilan-Pantojas #
# Program description: Given Fastq files encoding sequences for two organisms, this #
# program generates a de-Bruijn graph which encodes information #
# about the coverage of k-sized regions across both organisms. #
# This information is useful for the analysis of differential #
# expression, theoretically the graph structure approach should #
# recover more information about the organisms genetic make-up #
# when compared to traditionally generated assembled contigs. #
# #
##########################################################################################
# Future Works ;)
# import multiprocessing as mp
#########################
# #
# Module Imports #
# #
#########################
import argparse
import collections
from Bio import Seq, SeqIO, SeqRecord
import gfapy
#########################
# #
# Sequence Methods #
# #
#########################
# Create reverse complement
def twin(km):
return Seq.reverse_complement(km)
# Chunk reads into k sized chunks
def kmers(seq,k):
for i in range(len(seq)-k+1):
yield seq[i:i+k]
# Move sequence analysis one nucleotide forward
def fw(km):
for x in 'ACGT':
yield km[1:]+x
# Move sequence analysis one nucleotide backward
def bw(km):
for x in 'ACGT':
yield x + km[:-1]
#########################
# #
# File Read Methods #
# #
#########################
# Count kmers and build dictionary with counts
# Use limit as cut off for very down regulated seq
def build(fn,k,limit):
d = collections.defaultdict(int)
# For Each filename in input parse fast.q file
for f in fn:
reads = SeqIO.parse(f,'fastq')
# Extract reads
for read in reads:
# Maybe it's possible to reduce memory overhead here!
for seq in str(read.seq).split("N"):
# Extract kmers from forward
for km in kmers(seq,k):
d[km] +=1
# Extract kmers form reverse
for km in kmers(twin(seq),k):
d[km] += 1
# Remove k-mer elements of dict that appear less than limit times
for i in [x for x in d if d[x] <= limit]:
del d[i]
return d
#########################
# #
# Sequence Methods #
# #
#########################
# Merge dictionaries to create homunculus
def merge_dicts(d1,d2):
return {i: (d1[i], d2[i]) for i in set(list(d1.keys()) + list(d2.keys()))}
# Stringify
def contig_to_string(c):
return (c[0] + ''.join(x[-1] for x in c[1:]))
#########################
# #
# Graph Methods #
# #
#########################
# Get next k in contig
def get_contig(d,km):
# print("here")
# return ((contig_to_string(get_contig_forward(d,km)), get_contig_forward(d,km)) if km in fw(get_contig_forward(d,km)[-1]) else ( (contig_to_string([twin(x) for x in get_contig_forward(d,twin(km))[-1:0:-1]] + get_contig_forward(d,km))), ([twin(x) for x in get_contig_forward(d,twin(km))[-1:0:-1]] + get_contig_forward(d,km))) )
c_fw = get_contig_forward(d,km)
c_bw = get_contig_forward(d,twin(km))
if km in fw(c_fw[-1]):
c = c_fw
else:
c = [twin(x) for x in c_bw[-1:0:-1]] + c_fw
return contig_to_string(c),c
# Move from k to k+1 and break if loop encountered
def get_contig_forward(d,km):
c_fw = [km]
while True:
if sum(x in d for x in fw(c_fw[-1])) != 1:
break
cand = [x for x in fw(c_fw[-1]) if x in d][0]
if cand == km or cand == twin(km):
break # break out of cycles or mobius contigs
if cand == twin(c_fw[-1]):
break # break out of hairpins
if sum(x in d for x in bw(cand)) != 1:
break
c_fw.append(cand)
# Return contig iterated forward
return c_fw
# Create Graph from all contigs
def all_contigs(d,k):
done = set()
r = []
for x in d:
if x not in done:
s,c = get_contig(d,x)
for y in c:
done.add(y)
done.add(twin(y))
r.append(s)
G = {}
heads = {}
tails = {}
for i,x in enumerate(r):
G[i] = ([],[])
heads[x[:k]] = (i,'+')
tails[twin(x[-k:])] = (i,'-')
for i in G:
x = r[i]
for y in fw(x[-k:]):
if y in heads:
G[i][0].append(heads[y])
if y in tails:
G[i][0].append(tails[y])
for z in fw(twin(x[:k])):
if z in heads:
G[i][1].append(heads[z])
if z in tails:
G[i][1].append(tails[z])
# Returns Graph and list of reads
return G,r
#########################
# #
# Output Methods #
# #
#########################
# Write graph links to file
def get_links(cs,d,k,s,fd):
for x in range(0,len(cs)-k):
keyA = cs[x:x+k]
keyB = cs[(x+1):(x+1)+k]
kmerA = (('%s:%s:(A:%s,B:%s)')%(s,x,d[keyA][0],d[keyA][1]))
kmerB = (('%s:%s:(A:%s,B:%s)')%(s,x+1,d[keyB][0],d[keyB][1]))
fd.write("L\t%s\t+\t%s\t+\t%sM\n"%(kmerA,kmerB,(k-1)))
# Write kmers to file
def get_kmers(cs,d, k, s,fd):
global g, listofkmers,listoflinks,lastkmerid
for x in range(0,len(cs)+1-k): # to get all subsegment, holds the all the kmers
key = cs[x:x+k]
fd.write("S\t%s:%s:(A:%s,B:%s)\t%s\n"%(s,x,d[key][0],d[key][1],key))
lastkmerid[s] = (('%s:%s:(A:%s,B:%s)')%(s,x,d[key][0],d[key][1]))
# Start writing output GFA encoded de-Bruijn Graph
def write_GFA2(G,cs,k,d,fd):
global args, g,listofkmers, listoflinks,lastkmerid
fd.write("H\tVN:Z:1.0\n")
for i,x in enumerate(cs): # Get the one contig and a number id for the contig
get_kmers(x,d,k,i,fd) # Function to get the fragments of organism A and B if included
for i,x in enumerate(cs): # Get the one contig and a number id for the contig
get_links(x,d,k,i,fd)
# Free up some memory
del cs
for i in G: # Get the links to the gfa
for j,o in G[i][0]:
fd.write("L\t%s\t+\t%s\t%s\t%dM\n"%(lastkmerid[i],lastkmerid[j],o,k-1))
for j,o in G[i][1]:
fd.write("L\t%s\t-\t%s\t%s\t%dM\n"%(lastkmerid[i],lastkmerid[j],o,k-1))
# Free up some memory
del lastkmerid
#########################
# #
# Main Method #
# #
#########################
def main():
global args
d = merge_dicts(build(args.A, args.k, args.c), build(args.B, args.k, args.c))
with open(args.output,"w") as file:
G,cs = all_contigs(d,args.k)
write_GFA2(G,cs,args.k,d,file)
parser = argparse.ArgumentParser(description="Creates a GFA file with one or two organisms given a kmer size")
parser.add_argument("-k", default=21, type=int, required=True, help="The kmer size (default = 21)")
parser.add_argument("-A", nargs='+', required=True, help="Organism_A_files")
parser.add_argument("-B", nargs='+', required=True, help="Organism_B_files")
parser.add_argument("-c", default=0, type=int, required=False, help="Minimum coverage of kmer (default = 0)")
parser.add_argument("-output", default="output.gfa", required=False,help="Output GFA file name")
args = parser.parse_args()
listofkmers = []
listoflinks = []
lastkmerid = {}
# To add more organisms add this parser.add_argument("-B", nargs='+', required=True, help="Organism_B_files")
# change the name and do another call to build and do multiple merge_dicts calls
if __name__ == "__main__":
main()