-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDBGCompletev2.py
292 lines (257 loc) · 9.59 KB
/
DBGCompletev2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# -*- coding:utf-8 -*-
'''
使用DBG对结果串的两侧补全
python一直在报段错误,只能写c++了
DB图部分见DBGcpp.cpp,通过文件传输信息。
'''
import argparse
import os
import Levenshtein as leve
from tqdm import tqdm
from collections import Counter
import json
import sys
import math
import heapq
import numpy as np
import heapq
sys.setrecursionlimit(90000)
DEFAULT_PARAM_FILE = 'param.json'
DEFAULT_LONG_FILE = 'long.fasta'
DEFAULT_SHORT_1_FILE = 'short_1.fasta'
DEFAULT_SHORT_2_FILE = 'short_2.fasta'
DEFAULT_FIXED_LONG_FILE = 'long_repair.fasta' # 'fixed_long.fasta' # use repair
DEFAULT_EXTEND_FILE = 'extend_ans_v2_repair_17w.fasta'
ARGS = None
DNALEN = 30
def parse_args():
global ARGS
parser = argparse.ArgumentParser()
parser.add_argument('DATA_DIR', type=str, help='the dataset\'s directory')
parser.add_argument('ANS_FILE', type=str, help='ans data file\'s name')
parser.add_argument('-fl', '--FIXED_LONG_FILE', type=str,
default=DEFAULT_FIXED_LONG_FILE, help='fixed long data file\'s name')
parser.add_argument('-p', '--PARAM_FILE', type=str,
default=DEFAULT_PARAM_FILE, help='the param file\'s name')
parser.add_argument('-l', '--LONG_FILE', type=str,
default=DEFAULT_LONG_FILE, help='the long data file\'s name')
parser.add_argument('-s1', '--SHORT_1_FILE', type=str,
default=DEFAULT_SHORT_1_FILE, help='the short_1 file\'s name')
parser.add_argument('-s2', '--SHORT_2_FILE', type=str,
default=DEFAULT_SHORT_2_FILE, help='the short_2 file\'s name')
parser.add_argument('-e', '--EXTEND_FILE', type=str,
default=DEFAULT_EXTEND_FILE, help='the extended ans file\'s name')
parser.add_argument('-done', '--DONE', type=int,
default=0, help='the done number of ansdna')
ARGS = parser.parse_args()
ARGS.PARAM_FILE = os.path.join(ARGS.DATA_DIR, ARGS.PARAM_FILE)
ARGS.LONG_FILE = os.path.join(ARGS.DATA_DIR, ARGS.LONG_FILE)
ARGS.SHORT_1_FILE = os.path.join(ARGS.DATA_DIR, ARGS.SHORT_1_FILE)
ARGS.SHORT_2_FILE = os.path.join(ARGS.DATA_DIR, ARGS.SHORT_2_FILE)
ARGS.FIXED_LONG_FILE = os.path.join(ARGS.DATA_DIR, ARGS.FIXED_LONG_FILE)
ARGS.ANS_FILE = os.path.join(ARGS.DATA_DIR, ARGS.ANS_FILE)
ARGS.EXTEND_FILE = os.path.join(ARGS.DATA_DIR, ARGS.EXTEND_FILE)
def prepare_fasta_data(filename):
content = []
print('Load data', filename)
with open(filename, 'r') as f:
lines = f.readlines()
name = 'Unknown'
for i, line in enumerate(tqdm(lines)):
if (i & 1) == 0:
name = line.strip('\n')
else:
content.append({'name': name, 's': line.strip('\n')})
return content
def get_comp_rev_dna(dna):
tran = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
return ''.join([tran[c] for c in dna][::-1])
def get_comp_rev_dataset(dataset):
tran = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
return [{
'name': data['name']+'(comp_rev)',
's': get_comp_rev_dna(data['s'])
} for data in dataset]
def generate_graph(datasets):
point_to_dna = []
dna_to_point = {}
out_edges = []
g = []
def get_id(dna):
point = dna_to_point.get(dna)
if point != None:
return point
now = len(point_to_dna)
dna_to_point[dna] = now
point_to_dna.append(dna)
out_edges.append(Counter())
g.append([])
return now
def add_edge(u, v):
pre = out_edges[u][v]
if pre == 0: # Threshold
out_edges[u].update([v]) # flag:这句话为了效率暂时放在这里!
g[u].append(v)
def generate_from_dna(dna):
assert len(dna) >= DNALEN
for i in range(len(dna)-DNALEN):
u = get_id(dna[i:i+DNALEN])
v = get_id(dna[i+1:i+1+DNALEN])
add_edge(u, v)
for dataset in datasets:
for data in tqdm(dataset):
generate_from_dna(data['s'])
return point_to_dna, dna_to_point, out_edges, g
def dynamic_save(i, s):
with open(ARGS.EXTEND_FILE, 'a') as f:
print('{:3d} length: {:d}'.format(i, len(s)))
f.write('>ans_{0}/1\n{1}\n'.format(i, s))
if __name__ == "__main__":
# load ARGS
parse_args()
# load data
long_dataset = prepare_fasta_data(ARGS.LONG_FILE)
rc_long_dataset = get_comp_rev_dataset(long_dataset)
short1_dataset = prepare_fasta_data(ARGS.SHORT_1_FILE)
rc_short1_dataset = get_comp_rev_dataset(short1_dataset)
short2_dataset = prepare_fasta_data(ARGS.SHORT_2_FILE)
rc_short2_dataset = get_comp_rev_dataset(short2_dataset)
fixed_dataset = prepare_fasta_data(ARGS.FIXED_LONG_FILE)
# rc_fixed_dataset = get_comp_rev_dataset(fixed_dataset)
ans_dataset = prepare_fasta_data(ARGS.ANS_FILE)
try:
with open(ARGS.EXTEND_FILE, 'r') as f:
pass
except:
with open(ARGS.EXTEND_FILE, 'w') as f:
pass
# get dbg points
print('Building DB graph')
point_to_dna, dna_to_point, out_edges, g = generate_graph(
[short1_dataset, short2_dataset, fixed_dataset, rc_short1_dataset, rc_short2_dataset])
total_point = len(point_to_dna)
print('Total points', total_point)
# delete unuse
del long_dataset, rc_long_dataset, short1_dataset, rc_short1_dataset, short2_dataset, rc_short2_dataset, fixed_dataset # , rc_fixed_dataset
del out_edges
# 把图输出,传输给c++
with open('Graph.txt', 'w') as f:
f.write(str(total_point)+'\n')
for u, outs in enumerate(g):
f.write(str(len(outs))+'\n')
for v in outs:
f.write(str(v)+' ')
f.write('\n')
# def gen_chain_c(u, maxd=100000):
# os.system('./DBGcpp')
# res = None
# return res
# def gen_chain_v2(u, maxd=100000): # 树形dp(伪)
# res = []
# # 是否访问过,访问过则认为反向边不存在,理论上最长路最多减少一半的长度
# visited = {}
# maxto = []
# orgid = []
# def get_id(u):
# ret = visited.get(u)
# if ret == None:
# newid = len(maxto)
# visited[u] = newid
# maxto.append(-1)
# orgid.append(u)
# return newid, False # new id
# else:
# return ret, True # old id
# def dfs(u, dep=0):
# uid, _ = get_id(u)
# maxdeepu = 1
# if dep == maxd:
# # print('Reach maxd!')
# return maxdeepu
# for v in g[u]:
# vid, exist = get_id(v)
# if exist:
# continue
# maxdeepv = dfs(v, dep+1)
# if maxdeepv+1 > maxdeepu:
# maxdeepu = maxdeepv+1
# maxto[uid] = vid
# return maxdeepu
# dfs(u)
# u = maxto[visited[u]]
# while u != -1:
# res.append(orgid[u])
# u = maxto[u]
# return res
# generate str from a start point
def gen_str(u, points):
res = point_to_dna[u]
for u in points:
res += point_to_dna[u][-1]
return res
# extend all ans
def extend_tail(dna):
# 向后extend
TRY_STEP = 87
assert len(dna) >= DNALEN
step = min(TRY_STEP, len(dna)-DNALEN+1)
max_ext = 0
best_res = dna
startpoints = []
for i in range(step):
seg = dna[-(DNALEN+i):][:DNALEN]
st = dna_to_point.get(seg)
if st != None:
startpoints.append(st)
with open('Start.txt', 'w') as f:
f.write(str(len(startpoints))+'\n')
for st in startpoints:
f.write(str(st)+'\n')
os.system('./DBGcpp')
results = []
with open('Result.txt', 'r') as f:
for i, line in enumerate(f.readlines()):
if i >= len(startpoints):
break
res = line.strip('\n').split(' ')
res.remove('')
res = [int(n) for n in res]
results.append(res)
for i in range(step):
seg = dna[-(DNALEN+i):][:DNALEN]
st = dna_to_point.get(seg)
if st != None:
ext = gen_str(st, results[0])
results.pop(0)
extlen = len(ext)-DNALEN-i
if extlen <= max_ext:
# print('extend at', i, 'failed', extlen, 'continue')
continue
else:
print('extend at', i, 'length', extlen)
max_ext = extlen
best_res = dna[:-(DNALEN+i)]+ext
continue # 是否直接break完事了?不然算太久了
return best_res
# ans = []
for i, data in enumerate(ans_dataset):
print('Now new dna', i)
if i < ARGS.DONE:
print('Skip')
continue
pre = len(data['s'])
print('extend tail')
res = extend_tail(data['s'])
res = get_comp_rev_dna(res)
print('extend head')
res = extend_tail(res)
end = len(res)
print('Pre', pre, 'Extended', end-pre)
res = get_comp_rev_dna(res)
dynamic_save(i, res)
# ans.append(res)
# ans = sorted(ans, key=lambda x: len(x), reverse=True)
# with open(ARGS.EXTEND_FILE, 'w') as f:
# for i, s in enumerate(ans):
# print('{:3d} length: {:d}'.format(i, len(s)))
# f.write('>ans_{0}/1\n{1}\n'.format(i, s))