forked from cschin/pbdagcon
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathm4topre.py
executable file
·208 lines (164 loc) · 5.81 KB
/
m4topre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python
"""Super-simple converter from blasr m4 alignments to pbdagcon 'pre'
alignments. For use in the pre-assembler dagcon workflow.
"""
import sys
import heapq
import string # pylint: disable=W0402
from itertools import ifilter
from collections import namedtuple, defaultdict
import numpy as np
from pbcore.io.FastaIO import FastaReader
# qname tname score pctsimilarity qstrand qstart qend qseqlength tstrand tstart
# ... tend tseqlength mapqv
#
# store only fields we need
__m4fields__ = [0, 1, 2, 5, 6, 8, 9, 10, 11]
M4RECORD = namedtuple(
'M4RECORD', 'qname tname score qstart qend tstrand tstart tend tseqlength')
__tuplfy__ = M4RECORD._make # pylint: disable=W0212
# dna compliment
__rc__ = string.maketrans('actgACTG', 'tgacTGAC')
def parse_m4(rec):
"""Parse in the m4 file, returning a list of records"""
return [y for (x, y) in enumerate(rec.split()) if x in __m4fields__]
def rating(rec):
"""Rates the alignment for by length and score (revisit at some point)"""
score = -int(rec.score)
alen = int(rec.tend) - int(rec.tstart)
return score + alen
def schwartzian(rec):
"""Provides a schwartzian transform for the given record, used for
sorting
"""
flds = rec.split()
return (flds[1], float(flds[2]), rec)
def sort_targ_score(recs):
"""Sorts the list in place by target id (string), then score (float)"""
recs[:] = [schwartzian(x) for x in recs]
recs.sort()
recs[:] = [rec for (target, score, rec) in recs] # pylint: disable=W0612
def rescore(recs):
"""Rescore alignments using coverage based statistics"""
prev = ""
cov = np.zeros(1)
for idx, rec in enumerate(recs):
fields = rec.split()
rec = __tuplfy__(fields)
if rec.tname != prev:
prev = rec.tname
cov = np.zeros(int(rec.tseqlength), dtype=np.float16)
if rec.tstrand:
start = int(rec.tseqlength) - int(rec.tend)
end = int(rec.tseqlength) - int(rec.tstart)
else:
start = int(rec.tstart)
end = int(rec.tend)
cov[start:end] += 1
score = np.sum(1/cov[start:end])
fields[2] = str(-score)
recs[idx] = " ".join(fields)
def bestn_true(recstr, myq):
"""Checks if the record falls inside bestn (used when blasr is chunked)"""
rec = __tuplfy__(recstr.split())
rate = rating(rec)
return rate in myq[rec.qname[32:]].top
class AlnLimiter(object): # pylint: disable=R0903
"""Functor that returns alignments until some count is reached. Alignments
should be sorted.
"""
def __init__(self, limit=76):
self.count = 0
self.target = ''
self.limit = limit
def __call__(self, rec):
target = rec.split()[1]
if target != self.target:
self.count = 0
self.target = target
self.count += 1
return self.count < self.limit
class TopAlignments(object): # pylint: disable=R0903
"""Tracks the top alignments for a given query, used for bestn calc"""
bestn = 10
def __init__(self):
self.top = [0] * TopAlignments.bestn
def __call__(self):
return # noop
def add(self, aln):
"""Adds an alignment to a bounded list, kicking out another if
necessary
"""
heapq.heappushpop(self.top, aln)
def main(): # pylint: disable=R0914
"""Drives the program"""
mym4 = sys.argv[1]
allm4 = sys.argv[2]
reads = sys.argv[3]
TopAlignments.bestn = int(sys.argv[4])
# tracks bestn
my_queries = defaultdict(TopAlignments)
my_m4recs = []
# load my m4 chunk
m4h = open(mym4)
rec_add = my_m4recs.append
for line in m4h:
flds = parse_m4(line)
rec = __tuplfy__(flds)
rate = rating(rec)
my_queries[rec.qname[32:]].add(rate)
rec_add(' '.join(flds))
m4h.close()
# if we're chunked locate relevant alignments
if mym4 != allm4:
# assuming fofn here
m4files = [x.rstrip() for x in open(allm4) if x.rstrip() != mym4]
for m4f in m4files:
m4h = open(m4f)
for recstr in m4h:
rec = __tuplfy__(parse_m4(recstr))
if rec.qname[32:] in my_queries:
rate = rating(rec)
my_queries[rec.qname[32:]].add(rate)
m4h.close()
# remove alignments that fall outside of bestn
my_m4recs[:] = [x for x in my_m4recs if bestn_true(x, my_queries)]
# sort by target name/score
sort_targ_score(my_m4recs)
# rescore based on coverage
rescore(my_m4recs)
# sort one more time be new score
sort_targ_score(my_m4recs)
# take a max number of alignments for each target
limiter = AlnLimiter()
my_m4recs[:] = [x for x in ifilter(limiter, my_m4recs)]
# load only related sequences
seqs = {}
frea = FastaReader(reads)
for fent in frea:
if fent.name[32:] in my_queries:
seqs[fent.name] = fent.sequence
# may or may not help
del my_queries
# generate pre-alignments
for recstr in my_m4recs:
rec = __tuplfy__(recstr.split())
# Bug 24538, rare case missing self hit
if rec.tname not in seqs:
msg = "Warning: skipping query %s target %s\n"
sys.stderr.write(msg % (rec.qname, rec.tname))
continue
qst = int(rec.qstart)
qnd = int(rec.qend)
qseq = seqs[rec.qname][qst:qnd]
strand = '-' if rec.tstrand == '1' else '+'
tst = int(rec.tstart)
tnd = int(rec.tend)
if strand == '+':
tseq = seqs[rec.tname][tst:tnd]
else:
tseq = seqs[rec.tname].translate(__rc__)[::-1][tst:tnd]
print ' '.join([rec.qname, rec.tname, strand,
rec.tseqlength, str(tst), str(tnd), qseq, tseq])
if __name__ == '__main__':
sys.exit(main())