-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathba9o.py
49 lines (41 loc) · 1.72 KB
/
ba9o.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# Find All Approximate Occurrences of a Collection of Patterns in a String
from .ba9q import partial_suffix_array
from .ba9i import bwt
from .ba9m import first_occurrence, count_symbols
from .ba9n import find_location
class BWMatch:
def __init__(self, seq, k=10):
self.psa = dict(partial_suffix_array(seq + "$", k))
self.seq = bwt(seq + "$")
self.fo = first_occurrence(self.seq)
self.cs = count_symbols(self.seq)
def update(self, ptrs, x):
t, b = ptrs
return (self.fo[x] + self.cs[t][x], self.fo[x] + self.cs[b + 1][x] - 1)
# pattern: the pattern to match
# ptrs: top and bottom pointers within last column
# mi: counter for number of mismatches
# mn: total number of acceptable mismatches
def bwm(self, pattern, ptrs, mi):
if not pattern:
return list(range(ptrs[0], ptrs[1] + 1))
matches = []
pattern, sym = pattern[:-1], pattern[-1]
if sym in self.seq[ptrs[0] : ptrs[1] + 1]:
matches += self.bwm(pattern, self.update(ptrs, sym), mi)
if mi < self.mn:
for mm in ["A", "C", "G", "T"]:
if mm != sym:
matches += self.bwm(pattern, self.update(ptrs, mm), mi + 1)
return matches
def match_patterns(self, patterns, mn):
self.mn = mn
for pattern in patterns:
for match in self.bwm(pattern, (0, len(self.seq) - 1), 0):
yield find_location(match, self.psa, self.seq, self.fo, self.cs)
def main(file):
seq, patterns, mismatches = open(file).read().splitlines()
patterns = patterns.split()
mismatches = int(mismatches)
matcher = BWMatch(seq)
print(*sorted(matcher.match_patterns(patterns, mismatches)))