-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement exact string searching using FM index and LZ index #57
Changes from 9 commits
347c84d
b495f66
700fbb9
51882c6
72fb475
a85b978
d92588a
acb7f7f
af7a06f
f0a1256
11f445d
e05c1d3
2354d3a
55c7193
e7a6db0
7f301cb
356fdf6
7d91b4c
07e0a6c
0fc0941
92c15d9
7791ef7
a5b244e
5a67d93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
|
||
class _FMIndex: | ||
def __init__ (self, SA, BWT, text, n): | ||
self.L = BWT | ||
self.F = '#$' + ''.join(text[SA[i]] for i in range(1, n + 1)) | ||
self.n = n | ||
self.SA = SA | ||
self.sample_size = 8 # const for sampling | ||
|
||
#prepare char mapping for F | ||
self.mapper_of_chars = { self.F[2] : 0} | ||
self.beginnings = [2] | ||
last = self.F[2] | ||
for i in range(3, n+2): | ||
if self.F[i] != last: | ||
last = self.F[i] | ||
self.beginnings.append(i) | ||
self.mapper_of_chars[last] = len(self.beginnings) - 1 | ||
|
||
self.len_of_alphabet = len(self.mapper_of_chars) | ||
|
||
#prepare closest samplings | ||
current_sample = 0 | ||
self.closest_sample = [0] | ||
for i in range(1, n+2): | ||
if abs(current_sample-i) > abs(current_sample + self.sample_size-i) and (i + self.sample_size < self.n): | ||
current_sample += self.sample_size | ||
self.closest_sample.append(current_sample) | ||
|
||
#Generate values for occ for given samples O(|A|*n) | ||
self.occ_in_sample_for_char = { self.L[i]: [0] for i in range(1, n+2)} | ||
for c in self.mapper_of_chars: | ||
current_value = 0 | ||
next_sample = self.sample_size | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. current_value, next_sample = 0, self.sample_size |
||
for i in range(1, n+2): | ||
if self.L[i] == c: | ||
current_value += 1 | ||
if i == next_sample: | ||
self.occ_in_sample_for_char[c].append(current_value) | ||
next_sample = next_sample + self.sample_size | ||
|
||
def from_suffix_array_and_bwt (SA, BWT, text, n): | ||
return _FMIndex(SA, BWT, text, n) | ||
|
||
# O(|p|) | ||
def count(fm, p, size): | ||
(low, high) = _get_range_of_occurrences(fm, p, size) | ||
return max(high - low + 1, 0) if low > -1 else 0 | ||
|
||
# O(|p| + k) where k is the number or occurances of p in text | ||
def contains(fm, p, l): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since structures are written SA, BWT etc., probably we should use FM not fm. |
||
(low, high) = _get_range_of_occurrences(fm, p, l) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
yield from sorted([fm.SA[i-1] for i in range(low, high + 1) if low > -1]) | ||
|
||
|
||
def _get_occ(fm, c, i): | ||
if fm.closest_sample[i] < i: | ||
to_add = sum(1 for c_prim in fm.L[fm.closest_sample[i] + 1:i + 1] if c_prim == c) | ||
else: | ||
to_add = sum(-1 for c_prim in fm.L[i + 1:fm.closest_sample[i] + 1] if c_prim == c) | ||
return fm.occ_in_sample_for_char[c][fm.closest_sample[i] // fm.sample_size] + to_add | ||
|
||
def _get_range_of_occurrences(fm, p, size): | ||
if size > fm.n or size == 0: | ||
return (-1, -1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just -1, -1 is fine |
||
|
||
if p[-1] not in fm.mapper_of_chars: | ||
return (-1, -1) | ||
|
||
map_idx = fm.mapper_of_chars[p[-1]] | ||
l = fm.beginnings[map_idx] | ||
r = fm.n + 1 | ||
if map_idx != fm.len_of_alphabet - 1: | ||
r = fm.beginnings[map_idx + 1] - 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. r = fm.beginnings[map_idx + 1] - 1 if map_idx != fm.len_of_alphabet - 1 else fm.n + 1 |
||
|
||
for i in range(size-1, 0, -1): | ||
if p[i] not in fm.mapper_of_chars: | ||
return (-1, -1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
occurrences_before = _get_occ(fm, p[i], l - 1) | ||
occurrences_after = _get_occ(fm, p[i], r) | ||
if occurrences_before == occurrences_after: | ||
return (-1, -1) | ||
map_idx = fm.mapper_of_chars[p[i]] | ||
l = fm.beginnings[map_idx] + occurrences_before | ||
r = fm.beginnings[map_idx] + occurrences_after - 1 | ||
if r < l: | ||
return (-1, -1) | ||
return (l, r) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,13 +6,20 @@ | |
|
||
from generator import rand | ||
from exact_string_matching import forward, backward, other | ||
from string_indexing import lcp, suffix_tree, suffix_array | ||
from string_indexing import lcp, suffix_tree, suffix_array, fm_index | ||
from compression import burrows_wheeler | ||
|
||
def lcp_lr_contains(t, w, n, m): | ||
SA = suffix_array.skew(t, n) | ||
LCP_LR = lcp.build_lcp_lr(lcp.kasai(SA, t, n), n) | ||
return lcp.contains(SA, LCP_LR, t, w, n, m) | ||
|
||
def fm_index_contains(t, w, n, m): | ||
SA = suffix_array.skew(t, n) | ||
BWT = burrows_wheeler.transform_from_suffix_array(SA, t, n) | ||
fm = fm_index.from_suffix_array_and_bwt(SA, BWT, t, n) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Staying true to the convention above let this be |
||
return fm_index.contains(fm, w, m) | ||
|
||
EXACT_STRING_MATCHING_ALGORITHMS = [ | ||
[ 'Morris-Pratt', forward.morris_pratt ], | ||
[ 'Knuth-Morris-Pratt', forward.knuth_morris_pratt ], | ||
|
@@ -45,6 +52,7 @@ def lcp_lr_contains(t, w, n, m): | |
suffix_array.prefix_doubling(t, n), t, w, n, m), | ||
], | ||
[ 'lcp-lr array', lcp_lr_contains ], | ||
[ 'Fm index', fm_index_contains] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Either FM index or fm index |
||
] | ||
|
||
class TestExactStringMatching(unittest.TestCase): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please make it a class-level constant: