-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement exact string searching using FM index and LZ index #57
Changes from 5 commits
347c84d
b495f66
700fbb9
51882c6
72fb475
a85b978
d92588a
acb7f7f
af7a06f
f0a1256
11f445d
e05c1d3
2354d3a
55c7193
e7a6db0
7f301cb
356fdf6
7d91b4c
07e0a6c
0fc0941
92c15d9
7791ef7
a5b244e
5a67d93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
backup*/* | ||
benchar/cbenchar/build/* | ||
.vscode | ||
*/__pycache__ | ||
*/*/__pycache__ | ||
prolik123 marked this conversation as resolved.
Show resolved
Hide resolved
|
prolik123 marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
|
||
class FMIndex: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general I'd avoid creating classes in favor of procedural (i.e. bunch of functions) approach, please check e.g. suffix_array.py for details. Of course you can build a structure which will be used in methods like contains - see |
||
|
||
# all of strings beginns with # (idk why?) | ||
# i sppose that patterns do not starts with # | ||
|
||
def __init__ (self, SA, BWT, text, n): | ||
self.L = BWT | ||
self.F = '#$' + ''.join(text[SA[i]] for i in range(1, n + 1)) | ||
self.n = n | ||
self.SA = SA | ||
self.sampleSize = 8 # const for sampling | ||
|
||
#prepare char mapping for F | ||
self.mapperOfChar = { self.F[2] : 0} | ||
self.begginings = [2] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. beginnings |
||
last = self.F[2] | ||
lenOfBeginings = 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove and replace with len(beginnings), it's still |
||
for i in range(3, n+2): | ||
if self.F[i] != last: | ||
last = self.F[i] | ||
self.begginings.append(i) | ||
self.mapperOfChar[last] = lenOfBeginings | ||
lenOfBeginings += 1 | ||
|
||
self.lenOfAlphabet = len(self.mapperOfChar) | ||
|
||
#prepare closest samplings | ||
currentSample = 0 | ||
self.closestSample = [0] | ||
for i in range(1, n+2): | ||
if abs(currentSample-i) > abs(currentSample + self.sampleSize-i) and (i + self.sampleSize < self.n): | ||
currentSample += self.sampleSize | ||
self.closestSample.append(currentSample) | ||
|
||
#Generate values for occ for given samples O(|A|*n) | ||
self.occInSampleForChar = { self.L[i]: [0] for i in range(1, n+2)} | ||
for c in self.mapperOfChar: | ||
currValue = 0 | ||
nextSample = self.sampleSize | ||
for i in range(1, n+2): | ||
if self.L[i] == c: | ||
currValue += 1 | ||
if i == nextSample: | ||
self.occInSampleForChar[c].append(currValue) | ||
nextSample = nextSample + self.sampleSize | ||
|
||
# should be private | ||
def getRangeOfOccurence(self, p, size): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PS. Please check spelling of "occurrences", "beginnings" etc., because you have several variants of both ;) |
||
if size > self.n: | ||
return [-1, -1] | ||
|
||
currChar = p[size-1] | ||
if currChar not in self.mapperOfChar: | ||
return [-1, -1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
mapIdx = self.mapperOfChar[currChar] | ||
l = self.begginings[mapIdx] | ||
r = self.n + 1 | ||
if mapIdx != self.lenOfAlphabet - 1: | ||
r = self.begginings[mapIdx + 1] - 1 | ||
|
||
for i in range(size-2, -1, -1): | ||
currChar = p[i] | ||
if currChar not in self.mapperOfChar: | ||
return [-1, -1] | ||
occurencesBefore = self._getOcc(currChar, l - 1) | ||
occurencesAfter = self._getOcc(currChar, r) | ||
if occurencesBefore == occurencesAfter: | ||
return [-1, -1] | ||
mapIdx = self.mapperOfChar[currChar] | ||
l = self.begginings[mapIdx] + occurencesBefore | ||
r = self.begginings[mapIdx] + occurencesAfter - 1 | ||
if r < l: | ||
return [-1, -1] | ||
return [l, r] | ||
|
||
# O(|p|) | ||
def count(self, p, size): | ||
ran = self.getRangeOfOccurence(p, size) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unpack on return e.g. |
||
if ran[0] == -1: | ||
return 0 | ||
return max(ran[1] - ran[0] + 1, 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
|
||
#Should be private | ||
def _getOcc(self, c, i): | ||
closestSample = self.closestSample[i] | ||
toAdd = 0 | ||
if closestSample < i: | ||
for j in range(closestSample + 1, i + 1): | ||
if self.L[j] == c: | ||
toAdd += 1 | ||
elif closestSample > i: | ||
for j in range(i+1, closestSample + 1): | ||
if self.L[j] == c: | ||
toAdd -= 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
return self.occInSampleForChar[c][(closestSample)//self.sampleSize] + toAdd | ||
|
||
#O(|p|) | ||
def query(self, p, l): | ||
return self.count(p, l) > 0 | ||
|
||
# O(|p| + k) where k is the number or occurances of p in text | ||
def get_all_occurrance(self, p, l): | ||
arr = self.getRangeOfOccurence(p, l) | ||
if arr[0] == -1: | ||
return [] | ||
return [self.SA[i-1] for i in range(arr[0], arr[1] + 1)] | ||
|
||
# O(|p|) | ||
def get_any_occurrance(self, p, l): | ||
arr = self.getRangeOfOccurence(p, l) | ||
if arr[0] == -1: | ||
return -1 | ||
return self.SA[arr[0]-1] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please replace with a function "contains" like in suffix_tree and suffix_array packages, which returns values sequentially using |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import itertools | ||
import os | ||
import unittest | ||
|
||
from compression import burrows_wheeler | ||
from string_indexing import suffix_array | ||
from string_indexing import fm_index | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. from string_indexing import suffix_array, fm_index |
||
from generator import rand | ||
|
||
class TestFMIndex(unittest.TestCase): | ||
run_large = unittest.skipUnless( | ||
os.environ.get('LARGE', False), 'Skip test in small runs') | ||
|
||
def get_all_occurences_of_pattern_naive(self, text, n, pattern, l): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can move retrieving all occurences to `test_exact_string_matching.py' (see implementations of suffix tree and suffix array there) Is there any other operation that you would like to test aside from |
||
result = [] | ||
for i in range(1, n-l + 2): | ||
occurs = True | ||
for j in range(0, l): | ||
if text[i+j] != pattern[j]: | ||
occurs = False | ||
break | ||
if occurs: | ||
result.append(i) | ||
return result | ||
|
||
|
||
def check_fm_api_for_pattern(self, FMIndex, all_occurences_of_pattern, pattern, l): | ||
cnt = FMIndex.count(pattern, l) | ||
occurance = FMIndex.get_all_occurrance(pattern, l) | ||
any_occurance = FMIndex.get_any_occurrance(pattern, l) | ||
exists = FMIndex.query(pattern, l) | ||
self.assertEqual(cnt, len(all_occurences_of_pattern)) | ||
self.assertEqual(sorted(occurance), sorted(all_occurences_of_pattern)) | ||
self.assertTrue((any_occurance in all_occurences_of_pattern) or (any_occurance == -1 and len(all_occurences_of_pattern) == 0)) | ||
self.assertTrue(exists == (len(all_occurences_of_pattern) > 0)) | ||
|
||
|
||
def check_patterns_for_text_naive(self, text, n, patterns): | ||
SA = suffix_array.naive(text, n) | ||
BWT = burrows_wheeler.transform_from_suffix_array(SA, text, n) | ||
FMIndex = fm_index.FMIndex(SA, BWT, text, n) | ||
for pattern in patterns: | ||
l = len(pattern) | ||
pattern_occurances = self.get_all_occurences_of_pattern_naive(text, n, pattern, l) | ||
self.check_fm_api_for_pattern(FMIndex, pattern_occurances, pattern, l) | ||
|
||
|
||
api_naive_test_cases = [ | ||
['#ababa', ['a', 'a', 'aba', 'aa', 'ba', 'ab', 'bb', 'c', 'abc', 'ababa', 'ababaa']], | ||
['#aaababcaaabba', ['a', 'b', 'c', 'aab', 'aabb', 'aaababcaaabba']], | ||
['#aaabaababaababaababaaababaaabaabaaa', ['a', 'ab', 'aab', 'aaab', 'aaaab', 'aba', 'abaa', | ||
'abaaa', 'aaba', 'aabaa', 'aabaaa', 'aaaba', 'aaabaa']] | ||
] | ||
|
||
def test_fm_api_naive(self): | ||
for test_case in self.api_naive_test_cases: | ||
n = len(test_case[0]) - 1 | ||
self.check_patterns_for_text_naive(test_case[0], n, test_case[1]) | ||
|
||
|
||
@run_large | ||
def test_large_random(self): | ||
n = 10000 | ||
text = '#' + rand.random_word(n, ['a', 'b']) | ||
q = 1000 | ||
patterns = [rand.random_word(100, ['a', 'b']) for i in range(q)] | ||
self.check_patterns_for_text_naive(text, n, patterns) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove it from the PR - it's IDE-dependent, shouldn't be in the commit