-
-
Notifications
You must be signed in to change notification settings - Fork 46.2k
/
Copy pathbitap_string_match.py
79 lines (63 loc) · 2.31 KB
/
bitap_string_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Bitap exact string matching
https://en.wikipedia.org/wiki/Bitap_algorithm
Searches for a pattern inside text, and returns the index of the first occurrence
of the pattern. Both text and pattern consist of lowercase alphabetical characters only.
Complexity: O(m*n)
n = length of text
m = length of pattern
Python doctests can be run using this command:
python3 -m doctest -v bitap_string_match.py
"""
def bitap_string_match(text: str, pattern: str) -> int:
"""
Retrieves the index of the first occurrence of pattern in text.
Args:
text: A string consisting only of lowercase alphabetical characters.
pattern: A string consisting only of lowercase alphabetical characters.
Returns:
int: The index where pattern first occurs. Return -1 if not found.
>>> bitap_string_match('abdabababc', 'ababc')
5
>>> bitap_string_match('aaaaaaaaaaaaaaaaaa', 'a')
0
>>> bitap_string_match('zxywsijdfosdfnso', 'zxywsijdfosdfnso')
0
>>> bitap_string_match('abdabababc', '')
0
>>> bitap_string_match('abdabababc', 'c')
9
>>> bitap_string_match('abdabababc', 'fofosdfo')
-1
>>> bitap_string_match('abdab', 'fofosdfo')
-1
"""
if not pattern:
return 0
m = len(pattern)
if m > len(text):
return -1
# Initial state of bit string 1110
state = ~1
# Bit = 0 if character appears at index, and 1 otherwise
pattern_mask: list[int] = [~0] * 27 # 1111
for i, char in enumerate(pattern):
# For the pattern mask for this character, set the bit to 0 for each i
# the character appears.
pattern_index: int = ord(char) - ord("a")
pattern_mask[pattern_index] &= ~(1 << i)
for i, char in enumerate(text):
text_index = ord(char) - ord("a")
# If this character does not appear in pattern, it's pattern mask is 1111.
# Performing a bitwise OR between state and 1111 will reset the state to 1111
# and start searching the start of pattern again.
state |= pattern_mask[text_index]
state <<= 1
# If the mth bit (counting right to left) of the state is 0, then we have
# found pattern in text
if (state & (1 << m)) == 0:
return i - m + 1
return -1
if __name__ == "__main__":
import doctest
doctest.testmod()