diff --git a/common/wavelet_tree.py b/common/wavelet_tree.py new file mode 100644 index 0000000..cb0dd36 --- /dev/null +++ b/common/wavelet_tree.py @@ -0,0 +1,126 @@ +import itertools + +# pylint: disable=too-many-instance-attributes +class WaveletTree: + def __init__(self, t, n, A = None): + t = t[1:] + if A is not None: + self.alphabet = set(A) + else: + self.alphabet = set(t) + A = sorted(list(self.alphabet)) + self.n, self.smallest, self.largest = n, A[0], A[-1] + if len(A) == 1: + self.leaf = True + return + self.leaf = False + A_left, A_right = A[:(len(A) + 1) // 2], A[(len(A) + 1) // 2:] + self.zero_indexed, self.one_indexed = set(A_left), set(A_right) + value_array = [1 if c in self.one_indexed else 0 for c in t] + self.prefix_sum = list(itertools.accumulate(value_array, initial = 0)) + self.left_indices, self.right_indices = [0], [0] + for i, c in enumerate(t, start = 1): + if c in self.zero_indexed: + self.left_indices.append(i) + else: + self.right_indices.append(i) + left_text = ['#'] + [c for c in t if c in self.zero_indexed] + right_text = ['#'] + [c for c in t if c in self.one_indexed] + self.left = WaveletTree(left_text, len(left_text) - 1, A_left) + self.right = WaveletTree(right_text, len(right_text) - 1, A_right) + + def _left_tree_range(self, l, r): + return l - self.prefix_sum[l - 1], r - self.prefix_sum[r] + + def _right_tree_range(self, l, r): + return (self.prefix_sum[l - 1] + 1, self.prefix_sum[r]) + + def rank(self, c, l, r): + if c not in self.alphabet or l > r or l > self.n or r < 1: + return 0 + if self.leaf: + return r - l + 1 + if c in self.zero_indexed: + new_l, new_r = self._left_tree_range(l, r) + return self.left.rank(c, new_l, new_r) + new_l, new_r = self._right_tree_range(l, r) + return self.right.rank(c, new_l, new_r) + + def prefix_rank(self, c, r): + return self.rank(c, 1, r) + + def select(self, c, k, l, r): + if c not in self.alphabet or l > r or l > self.n or r < 1 : + return None + if self.leaf: + return k + l - 1 if k <= r - l + 1 else None + if c in self.zero_indexed: + new_l, new_r = self._left_tree_range(l, r) + result = self.left.select(c, k, new_l, new_r) + return self.left_indices[result] if result is not None else None + new_l, new_r = self._right_tree_range(l, r) + result = self.right.select(c, k, new_l, new_r) + return self.right_indices[result] if result is not None else None + + def quantile(self, k, l, r): + if k < 1 or k > r - l + 1: + return None + if self.leaf: + return self.smallest if k <= self.n else None + left_num = self.prefix_sum[r] - self.prefix_sum[l-1] + if r - l + 1 - left_num >= k: + new_l, new_r = self._left_tree_range(l, r) + return self.left.quantile(k, new_l, new_r) + new_l, new_r = self._right_tree_range(l, r) + return self.right.quantile(k-r+l-1+left_num, new_l, new_r) + + def _does_one_range_end_in_another(self, l, r, i, j): + return (i <= l <= j) or (i <= r <= j) + + def _ranges_intersect(self, l, r, i, j): + return (self._does_one_range_end_in_another(l, r, i ,j) or + self._does_one_range_end_in_another(i, j, l, r)) + + def range_count(self, l, r, x, y): + if l > r or l > self.n or l < 1 or x > y: + return 0 + if x <= self.smallest and self.largest <= y: + return r-l+1 + if self.leaf or y < self.smallest or x > self.largest: + return 0 + l_node, r_node = self.left, self.right + if (self._ranges_intersect(l_node.smallest, l_node.largest, x, y) and + self._ranges_intersect(r_node.smallest, r_node.largest, x, y)): + new_left_l, new_left_r = self._left_tree_range(l, r) + new_right_l, new_right_r = self._right_tree_range(l, r) + return (self.left.range_count(new_left_l, new_left_r, x, y) + + self.right.range_count(new_right_l, new_right_r, x, y)) + if self._ranges_intersect(self.right.smallest, self.right.largest, x, y): + new_l, new_r = self._right_tree_range(l, r) + return self.right.range_count(new_l, new_r, x, y) + new_l, new_r = self._left_tree_range(l, r) + return self.left.range_count(new_l, new_r, x, y) + + def range_search(self, l, r, x, y): + if l > r or l > self.n or l < 1 or x > y: + return [] + if x <= self.smallest and self.largest <= y: + return list(range(l, r + 1)) + if self.leaf or y < self.smallest or x > self.largest: + return [] + l_node, r_node = self.left, self.right + if (self._ranges_intersect(l_node.smallest, l_node.largest, x, y) + and self._ranges_intersect(r_node.smallest, r_node.largest, x, y)): + new_left_l, new_left_r = self._left_tree_range(l, r) + new_right_l, new_right_r = self._right_tree_range(l, r) + return (([self.left_indices[x] for x in + self.left.range_search(new_left_l, new_left_r, x, y)]) + + ([self.right_indices[x] for x in + self.right.range_search(new_right_l, new_right_r, x, y)])) + if self._ranges_intersect(self.right.smallest, self.right.largest, x, y): + return [ + self.right_indices[x] + for x in self.right.range_search(*self._right_tree_range(l, r), x, y)] + return [ + self.left_indices[x] + for x in self.left.range_search(*self._left_tree_range(l, r), x, y)] diff --git a/string_indexing/fm_index.py b/string_indexing/fm_index.py new file mode 100644 index 0000000..e9d451b --- /dev/null +++ b/string_indexing/fm_index.py @@ -0,0 +1,99 @@ +#pylint: disable=too-few-public-methods +#pylint: disable=invalid-name +class _RankSearcher: + SAMPLE_SIZE = 8 + + def __init__(self, L, mapper_of_chars, n): + self.L = L + #prepare closest samplings + current_sample = 0 + self.closest_sample = [0] + for i in range(1, n+2): + if (abs(current_sample-i) > abs(current_sample + self.SAMPLE_SIZE-i) and + (i + self.SAMPLE_SIZE < n)): + current_sample += self.SAMPLE_SIZE + self.closest_sample.append(current_sample) + + #Generate values for occ for given samples O(|A|*n) + self.occ_for_char = { self.L[i]: [0] for i in range(1, n+2)} + for c in mapper_of_chars: + current_value, next_sample = 0, self.SAMPLE_SIZE + for i in range(1, n+2): + if L[i] == c: + current_value += 1 + if i == next_sample: + self.occ_for_char[c].append(current_value) + next_sample = next_sample + self.SAMPLE_SIZE + + def prefix_rank(self, c, i): + if self.closest_sample[i] < i: + to_add = sum( + 1 for c_it in self.L[self.closest_sample[i] + 1:i + 1] if c_it == c) + else: + to_add = sum( + -1 for c_it in self.L[i + 1:self.closest_sample[i] + 1] if c_it == c) + return (self.occ_for_char[c][self.closest_sample[i] // self.SAMPLE_SIZE] + + to_add) + +#pylint: disable=too-few-public-methods +#pylint: disable=invalid-name +class _FMIndex: + def __init__ (self, SA, BWT, text, n, rank_searcher = None): + self.L = BWT + F = '#$' + ''.join(text[SA[i]] for i in range(1, n + 1)) + self.n = n + self.SA = SA + + #prepare char mapping for F + self.mapper_of_chars = { F[2] : 0} + self.beginnings = [2] + last = F[2] + for i in range(3, n+2): + if F[i] != last: + last = F[i] + self.beginnings.append(i) + self.mapper_of_chars[last] = len(self.beginnings) - 1 + + self.len_of_alphabet = len(self.mapper_of_chars) + self.rank_searcher = (_RankSearcher(self.L, self.mapper_of_chars, n) + if rank_searcher is None else rank_searcher) + +def from_suffix_array_and_bwt(SA, BWT, text, n, rank_searcher = None): + return _FMIndex(SA, BWT, text, n, rank_searcher) + +# O(|p|) +def count(FM, p, size): + low, high = _get_range_of_occurrences(FM, p, size) + return max(high - low + 1, 0) if low > -1 else 0 + +# O(|p| + k) where k is the number or occurances of p in text +def contains(FM, p, l): + low, high = _get_range_of_occurrences(FM, p, l) + yield from sorted([FM.SA[i-1] for i in range(low, high + 1) if low > -1]) + +def _get_range_of_occurrences(FM, p, size): + if size > FM.n or size == 0: + return -1, -1 + + if p[-1] not in FM.mapper_of_chars: + return -1, -1 + + map_idx = FM.mapper_of_chars[p[-1]] + l= FM.beginnings[map_idx] + r = (FM.beginnings[map_idx + 1] - 1 + if map_idx != FM.len_of_alphabet - 1 else FM.n + 1) + + for c in p[-2:0:-1]: + if c not in FM.mapper_of_chars: + return -1, -1 + occurrences_before = FM.rank_searcher.prefix_rank(c, l - 1) + occurrences_after = FM.rank_searcher.prefix_rank(c, r) + if occurrences_before == occurrences_after: + return -1, -1 + map_idx = FM.mapper_of_chars[c] + l = FM.beginnings[map_idx] + occurrences_before + r = FM.beginnings[map_idx] + occurrences_after - 1 + if r < l: + return -1, -1 + + return l, r diff --git a/string_indexing/lz_index.py b/string_indexing/lz_index.py new file mode 100644 index 0000000..73d2aa5 --- /dev/null +++ b/string_indexing/lz_index.py @@ -0,0 +1,283 @@ +from common import wavelet_tree + +#pylint: disable=too-many-instance-attributes +#pylint: disable=too-few-public-methods +class _LZTreeNode: + def __init__(self, parent, character, idx, position): + self.parent = parent + self.position = position + if parent is not None: + parent.children[character] = self + self.depth = parent.depth + 1 + else: + self.depth = 0 + self.idx = idx + self.children = {} + self.character = character + self.rank = None + self.left_rank = None + self.right_rank = None + + def set_ranks(self, rank): + if self.idx is not None: + self.rank = rank + self.left_rank = rank + self.right_rank = rank + rank = rank + 1 + if len(self.children) > 0: + for child_key in sorted(self.children): + rank = self.children[child_key].set_ranks(rank) + min_key = min(self.children) + max_key = max(self.children) + self.left_rank = (self.children[min_key].left_rank + if (self.rank is None or + self.children[min_key].left_rank < self.rank) + else self.rank) + self.right_rank = (self.children[max_key].right_rank + if (self.rank is None or + self.children[max_key].right_rank > self.rank) + else self.rank) + return rank + +def search(tree, t, n): + return _search_internal(t, 0, n, tree.root) + +def _search_internal(t, idx, n, node): + if idx == n: + return node + if t[idx + 1] not in node.children: + return None + return _search_internal(t, idx + 1, n, node.children[t[idx + 1]]) + +#pylint: disable=too-few-public-methods +class _LZTrie: + def __init__(self, t, n): + t += '$' #guaranting unique last node + self.root = _LZTreeNode(None, '#', 0, None) + current_node = self.root + idx, position = 1, 1 + for i in range(1, n+2): + current_char = t[i] + if current_char not in current_node.children: + _LZTreeNode(current_node, current_char, idx, position) + idx += 1 + current_node = self.root + position = i+1 + else: + current_node = current_node.children[current_char] + self.size = idx + self.root.set_ranks(0) + +#pylint: disable=too-few-public-methods +class _NodeMapper: + def __init__(self, lz_trie, size): + self.arr = [None] * size + self._map_tree_to_list(lz_trie.root) + + def _map_tree_to_list(self, node): + if node.idx is not None: + self.arr[node.idx] = node + for child in node.children.values(): + self._map_tree_to_list(child) + + def get_node_by_idx(self, idx): + return self.arr[idx] + +#pylint: disable=too-few-public-methods +class _RankMapper: + def __init__(self, lz_trie, size): + self.arr = [None] * size + self._map_tree_to_list(lz_trie.root) + + def _map_tree_to_list(self, node): + if node.rank is not None: + self.arr[node.rank] = node + for child in node.children.values(): + self._map_tree_to_list(child) + + def get_node_by_rank(self, rank): + return self.arr[rank] + +#pylint: disable=too-few-public-methods +class _NaiveRangeSearcher: + def __init__(self, points): + self.points = points + + def search_in_range(self, l1, r1, l2, r2): + return [(x, y) for (x, y) in self.points if l1 <= x <= r1 and l2 <= y <= r2] + +#pylint: disable=too-few-public-methods +class _RangeSearcher: + def __init__(self, points): + self.points = sorted(points, key= lambda x: x[0]) + values = ['#'] + [y for x, y in self.points] + self.wavelet_tree = wavelet_tree.WaveletTree(values, len(values)-1) + + def search_in_range(self, l1, r1, l2, r2): + l, r = 0, len(self.points) + while l < r: + s = (l+r)//2 + x, _ = self.points[s] + if x < l1: + l = s + 1 + else: + r = s + left = l + l, r = -1, len(self.points) - 1 + while l < r: + s = (l+r+1)//2 + x, _ = self.points[s] + if x <= r1: + l = s + else: + r = s - 1 + right = l + if left > right or left == len(self.points) or right == -1: + return [] + return ([self.points[x-1] for x in + self.wavelet_tree.range_search(left + 1, right + 1, l2, r2)]) + +#pylint: disable=too-few-public-methods +class _RevLZTrie: + def __init__(self, lz_trie): + self.root = _LZTreeNode(None, '#', 0, None) + self._add_recursive(lz_trie.root) + self.root.set_ranks(0) + + def _add_recursive(self, node): + for child in node.children.values(): + self._add_recursive(child) + self._add_block(child, self.root, child.idx) + + def _add_block(self, lz_node, rev_node, idx): + if lz_node.parent is None or lz_node.parent.character == '#': + if lz_node.character in rev_node.children: + rev_node.children[lz_node.character].idx = idx + else: + rev_node.children[lz_node.character] = (_LZTreeNode(rev_node, + lz_node.character, idx, None)) + else: + if lz_node.character not in rev_node.children: + rev_node.children[lz_node.character] = (_LZTreeNode(rev_node, + lz_node.character, None, None)) + self._add_block(lz_node.parent, + rev_node.children[lz_node.character], idx) + +#pylint: disable=too-few-public-methods +class _LZIndex: + def __init__(self, lz_trie, rev_lz_trie, lz_node_mapper, rev_lz_node_mapper, + range_searcher, lz_rank_mapper, rev_lz_rank_mapper): + self.lz_trie = lz_trie + self.rev_lz_trie = rev_lz_trie + self.lz_node_mapper = lz_node_mapper + self.range_searcher = range_searcher + self.rev_lz_node_mapper = rev_lz_node_mapper + self.lz_rank_mapper = lz_rank_mapper + self.rev_lz_rank_mapper = rev_lz_rank_mapper + +def _contains_internal(lz_index : _LZIndex, s, m): + yield from _contains_in_single_block(lz_index, s, m) + yield from _contains_within_two_blocks(lz_index, s, m) + yield from _contains_within_three_or_more_blocks(lz_index, s, m) + + +def _contains_in_single_block(lz_index : _LZIndex, s, m): + v = '#' + (s[::-1])[:-1] + root = search(lz_index.rev_lz_trie, v, m) + if root is not None: + for i in range(root.left_rank, root.right_rank + 1): + rev_node = lz_index.rev_lz_rank_mapper.get_node_by_rank(i) + node = lz_index.lz_node_mapper.get_node_by_idx(rev_node.idx) + for j in range(node.left_rank, node.right_rank + 1): + result_node = lz_index.lz_rank_mapper.get_node_by_rank(j) + yield result_node.position + node.depth - m + +def _contains_within_two_blocks(lz_index : _LZIndex, s, m): + for i in range(1, m): + rev_prefix = '#' + (s[::-1])[m-i:m] + sufix = '#' + s[i+1:] + rev_node = search(lz_index.rev_lz_trie, rev_prefix, i) + node = search(lz_index.lz_trie, sufix, m-i) + if rev_node is None or node is None: + continue + for (x, _) in (lz_index.range_searcher.search_in_range(rev_node.left_rank, + rev_node.right_rank, node.left_rank, node.right_rank)): + rev_node = lz_index.rev_lz_rank_mapper.get_node_by_rank(x) + node = lz_index.lz_node_mapper.get_node_by_idx(rev_node.idx) + yield node.position + node.depth - i + +def _prepare_structures_for_third_case(lz_index : _LZIndex, s, m): + used = [[False]*(m+1) for _ in range(m+1)] + existance = [[None]*(m+1) for _ in range(m+1)] + arr = [{}] + for i in range(1, m+1): + recorded = {} + current_node = lz_index.lz_trie.root + for j in range(i, m+1): + if current_node is not None and s[j] not in current_node.children: + current_node = None + elif current_node is not None: + current_node = current_node.children[s[j]] + existance[i][j] = current_node + if current_node is not None: + recorded[current_node.idx] = j + arr.append(recorded) + return used, existance, arr + +def _contains_within_three_or_more_blocks(lz_index : _LZIndex, s, m): + used, existance, arr = _prepare_structures_for_third_case(lz_index, s, m) + for i in range(1, m+1): + for j in range(i, m+1): + if existance[i][j] is None or used[i][j] is True: + continue + start_idx = existance[i][j].idx + current_idx = start_idx + current_end = j + while current_end < m and (current_idx + 1) in arr[current_end+1]: + current_idx = current_idx + 1 + used[current_end + 1][arr[current_end + 1][current_idx]] = True + current_end = arr[current_end + 1][current_idx] + size = current_idx - start_idx + 1 + if i > 1: + size = size + 1 + if current_end < m: + size = size + 1 + if size < 3 or (current_end != m and existance[current_end+1][m] is None): + continue + if (lz_index.lz_trie.size > current_idx + 1 and (current_end == m or + (existance[current_end+1][m].left_rank <= + lz_index.lz_node_mapper.get_node_by_idx(current_idx + 1).rank <= + existance[current_end+1][m].right_rank ))): + if i == 1: + yield lz_index.lz_node_mapper.get_node_by_idx(start_idx).position + continue + if start_idx == 1: + continue + current_node = lz_index.lz_node_mapper.get_node_by_idx(start_idx - 1) + prev = i - 1 + while (prev > 0 and current_node.parent is not None and + s[prev] in current_node.parent.children and + current_node.parent.children[s[prev]] == current_node): + prev = prev - 1 + current_node = current_node.parent + if prev == 0: + node = lz_index.lz_node_mapper.get_node_by_idx(start_idx) + yield node.position - i + 1 + +def create_lz_index(t, n): + lz_trie = _LZTrie(t, n) + rev_trie = _RevLZTrie(lz_trie) + lz_node_mapper = _NodeMapper(lz_trie, lz_trie.size) + rev_node_mapper = _NodeMapper(rev_trie, lz_trie.size) + + points = [(rev_node_mapper.get_node_by_idx(i).rank, + lz_node_mapper.get_node_by_idx(i+1).rank) + for i in range(1, lz_trie.size - 1)] + range_searcher = _RangeSearcher(points) + lz_rank_mapper = _RankMapper(lz_trie, lz_trie.size) + rev_lz_rank_mapper = _RankMapper(rev_trie, lz_trie.size) + return _LZIndex(lz_trie, rev_trie, lz_node_mapper, rev_node_mapper, + range_searcher, lz_rank_mapper, rev_lz_rank_mapper) + +def contains(lz_index, s, m): + yield from sorted(_contains_internal(lz_index, s, m)) diff --git a/test/test_exact_string_matching.py b/test/test_exact_string_matching.py index b92cef8..507d761 100644 --- a/test/test_exact_string_matching.py +++ b/test/test_exact_string_matching.py @@ -6,13 +6,32 @@ from generator import rand from exact_string_matching import forward, backward, other -from string_indexing import lcp, suffix_tree, suffix_array +from string_indexing import lcp, suffix_tree, suffix_array, fm_index, lz_index +from compression import burrows_wheeler +from common import wavelet_tree def lcp_lr_contains(t, w, n, m): SA = suffix_array.skew(t, n) LCP_LR = lcp.build_lcp_lr(lcp.kasai(SA, t, n), n) return lcp.contains(SA, LCP_LR, t, w, n, m) +def fm_index_wavelet_contains(t, w, n, m): + SA = suffix_array.skew(t, n) + BWT = burrows_wheeler.transform_from_suffix_array(SA, t, n) + FM = fm_index.from_suffix_array_and_bwt(SA, BWT, t, n, 0) + FM.rank_searcher = wavelet_tree.WaveletTree(FM.L, len(FM.L) - 1) + return fm_index.contains(FM, w, m) + +def fm_index_contains(t, w, n, m): + SA = suffix_array.skew(t, n) + BWT = burrows_wheeler.transform_from_suffix_array(SA, t, n) + FM = fm_index.from_suffix_array_and_bwt(SA, BWT, t, n) + return fm_index.contains(FM, w, m) + +def lz_index_contains(t, w, n, m): + LZ = lz_index.create_lz_index(t, n) + return lz_index.contains(LZ, w, m) + EXACT_STRING_MATCHING_ALGORITHMS = [ [ 'Morris-Pratt', forward.morris_pratt ], [ 'Knuth-Morris-Pratt', forward.knuth_morris_pratt ], @@ -33,7 +52,7 @@ def lcp_lr_contains(t, w, n, m): [ 'Karp-Rabin', other.karp_rabin ], [ 'fast-on-average', other.fast_on_average ], [ 'two-way constant space', other.two_way ], - [ 'fft', other.fft ], + [ 'FFT', other.fft ], [ 'suffix tree', lambda t, w, n, m: suffix_tree.contains( @@ -44,7 +63,10 @@ def lcp_lr_contains(t, w, n, m): lambda t, w, n, m: suffix_array.contains( suffix_array.prefix_doubling(t, n), t, w, n, m), ], - [ 'lcp-lr array', lcp_lr_contains ], + [ 'LCP-LR array', lcp_lr_contains ], + [ 'FM index', fm_index_contains ], + [ 'FM index with wavelet tree', fm_index_wavelet_contains ], + [ 'LZ index', lz_index_contains ], ] class TestExactStringMatching(unittest.TestCase): diff --git a/test/test_wavelet_tree.py b/test/test_wavelet_tree.py new file mode 100644 index 0000000..8da2d3d --- /dev/null +++ b/test/test_wavelet_tree.py @@ -0,0 +1,257 @@ +import os +import unittest + +from common import wavelet_tree +from generator import rand + +class DummySolver: + def __init__(self, t, n): + self.t = t + self.n = n + + def rank(self, c, l ,r): + if l > r or l > self.n or l < 1: + return 0 + return sum(1 if x == c else 0 for x in self.t[l:r+1]) + + def prefix_rank(self, c, r): + return self.rank(c, 1, r) + + def select(self, c, k, l, r): + current_occurrence = 0 + if l > r or l > self.n or l < 1: + return None + for i in range(l, r+1): + if self.t[i] == c: + current_occurrence = current_occurrence + 1 + if current_occurrence == k: + return i + return None + + def quantile(self, k, l, r): + if l > r or l > self.n or l < 1 or k > r - l + 1: + return None + substring = self.t[l:r + 1] + return sorted(substring)[k - 1] + + def range_count(self, l, r, x, y): + if l > r or l > self.n or l < 1: + return None + result = 0 + for i in range(l, r + 1): + if x <= self.t[i] <= y: + result = result + 1 + return result + +def rank_result(solver, queries): + return [solver.rank(c, l, r) for (c, l, r) in queries] + +def select_result(solver, queries): + return [solver.select(c, k, l, r) for (c, k, l, r) in queries] + +def range_count_result(solver, queries): + return [solver.range_count(l, r, x, y) for (l, r, x, y) in queries] + +def quantile_result(solver, queries): + return [solver.quantile(k, l, r) for (k, l, r) in queries] + +def create_range_for_query(n): + l = rand.random.randint(1, n) + r = rand.random.randint(l, n) + return (l, r) + +def create_rank_query(n, A): + l, r = create_range_for_query(n) + return (rand.random.choice(A), l, r) + +def create_select_query(n, A): + l, r = create_range_for_query(n) + return (rand.random.choice(A), rand.random.randint(1, r - l + 1), l, r) + +def create_quantile_query(n, _): + l, r = create_range_for_query(n) + return (rand.random.randint(1, r - l + 1), l, r) + +def create_range_count_query(n, A): + l, r = create_range_for_query(n) + x, y = minrand.random.choice(A), rand.random.choice(A) + return (l, r, min(x, y), max(x, y)) + + +class TestWaveletTree(unittest.TestCase): + run_large = unittest.skipUnless( + os.environ.get('LARGE', False), 'Skip test in small runs') + + test_classes = [wavelet_tree.WaveletTree] + + runner_functions = [ + (create_rank_query, rank_result), + (create_select_query, select_result), + (create_quantile_query, quantile_result), + (create_range_count_query, range_count_result) + ] + + random_small_test_data = [ + (12, 10, ['a', 'b', 'c']), + (10, 20, ['a', 'b', 'c']), + (5, 12, ['a', 'b']), + (7, 12, ['a', 'c']), + (5, 25, ['a', 'b', 'c', 'd', 'w', 'e']) + ] + + def create_queries(self, n, q, alphabet, genaration_function): + return [genaration_function(n, alphabet) for _ in range(q)] + + def test_tree_api_handmade(self): + # pylint: disable=consider-using-enumerate + for test_idx in range(len(self.test_inputs)): + for cls in self.test_classes: + text, test_cases = self.test_inputs[test_idx] + solver = cls(text, len(text)-1) + # pylint: disable=consider-using-enumerate + for i in range(len(self.runner_functions)): + _, runner = self.runner_functions[i] + result = runner(solver, test_cases[i]) + self.assertEqual(self.test_expected_outputs[test_idx][i], result) + + def test_small_random(self): + for (n, q, alphabet) in self.random_small_test_data: + self.tree_api_random_test(n, q, alphabet) + + def tree_api_random_test(self, n, q, alphabet): + text = rand.random_word(n, alphabet) + model_solver = DummySolver(text, n) + runners_args = [(runner, self.create_queries(n, q, alphabet, fun)) + for (fun, runner) in self.runner_functions] + model_results = [runner(model_solver, queries) + for (runner, queries) in runners_args] + for cls in self.test_classes: + solver = cls(text, n) + results = [runner(solver, queries) for (runner, queries) in runners_args] + self.assertEqual(model_results, results) + + + large_test_case_data = [ + (1000, 10000, ['a', 'b']), + (1000, 10000, '''qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM +[];,./?><+_)(*&^%$#@!1234567890-=)'''.split()), + ] + + @run_large + def test_large_random(self): + for (n, q, alphabet) in self.large_test_case_data: + self.tree_api_random_test(n, q, alphabet) + + test_expected_outputs = [ + [ + [2, 1, 3, 0, 1], + [3, 5, None, 4, None], + ['a', 'a', 'a', 'b', None], + [5, 3, 1, 1, 3, 0, 0] + ], + [ + [4, 1, 4, 1, 1, 0, 0, 3, 1, 2, 1, 0], + [None, None, 3, None, 5, 6, 10, None, None, 7, 8, 10], + ['c', 'a', 'c', 'a', 'a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], + [0, 1, 10, 2, 5, 1, 1, 3, 3, 1, 3, 1] + ] + ] + + test_inputs = [ + ( + '#ababa', + [ + [ + ('a', 1, 3), + ('b', 1, 3), + ('a',1 , 5), + ('c', 1, 5), + ('a', 3, 3) + ], + [ + ('a', 1, 3, 3), + ('a', 3, 1, 5), + ('b', 1, 3, 3), + ('b', 2, 2, 4), + ('c', 1, 1, 5) + ], + [ + (1, 1, 5), + (1, 3, 3), + (3, 1, 5), + (4, 1, 4), + (2, 1, 1) + ], + [ + (1, 5, 'a', 'b'), + (1, 5, 'a', 'a'), + (2, 4, 'a', 'a'), + (3, 3, 'a', 'a'), + (1, 3, ' ', 'c'), + (1, 3, ' ', ' '), + (2, 4, 'c', 'c') + ] + ] + ), + ( + '#bcbbbaabca', + [ + [ + ('b', 3, 10), + ('a', 6, 6), + ('b', 3, 8), + ('a', 9, 10), + ('c', 6, 9), + ('a', 8, 8), + ('c', 4, 4), + ('b', 4, 10), + ('a', 9, 10), + ('a', 5, 8), + ('c', 1, 2), + ('c', 3, 5) + ], + [ + ('c', 2, 8, 10), + ('c', 2, 4, 5), + ('b', 1, 3, 9), + ('c', 1, 5, 8), + ('b', 1, 5, 5), + ('a', 1, 5, 8), + ('a', 1, 10, 10), + ('c', 6, 3, 8), + ('a', 1, 3, 3), + ('a', 2, 6, 9), + ('b', 4, 3, 9), + ('a', 1, 10, 10) + ], + [ + (1, 9, 9), + (2, 6, 8), + (1, 9, 9), + (2, 6, 8), + (2, 4, 7), + (2, 1, 5), + (2, 4, 6), + (1, 10, 10), + (2, 5, 7), + (2, 1, 2), + (3, 8, 10), + (4, 7, 10) + ], + [ + (10, 10, 'b', 'b'), + (3, 3, 'b', 'b'), + (1, 10, 'a', 'c'), + (6, 7, 'a', 'c'), + (1, 6, 'a', 'b'), + (5, 5, 'a', 'b'), + (9, 10, 'b', 'c'), + (3, 7, 'b', 'c'), + (4, 8, 'b', 'c'), + (10, 10, 'a', 'b'), + (4, 6, 'a', 'b'), + (9, 9, 'a', 'c') + ] + ] + ) + ] diff --git a/text/README.md b/text/README.md index 53324ca..12fb6af 100644 --- a/text/README.md +++ b/text/README.md @@ -4,4 +4,4 @@ Lecture notes for the _String Algorithms_ course (summer semester 2019/20) at [J ### Submission -Please put the pull requests in separate directories with your name +Please put the pull requests in separate directories with your project name diff --git a/text/fm_lz_index/FMIndexLZIndex.pdf b/text/fm_lz_index/FMIndexLZIndex.pdf new file mode 100644 index 0000000..23f0b4d Binary files /dev/null and b/text/fm_lz_index/FMIndexLZIndex.pdf differ diff --git a/text/fm_lz_index/tex/chapters/Comparation/Compare.tex b/text/fm_lz_index/tex/chapters/Comparation/Compare.tex new file mode 100644 index 0000000..3907160 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/Comparation/Compare.tex @@ -0,0 +1,3 @@ +\chapter{Implementation and comparison of LZ-Index and FM-Index algorithms} + +\input{chapters/Comparation/Sections/Comp} \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/Comparation/Sections/Comp.tex b/text/fm_lz_index/tex/chapters/Comparation/Sections/Comp.tex new file mode 100644 index 0000000..cd17988 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/Comparation/Sections/Comp.tex @@ -0,0 +1,476 @@ +\section{Implementation} +The algorithms discussed in this work were implemented in python. The source code of the implementation is publicly available in the \textbf{string-algorithms} GitHub repository and can be accessed at \url{https://github.com/krzysztof-turowski/string-algorithms/pull/57}. + +\subsection{Implementation details} +Implementation of discussed structures is split into files. We outline the primary files, their content and usages below: +\begin{itemize} + \item $\texttt{common/wavelet\_tree.py}$ -- File containing the implementation of Wavelet Tree structure. + \item $\texttt{string\_indexing/fm\_index.py}$ -- File containing the FM-Index implementation, all required classes and creation methods. + \item $\texttt{string\_indexing/lz\_index.py}$ -- File containing the implementation of the LZ-Index, all required structures and creation methods. + \item $\texttt{test/test\_wavelet\_tree.py}$ -- Unit tests that ensure the correctness of the wavelet tree. + \item $\texttt{test/test\_exact\_string\_matching.py}$ -- Unit tests that ensure the correctness of implemented string-matching algorithms. +\end{itemize} + +\section{Comparison of FM-Index and LZ-Index} + +We will compare FM-Index and LZ-Index in terms of peak memory used, time needed and accesses to their structures. As for test cases, we will use test cases with different sizes of alphabet, size of inputs and with non-uniform characters distributions. Details description of all test cases are as follows: +\begin{itemize} + \item Small alphabet -- In that test case we will focus on properties that described structures hold for small alphabet. In practice, we set the size of alphabet to $4$ and generate strings over that alphabet. + \item Medium alphabet -- In that test case, we generate the words over the alphabet of size $16$. We also choose characters of text and patterns with uniform distribution over $\mathcal{A}$. That test case will also provide the differences in performance in comparison to the small alphabet. + \item Large alphabet -- That test case have the size of alphabet fixed to $64$. The text and patterns were still chosen randomly with uniform distribution over $\mathcal{A}$. It also provides data of comparison to smaller sizes of alphabets. + \item Alphabet with non-uniform distribution with parameter 1/2 -- That test case will stand out from the previous in terms of the probability, that each character is chosen from the alphabet to text and patterns. The alphabet used here will be infinite and the probability of choosing the $i$-th character form the alphabet is $(\frac{1}{2})^i$. + \item Alphabet with non-uniform distribution with parameter 1/3 -- This test case is similar to the previous test case. The only difference is that all characters from the alphabet are chosen with probability $(\frac{1}{3})^i$, for $i > 1$, and $\frac{5}{6}$, for $i = 1$, to ensure that the sum of probabilities is equal to $1$. +\end{itemize} + +\subsection{Tests for small alphabet} +\subsubsection{Peak memory comparison} +As for the peak memory for the creation of structures and the search, the results are as follows: + +\begin{table}[H] +\begin{center} +\caption{Memory peak during structure creation for small alphabet} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 103 KB & 116 KB & 38 KB \\ \hline +$10^{3}$ & 765 KB & 921 KB & 392 KB \\ \hline +$10^{4}$ & 6 MB & 8 MB & 4 MB \\ \hline +$10^{5}$ & 58 MB & 77 MB & 41 MB \\ \hline +$10^{6}$ & 534 MB & 708 MB & 391 MB \\ \hline +$3 \cdot 10^{6}$ & 2018 MB & 2103 MB & 1154 MB \\ \hline +\end{tabular} +\end{center} +\end{table} + +Summarizing the result of this case for memory performance, all of these structures require nearly the same memory for creation. The exception is FM-Index with Wavelet Tree implementation, which uses around 2 times less memory. + +\begin{table}[H] +\begin{center} +\caption{Memory peak during search for small alphabet} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 6 KB & 1443 B & 1056 B \\ \hline +$10^{3}$ & $10^{1}$ & 6 KB & 1530 B & 1059 B \\ \hline +$10^{3}$ & $10^{2}$ & 194 KB & 1623 B & 1152 B \\ \hline +$10^{4}$ & $10^{1}$ & 6 KB & 1601 B & 1220 B \\ \hline +$10^{4}$ & $10^{2}$ & 195 KB & 1688 B & 1300 B \\ \hline +$10^{5}$ & $10^{2}$ & 203 KB & 1693 B & 1311 B \\ \hline +$10^{6}$ & $10^{2}$ & 204 KB & 1694 B & 1309 B \\ \hline +$10^{6}$ & $10^{3}$ & 15 MB & 2 KB & 2 KB \\ \hline +$10^{6}$ & $10^{4}$ & 1533 MB & 11 KB & 10 KB \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 1533 MB & 11 KB & 10 KB \\ \hline +\end{tabular} +\end{center} +\end{table} + +Memory performance of searching in this case shows that both implementations of FM-Index needs nearly the same amount of memory. However, the LZ-Index search always requires $\bigO(m^2)$ space, which results in much more memory used in practice. + + +\subsubsection{Time comparison} +The time needed for creation of structures and the search presents as follows: + +\begin{table}[H] +\begin{center} +\caption{Time needed to create structures for small alphabet} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 2 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & 21 ms & 8 ms & 8 ms \\ \hline +$10^{4}$ & 187 ms & 96 ms & 131 ms \\ \hline +$10^{5}$ & 1859 ms & 1027 ms & 1319 ms \\ \hline +$10^{6}$ & 19.45 s & 9.8 s & 15.13 s \\ \hline +$3 \cdot 10^{6}$ & 76.83 s & 30.47 s & 44.68 s \\ \hline +\end{tabular} +\end{center} +\end{table} + +Time needed for construction standard FM-Index is the lowest and is $2$ times faster than construction the LZ-Index and almost $1.5$ faster than construction of the FM-Index with Wavelet Tree implementation. + +\begin{table}[H] +\begin{center} +\caption{Time needed to perform search for small alphabet} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & $10^{1}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & $10^{2}$ & 2 ms & 1 ms & 1 ms \\ \hline +$10^{4}$ & $10^{1}$ & 2 ms & 1 ms & 1 ms \\ \hline +$10^{4}$ & $10^{2}$ & 3 ms & 1 ms & 1 ms \\ \hline +$10^{5}$ & $10^{2}$ & 3 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{2}$ & 3 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{3}$ & 759 ms & 2 ms & 2 ms \\ \hline +$10^{6}$ & $10^{4}$ & 76.61 s & 3 ms & 2 ms \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 79.03 s & 3 ms & 3 ms \\ \hline +\end{tabular} +\end{center} +\end{table} + +Time performance of both implementation of the FM-Index is the same. Unfortunately, time needed by the LZ-Index to find all occurrences is huge due to at least $\bigO(m^2)$ time complexity in the best case. + +\subsection{Alphabet of medium size} + +\subsubsection{Peek memory comparison} + +\begin{table}[H] +\begin{center} +\caption{Memory peak during structure creation for medium alphabet} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 666 KB & 939 KB & 676 KB \\ \hline +$10^{3}$ & 2 MB & 5 MB & 7 MB \\ \hline +$10^{4}$ & 7 MB & 41 MB & 70 MB \\ \hline +$10^{5}$ & 26 MB & 337 MB & 685.0 MB \\ \hline +$10^{6}$ & 129 MB & 3.12 GB & 6.52 GB \\ \hline +$3 \cdot 10^{6}$ & 302 MB & 9.01 GB & 16.89 GB \\ \hline +\end{tabular} +\end{center} +\end{table} + +The result of memory performance for medium alphabet are very different from for small alphabet. In that case memory used to construct the LZ-Index is around $30$ times smaller than for the standard FM-Index and around $60$ times smaller than for the FM-Index with Wavelet Tree implementation. + +\begin{table}[H] +\begin{center} +\caption{Memory peak during search for medium alphabet} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 524 KB & 2 KB & 2 KB \\ \hline +$10^{3}$ & $10^{1}$ & 757 KB & 17 KB & 17 KB \\ \hline +$10^{3}$ & $10^{2}$ & 44 MB & 15 KB & 15 KB \\ \hline +$10^{4}$ & $10^{1}$ & 1289 KB & 162 KB & 162 KB \\ \hline +$10^{4}$ & $10^{2}$ & 52 MB & 161 KB & 161 KB \\ \hline +$10^{5}$ & $10^{2}$ & 85 MB & 1563 KB & 1563 KB \\ \hline +$10^{6}$ & $10^{2}$ & 89 MB & 1590 KB & 1591 KB \\ \hline +$10^{6}$ & $10^{3}$ & 1.2 GB & 2 MB & 2 MB \\ \hline +$10^{6}$ & $10^{4}$ & 13.4 GB & 4 MB & 4 MB \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 15 GB & 4.2 MB & 4.1 MB \\ \hline +\end{tabular} +\end{center} +\end{table} + +Memory performance of obtaining all occurrences for both implementation of the FM-Index are similar, but for the LZ-Index it is still huge amount of memory needed. + +\subsubsection{Time comparison} + +\begin{table}[H] +\begin{center} +\caption{Time needed to create structures for medium alphabet} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 18 ms & 34 ms & 25 ms \\ \hline +$10^{3}$ & 81 ms & 367 ms & 335 ms \\ \hline +$10^{4}$ & 409 ms & 3.92 s & 3.6 s \\ \hline +$10^{5}$ & 2.79 s & 39.91 s & 38.11 s \\ \hline +$10^{6}$ & 22.3 s & 390 s & 378 s \\ \hline +$3 \cdot 10^{6}$ & 56.3 s & 18.9 min & 16.76 min \\ \hline +\end{tabular} +\end{center} +\end{table} + +In this case, time needed for construction of the LZ-Index is around $17$ time less than both implementations of the FM-Index. It is very different in comparison with the construction for the small alphabet. + +\begin{table}[H] +\begin{center} +\caption{Time needed to perform search for medium alphabet} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 6 ms & 3 ms & 2 ms \\ \hline +$10^{3}$ & $10^{1}$ & 36 ms & 4 ms & 3 ms \\ \hline +$10^{3}$ & $10^{2}$ & 2.41 s & 28 ms & 19 ms \\ \hline +$10^{4}$ & $10^{1}$ & 164 ms & 14 ms & 12 ms \\ \hline +$10^{4}$ & $10^{2}$ & 3.53 s & 39 ms & 29 ms \\ \hline +$10^{5}$ & $10^{2}$ & 5.64 s & 76 ms & 62 ms \\ \hline +$10^{6}$ & $10^{2}$ & 6.21 s & 82 ms & 71 ms \\ \hline +$10^{6}$ & $10^{3}$ & 43 s & 120 ms & 92 ms \\ \hline +$10^{6}$ & $10^{4}$ & 392 s & 124 ms & 93 ms \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 417 s & 124 ms & 94 ms \\ \hline +\end{tabular} +\end{center} +\end{table} + +Time performance of searching in medium alphabet for both implementations of the FM-Index are quite similar, but it is much higher than for small alphabet. That is probably connected with the high memory usage. As for the LZ-Index, the time needed is around $5$ times higher than in small alphabet case. + +\subsection{Alphabet of large size} +\subsubsection{Peek memory comparison} + +\begin{table}[H] +\begin{center} +\caption{Memory peak during structure creation for large alphabet} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 2 MB & 4.0 MB & 3.0 MB \\ \hline +$10^{3}$ & 11 MB & 23.0 MB & 41.0 MB \\ \hline +$10^{4}$ & 33 MB & 169.0 MB & 406.0 MB \\ \hline +$10^{5}$ & 115.0 MB & 1352.0 MB & 1.1 GB \\ \hline +$10^{6}$ & 402 MB & 9.32 GB & 8.52 GB \\ \hline +\end{tabular} +\end{center} +\end{table} + +Space used by the LZ-Index in that case is still much less than used by both FM-Index implementations. The interesting fact is that the FM-Index with Wavelet Tree for greater text length will use less space than the standard FM-Index implementation. + +\begin{table}[H] +\begin{center} +\caption{Memory peak during search for large alphabet} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 6 MB & 2 KB & 2 KB \\ \hline +$10^{3}$ & $10^{1}$ & 8 MB & 17 KB & 17 KB \\ \hline +$10^{3}$ & $10^{2}$ & 648 MB & 15 KB & 15 KB \\ \hline +$10^{4}$ & $10^{1}$ & 10 MB & 162 KB & 162 KB \\ \hline +$10^{4}$ & $10^{2}$ & 681 MB & 161 KB & 161 KB \\ \hline +$10^{5}$ & $10^{2}$ & 822 MB & 1563 KB & 1614 KB \\ \hline +$10^{6}$ & $10^{2}$ & 894 MB & 2 MB & 2 MB \\ \hline +$10^{6}$ & $10^{3}$ & 10.2 GB & 11.2 MB & 11.1 MB \\ \hline +\end{tabular} +\end{center} +\end{table} + +Memory performance result of searching in this case are nearly the same as the results of searching for medium alphabet. We can conclude that the size of alphabet does not have high influence on memory during search for all these structures. + +\subsubsection{Time comparison} + +\begin{table}[H] +\begin{center} +\caption{Time needed to create structures for large alphabet} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 73 ms & 521 ms & 129 ms \\ \hline +$10^{3}$ & 324 ms & 5.25 s & 1916 ms \\ \hline +$10^{4}$ & 1907 ms & 58.18 s & 22.56 s \\ \hline +$10^{5}$ & 8.2 s & 9.4 min & 3.2 min \\ \hline +$10^{6}$ & 63 s & 89.9 min & 29.5 min \\ \hline +\end{tabular} +\end{center} +\end{table} + +Time performance of construction these structures differs a lot in comparison to previous cases. The size of alphabet has high influence for both FM-Index implementations, but small influence for the LZ-Index. In the largest case, creation of LZ-Index is almost $30$ times faster than the FM-Index. + +\begin{table}[H] +\begin{center} +\caption{Time needed to perform search for large alphabet} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 297 ms & 10 ms & 7 ms \\ \hline +$10^{3}$ & $10^{1}$ & 458 ms & 12 ms & 14 ms \\ \hline +$10^{3}$ & $10^{2}$ & 32.61 s & 116 ms & 98 ms \\ \hline +$10^{4}$ & $10^{1}$ & 952 ms & 22 ms & 21 ms \\ \hline +$10^{4}$ & $10^{2}$ & 38.89 s & 156 ms & 135 ms \\ \hline +$10^{5}$ & $10^{2}$ & 41.2 s & 172 ms & 149 ms \\ \hline +$10^{6}$ & $10^{2}$ & 44.7 s & 183 ms & 158 ms \\ \hline +$10^{6}$ & $10^{3}$ & 7.2 min & 382 ms & 331 ms \\ \hline +$10^{6}$ & $10^{4}$ & 81.3 min & 472 ms & 403 ms \\ \hline +\end{tabular} +\end{center} +\end{table} + +In contrast to the memory performance, the time performance of that structures depends a lot on the size of the alphabet. The time needed to obtain all occurrences for LZ-Index is almost $12$ times higher than for medium size alphabet. + +\subsection{Non-uniform character distribution with parameter 1/2} + + +\subsubsection{Peek memory comparison} + +\begin{table}[H] +\begin{center} +\caption{Memory peak during structure creation for non-uniform distribution with parameter 1/2} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 101 KB & 114 KB & 45 KB \\ \hline +$10^{3}$ & 740 KB & 897 KB & 502 KB \\ \hline +$10^{4}$ & 6 MB & 8 MB & 5 MB \\ \hline +$10^{5}$ & 58 MB & 78 MB & 57 MB \\ \hline +$10^{6}$ & 534 MB & 717 MB & 591 MB \\ \hline +$3 \cdot 10^{6}$ & 5.21 GB & 7.01 GB & 6.05 GB \\ \hline +\end{tabular} +\end{center} +\end{table} + +Memory performance results of construction the structures for non-uniform distribution are quite similar to the results for small alphabet. However, the results here for each structure are even more similar. + +\begin{table}[H] +\begin{center} +\caption{Memory peak during search for non-uniform distribution with parameter 1/2} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 5 KB & 1433 B & 1056 B \\ \hline +$10^{3}$ & $10^{1}$ & 6 KB & 1551 B & 1148 B \\ \hline +$10^{3}$ & $10^{2}$ & 195 KB & 1620 B & 1175 B \\ \hline +$10^{4}$ & $10^{1}$ & 6 KB & 1602 B & 1249 B \\ \hline +$10^{4}$ & $10^{2}$ & 198 KB & 1691 B & 1342 B \\ \hline +$10^{5}$ & $10^{2}$ & 200 KB & 1693 B & 1327 B \\ \hline +$10^{6}$ & $10^{2}$ & 206 KB & 1694 B & 1366 B \\ \hline +$10^{6}$ & $10^{3}$ & 15 MB & 2 KB & 2 KB \\ \hline +$10^{6}$ & $10^{4}$ & 1533 MB & 11 KB & 10 KB \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 1535 MB & 11 KB & 11 KB \\ \hline +\end{tabular} +\end{center} +\end{table} + +The results for the memory performance during search are exactly the same as the results of the memory performance during search for small alphabet. + + +\subsubsection{Time comparison} + +\begin{table}[H] +\begin{center} +\caption{Time needed to create structures for non-uniform distribution with parameter 1/2} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 3 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & 21 ms & 19 ms & 10 ms \\ \hline +$10^{4}$ & 218 ms & 234 ms & 210 ms \\ \hline +$10^{5}$ & 1906 ms & 2.84 s & 2.32 s \\ \hline +$10^{6}$ & 22.44 s & 34.21 s & 29.25 s \\ \hline +$3 \cdot 10^{6}$ & 57.2 s & 98.8 s & 94.3 s \\ \hline +\end{tabular} +\end{center} +\end{table} +The first bigger difference of the results for alphabet with non-uniform distribution and small alphabet can be seen in time performance for the structures construction. In non-uniform distribution with parameter 1/2, the creation of the both FM-Index implementations are $2$ times slower comparing to LZ-Index. + +\begin{table}[H] +\begin{center} +\caption{Time needed to perform search for non-uniform distribution with parameter 1/2} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & $10^{1}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & $10^{2}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{4}$ & $10^{1}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{4}$ & $10^{2}$ & 2 ms & 1 ms & 1 ms \\ \hline +$10^{5}$ & $10^{2}$ & 3 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{2}$ & 5 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{3}$ & 851 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{4}$ & 79.9 s & 4 ms & 4 ms \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 82.3 s & 4 ms & 4 ms \\ \hline +\end{tabular} +\end{center} +\end{table} + +Time performance of search in this case is exactly the same as in the case with small alphabet. + +\subsection{Non-uniform character distribution with parameter 1/3} + +\subsubsection{Peak memory comparison} + +\begin{table}[H] +\begin{center} +\caption{Memory peak during structure creation for non-uniform distribution with parameter 1/3} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 60 KB & 74 KB & 34 KB \\ \hline +$10^{3}$ & 429 KB & 595 KB & 432 KB \\ \hline +$10^{4}$ & 3 MB & 5 MB & 4 MB \\ \hline +$10^{5}$ & 29 MB & 48 MB & 52 MB \\ \hline +$10^{6}$ & 267 MB & 446 MB & 536 MB \\ \hline +$3 \cdot 10^{6}$ & 713 MB & 1281 MB & 1513 MB \\ \hline +\end{tabular} +\end{center} +\end{table} + +The results of space performance for non-uniform distribution with parameter 1/3 are quite different from for non-uniform distribution with parameter 1/2. The space needed for creation of LZ-Index is almost $2$ times less than need for creation of FM-Index. + +\begin{table}[H] +\begin{center} +\caption{Memory peak during search for non-uniform distribution with parameter 1/3} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 6 KB & 1445 B & 1072 B \\ \hline +$10^{3}$ & $10^{1}$ & 8 KB & 1866 B & 1573 B \\ \hline +$10^{3}$ & $10^{2}$ & 206 KB & 1668 B & 1271 B \\ \hline +$10^{4}$ & $10^{1}$ & 32 KB & 12 KB & 12 KB \\ \hline +$10^{4}$ & $10^{2}$ & 216 KB & 1692 B & 1310 B \\ \hline +$10^{5}$ & $10^{2}$ & 227 KB & 1693 B & 1343 B \\ \hline +$10^{6}$ & $10^{2}$ & 231 KB & 1694 B & 1344 B \\ \hline +$10^{6}$ & $10^{3}$ & 16 MB & 2 KB & 2 KB \\ \hline +$10^{6}$ & $10^{4}$ & 1539 MB & 11 KB & 11 KB \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 1544 MB & 11 KB & 11 KB \\ \hline +\end{tabular} +\end{center} +\end{table} + +Unfortunately, the space performance during searching in this case is exactly the same as for the small alphabet case and for the non-uniform distribution with parameter 1/2, so we cannot observe any interesting properties here. + +\subsubsection{Time comparison} + +\begin{table}[H] +\begin{center} +\caption{Time needed to create structures for non-uniform distribution with parameter 1/3} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & 2 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & 13 ms & 13 ms & 14 ms \\ \hline +$10^{4}$ & 116 ms & 137 ms & 161 ms \\ \hline +$10^{5}$ & 1041 ms & 1829 ms & 1969 ms \\ \hline +$10^{6}$ & 10.83 s & 24.37 s & 26.54 s \\ \hline +$3 \cdot 10^{6}$ & 100.2 s & 274 s & 269 s \\ \hline +\end{tabular} +\end{center} +\end{table} + +The results of time performance for construction of structures are quite similar to the results for non-uniform distribution with parameter 1/2. The only difference is that all the results here are multiplied by $2$. + +\begin{table}[H] +\begin{center} +\caption{Time needed to perform search for non-uniform distribution with parameter 1/3} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +n & m & LZ-Index & Standard FM-Index & FM-Index with Wavelet Tree \\ \hline +$10^{2}$ & $10^{1}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & $10^{1}$ & 1 ms & 1 ms & 1 ms \\ \hline +$10^{3}$ & $10^{2}$ & 2 ms & 1 ms & 1 ms \\ \hline +$10^{4}$ & $10^{1}$ & 3 ms & 1 ms & 1 ms \\ \hline +$10^{4}$ & $10^{2}$ & 4 ms & 1 ms & 1 ms \\ \hline +$10^{5}$ & $10^{2}$ & 4 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{2}$ & 5 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{3}$ & 844 ms & 1 ms & 1 ms \\ \hline +$10^{6}$ & $10^{4}$ & 87.4 s & 2 ms & 2 ms \\ \hline +$3 \cdot 10^{6}$ & $10^{4}$ & 92.3 s & 2 ms & 2 ms \\ \hline +\end{tabular} +\end{center} +\end{table} + +As for the time performance for searching, the result does not differ at all from small alphabet and non-uniform distribution with parameter 1/2. + +\subsection{Summary} +The advantage of LZ-Index is its memory usage and time needed for construction when working with large or medium alphabet. The differences in that case are huge and can reach even $25$ time less memory used and $30$ times less time for construction. In the other cases the results for construction are quite similar. However, time and memory needed to find occurrences for long pattern is large and may be over 100 times higher than both implementation of FM-Index. Summarizing that, LZ-Index will be better for short patterns with large alphabet. +\\ +\\ +In comparison of both implementation of FM-Index, they do not differ a lot. Time performances for searching for a pattern are exactly the same besides when the alphabet is large. However, there are difference is in its construction. Implementation based on Wavelet Tree uses less memory, but requires more time. The differences are rather small except the large alphabet when implementation based on Wavelet Tree is constructed $3$ times faster. In summary, described FM-Index implementation similar in performance, but in case with large alphabet it is better to use implementation based on Wavelet Tree. \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/Comparation/Sections/FMDetails.tex b/text/fm_lz_index/tex/chapters/Comparation/Sections/FMDetails.tex new file mode 100644 index 0000000..e69de29 diff --git a/text/fm_lz_index/tex/chapters/Comparation/Sections/LZDetails.tex b/text/fm_lz_index/tex/chapters/Comparation/Sections/LZDetails.tex new file mode 100644 index 0000000..e69de29 diff --git a/text/fm_lz_index/tex/chapters/FM-Index/FMIndex.tex b/text/fm_lz_index/tex/chapters/FM-Index/FMIndex.tex new file mode 100644 index 0000000..0123724 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/FM-Index/FMIndex.tex @@ -0,0 +1,5 @@ +\chapter{FM-Index} + +\input{chapters/FM-Index/Sections/FM1} +\input{chapters/FM-Index/Sections/FM2} +\input{chapters/FM-Index/Sections/FM3} \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM1.tex b/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM1.tex new file mode 100644 index 0000000..6966d7e --- /dev/null +++ b/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM1.tex @@ -0,0 +1,107 @@ +\section{Idea} + +The general idea of FM-Index \cite{FMIndex} is quite simple. We will want to find a way to get a range of the suffix array such that the pattern occurs in all the suffixes in that range. It can be seen that if the pattern occurs in text $t$ then it occurs in a consistent range on the suffix array of $t$. More formally: + +\begin{theorem} + If $s$ occurs in $t$ then there exists a range $[i, j]$ such that for all $k \in [i, j]$: $t[SA[k]: SA[k] + m] = s$ and for all $l \notin [i, j]$, it holds that $t[SA[l]: SA[l] + m] \neq s$. +\end{theorem} + +\begin{proof} + The proof follows from Observation 1. +\end{proof} + +Earlier it was shown that such a range can be obtained using two binary searches over suffix array, but it requires in the worst case $\bigO(m \cdot \log (n))$ time. Using FM-index we can change that complexity. The required structure for FM-index is $\texttt{RankSearcher}$ which can answer $\texttt{prefix\_rank(i, c)}$ query. $\texttt{prefix\_rank(i, c)}$ returns the number of characters $c$ in prefix of length $i$ for given text or array. Assuming that we can create $\texttt{RankSearcher}$ in $\bigO(f(n, |\mathcal{A}|))$ time and time required for $\texttt{prefix\_rank(i, c)}$ is $\bigO(g(n, |\mathcal{A}|))$ time, the complexity of FM-index will be $\bigO(n + f(n, |\mathcal{A}|))$ time for creation and $\bigO(m\cdot g(n, |\mathcal{A}|))$ time for query. The possible structures that may be used to implement $\texttt{RankSearcher}$ are as follows: + +\begin{enumerate} + \item Wavelet Tree -- it can be constructed in $\bigO(n\cdot \log(|\mathcal{A}|))$ time and can answer $\texttt{rank(i, c)}$ query in $\bigO(\log(|\mathcal{A}|))$ time, which results in $\bigO(n\cdot \log(|\mathcal{A}|))$ time for creation of FM-index and $\bigO(m\cdot \log(|\mathcal{A}|))$ time to perform search. This structure is described in \Cref{subsection:WaveletTree}. + \item Prefix array for each character in $\mathcal{A}$ -- this structure can be built in $\bigO(n\cdot |\mathcal{A}|)$ time but answers a query in $\bigO(1)$ time, that means we can create the FM-index in $\bigO(n\cdot |\mathcal{A}|)$ and perform a search in $\bigO(m)$ time. This structure is used more commonly in practice due to the small size of alphabet. +\end{enumerate} + + +\subsection{Idea of algorithm} +\label{subsection:IdeaOfFMAlgo} + +Let us show the main idea on a following example: FM-index stores text $t = ababa\$$, suffix array of $t$, $BWT(t)$ and length of text $n$ but as an example to better show the idea we will use the matrix from the BWT section. The pattern given for query is $s = aba$. So let us consider a matrix of circular shifts of $t$ sorted in lexicographic order. + +$$ +\begin{bmatrix} +\$ababa\\ +a\$abab\\ +aba\$ab\\ +ababa\$\\ +ba\$aba\\ +baba\$a +\end{bmatrix} +$$ +\newline +As it was show earlier the first column of matrix is the suffix array of $t$ and the last column is $BWT(t)$. Now we want to search for the pattern. Let us consider a pattern character by character backwards, so the first character we want to use is $\textbf{a}$ from $s = ab\textbf{a}$. Now we want to find a range such that all the suffixes begin with that character (the details of how to do it fast will be explained later), in this case this range is [1,3]. + +$$ +\begin{bmatrix} +\$ababa\\ +\textbf{a\$abab}\\ +\textbf{aba\$ab}\\ +\textbf{ababa\$}\\ +ba\$aba\\ +baba\$a +\end{bmatrix} +$$ + +The next operation to perform is to find the new range for the previous letter in pattern which is $\textbf{b}$, but it must be done in such a way that this range will only contain suffixes that starts with $b$ and have the already performed suffix of pattern as a next character, so we want to find all corresponding suffixes which starts with $\textbf{b}$ and are obtained from suffixes from previous range. So for all suffixes in range [1,3] they have to have $\textbf{b}$ as the value of the last column (which means the previous character for that suffix was $b$). Only suffixes from the range [1,2] fulfill that condition. + +$$ +\begin{bmatrix} +\$ababa\\ +\textbf{a\$abab}\\ +\textbf{aba\$ab}\\ +ababa\$\\ +ba\$aba\\ +baba\$a +\end{bmatrix} +$$ + +Next, we have to map that range for the corresponding suffixes starting with $\textbf{b}$, which is the range $[4,5]$. We will explain later a way of building such a range. + +$$ +\begin{bmatrix} +\$ababa\\ +a\$abab\\ +aba\$ab\\ +ababa\$\\ +\textbf{ba\$aba}\\ +\textbf{baba\$a} +\end{bmatrix} +$$ + +This way we construct the result range step by step. The algorithm ends after obtaining the range for the first character of the pattern which in this example is range $[2,3]$, which is the correct answer. +$$ +\begin{bmatrix} +\$ababa\\ +a\$abab\\ +\textbf{aba\$ab}\\ +\textbf{ababa\$}\\ +ba\$aba\\ +baba\$a +\end{bmatrix} +$$ + +If an algorithm cannot create a proper range at any step the answer is that pattern does not exist in the text. The correctness of that process follows from the fact that we construct ranges step by step and in each step we obtain a proper range for each consecutive suffix of $s$. + +\subsection{API of FM-Index} + +The basic function of FM-Index is finding a range of suffix array such that the pattern occurs in all the suffixes in that range, but what can be obtained from that information. +\begin{itemize} + \item $\texttt{count(s, m)}$ -- Counting the number of occurrences of the pattern has $\bigO(m\cdot g(n, |\mathcal{A}|))$ time complexity. To obtain the number of occurrences from the given range of suffix array we just retrieve the size of that range and it will be the expected result. + \item $\texttt{exists(s, m)}$ -- Checking if the pattern occurs in text in $\bigO(m\cdot g(n, |\mathcal{A}|))$ time. Getting the answer for that function is equivalent to checking if $\texttt{count(s, m)} > 0$. + \item $\texttt{all\_occurrences(s, m)}$ -- All occurrences of the pattern in text in $\bigO(m\cdot g(n, |\mathcal{A}|) + R)$ time complexity, where $R$ is the number of occurrences of the pattern in text. To obtain such information you can just map indexes from given range to indexes at that positions in suffix array. + \item $\texttt{any\_occurrence(s, m)}$ -- Any occurrence of the pattern in text in $\bigO(m\cdot g(n, |\mathcal{A}|))$ time. It can be done by mapping any index from result range to index of suffix array. + \item $\texttt{first\_occurence(s, m)}$ -- Getting first occurrence of the pattern in text. There are many ways to get that, one of them require to get all occurrences and find the minimal index, but it requires $\bigO(m\cdot g(n, |\mathcal{A}|) + k)$ time, which at the worst case can be a lot. Finding it faster requires solving the RMQ (Range minimal query) problem, which can be achieved in many ways, using such data structures: + \begin{itemize} + \item Sparse table \cite{RMQ1} -- which will require $\bigO(n \cdot \log n)$ preprocessing time and space, and it returns an answer to such query in $\bigO(1)$ time. + \item Segment tree \cite{SegmentTreeRMQ} -- using segment tree allows us to compute the answer in $\bigO(\log n)$ time, but it will only require $\bigO(n)$ time and space for preprocessing. + \item Combined sparse table and segment tree -- combining both of above techniques allows solving RMQ in $\bigO(\log \log n)$ for query and $\bigO(n \cdot \log \log n)$ time for preprocessing. + \item Optimal RMQ algorithm \cite{RMQ1} (Sparse table + Cartesian trees + segmentation) -- optimal algorithm finds an answer in $\bigO(1)$ time and requires only $\bigO(n)$ time for preprocessing. + \end{itemize} + \item $\texttt{last\_occurrence(s, m)}$ -- Getting the last occurrence of the pattern. It can be done similarly to $\texttt{first\_occurrence(s, m)}$ with the only change that the answer is maximum instead of minimum. + \item $\texttt{count\_in\_range(s, m, i, j)}$ -- Getting the number of occurrences of $s$ in substring $t[i:j]$. It can be done using separated Wavelet Tree on indices of $SA$ using at most $\bigO(m\cdot g(n, |\mathcal{A}|) + \log (n))$ operations. +\end{itemize} \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM2.tex b/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM2.tex new file mode 100644 index 0000000..45e5fa2 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM2.tex @@ -0,0 +1,57 @@ +\section{Details of preprocessing for FM-Index} + +The creation of FM-Index will require the text $t$, Burrows-Wheeler transform $BWT$ of $t$, suffix array $SA$ of $t$, $RankSearcher$ structure, which can perform $\texttt{prefix\_rank(i, c)}$ query and $Beginnings$ structure which will help to get a range in which character $c$ occurs in a given array. + +\subsection[Creation of F and L]{Creation of F and L} + +The array $F$ will correspond to the first column of the matrix which was shown in \Cref{subsection:IdeaOfFMAlgo}, which is array of character created from the suffix array as $F[i+1] = t[SA[i]]$ and $F[0] = \$$. The array $L$ corresponds to the last column of the previous matrix and will be equal to $BWT$. This stage takes at most $\bigO(n)$ time and space. + +\subsection{Creation of range mapping for character} + +The next structure, denoted as $MAP$, is a dictionary that maps characters from set $\{\$\}$ $\cup$ $\mathcal{A}$ to indices in the range $[0, |\mathcal{A}|]$. It can be done by iterating over $L$ and assigning the first unused index for each distinct character. +\newline \newline \newline +The next step involves a construction of mapping from range $[0, |\mathcal{A}| + 1]$ to the first occurrence of that character in $F$ denoted as $FIRST$. It is known that all the same characters are in consistent range because, $F$ was created based on suffix array. We built it by iterating over $F$ and for each mapped character assign current index in $F$ to our structure unless that character already has assigned index in mapping. Without loss of generality, we define $FIRST[|\mathcal{A}|] = n + 1$ to handle edge case where a character is the last one in mapping. On that index the character does not exist, but will mark the end of previous character occurrences. Therefore, the range in which character $c$ exists in $L$ is confined to $[ FIRST[ MAP[ c ] ], + [ FIRST[ MAP[ c ] + 1 ] ] - 1 ]$ + +\subsection{Creation of Beginnings structure} +The next structure is very simple due to the dependence on the array $F$ where all first letters are sorted. It is sufficient to iterate over the suffix array and assign the proper values. + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def create_beginnings_structure(F, n): + beginnings = [2] + last = F[2] + for i in range(3, n+2): + if F[i] != last: + last = F[i] + beginnings.append(i) + return beginnings +\end{minted} +\subsection{Creation of RangeSearcher} + +Here we will focus on the approach based on prefix array. In practice the alphabet is small, so this approach can be faster than using Wavelet Tree. To save some space we can also compute the prefix array only for some selected indices, so during the query we will have to expand our range like in naive the approach. +\newline +\newline + +The code creating that data structure can be written as follows: +\begin{minted}[xleftmargin=20pt, linenos]{python} +# t - text, n - lenght of t, A - alphabet of t +def create_range_searcher(t, n, A): + result = dict() + for character in A: + prefix = [] + prefix.append(1 if t[0] == character else 0) + for i in range(1, n+1): + prefix.append(prefix[i-1] + (1 if t[i] == character else 0) + result[character] = prefix + return result +\end{minted} + +The query for that structure will look like on typical prefix array. + +\begin{minted}{python} + def rank(self, i, c): + return self.prefix[c][i] +\end{minted} + + +By combining all the algorithms above we obtain a complete FM-index structure. diff --git a/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM3.tex b/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM3.tex new file mode 100644 index 0000000..c01f8f0 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/FM-Index/Sections/FM3.tex @@ -0,0 +1,37 @@ +\section{Details of querying on FM-Index} + +The main function of FM-index is $\texttt{\_get\_range\_of\_occurrences(s, k)}$ which will return range $[i, j]$ in suffix array where pattern $s$ occurs. Like was described in the idea of that algorithm we will iterate over $s$ backward and will store the proper range for current suffix of $s$. The only thing to explain is how to expand the current suffix of $s$. Let us suppose that we are currently at the index $i > 1$ in $s$ and the current range is $[a, b]$. Then we: + +\begin{enumerate} + \item Get the next character $c = s[i-1]$. + \item Check if $c$ exists in $t$, it can be done using character mapper. + \item Get number of characters $c$ which are before $l$ which is $x = \texttt{prefix\_rank(l-1, c)}$ and number of characters $c$ that are in prefix of length $r$ which is $y = \texttt{prefix\_rank(r, c)}$, if $x = y$ that means that there is no character $c$ in our range, so it can be stopped here. + \item Set $a = \texttt{beginnings[mapper[i]]} + x$, $b = \texttt{beginnings[mapper[i]]} + y - 1$ as the new range and decrement $i$. +\end{enumerate} + +Using that procedure we can now start properly for the first character and execute that procedure in loop to get an answer. + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def _get_range_of_occurrences(FM, p, size): + if size > FM.n or size == 0: + return -1, -1 + if p[-1] not in FM.mapper_of_chars: + return -1, -1 + map_idx = FM.mapper_of_chars[p[-1]] + l= FM.beginnings[map_idx] + r = (FM.beginnings[map_idx + 1] - 1 + if map_idx != FM.len_of_alphabet - 1 else FM.n + 1) + for c in p[-2:0:-1]: + if c not in FM.mapper_of_chars: + return -1, -1 + occurrences_before = FM.rank_searcher.prefix_rank(c, l - 1) + occurrences_after = FM.rank_searcher.prefix_rank(c, r) + if occurrences_before == occurrences_after: + return -1, -1 + map_idx = FM.mapper_of_chars[c] + l = FM.beginnings[map_idx] + occurrences_before + r = FM.beginnings[map_idx] + occurrences_after - 1 + if r < l: + return -1, -1 + return l, r +\end{minted} diff --git a/text/fm_lz_index/tex/chapters/Intro/Introduction.tex b/text/fm_lz_index/tex/chapters/Intro/Introduction.tex new file mode 100644 index 0000000..ffed9c4 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/Intro/Introduction.tex @@ -0,0 +1,6 @@ +\chapter{Introduction} + +\input{chapters/Intro/Sections/Intro1} +\input{chapters/Intro/Sections/Intro2} +\input{chapters/Intro/Sections/Intro3} +\input{chapters/Intro/Sections/Intro4} \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/Intro/Sections/Intro1.tex b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro1.tex new file mode 100644 index 0000000..199d5f1 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro1.tex @@ -0,0 +1,44 @@ +\section{String-searching} + +\subsection{Definitions} +Let us first introduce basic concepts, used through this work. Let $\mathcal{A}$ denote an alphabet, that is the set of characters. A text, denoted as $t$, is the sequence of characters from the alphabet $\mathcal{A}$, we denote reversed text $t$ as $t^r$. A pattern $s$ is a text that we will try to find in given text $t$. + +\begin{definition} + Substring $t[i: j+1]$ is a part of the text $t = a_1...a_n$ such that $t[i : j+1] = a_i,a_{i+1}...a_{j-1},a_j$. +\end{definition} + +\begin{definition} + Suffix $t[i:]$ is a substring of text $t = a_1...a_n$ such that $t[i:] = a_i...a_n$ +\end{definition} + +\begin{definition} + Prefix $t[:j+1]$ is a substring of text $t = a_1...a_n$, such that $t[:j+1] = a_1...a_j$. +\end{definition} + +\begin{definition} + Compression of text $t$ is a function $F: \mathcal{A}^\mathbb{N} \mapsto X^\mathbb{N}$, that maps text to any object used for compression. We denote length of the result of compression as $w$. +\end{definition} + +\begin{definition} + Compression rate is the ratio of the bit representation of uncompressed text and compressed text. That is $\frac{x}{y}$, where $x$ is the size of uncompressed text in bits and $y$ is length of bit representation of its compression. +\end{definition} + +\subsection{Problem definitions} +String-searching is well known problem which can also be found in everyday life. +\begin{problem}[String-searching] \cite{stringSearchingDef} + The string searching, sometimes also called string matching problem, consists of finding all occurrences (or the first occurrence) of a pattern $s$ in a text $t$. +\end{problem} + +\begin{problem}[Multiple string-searching] \cite{multiString} Given a finite set of $k$ pattern strings $S = \{s_1, s_2, ,..., s_k\}$ and a text $t$. The problem is to find all the text positions where the $s_i$ occurs in $t$. More precisely the problem is to compute the set $\{ j \colon \exists_i \hspace{1mm} s_i = t_jt_{j+1}...t_{j+|Pi|-1}\}$. +\end{problem} + +\begin{problem}[String-searching with mismatches] \cite{missMatch} +Given a text $t = a_1a_2...a_n$, and a pattern $s$ = $s_1s_2...s_m$, return, index i, $(1 \leq i \leq n-m+1)$, such that strings $t[i : i+m+1]$, $s$ differ at the minimal number of positions. +\end{problem} + +\begin{problem}[Wildcard matching] \cite{wildcardsMatch} + Let text $t= t_1...t_n$ and pattern $s = s_1...s_m$. The pattern $s$ is said to occur at location $i$ in $t$ if, for every position $j$ in the pattern, either $s_j = t_{i+j}$ +or at least one of $s_j$ and $t_{i+j}$ is the wildcard symbol. +\end{problem} + +Through this work we focus only on the string-searching and the multiple string-searching problems. The most we will focus on the multiple string-searching problem due to the fact that we can save much more time solving that version compared to naive algorithms and it has more usages in practice. \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/Intro/Sections/Intro2.tex b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro2.tex new file mode 100644 index 0000000..b8cd421 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro2.tex @@ -0,0 +1,18 @@ +\section{Usage of string-searching in practice} +String-searching is used in many areas of IT and the easiest algorithms to solve those problems are already implemented in some programming languages by default \cite{stringSearchingUsage} \cite{Gusfield1997AlgorithmsOS}. String-searching can be used in: +\begin{itemize} + \item Spell Checker + \item Spam Filter + \item Intrusion Detection System Model + \item Search Engine Module + \item Plagiarism Detection System + \item DNA Sequencing Module + \item Digital Forensic Results + \item Information Retrieval Modal + \item Databases + \item Search tools, such as grep + \item Educational CD-ROM applications + \item Internet browsers and crawlers +\end{itemize} + +As can be seen, a lot of branches requires to solve that problem and many of them are commonly used in the daily life. \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/Intro/Sections/Intro3.tex b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro3.tex new file mode 100644 index 0000000..d19a78e --- /dev/null +++ b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro3.tex @@ -0,0 +1,39 @@ +\section{Simple algorithms for string-searching} + +%Naive serach section +\subsection{Naive search} +Most natural approach to solve string-searching is naive search where for each index $i$ in text $t$ we compare a substring $t[i : i+m]$ with the pattern. This algorithm can be written as follows: +\begin{minted}[xleftmargin=20pt, linenos]{python} +# t - text, s - pattern, n - length of t, m - length of s +def naive_search(t, s, n, m): + for i in range(0, n-m): + equals_on_index = True + for j in range(0,m): + if t[i+j] != s[j]: + equals_on_index = False + break + if equals_on_index: + return True + return False +\end{minted} +It is very simple algorithm, but in the worst case it can be very slow. For example $t = a^n$, $s = a^{m-1}b$ requires $(n-m) \cdot m$ operations. It is easy to observe that the worst time complexity will be $\bigO (n \cdot m)$. For version with multiple patterns we have to run this algorithm for each pattern independently, which results in $\bigO(n \cdot \sum m_i)$ running time. \newline + +However, if the patterns are generated randomly and the size of the alphabet is greater than $1$, this algorithm is quite fast. We observe that with such assumptions the expected number of comparisons in inner loop will be constant and for version with $q$ patterns we obtain $\bigO(q \cdot n)$ running time. +\newline +\begin{table}[!htb] +\begin{center} +\caption{Comparison of simple algorithms} +\label{table:simpleAlgos} +\begin{tabular}{|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +Algorithm & Average case & Worst case & Additional space \\ \hline +Naive algorithm & $\bigO(n)$ & $\bigO(n\cdot m)$ & $\bigO(1)$ \\ \hline +Rabin-Karp \cite{rabinKarp} & $\bigO(n)$ & $\bigO(n\cdot m)$ & $\bigO(1)$ \\ \hline +Knuth-Morris-Pratt \cite{KMP} & $\bigO(n)$ & $\bigO(n)$ & $\bigO(m)$ \\ \hline +Two-way algorithm \cite{twoWayAlgo} & $\bigO(n)$ & $\bigO(n)$ & $\bigO(\log m)$ \\ \hline +\end{tabular} +\end{center} +\end{table} + +If the average complexity of a naive algorithm is not enough in specific case there are some algorithms mentioned in table \ref{table:simpleAlgos}, which for single pattern achieves optimal running time. diff --git a/text/fm_lz_index/tex/chapters/Intro/Sections/Intro4.tex b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro4.tex new file mode 100644 index 0000000..01d98bf --- /dev/null +++ b/text/fm_lz_index/tex/chapters/Intro/Sections/Intro4.tex @@ -0,0 +1,451 @@ +\section{Suffix array, BWT and Wavelet tree} +%Suffix array subsection +\subsection{Suffix Array} +Suffix array introduced in \cite{SuffixArray} is a data structure which describes the suffixes of the text. It stores all the suffixes of text sorted in lexicographic order, but there is one trick, it stores only the beginning index of each suffix. + +\begin{definition}\cite[p. 986]{cormenSufixArray} + Given a text $t = a_1,...,a_n$. The suffix array $SA$ of $t$ is defined such that if $SA[i] = j$ , then $t[j]$ is the $i$-th suffix of $t$ in lexicographic order. That is, the $i$-th suffix of $t$ in lexicographic order is $t[SA[i]:]$. +\end{definition} + +We add character $\$$ at the end of text which will work as an empty suffix and that character will also be less than any other character. + +\begin{table}[H] +\caption*{Visualization of suffix array} +\begin{center} +\begin{tabular}{|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +Index & Suffix \\ \hline +0 & $ananas\$$ \\ \hline +1 & $nanas\$$ \\ \hline +2 & $anas\$$ \\ \hline +3 & $nas\$$ \\ \hline +4 & $as\$$ \\ \hline +5 & $s\$$ \\ \hline +6 & $\$$ \\ \hline +\end{tabular} +$\implies$ +\begin{tabular}{|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +SA-Index & Original index & Suffix \\ \hline +0 & 6 & $\$$ \\ \hline +1 & 0 & $ananas\$$ \\ \hline +2 & 2 & $anas\$$ \\ \hline +3 & 4 & $as\$$ \\ \hline +4 & 1 & $nanas\$$ \\ \hline +5 & 3 & $nas\$$ \\ \hline +6 & 5 & $s\$$ \\ \hline +\end{tabular} +\end{center} +\end{table} + +If we want to store each suffix as strings, it will require at least $\bigO(n^2)$ time and space, which is not optimal. The suffix array can be computed by many methods, such as: +\begin{table}[H] +\caption{Algorithms for computing suffix array} +\begin{center} +\label{table:suffixArrayAlgos} +\begin{tabular}{|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +Algorithm & Time complexity & Additional space \\ \hline +Naive algorithm & $\bigO(n^2 \log n)$ & $\bigO(n^2)$ \\ \hline +KMR algorithm \cite{KMR} & $\bigO(n \log^2 n)$ & $\bigO(n \log n)$ \\ \hline +KMR \cite{KMR} + radix sort \cite{radixSort} & $\bigO(n \log n)$ & $\bigO(n \log n)$ \\ \hline +SA-IS algorithm \cite{SA-IS} & $\bigO(n)$ & $\bigO(n)$ \\ \hline +Kärkkäinen-Sanders algorithm \cite{KS} & $\bigO(n)$ & $\bigO(n)$ \\ \hline +Larsson-Sadakane algorithm \cite{LS} & $\bigO(n \log n)$ & $\bigO(n)$ \\ \hline +Linear suffix tree algorithm \cite{suffixTree} & $\bigO(n)$ & $\bigO(n)$ \\ \hline +\end{tabular} +\end{center} +\end{table} + +Suffix array is used in many text algorithms. We will use this structure later for computing FM-index, but if we have suffix array for example we can solve string-matching for many patterns faster than naive algorithms. Note that the suffix array can be computed in $\bigO(n)$ running time using algorithms mentioned in table \ref{table:suffixArrayAlgos}. +\newline +\newline + +Let us consider the text $t$ of length $n$, it's suffix array $SA$ and patterns $s_1,s_2...$. To solve string-matching using suffix array we will make simple observation which will allow us to use binary search. + +\begin{observation} +\label{observation:suffixArrayObservation} +Pattern $s$ exists in text $t$ if and only if there is a suffix of $t$ that have a prefix equals to $s$. +\end{observation} + +Having that observation and a suffix array we can use binary search over the suffix array to find such suffix. Due to the definition of suffix array, all suffixes in it are sorted so that we can use binary search. + +\begin{minted}[xleftmargin=20pt, linenos]{python} +# t - text, SA - suffix array, n - length of t, s - pattern +# m - length of s +def pattern_exists(t, SA, n, s, m) + l = 0 + r = n - 1 + while l > r: + mid = (l+r)//2 + string_to_compare = ''.join( + t[i] for i in range(SA[mid], min(n, SA[mid] + m + 1))) + if string_to_compare < s: + l = mid + 1 + else: + r = mid + if (n - SA[l] < m) or (t[SA[l] : SA[l] + m] != s): + return False + return True +\end{minted} + +As we can see, the binary search will execute string comparison of length at most $m$, so the time complexity of this function is $\bigO(\log(n)\cdot m)$, which after the summation for all strings gives us $\bigO(\log(n) \cdot \sum m_i)$, which is not optimal because the input size is $\sum m_i$. In that analysis we are skipping the time needed for calculating the suffix array, which can be done using $\bigO(n)$ time. Comparison of suffix and pattern can be stopped on the first index that are different, so we can save some time comparing characters one by one without materializing the suffix, but the worst complexity will be the same. + +\subsection{Burrows-Wheeler transform (BWT)} + +The Burrows-Wheeler transform (BWT) is a transformation that permutes the characters of a string in a specific order. The easiest way to show how BWT work is to use an example. +\newline \newline +Let us consider text $t = baabb$. We can mark the end of the text with \$ to make it reversible, so $t = baabb\$$. Now let us construct a matrix of all the circular shifts of the text (circular shift on text is an operation that takes the first character from text and place it at the end of the text). +\bigskip +\newline +$$ +\begin{bmatrix} +baabb\$\\ +aabb\$b\\ +abb\$ba\\ +bb\$baa\\ +b\$baab\\ +\$baabb +\end{bmatrix} +$$ +Now we sort the texts in the matrix in lexicographic order. The matrix after sort will look like this: +$$ +\begin{bmatrix} + + +\$baabb\\ +aabb\$b\\ +abb\$ba\\ +baabb\$\\ +bb\$baa\\ +b\$baab +\end{bmatrix} +$$ + +The output of BWT is the last column of this matrix, that is, $BWT(t) = bba\$ab$. +\begin{definition} + $BWT(t)$ is the last column of the matrix of circular shifts, sorted lexicographically. +\end{definition} +Such definition immediately provide an algorithm for computing $BWT(t)$, which is, get all circular shifts of $t$, sort them and get the last column of matrix. Unfortunately, a direct approach is very slow, because sorting such a matrix in the worst case requires $\bigO(n^2 \log n)$ time, but it can be done much faster. This algorithm works similar to the naive algorithm for computing the suffix array because, the matrix after sort is like a suffix array (but it does not include the indexes), in detail the characters in the first column are the letters on indexes from suffix array. All the texts in this matrix are circular shifts, so the character in the last column is a previous character in text of the character in the first column. This follows to the new observation. + +\begin{observation} + +$BWT(t)$ on index $i$ is the same as $t[SA[i] - 1]$ or $\$$ if $SA[i] = 0$, and at the first index of $BWT(t)$ is last character of $t$. +\end{observation} + +This observation provides a fast algorithm for computing $BWT(t)$, that requires a precomputed suffix array. + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def BWT(SA, t, n): + bwt = [t[n - 1]] + for i in range(0, n): + if SA[i] == 0: + bwt.append('$') + else: + bwt.append(t[SA[i] - 1]) + return ''.join(bwt) +\end{minted} + +It can be proved that this algorithm requires only $\bigO(n)$ time, which means that it is optimal for computing $BWT(t)$. + +\subsection{Wavelet tree} +\label{subsection:WaveletTree} +Wavelet tree is a data structure for performing a certain set of operations on ranges for a given text. In particular, the required operations are: +\begin{itemize} + \item $\texttt{rank(i, j, x)}$ -- get the number of occurrences of character $x$ in $t[i : j+1]$. + \item $\texttt{select(i, j, k, x)}$ -- get the $k$-th occurrence of character $x$ in $t[i : j+1]$. + \item $\texttt{quantile(i, j, k)}$ -- get the $k$-th smallest character in $t[i : j+1]$. + \item $\texttt{range\_count(i, j, x, y)}$ -- get the number of positions $k \in [i,j]$ such that $x \leq t[k] \leq y$. +\end{itemize} + +The wavelet tree can be constructed in $\bigO(n\cdot \log(|\mathcal{A})|)$ time and each of the queries mentioned above can be answered in $\bigO(\log(|\mathcal{A}|))$ time. The only weakness of wavelet tree is its space complexity, equal to $\bigO(n\cdot \log(|\mathcal{A}|))$. + +\subsubsection{Idea} +The idea of Wavelet Tree is quite simple. We will represent Wavelet Tree as a binary tree. In each node we store the alphabet of $t$, minimal and maximal character in alphabet, prefix sum array and the information about each character and index to which division does it belong. + + +For each node we sort an alphabet and divide it in half denoted as $\mathcal{A}_1, \mathcal{A}_2$ and create texts $t_1,t_2$ where $t_1$ (respectively, $t_2$) is a copy of $t$ with all characters for alphabet $\mathcal{A}_2$ (respectively, $\mathcal{A}_1$) removed, but the order of characters exactly preserved as in $t$. Next we recursively create left node with arguments $t_1,\mathcal{A}_1$ and right node with arguments $t_2, \mathcal{A}_2$, until we reach the base case, when the alphabet $\mathcal{A}$ contains only one element. Additionally, in each node we store a prefix sum array where the character contributes to the prefix sum as $0$ where character is in $\mathcal{A}_1$ and as $1$ if character is in $\mathcal{A}_2$. + +\begin{tikzpicture}[>=latex] +\matrix[mymat,row 2/.style={nodes=draw}] +at (3,0) +(mat1) +{ +0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 \\ +a & b & r & a & k & a & d & a & b & r & a \\ +}; +\matrix[mymat,row 2/.style={nodes=draw}] +at (0,-3) +(mat3) +{ +0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 \\ +a & b & a & a & d & a & b & a \\ +}; +\matrix[mymat,row 2/.style={nodes=draw}] +at (7, -3) +(mat4) +{ +0 & 1 & 2 \\ +r & k & r \\ +}; +\matrix[mymat,row 2/.style={nodes=draw}] +at (-1, -6) +(mat5) +{ +0 & 1 & 2 & 3 & 4 & 5 & 6\\ +a & b & a & a & a & b & a \\ +}; +\matrix[mymat,row 2/.style={nodes=draw}] +at (3, -6) +(mat6) +{ +0 \\ +d \\ +}; + +\matrix[mymat,row 2/.style={nodes=draw}] +at (6, -6) +(mat7) +{ +0\\ +k\\ +}; + +\matrix[mymat,row 2/.style={nodes=draw}] +at (8, -6) +(mat8) +{ +0 & 1 \\ +r & r \\ +}; +\matrix[mymat, row 2/.style={nodes=draw}] +at (-2, -9) +(mat9) +{ +0 & 1 & 2 & 3 & 4 \\ +a & a & a & a & a \\ +}; +\matrix[mymat, row 2/.style={nodes=draw}] +at (2, -9) +(mat10) +{ +0 & 1\\ +b & b\\ +}; + +\node[above=0pt of mat1] + (cella) {Example of Wavelet Tree for $t = abrakadabra$}; + +\begin{scope}[shorten <= -2pt] + +\draw[*->] + (mat1-2-4.south) -- (mat3-1-4.north); +\draw[*->] + (mat1-2-8.south) -- (mat4-1-2.north); +\draw[*->] + (mat3-2-4.south) -- (mat5-1-3.north); +\draw[*->] + (mat3-2-6.south) -- (mat6-1-1.north); +\draw[*->] + (mat4-2-1.south) -- (mat7-1-1.north); +\draw[*->] + (mat4-2-3.south) -- (mat8-1-2.north); +\draw[*->] + (mat1-2-4.south) -- (mat3-1-4.north); +\draw[*->] + (mat5-2-3.south) -- (mat9-1-2.north); +\draw[*->] + (mat5-2-6.south) -- (mat10-1-2.north); + +\end{scope} + +\end{tikzpicture} + +To save time we can sort the alphabet only once in the root node, split it and pass it as an argument for children nodes, which will result in total $\bigO(|\mathcal{A}|\log (|\mathcal{A}|))$ operations for sorting. +\newline +\newline +A construction of Wavelet Tree can be implemented in the following way: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class WaveletTree: + def __init__(self, t, n, sorted_alphabet = None): + t = t[1:] + if sorted_alphabet is not None: + self.alphabet = set(sorted_alphabet) + else: + self.alphabet = set(t) + sorted_alphabet = sorted(list(self.alphabet)) + self.n = n + self.smallest = sorted_alphabet[0] + self.largest = sorted_alphabet[-1] + if len(sorted_alphabet) == 1: + self.leaf = True + return + self.leaf = False + left_alphabet = sorted_alphabet[:(len(sorted_alphabet) + 1)//2] + right_alphabet = sorted_alphabet[(len(sorted_alphabet) + 1)//2:] + self.zero_indexed = set(left_alphabet) + self.one_indexed = set(right_alphabet) + value_array = [1 if c in self.one_indexed else 0 for c in t ] + self.prefix_sum = [0] + for i in range(n): + self.prefix_sum.append(self.prefix_sum[i] + value_array[i]) + self.left_indices = [0] + self.right_indices = [0] + for i in range(n): + if t[i] in self.zero_indexed: + self.left_indices.append(i+1) + else: + self.right_indices.append(i+1) + left_t = ['#'] + [c for c in t if c in self.zero_indexed] + right_t = ['#'] + [c for c in t if c in self.one_indexed] + self.left = WaveletTree(left_t, len(left_t) - 1, left_alphabet) + self.right = WaveletTree(right_t, len(right_t) - 1, right_alphabet) +\end{minted} + +It can be seen that the creation of Wavelet Tree requires $\bigO(n \log(|\mathcal{A}|))$ time and space. We can observe that for each level of nodes the total time and space is the same, and it is equal to $\bigO(n)$ also the height of the tree is $\bigO(\log(|\mathcal{A}|))$. The creation complexity can be compared to the complexity of Merge Sort \cite[p. 34-44]{cormenSufixArray} where we use similar approach. Also note that $|\mathcal{A}|$ is less or equal to $n$, so the time used for the sorting at the root node will not be greater than $\bigO(n \log(|\mathcal{A}|))$. +\newline + +Using all the information stored in Wavelet Tree we can now proceed to answer the queries. +\newline + +\subsubsection{Rank query} +$\texttt{rank(i, j, x)}$ query can be answered using recursion and modifying the range as we go deeper into the tree to the leaf, it can be described in the following way: + +\begin{enumerate} + \item If $x$ is not in the alphabet or range $[i,j]$ is empty return $0$. + \item If the node is a leaf, then return $r-l+1$. + \item Modify $i$ and $j$, so they correspond to the correct positions in a recursive call. + \item Recursive call on proper child and range. +\end{enumerate} + +The range modification may be implemented in the following way: +\begin{minted}[xleftmargin=20pt, linenos]{python} + def _left_tree_range(self, l, r): + return l - self.prefix_sum[l-1], r - self.prefix_sum[r] + + def _right_tree_range(self, l, r): + return (self.prefix_sum[l-1] + 1, self.prefix_sum[r]) +\end{minted} + +The correctness of each query follows explicitly from the correctness of range mapping and the trivially true base case. + +For the right node we observe that the prefix sum correspond to the number of elements in right tree at given index, so new indexes in right tree are just values of prefix sum. In the other case the indices in left node are the complement for right node, so it will be original index minus the prefix sum value. Using such functions, the $\texttt{rank(i, j, x)}$ query can be implemented in the following way: + +\begin{minted}[xleftmargin=20pt, linenos]{python} + def rank(self, c, l, r): + if c not in self.alphabet or l > r or l > self.n or r < 1: + return 0 + if self.leaf: + return r-l+1 + if c in self.zero_indexed: + new_l, new_r = self._left_tree_range(l, r) + return self.left.rank(c, new_l, new_r) + new_l, new_r = self._right_tree_range(l, r) + return self.right.rank(c, new_l, new_r) +\end{minted} + +Note that it is easy to modify $\texttt{rank(c, l, r)}$ function to return all indices. The only difference is that we return range $[l, r]$ in the base case and map all indices from the recursion call to current node indices. + +\subsubsection{Select query} +Answering the $\texttt{select(i, j, k, x)}$ query uses the same approach as the $\texttt{rank(i, j, x)}$ query but returns another information in the base case where we just return an index and as we leave the recursion call we map it to an index in currently considered text inside a node. It may be implemented as follows: + +\begin{minted}[xleftmargin=20pt, linenos]{python} + def select(self, c, k, l, r): + if c not in self.alphabet or l > r or l > self.n or r < 1 : + return None + if self.leaf: + return k+l-1 if k <= r-l+1 else None + if c in self.zero_indexed: + new_l, new_r = self._left_tree_range(l, r) + rec_result = self.left.select(c, k, new_l, new_r) + return (self.left_indices[rec_result] if rec_result is not None + else None) + new_l, new_r = self._right_tree_range(l, r) + rec_result = self.right.select(c, k, new_l, new_r) + return (self.right_indices[rec_result] if rec_result is not None + else None) +\end{minted} + +\subsubsection{Quantile query} +The $\texttt{quantile(i, j ,k)}$ will also use similar approach but in each step we will count the number of characters in the left node in the corresponding range and recursively call the function according to the $k$ and number of characters. As can be seen if number of characters in the left node in range $[i, j]$ is less than $k$ we will call recursively with arguments $mapped(i)$, $mapped(j)$ and $k - num$, due to that each character in left node are smaller than in the right node, and now we are searching for $(k-num)$-th element, in other case we call recursively $mapped(i)$, $mapped(j)$, $k$ on the left node. As for the base case we will just return the character which is in the leaf. + +\begin{minted}[xleftmargin=20pt, linenos]{python} + def quantile(self, k, l, r): + if k < 1 or k > r-l+1: + return None + if self.leaf: + return self.smallest if k <= self.n else None + left_num = self.prefix_sum[r] - self.prefix_sum[l-1] + if r-l+1-left_num >= k: + new_l, new_r = self._left_tree_range(l, r) + return self.left.quantile(k, new_l, new_r) + new_l, new_r = self._right_tree_range(l, r) + return self.right.quantile(k-r+l-1+left_num, new_l, new_r) +\end{minted} + +All the queries requires constant time in a single node and use one recursion call in one of the child, so the time complexity is proportional to the depth of the tree, which is $\bigO(\log(|\mathcal{A}|))$. + +\subsubsection{RangeCount query} +As for $\texttt{range\_count(i, j, x, y)}$ query here we use another approach. For each range that fulfills condition $x \leq node.smallest$ and $node.biggest \leq y$ the answer is simple, it is $j-i+1$. So we want to find the closest nodes that fulfill that condition and sum the answers. To find such nodes, we can use recursion in the following way (the order of conditions is important): + +\begin{enumerate} + \item Base case: if $x \leq node.smallest$ and $node.biggest \leq y$, then return (j - i + 1) + \item If the node is a leaf, then return 0 + \item If $[x, y]$ intersect with range of elements of left child and right child, then use recursive call on both children and return the sum of results. + \item If $[x, y]$ intersect only in the right child range of elements, then return the result of recursion call for right child. + \item Otherwise, return the result of recursion call for left child. +\end{enumerate} + + + +The intersection of ranges can be implemented naively, just considering all cases: + +\begin{minted}[xleftmargin=20pt, linenos]{python} + def _does_one_range_end_in_another(self, l, r, i, j): + return (i <= l <= j) or (i <= r <= j) + + def _ranges_intersect(self, l, r, i, j): + return (self._does_one_range_end_in_another(l, r, i ,j) or + self._does_one_range_end_in_another(i, j, l, r)) +\end{minted} + +The first case is just a base case, so we return the length of the range. In the second case if the current node is leaf, then it means that no element fulfills the condition, so we terminate the recursion here. As for other case we just check if there are some elements in each child and invoke a proper recursion call. To prove the time complexity we need the following fact: + +\begin{observation} + After the first call of case $3$ for the next calls of case $3$ one of its child will fulfill base case condition. +\end{observation} + +\begin{proof} + We inductively assume that this property holds for the children of the node. Each element of left child is smaller than any element of the right child. That implies that for each case $3$ which occurs in the left child, the recursion call on the right child will fulfill the base case due to the fact that earlier in one of the ancestors of that node we used recursion call for right node, but all elements there were greater than in current child. Hence, that $node.right.biggest \leq y$ and also we used recursion call for the left side, but it can only happen if $x \leq node.left.biggest$, which implies that $x \leq node.right.smallest$, and it is exactly the base condition. The analysis for case $3$ calls on the nodes on the right after the first occurrence of case $3$ is similar. +\end{proof} + +Using that observation we can now calculate properly the time complexity of \\ $\texttt{range\_count(i, j, x, y)}$. If case $3$ did not occur at all during the query the answer is simple, it is just the height of tree which is $\bigO(\log(|\mathcal{A}|))$. In the other case the first case $3$ occurs after at most $\bigO(\log(|\mathcal{A}|))$ operations, and now we can calculate the time needed for left and right child. On each of the next occurrences of case $3$ we can observe that it will use $\bigO(1)$ operations for at least one of the child and in the other child in the worst case the recursion can go deeper which also can be represented as height of the tree which is $\bigO(\log(|\mathcal{A}|))$. In the worst case we need $\bigO(\log(|\mathcal{A}|))$ operations for the first occurrence and $\bigO(\log(|\mathcal{A}|))$ operations for both children which in total results in $\bigO(\log(|\mathcal{A}|))$ running time. The implementation of that function is shown below: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def range_count(self, l, r, x, y): + if l > r or l > self.n or l < 1 or x > y: + return 0 + if x <= self.smallest and self.largest <= y: + return r-l+1 + if self.leaf or y < self.smallest or x > self.largest: + return 0 + l_node, r_node = self.left, self.right + if (self._ranges_intersect(l_node.smallest, l_node.largest, x, y) and + self._ranges_intersect(r_node.smallest, r_node.largest, x, y)): + new_left_l, new_left_r = self._left_tree_range(l, r) + new_right_l, new_right_r = self._right_tree_range(l, r) + return (self.left.range_count(new_left_l, new_left_r, x, y) + + self.right.range_count(new_right_l, new_right_r, x, y)) + if (self._ranges_intersect(self.right.smallest, + self.right.largest, x, y)): + new_l, new_r = self._right_tree_range(l, r) + return self.right.range_count(new_l, new_r, x, y) + new_l, new_r = self._left_tree_range(l, r) + return self.left.range_count(new_l, new_r, x, y) +\end{minted} + +Note that $\texttt{rank(c, l, r)}$ and $\texttt{range\_count(l, r, x, y)}$ functions are easy to modify to return all indices. The only difference is to return range $[l, r]$ in the base case and map all indices from the recursion call to current node indices. Modification of $\texttt{range\_count(l, r, x, y)}$ returning all indices will be called $\texttt{range\_search(l, r, x, y)}$ and will be useful in construction of LZ-Index. \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/LZ-Index/LZIndex.tex b/text/fm_lz_index/tex/chapters/LZ-Index/LZIndex.tex new file mode 100644 index 0000000..2665de6 --- /dev/null +++ b/text/fm_lz_index/tex/chapters/LZ-Index/LZIndex.tex @@ -0,0 +1,7 @@ +\chapter{LZ-Index} + +\input{chapters/LZ-Index/Sections/LZ1} +\input{chapters/LZ-Index/Sections/LZ2} +\input{chapters/LZ-Index/Sections/LZ3} +\input{chapters/LZ-Index/Sections/LZ4} + diff --git a/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ1.tex b/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ1.tex new file mode 100644 index 0000000..a24de2e --- /dev/null +++ b/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ1.tex @@ -0,0 +1,404 @@ +\section{Preliminaries} +\subsection{LZ compression} +\label{subsection:LZCompression} +The most important part in LZ-Index \cite{LZIndex} is the fact that we want to search for patterns within the compressed text. The compressions that can be used for this particular structure are LZ78 \cite{LZ78} and LZW \cite{LZW}. Here we will focus on the LZ78 compression. + +\subsubsection{Idea of LZ78 compression} + +The main idea of LZ78 compression is to convert text $t$ into sequence of blocks, where the block will be in form $(i, c)$ where $i$ is an index and $c$ is the character of $t$, so the output of the LZ78 compression will be a sequence of $B_1,...,B_w$. Each of the block will clearly represent some substring of $t$ and after translating this block to that substring we will get exactly text $t$. The block $B_x$ is defined as a pair $(i, c)$ such that $B_x$ correspond to the string obtained from decompressing $B_i$ adding character $c$ at the end or just $c$ if $i = 0$. So we will use the previous blocks to create a new one by extending them with one character, so the idea can be written as: + +\begin{enumerate} + \item Start with the empty string $q$ and previous block as $prev$ = 0. + \item Get next character $c$ of $t$. + \item If there is no block that represents string $q + c$ or $c$ is last character of $t$, then add to result block $(prev, c)$, set $prev = 0$ and go to step 3. + \item Otherwise, which means that there is a block $B_z$, that it represents string $q + c$, set $prev = z$ and go to step 3. +\end{enumerate} + +To summarize it with one sentence: get next character of $t$ and add it to current string, if it is unique at that moment create a new block from the previous one and start again from empty string with next character of $t$. + +\subsubsection{Example of LZ78 compression} + +Let's try to find a LZ78 compression of text $t = aabb$. We start with: + +\begin{table}[H] +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +step & result & current string & previous block & blocks mapping \\ \hline +0 & {[} {]} & ' ' & 0 & \{\} \\ \hline +\end{tabular} + +\end{center} +\end{table} +Going forward with loop the next character is $a$. + +\begin{table}[H] +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +step & result & current string & previous block & blocks mapping \\ \hline +0 & {[} {]} & ' ' & 0 & \{\} \\ \hline +1 & {[} {]} & 'a' & 0 & \{\} \\ \hline +\end{tabular} +\end{center} +\end{table} +There is no block that correspond to the string currently under consideration so we add new block, set default values and look at the next character. + +\begin{table}[H] +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +step & result & current string & previous block & blocks mapping \\ \hline +0 & {[} {]} & ' ' & 0 & \{\} \\ \hline +1 & {[} {]} & 'a' & 0 & \{\} \\ \hline +2 & {[} (0, a) {]} & 'a' & 0 & \{ $B_1$ = 'a' \} \\ \hline +\end{tabular} + +\end{center} +\end{table} +There is already a block that correspond to string $'a'$, so we continue with the next character. + +\begin{table}[H] +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +Step & result & current string & previous block & blocks mapping \\ \hline +0 & {[} {]} & ' ' & 0 & \{\} \\ \hline +1 & {[} {]} & 'a' & 0 & \{\} \\ \hline +2 & {[} (0, a) {]} & 'a' & 0 & \{ $B_1$ = 'a' \} \\ \hline +3 & {[} (0, a) {]} & 'ab' & 1 & \{ $B_1$ = 'a' \} \\ \hline +\end{tabular} + +\end{center} +\end{table} +Now string currently under consideration is unique withing current mapped blocks, so we add it as a new block and goes with new character. + +\begin{table}[H] +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline +\rowcolor[HTML]{C0C0C0} +step & result & current string & previous block & blocks mapping \\ \hline +0 & {[} {]} & ' ' & 0 & \{\} \\ \hline +1 & {[} {]} & 'a' & 0 & \{\} \\ \hline +2 & {[} (0, a) {]} & 'a' & 0 & \{ $B_1$ = 'a' \} \\ \hline +3 & {[} (0, a) {]} & 'ab' & 1 & \{ $B_1$ = 'a' \} \\ \hline +4 & {[} (0, a), (1, b) {]} & 'b' & 0 & \{ $B_1$ = 'a', $B_2$ = 'ab' \} \\ \hline +\end{tabular} + +\end{center} +\end{table} +The last character is $b$ which is also unique string, so the result of compression will be $(0, a), (1, b), (0, b)$. + + +\subsubsection{Properties of LZ78 compression} + +Let us start by stating some interesting properties of the LZ78 compression. + +\begin{observation} \cite{LZIndex} + Let $B_1,...,B_w$ be the LZ78 compression of $t$. Each of these blocks is unique, possibly except the last one. We can make them all unique appending to $t$ a character that does not appear in the alphabet. +\end{observation} + +\begin{proof} + Let us prove it by contradiction. Let us assume that there are two blocks $B_i, B_j$ such that $B_i = B_j$, $1 \leq i \neq j < w$. Without loss of generality we assume that $j > i$. Taking into consideration LZ78 compression algorithm. If there already exists the block that is representing the same substring, we are considering the next character of $t$, so we cannot add that block to the result. From the fact that $i, j < w$, none of the blocks $B_i, B_j$ is the last one block, so the next character always exists. That implies that such situation cannot happen. Appending the character that does not appear in the alphabet, we ensure that the last one block is unique, because such character is already unique in $t$. +\end{proof} + +The above observation is essential in LZ-index pattern searching procedure, we will need all the blocks to be unique. + +\begin{observation} \cite{LZ78} + LZ78 is a lossless compression, that is, it can be decompressed unambiguously, to obtain text $t$. +\end{observation} + +That observation can be easily by contradiction straight form the decompressing procedure. + +\begin{observation} \cite{entropyProof} + In the worst case LZ78 compression will use $x + o(x)$ bits, where $x$ is a number of bits of uncompressed text. +\end{observation} + +\begin{observation} \cite{entropyProof} + The compression rate of LZ78 compression in average case asymptotically approaches to entropy of text $t$. +\end{observation} + +Proof of above observation is quite complicated and requires advanced knowledge of probability, so it won't be discussed here. + + +\subsection{LZTrie} +The first structure required for LZ-Index is LZTrie. It is a tree consisting of all blocks of LZ78 compression, where each node correspond to a single block of compressed text. By creating that structure you can also compress the text at once, like in the compression procedure presented above. The only difference in this case is that all blocks are represented as nodes. That structure has to permit the following operations for each node: + +\begin{itemize} + \item $\texttt{id(x)}$ -- returns the identifier of block which can be just an index of the block in compressed text. + \item $\texttt{children(x)}$ -- computes a structure that holds all the children of given node. The returned collection has to answer fast for query like $\texttt{child(c)}$ which should return a child of $x$ such that string represented by $x$ is extended by character $c$ in that child or recognize that such child not exist. + \item $\texttt{parent(x)}$ -- provides the parent of given node. + \item $\texttt{depth(x)}$ -- retrieves the depth of given node counted from the root of the tree. It will be used to count the offset for pattern beginning index. + \item $\texttt{rank(x)}$ -- returns the rank of given node in lexicographic order. + \item $\texttt{position(x)}$ -- returns a position in which given node begins in uncompressed text. It is used for mapping the raw result of string-searching performed on LZ-Index, which is a set of pairs $(i, j)$, where $i$ is the index of a block in compressed text and $j$ is the offset for the beginning of that block. + \item $\texttt{left\_rank(x)}$ and $\texttt{right\_rank(x)}$ -- retrieves the minimum and maximum rank of proper node in given node subtree. +\end{itemize} +The LZTrie structure has to perform search for given string that will return the node which represents that string in LZTrie. + +\begin{center} + \begin{tikzpicture}[shorten >=1pt,->] + \tikzstyle{vertex}=[circle,fill=black!25,minimum size=20pt,inner sep=4pt] + \node[vertex] (G_1) at (-2,-2) {1}; + \node[vertex] (G_2) at (0,0) {0}; + \node[vertex] (G_3) at (2,-2) {2}; + \node[vertex] (G_4) at (-1,-4) {3}; + \node[vertex] (G_5) at (-3,-4) {4}; + \draw[] (G_2) -- (G_1) node [midway, left] {a}; + \draw[] (G_2) -- (G_3) node [midway, right] {n}; + \draw[] (G_1) -- (G_4) node [midway, right] {n}; + \draw[] (G_1) -- (G_5) node [midway, left] {s}; + \end{tikzpicture} + \captionof{figure}{Example LZTrie for $t = ananas$} +\end{center} + +The creation of LZTrie node and also RevLZTrieNode can be implemented in the following way: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class _LZTreeNode: + def __init__(self, parent, character, idx, position): + self.parent = parent + self.position = position + if parent is not None: + parent.children[character] = self + self.depth = parent.depth + 1 + else: + self.depth = 0 + self.idx = idx + self.children = {} + self.character = character + self.rank = None + self.left_rank = None + self.right_rank = None +\end{minted} + +For proper setting $rank$, $left\_rank$, $right\_rank$ properties we have to build the whole tree and then traverse it, so it requires separate function: + +\begin{minted}[xleftmargin=20pt, linenos]{python} + def set_ranks(self, rank): + if self.idx is not None: + self.rank = rank + self.left_rank = rank + self.right_rank = rank + rank = rank + 1 + if len(self.children) > 0: + for child_key in sorted(self.children): + rank = self.children[child_key].set_ranks(rank) + min_key = min(self.children) + max_key = max(self.children) + self.left_rank = (self.children[min_key].left_rank + if (self.rank is None or + self.children[min_key].left_rank < self.rank) + else self.rank) + self.right_rank = (self.children[max_key].right_rank + if (self.rank is None or + self.children[max_key].right_rank > self.rank) + else self.rank) + return rank +\end{minted} + +As for implementation of construction the LZTrie it looks just like the compression algorithm mentioned in \Cref{subsection:LZCompression}. + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class _LZTrie: + def __init__(self, t, n): + t += '$' + self.root = _LZTreeNode(None, '#', 0, None) + current_node = self.root + index, position = 1, 1 + for i in range(1, n+2): + current_char = t[i] + if current_char not in current_node.children: + _LZTreeNode(current_node, current_char, idx, position) + index += 1 + current_node = self.root + position = i+1 + else: + current_node = current_node.children[current_char] + self.size = index + self.root.set_ranks(0) + +def search(tree, t, n): + return _search_private(t, 0, n, tree.root) + +def _search_private(tree, index, n, node): + if index == n: + return node + if tree[index+1] not in node.children: + return None + return _search_private(tree, index+1, n, node.children[tree[index+1]]) +\end{minted} + +LZTrie creation requires $\bigO(n \cdot \log (|\mathcal{A}|))$ time for a given text of length $n$. + +\subsection{RevLZTrie} +RevLZTrie structure behaves like LZTrie, but there are some differences. The first one is that the RevLZTrie is created by passing each node of LZTrie and going from it to the root. Formally, it is created by reversed strings of compressed text and not by passing the reversed text to LZTrie. The RevLZTrie consists of the same types of nodes as LZTrie so each of them can perform the same operations. The main difference is that in RevLZTrie there are some nodes that do not correspond to any blocks of the compressed text, so the size of that structure can be equal to the length of uncompressed text, but still the complexity of creation it is exactly the same as for LZTrie which is $\bigO(n \cdot \log (|\mathcal{A}|))$. +\begin{center} + \begin{tikzpicture}[shorten >=1pt,->] + \tikzstyle{vertex}=[circle,fill=black!25,minimum size=20pt,inner sep=4pt] + \node[vertex] (G_1) at (-2,-2) {1}; + \node[vertex] (G_2) at (0,0) {0}; + \node[vertex] (G_3) at (2,-2) {2}; + \node[vertex] (G_4) at (3,-4) {3}; + \node[vertex] (G_5) at (0,-2) {4}; + \node[vertex] (G_6) at (0,-4) {5}; + \draw[] (G_2) -- (G_1) node [midway, left] {a}; + \draw[] (G_2) -- (G_3) node [midway, right] {n}; + \draw[] (G_3) -- (G_4) node [midway, right] {a}; + \draw[] (G_2) -- (G_5) node [midway, left] {s}; + \draw[] (G_5) -- (G_6) node [midway, left] {a}; + \end{tikzpicture} + \captionof{figure}{RevLZTrie for LZTrie from Figure 3.1} +\end{center} + +The example implementation looks like this: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class _RevLZTrie: + def __init__(self, lz_trie): + self.root = _LZTreeNode(None, '#', 0, None) + self._add_recursive(lz_trie.root) + self.root.set_ranks(0) + + def _add_recursive(self, node): + for child in node.children.values(): + self._add_recursive(child) + self._add_block(child, self.root, child.idx) + + def _add_block(self, lz_node, rev_node, idx): + if lz_node.parent is None or lz_node.parent.character == '#': + if lz_node.character in rev_node.children: + rev_node.children[lz_node.character].idx = idx + else: + rev_node.children[lz_node.character] = (_LZTreeNode(rev_node, + lz_node.character, idx, None)) + else: + if lz_node.character not in rev_node.children: + rev_node.children[lz_node.character] = (_LZTreeNode(rev_node, + lz_node.character, None, None)) + self._add_block(lz_node.parent, + rev_node.children[lz_node.character], idx) +\end{minted} + +Note that $\texttt{search(tree, n, root)}$ function works properly for both LZTrie and RevLZTrie. + +\subsection{Range structure} +One of the structures needed for LZ-index is a structure that can perform two-dimensional searching in space $[0, n] \times [0,n]$. + +\begin{problem}[Two-dimensional search in space $\lbrack0, n\rbrack \times \lbrack0,n\rbrack$] +Given a set of points $P = \{ (x, y): x \in \lbrack0, n\rbrack, y \in \lbrack0, n\rbrack \}$ of size $m$. For each query of form $[l_1, r_1]$, $[l_2, r_2]$ return all points from $P$ that $x \in [l_1, r_1]$ and $y \in [l_2, r_2]$. +\end{problem} + +There are many ways to solve that problem, the first one is naive approach which for each point $p \in P$ checks if it fulfills condition from the query. Such approach is very simple and will work in $\bigO(m)$ time as for each point we need constant time to check the query condition, but it can be done faster. First, we sort the points by their first coordinate. Then, we create Wavelet Tree from the list of second coordinates from sorted points. With such preprocessing, we can now answer queries by performing two binary searches to first find the indices $l, r$ such that all points in range $[l, r]$ fulfills search condition for first coordinate, and then we use $\texttt{range\_search(l, r, x, y)}$ function for Wavelet Tree with arguments $l$, $r$, $l_2$, $r_2$ and maps the result. Note that the result of $\texttt{range\_search(l, r, x, y)}$ is exactly the list of all indices for points that fulfil the condition for first and second coordinate. Implementation of that structure can be done in the following way: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class _RangeSearcher: + def __init__(self, points): + self.points = sorted(points, key= lambda x: x[0]) + values = ['#'] + [y for x, y in self.points] + self.wavelet_tree = wavelet_tree.WaveletTree(values, len(values)-1) + + def search_in_range(self, l1, r1, l2, r2): + l, r = 0, len(self.points) + while l < r: + s = (l+r)//2 + x, _ = self.points[s] + if x < l1: + l = s+1 + else: + r = s + left = l + l, r = -1, len(self.points)-1 + while l < r: + s = (l+r+1)//2 + x, _ = self.points[s] + if x <= r1: + l = s + else: + r = s-1 + right = l + if left > right or left == len(self.points) or right == -1: + return [] + return ([self.points[x-1] for x in + self.wavelet_tree.range_search(left+1, right+1, l2, r2)]) +\end{minted} + +Time needed for construction RankSearcher is equal to $\bigO(m \log m)$ due to the sorting and the creation of Wavelet Tree. Moreover, answering the query use only $\bigO(R\cdot \log(m) + \log(m))$ time, where $R$ is the number of points that are returned from query. The input for creation of RangeSearcher will be all pairs in form $(B_i.rev\_rank, B_{i+1}.rank)$ for all $i$ less than number of block in compressed text and where $B_i.rev\_rank$ is the rank if $i$-th block in RevLZTrie. + +\subsection{NodeMapper structure} +The NodeMapper structure retrieves node of RevLZTrie that represents a block at the given index in LZ78 compression. Such structure can be created by using information from RevLZTrie nodes and just all of them in any order. It requires $\bigO(n)$ time due to the fact that some nodes of RevLZTrie does not represent any block of compressed text. + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class _NodeMapper: + def __init__(self, lz_trie, size): + self.arr = [None] * size + self._map_tree_to_list(lz_trie.root) + + def _map_tree_to_list(self, node): + if node.idx is not None: + self.arr[node.idx] = node + for child in node.children.values(): + self._map_tree_to_list(child) + + def get_node_by_idx(self, idx): + return self.arr[idx] +\end{minted} + +The creation of this structure requires to visit all $\bigO(n)$ nodes of RevLZTrie, and the query requires $\bigO(1)$ time. This structure is also used for receiving the text position from blocks, so it also has to be build for LZTrie. + +\subsection{RankMapper structure} +The last structure we need to perform string-searching in LZ78 compressed text is called RankMapper. That structure allows us to find the $i$-th node in lexicographical order. That structure has to be build for both LZtrie and RevLZTrie, but it can be also done in simple way using the idea already invoked for the creation of NodeMapper. To build RankMapper one can just visit all the nodes of the tree and set their index in a suitable array, as presented below: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class _RankMapper: + def __init__(self, lz_trie, size): + self.arr = [None] * size + self._map_tree_to_list(lz_trie.root) + + def _map_tree_to_list(self, node): + if node.rank is not None: + self.arr[node.rank] = node + for child in node.children.values(): + self._map_tree_to_list(child) + + def get_node_by_rank(self, rank): + return self.arr[rank] +\end{minted} + +The complexity of creation RankMapper and performing a query is exactly the same as for NodeMapper. Having all of that structures in hand, LZ-Index can be constructed as follows: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +class _LZIndex: + def __init__(self, lz_trie, rev_lz_trie, lz_node_mapper, + rev_lz_node_mapper, range_searcher, lz_rank_mapper, + rev_lz_rank_mapper): + self.lz_trie = lz_trie + self.rev_lz_trie = rev_lz_trie + self.lz_node_mapper = lz_node_mapper + self.range_searcher = range_searcher + self.rev_lz_node_mapper = rev_lz_node_mapper + self.lz_rank_mapper = lz_rank_mapper + self.rev_lz_rank_mapper = rev_lz_rank_mapper + +def create_lz_index(t, n): + lz_trie = _LZTrie(t, n) + rev_trie = _RevLZTrie(lz_trie) + lz_node_mapper = _NodeMapper(lz_trie, lz_trie.size) + rev_node_mapper = _NodeMapper(rev_trie, lz_trie.size) + + points = [(rev_node_mapper.get_node_by_idx(i).rank, + lz_node_mapper.get_node_by_idx(i+1).rank) + for i in range(1, lz_trie.size - 1)] + range_searcher = _RangeSearcher(points) + lz_rank_mapper = _RankMapper(lz_trie, lz_trie.size) + rev_lz_rank_mapper = _RankMapper(rev_trie, lz_trie.size) + return _LZIndex(lz_trie, rev_trie, lz_node_mapper, rev_node_mapper, + range_searcher, lz_rank_mapper, rev_lz_rank_mapper) +\end{minted} + + + diff --git a/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ2.tex b/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ2.tex new file mode 100644 index 0000000..940a37f --- /dev/null +++ b/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ2.tex @@ -0,0 +1,225 @@ +\section{Idea of LZ-Index searching} +String-searching in compressed text using LZ78 compression may be split in three cases depending on the number of block in which the pattern is located. + +\begin{enumerate} + \item Pattern occurrence is fully located inside a single block. + \item Pattern occurrence lies in to following blocks, it means that the prefix of the pattern matches the suffix of some block $B_i$ and the left suffix of the pattern matches with the prefix of the block $B_{i+1}$ for some $i$. + \item Pattern occurrence lies in three on more blocks, which means that it will be like in case 2, but there can be some blocks in between. +\end{enumerate} + +\begin{table}[H] +\begin{center} +\caption*{Types of occurrences} +\begin{tabular}{|l| +l +>{\columncolor[HTML]{C0C0C0}}l +l| +l +l +>{\columncolor[HTML]{C0C0C0}}l| +>{\columncolor[HTML]{C0C0C0}}l +>{\columncolor[HTML]{C0C0C0}}l +l +l +>{\columncolor[HTML]{C0C0C0}}l| +>{\columncolor[HTML]{C0C0C0}}l| +>{\columncolor[HTML]{C0C0C0}}l +>{\columncolor[HTML]{C0C0C0}}l +l|} +\hline + & & & & & & & & & & & & & & & \\ + $B_1$ & & $B_2$ & & & $B_3$& & & & $B_4$& & & $B_5$ & & $B_6$ &\\ + & & & & & & & & & & & & & & & + \\ \hline +\end{tabular} +\end{center} +\end{table} + +Let us consider all these cases in turn. + +\subsection{Occurrences inside one block} +To find all occurrences in that case we look at blocks as the independent texts. We use the idea of searching in the suffix array, that is, we want to obtain all suffixes for each block and check if the pattern is its prefix. It can be done all at once, so we do not have to get all suffixes for each block. To achieve that we use RevLZTrie, but first we have to notice some properties. We know that if there is some block $B_i$ in LZTrie, then LZTrie also contains blocks that corresponds to all prefixes of the substring compressed in $B_i$. That property will follow to the observation below. + +\begin{observation} + Let $B_j$ be the parent block of $B_i$. If a pattern $s$ occurs inside the string compressed in $B_i$, and it is not a suffix of string compressed in $B_i$ then string compressed in $B_j$ also contains the pattern $s$. +\end{observation} + +That means if we find the node corresponding to $s$ in RevLZTrie, denoted as $v$, then all nodes that correspond to some block in the subtree of $v$ will be a candidate to be the pattern occurrence, but it will be even a little more form that observation. For each correct node $u$ in subtree of $v$ it is enough to find it in LZTrie and all nodes in subtree of $u$ will correspond to an occurrence. This follows to the algorithm for the first case. + +\begin{enumerate} + \item Find the node $v$ that correspond to $s^r$ in $\texttt{RevLZTrie}$. + \item For each $i$ in range $[x.left\_rank, x.right\_rank]$ do: + \begin{enumerate} + \item Find node $u$ in LZTrie that correspond to the node with rank $i$ in RevLZTrie, it can be done by using RevLZRankMapper and LZNodeMapper. + \item For each node $x$ in subtree of $u$ add to result index $x.position + u.depth - m$. + + \end{enumerate} +\end{enumerate} + +Finding the node $v$ requires at most $\bigO(m)$. From Observation 8. follows that for each node in subtree of $v$ there is at least one unique occurrence. That gives us that by traversing the subtree of $v$ we obtain all occurrences of that type. This gives us in total $\bigO(m + R)$ time complexity. In short, the above procedure can be implemented with all edge cases handled as follows: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def _contains_in_single_block(lz_index : _LZIndex, s, m): + v = '#' + (s[::-1])[:-1] + root = search(lz_index.rev_lz_trie, v, m) + if root is not None: + for i in range(root.left_rank, root.right_rank + 1): + rev_node = lz_index.rev_lz_rank_mapper.get_node_by_rank(i) + node = lz_index.lz_node_mapper.get_node_by_idx(rev_node.idx) + for j in range(node.left_rank, node.right_rank + 1): + result_node = lz_index.lz_rank_mapper.get_node_by_rank(j) + yield result_node.position + node.depth - m +\end{minted} + +\subsection{Occurrences spanning two blocks} +In the second type of occurrences we consider all partitions of the pattern. It was mentioned before that the occurrences are in form such that the prefix of the pattern is the suffix of one block and the suffix of the pattern is the prefix of the next block. Therefore, for each partition of the pattern we find all nodes that contains the first part of partition as a suffix, which means searching for the reversed first part in RevLZTrie and for the second part in LZTrie. Let us denote $u$ as the result of search for the prefix of the pattern and $v$ the result of search for the suffix of the pattern. Now we want to check that if there are nodes $x, y$ such that $x$ is in subtree of $u$, $y$ is in subtree of $v$ and $x.idx + 1 = y.idx$. That way the nodes $x, y$ are neighbors. We can find such nodes using RangeSearcher structure in the following way. + +\begin{enumerate} + \item For all $i$ in range $[1, |s|-1]$ do: + \begin{enumerate} + \item Let $pref$ be the prefix of $s$ of length $i$ and $suf$ the suffix of $s$ of length $|s| - i$. + \item Find node that correspond to reversed $pref$ in RevLZTrie, let us denote the result as $u$ and find node that correspond to the $suf$ in LZTrie, let us denote it as $v$. + \item For all points $(x, y)$ returned by RangeSearcher query with arguments \\ $u.left\_rank$, $u.rigth\_rank$, $u.left\_rank$, $v.right\_rank$. Let us denote $z$ as the node of LZTrie that corresponds to the node with rank $x$ in RevLZTrie and add to result index $z.position + z.depth - i$. + \end{enumerate} +\end{enumerate} + +For each position index in patter we perform a query on RangeSearcher and return the results, so it is easy to observe that the time complexity is $\bigO(m^2 + m \cdot \log (w) + R \cdot \log (w))$. Above procedure can be implemented in the following way: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def _contains_within_two_blocks(lz_index : _LZIndex, s, m): + for i in range(1, m): + rev_prefix = '#' + (s[::-1])[m-i:m] + sufix = '#' + s[i+1:] + rev_node = search(lz_index.rev_lz_trie, rev_prefix, i) + node = search(lz_index.lz_trie, sufix, m-i) + if rev_node is None or node is None: + continue + l1,r1 = rev_node.left_rank, rev_node.right_rank + l2,r2 = node.left_rank, node.right_rank + for (x, _) in (lz_index.range_searcher.search_in_range(l1, + r1, l2, r1)): + rev_node = lz_index.rev_lz_rank_mapper.get_node_by_rank(x) + node = lz_index.lz_node_mapper.get_node_by_idx(rev_node.idx) + yield node.position + node.depth - i + +\end{minted} + + +\subsection{Occurrences spanning three or more blocks} +For the last case we will use the property of LZ78 compression that all the block in result compression are unique to make an observation. + +\begin{observation} \cite{LZIndex} + There is only one block $B_i$ such that the string compressed in $B_i$ is equal to $s[j:k]$. +\end{observation} +\begin{proof} + We prove it by contradiction: if there were two blocks that match such substring it would mean that they are the same, which conflicts with a property of LZ78 compression that all blocks corresponds to unique substrings. +\end{proof} +From that observation it can be also concluded that there are at most $\bigO(m^2)$ blocks that match some substring of $s$ in that case. It follows from the fact that there are at most $\bigO(m^2)$ substring in $s$. + +\begin{observation} + If the substring compressed in block $B_k$ lies inside the occurrence of the pattern, and it matches substring $s[i:j]$. It will not be used to match that substring in any other occurrence. +\end{observation} + +\begin{proof} + Again we prove it by contradiction. Suppose that there are two different occurrences of $s$ that contains block $B_k$ at exactly position $i$ in the pattern, which means that the next block are implied and the previous ones also because the next one has to be $b_{k+1}$, and it is also at the same position which means that these occurrences will be the same. +\end{proof} + +Using these observations we compute array $C[i][j]$, which will store a node that correspond to the substring $s[i: j+1]$, array of dictionaries $A[i]$, which store a dictionary that maps the node index to an index $j$ such that $C[i][j] = x$, array $visited[i][j]$, which store information that the block, which strings match $s[i : j+1]$ was already used. To compute these structures we use the following procedure: + +\begin{enumerate} + \item For each $i$ in range $[1, |s|]$ do: + \begin{enumerate} + \item Denote $LZTrie.root$ as $v$, and create empty dictionary $d$. + \item For each $j$ in range $[i, |s|]$ do: + \begin{enumerate} + \item If $s[j]$ not in $v.children$ then go to step 1 + \item Set $v = v.children[s[j]]$, $C[i][j] = v$ and $d[v.idx] = j$ + \end{enumerate} + \item Set $A[i] = d$ + \end{enumerate} +\end{enumerate} + +Above procedure requires $\bigO(m^2)$ time and space and can be implemented as follows: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def _prepare_structures_for_third_case(lz_index : _LZIndex, s, m): + used = [[False]*(m+1) for _ in range(m+1)] + existance = [[None]*(m+1) for _ in range(m+1)] + arr = [{}] + for i in range(1, m+1): + recorded = {} + current_node = lz_index.lz_trie.root + for j in range(i, m+1): + if (current_node is not None and + s[j] not in current_node.children): + current_node = None + elif current_node is not None: + current_node = current_node.children[s[j]] + existance[i][j] = current_node + if current_node is not None: + recorded[current_node.idx] = j + arr.append(recorded) + return used, existance, arr +\end{minted} + +Using that information we can now search for all occurrences with case 3. + +\begin{enumerate} + \item For all $i$ in range $[1, |s|]$ do: + \begin{enumerate} + \item For all $j$ in range $[i, |s|]$ do: + \begin{enumerate} + \item If $C[i][j]$ is $None$ or $visited[i][j]$ is true, go back to step 2 + \item Try to expand range $[i,j]$ with the next blocks from $C[i][j]$ that matches exactly as far as it's possible and denote the new range as $[i, r]$ + \item If the next block does not contain $s_{r+1},...,s_m$ as a suffix, then go back to step 2 + \item If the suffix of block with $idx = C[i][j].idx - 1$ contains $s_1,...,s_{i-1}$ as a suffix and there are at least 3 blocks, then denote $v$ as \\ $LZNodeMapper.get\_node\_by\_idx(C[i][j].idx-1)$ and add index \\ $v.position - i + 1$ to the result. + \end{enumerate} + \end{enumerate} +\end{enumerate} + +During that process we mark every used node and check if it was not reused. This procedure can be done in $\bigO(m^3)$ time. Unfortunately, there are some edge cases which has to be considered within that procedure. Implementation that cover all of that edge cases can be seen below: + +\begin{minted}[xleftmargin=20pt, linenos]{python} +def _contains_within_three_or_more_blocks(lz_index : _LZIndex, s, m): + used, exist, arr = _prepare_structures_for_third_case(lz_index, s, m) + for i in range(1, m+1): + for j in range(i, m+1): + if exist[i][j] is None or used[i][j] is True: + continue + start_idx = exist[i][j].idx + current_idx = start_idx + current_end = j + while current_end < m and (current_idx + 1) in arr[current_end+1]: + current_idx = current_idx + 1 + used[current_end + 1][arr[current_end + 1][current_idx]] = True + current_end = arr[current_end + 1][current_idx] + size = current_idx - start_idx + 1 + if i > 1: + size = size + 1 + if current_end < m: + size = size + 1 + if size < 3 or (current_end != m and + exist[current_end+1][m] is None): + continue + if (lz_index.lz_trie.size > current_idx + 1 and + (current_end == m or (exist[current_end+1][m].left_rank <= + lz_index.lz_node_mapper.get_node_by_idx(current_idx+1).rank <= + exist[current_end+1][m].right_rank ))): + if i == 1: + yield lz_index.lz_node_mapper.get_node_by_idx(start_idx).position + continue + if start_idx == 1: + continue + current_node = lz_index.lz_node_mapper.get_node_by_idx(start_idx-1) + prev = i - 1 + while (prev > 0 and current_node.parent is not None and + s[prev] in current_node.parent.children and + current_node.parent.children[s[prev]] == current_node): + prev = prev - 1 + current_node = current_node.parent + if prev == 0: + node = lz_index.lz_node_mapper.get_node_by_idx(start_idx) + yield node.position - i + 1 +\end{minted} + +By summing the complexities of all cases we get the total running time $\bigO(m^3 + (m + R) \cdot \log (w))$, where $R$ is the number of occurrences of $s$ in $t$. Note that some operations on LZTrie and RevLZTrie use dictionaries, e.g. for $\texttt{children(x)}$, which in the worst case may require more than $\bigO(1)$ operations. If we want to use data structures with deterministic guarantees, such as balanced BST, the total complexity will be $\bigO(m^3 \cdot \log (|\mathcal{A}|) + (m + R)\cdot \log (w))$. \ No newline at end of file diff --git a/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ3.tex b/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ3.tex new file mode 100644 index 0000000..e69de29 diff --git a/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ4.tex b/text/fm_lz_index/tex/chapters/LZ-Index/Sections/LZ4.tex new file mode 100644 index 0000000..e69de29 diff --git a/text/fm_lz_index/tex/config.tex b/text/fm_lz_index/tex/config.tex new file mode 100644 index 0000000..a41cd84 --- /dev/null +++ b/text/fm_lz_index/tex/config.tex @@ -0,0 +1,28 @@ +\newcommand{\myName}{Mateusz\xspace} +\newcommand{\mySurname}{Hurkała\xspace} +\newcommand{\myUniversity}{Jagiellonian University\xspace} +\newcommand{\myDepartament}{Department of Theoretical Computer Science\xspace} +\newcommand{\myLocation}{Kraków\xspace} +\newcommand{\currentYear}{2024\xspace} +\newcommand{\myTitle}{Implementation and comparison of LZ-Index and FM-Index algorithms \xspace} +\newcommand{\myTutor}{D.Sc. Krzysztof Turowski\xspace} +\newcommand{\bigO}{\mathcal{O}} + + + +\graphicspath{ {images/} } +\makeatletter +\newtheorem{theorem}{Theorem} +\newtheorem{observation}{Observation} +\newtheorem{lemma}{Lemma} +\newtheorem{problem}{Problem} +\newtheorem{definition}{Definition} +\crefname{observation}{Observation}{Observations} + +\pagestyle{fancyplain} +\fancyhf{} +\fancyfoot[R]{\thepage} +\renewcommand{\headrulewidth}{0pt} +\setlength{\headheight}{14pt} + +\author{\myName \mySurname} \ No newline at end of file diff --git a/text/fm_lz_index/tex/main.tex b/text/fm_lz_index/tex/main.tex new file mode 100644 index 0000000..348221e --- /dev/null +++ b/text/fm_lz_index/tex/main.tex @@ -0,0 +1,23 @@ +\documentclass[11pt]{report} +\usepackage{packages} +\bibliography{references} + +\begin{document} +\input{config} +\input{titlepage} +\graphicspath{{./images/}} +\tableofcontents + +\setstretch{1.1} + +\pagenumbering{arabic} +\input{chapters/Intro/Introduction} +\input{chapters/FM-Index/FMIndex} +\input{chapters/LZ-Index/LZIndex} +\input{chapters/Comparation/Compare} + + +\singlespacing +\printbibliography + +\end{document} diff --git a/text/fm_lz_index/tex/packages.sty b/text/fm_lz_index/tex/packages.sty new file mode 100644 index 0000000..271d596 --- /dev/null +++ b/text/fm_lz_index/tex/packages.sty @@ -0,0 +1,44 @@ + +%\usepackage[polish]{babel} +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{graphicx} +\usepackage[hypcap=false]{caption} +\usepackage{subcaption} +\usepackage{float} +\usepackage[width=150mm,top=30mm,bottom=25mm,bindingoffset=6mm]{geometry} +\usepackage{fancyhdr} +\usepackage{setspace} +\usepackage{algorithm} +\usepackage{algpseudocode} +\usepackage{svg} +\usepackage{booktabs} +\usepackage{tabulary} +\usepackage[section]{placeins} +\usepackage{amsmath} +\usepackage{amsthm} +\usepackage{xspace} +\usepackage{minted} +\usepackage{hyperref} +\usepackage{colortbl} +\usepackage{tikz} +\usepackage{color} +\usepackage{amsfonts} +\usepackage{cleveref} +\usepackage{csquotes} +\usetikzlibrary{matrix,positioning,arrows.meta,arrows} +\tikzset{ +mymat/.style={ + matrix of math nodes, + text height=2.5ex, + text depth=0.75ex, + text width=3.25ex, + align=center, + column sep=-\pgflinewidth + }, +mymats/.style={ + mymat, + nodes={draw,fill=#1} + } +} +\usepackage[backend=biber,style=alphabetic,sorting=ynt]{biblatex} \ No newline at end of file diff --git a/text/fm_lz_index/tex/references.bib b/text/fm_lz_index/tex/references.bib new file mode 100644 index 0000000..bcdb039 --- /dev/null +++ b/text/fm_lz_index/tex/references.bib @@ -0,0 +1,260 @@ +@article{KMP, + author = {Knuth, Donald and Morris Jr., James and Pratt, Vaughan}, + title = {Fast Pattern Matching in Strings}, + journal = {SIAM Journal on Computing}, + volume = {6}, + number = {2}, + pages = {323-350}, + year = {1977}, + doi = {10.1137/0206024}, +} + +@inproceedings{SuffixArray, + title={Suffix arrays: a new method for on-line string searches}, + author={Manber, Udi and Myers, Gene}, + journal={SIAM Journal on Computing}, + volume={22}, + number={5}, + pages={935--948}, + year={1993} +} + +@inproceedings{KMR, + author = {Karp, Richard and Miller, Raymond and Rosenberg, Arnold}, + title = {Rapid identification of repeated patterns in strings, trees and arrays}, + booktitle = {STOC '72: Proceedings of the 4th Annual ACM Symposium on Theory of Computing}, + year = {1972}, + pages = {125-136} +} + +@inproceedings{KS, + author = {Kärkkäinen, Juha and Sanders, Peter}, + title = {Simple Linear Work Suffix Array Construction}, + booktitle = {Automata, Languages and Programming}, + editor = {Baeten, Jos and Lenstra, Jan Karel and Parrow, Joachim and Woeginger, Gerhard}, + year = {2003}, + series = {Lecture Notes in Computer Science}, + volume = {2719}, + doi = {10.1007/3-540-45061-0_73}, + URL = {https://doi.org/10.1007/3-540-45061-0_73} +} + +@inproceedings{SuffixArrayImage, + author = {Li, Guanhua and Wu, Yijian and Roy, Chanchal and Sun, Jun and Peng, Xin and Zhan, Nanjie and Hu, Bin and Ma, Jingyi}, + title = {SAGA: Efficient and Large-Scale Detection of Near-Miss Clones with GPU Acceleration}, + booktitle = {IEEE International Conference on Software Analysis, Evolution and Reengineering (SANER)}, + pages = {272-283}, + year = {2020}, + doi = {10.1109/SANER48275.2020.9054832}, +} + +@article{Chazelle, + title={A Functional Approach to Data Structures and Its Use in Multidimensional Searching}, + author={Bernard Chazelle}, + journal={SIAM Journal of Computing}, + year={1988}, + volume={17}, + number={3}, + pages={427-462}, + url={https://api.semanticscholar.org/CorpusID:3209902} +} + +@book{cormenSufixArray, + title={Introduction to Algorithms, fourth edition}, + author={Cormen, Thomas and Leiserson, Charles and Rivest, Ron and Stein, Clifford}, + url={https://books.google.pl/books?id=HOJyzgEACAAJ}, + year={2022}, + publisher={MIT Press} +} + +@article{twoWayAlgo, +author = {Crochemore, Maxime and Perrin, Dominique}, +title = {Two-way string-matching}, +year = {1991}, +volume = {38}, +number = {3}, +doi = {10.1145/116825.116845}, +journal = {Journal of the ACM}, +pages = {650–674}, +} +@article{rabinKarp, + doi = {10.1147/rd.312.0249}, + year = {1987}, + volume = {31}, + number = {2}, + pages = {249--260}, + author = {Richard Karp and Michael Rabin}, + title = {Efficient randomized pattern-matching algorithms}, + journal = {{IBM} Journal of Research and Development} +} + +@article{radixSort, +author = {Davis, Ian}, +year = {1992}, +month = {12}, +pages = {636-642}, +title = {A Fast Radix Sort}, +volume = {35}, +number={6}, +journal={The Computer Journal}, +doi = {10.1093/comjnl/35.6.636} +} + +@article{LS, +title = {Faster suffix sorting}, +journal = {Theoretical Computer Science}, +volume = {387}, +number = {3}, +pages = {258-272}, +year = {2007}, +doi = {https://doi.org/10.1016/j.tcs.2007.07.017}, +author = {N. Jesper Larsson and Kunihiko Sadakane}, +} + +@inproceedings{SA-IS, + author={Nong, Ge and Zhang, Sen and Chan, Wai Hong}, + booktitle={2009 Data Compression Conference}, + title={Linear Suffix Array Construction by Almost Pure Induced-Sorting}, + year={2009}, + pages={193-202}, + doi={10.1109/DCC.2009.42} + +@inproceedings{suffixTree, +title = "Optimal suffix tree construction with large alphabets", +author = "Martin Farach", +year = "1997", +pages = "137-143", +booktitle = "Proceedings 38th Annual Symposium on Foundations of Computer Science" +} + +@article{stringSearchingUsage, +author = {Kapil, Kumar and Soni, Rohit and Vyas, Amit and Sinhal, Dr. Amit}, +year = {2014}, +journal = {International Journal Of Engineering And Computer Science}, +pages = {2319-7242}, +title = {Importance of String Matching in Real World Problems}, +volume = {3}, +number={6} +} + +@InProceedings{RMQ1, +author="Bender, Michael and Farach-Colton, Mart{\'i}n", +editor="Gonnet, Gaston and Viola, Alfredo", +title="The LCA Problem Revisited", +booktitle="LATIN 2000: Theoretical Informatics", +year="2000", +publisher="Springer", +address="Berlin, Heidelberg", +pages="88-94" +} + +@misc{SegmentTreeRMQ, + author = "Richard Zhan", + title = "Segment Trees", + month = "11", + year = "2019", + note = "Lecture notes", + url = "https://activities.tjhsst.edu/sct/lectures/1920/2019_11_15_Segment_Trees.pdf" +} + +@misc{entropyProof, + author = "Michel Goemans", + month = "04", + year = "2015", + url = "https://math.mit.edu/~goemans/18310S15/lempel-ziv-notes.pdf", + title = "MIT lecture notes about LZ compressions" +} + +@ARTICLE{LZ78, + author={Lempel, Abraham and Ziv, Jacob}, + journal={IEEE Transactions on Information Theory}, + title={Compression of individual sequences via variable-rate coding}, + year={1978}, + volume={24}, + number={5}, + pages={530-536} +} + +@ARTICLE{LZW, + author={Welch, Terry}, + journal={Computer}, + title={A Technique for High-Performance Data Compression}, + year={1984}, + volume={17}, + number={6}, + pages={8-19}, + doi={10.1109/MC.1984.1659158} +} + +@incollection{LZIndex, + title = {Indexing text using the Ziv–Lempel trie}, + journal = {Journal of Discrete Algorithms}, + volume = {2}, + number = {1}, + pages = {87-114}, + year = {2004}, + doi = {10.1016/S1570-8667(03)00066-2}, + author = {Gonzalo Navarro} +} + + +@article{FMIndex, + author = {Ferragina, Paolo and Manzini, Giovanni}, + title = {Indexing compressed text}, + year = {2005}, + volume = {52}, + number = {4}, + doi = {10.1145/1082036.1082039}, + journal = {Journal of the ACM}, + pages = {552–581} +} + +@article{stringSearchingDef, + author = {Baeza-Yates, Ricardo}, + year = {1989}, + pages = {34-58}, + title = {Algorithms for string searching}, + volume = {23}, + number = {3-4}, + journal = {ACM SIGIR Forum}, + doi = {10.1145/74697.74700} +} + +@article{missMatch, +author = {Nicolae, Marius and Rajasekaran, Sanguthevar}, +year = {2013}, +title = {On String Matching with Mismatches}, +volume = {8}, +number = {2}, +pages = {248-270}, +journal = {Algorithms}, +doi = {10.3390/a8020248} +} + +@incollection{citation-key, +author = {Crochemore, Maxime and Lecroq, Thierry}, +year = {2016}, +title = {Multiple String Matching}, +publisher = {Springer}, +pages = {1378–1382}, +booktitle = {Encyclopedia of Algorithms}, +editor = {Kao, Ming-Yang} +} + +@article{wildcardsMatch, +author = {Clifford, Peter and Clifford, Raphael}, +year = {2007}, +pages = {53-54}, +title = {Simple deterministic wildcard matching}, +volume = {101}, +number = {2}, +journal = {Information Processing Letters}, +doi = {10.1016/j.ipl.2006.08.002} +} + +@book{Gusfield1997AlgorithmsOS, + title={Algorithms on Strings, Trees, and Sequences}, + author={Dan Gusfield}, + year={1997}, + publisher={Cambridge University Press} +} \ No newline at end of file diff --git a/text/fm_lz_index/tex/titlepage.tex b/text/fm_lz_index/tex/titlepage.tex new file mode 100644 index 0000000..4694970 --- /dev/null +++ b/text/fm_lz_index/tex/titlepage.tex @@ -0,0 +1,29 @@ +\begin{titlepage} + \begin{center} + \vspace*{1cm} + + \Huge + \textbf{\myTitle} + + \vspace{1cm} + \LARGE + + \textbf{\myName \mySurname} + + \vfill + + Bachelor thesis + + \Large + + Tutor: \myTutor + + \vspace{1.8cm} + + \textbf{\myUniversity}\\ + \myDepartament\\ + \myLocation \currentYear + + \end{center} + +\end{titlepage} \ No newline at end of file