Skip to content

Commit

Permalink
Bump version to 0.9.0 Update README with precomputation details
Browse files Browse the repository at this point in the history
Refactor tests for precomputation
  • Loading branch information
Buba98 committed Dec 27, 2024
1 parent 8a52e6b commit 30b01fe
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 199 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ assert '¢' in result
assert '£' in result
```

## Precomputation

For optimization purposes, the library precomputes the strings of the elements in the regex pattern when those does not repeat indefinitely. To disable this feature, in order to reduce the time of the first call to `next()`, set `precompute=False` when creating the `RegexEnumerator`.

```python
from regex_enumerator import RegexEnumerator

re = RegexEnumerator(r'a[0-9]b', precompute=False)
```

## How it works

This library works by parsing the regex pattern into a tree structure. Once parsed, it performs a breadth-first search (BFS) on the tree to generate all matching strings. This ensures it does not get stuck on unbounded quantifiers for character classes or groups.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='regex_enumerator',
version='0.8.5',
version='0.9.0',
packages=find_packages(include=['regex_enumerator', 'regex_enumerator.*']),
description='Enumerate all strings that match a given regex',
author='Vincenzo Greco',
Expand Down
45 changes: 23 additions & 22 deletions tests/test_alternative.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,69 +3,70 @@


def test_two_alternatives():
regexEnumerator = RegexEnumerator(r'a|b')
regex = r'a|b'
possibilities = ['a', 'b']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_alternatives_with_quantifier_on_second_option():
regexEnumerator = RegexEnumerator(r'a|b*')
regex = r'a|b*'
possibilities = ['a', '', 'b', 'bb', 'bbb', 'bbbb', 'bbbbb']

f_infinite(regexEnumerator, possibilities)
f_infinite(regex, possibilities)


def test_alternatives_with_quantifier_plus_on_first_option():
regexEnumerator = RegexEnumerator(r'a+|b')
regex = r'a+|b'
possibilities = ['b', 'a', 'aa', 'aaa', 'aaaa', 'aaaaa']

f_infinite(regexEnumerator, possibilities)
f_infinite(regex, possibilities)


def test_multiple_alternatives():
regexEnumerator = RegexEnumerator(r'a|b|c')
regex = r'a|b|c'
possibilities = ['a', 'b', 'c']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_alternative_with_literal_and_character_class():
regexEnumerator = RegexEnumerator(r'a|[b-d]')
regex = r'a|[b-d]'
possibilities = ['a', 'b', 'c', 'd']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_alternative_with_character_class_and_literal():
regexEnumerator = RegexEnumerator(r'[a-c]{ 0}|d')
regex = r'[a-c]{ 0}|d'
possibilities = ['', 'd']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_alternation_with_character_classes_and_literals():
regexEnumerator = RegexEnumerator(r'(a|[0-2])')
regex = r'(a|[0-2])'
possibilities = ['a', '0', '1', '2']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_nested_alternation():
regexEnumerator = RegexEnumerator(r'((a|b)|c)')
regex = r'((a|b)|c)'
possibilities = ['a', 'b', 'c']
f_finite(regexEnumerator, possibilities)

f_finite(regex, possibilities)


def test_alternation_with_grouping():
regexEnumerator = RegexEnumerator(r'(a(b|c)d|x)')
regex = r'(a(b|c)d|x)'
possibilities = ['abd', 'acd', 'x']

f_finite(regexEnumerator, possibilities)

f_finite(regex, possibilities)


def test_same_alternative_twice():
regexEnumerator = RegexEnumerator(r'a{1,2}|a{1,2}')
regex = r'a{1,2}|a{1,2}'
possibilities = ['a', 'aa']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)
44 changes: 22 additions & 22 deletions tests/test_backreference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,81 +3,81 @@


def test_backreference():
regexEnumerator = RegexEnumerator(r'(a)\1')
regex = r'(a)\1'
possibilities = ['aa']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_backreference_with_group_quantifier():
regexEnumerator = RegexEnumerator(r'(a)+\1')
regex = r'(a)+\1'
possibilities = ['aa' * i for i in range(1, 6)]

f_infinite(regexEnumerator, possibilities)
f_infinite(regex, possibilities)


def test_backreference_with_quantifier():
regexEnumerator = RegexEnumerator(r'(a)\1+')
regex = r'(a)\1+'
possibilities = ['a' * i + 'a' for i in range(1, 6)]

f_infinite(regexEnumerator, possibilities)
f_infinite(regex, possibilities)


def test_backreference_with_named_group():
regexEnumerator = RegexEnumerator(r'(?<name>[a-b])\k<name>')
regex = r'(?<name>[a-b])\k<name>'
possibilities = ['aa', 'bb']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_backreference_with_named_group_and_quantifier():
regexEnumerator = RegexEnumerator(r'(?<name>[a-b])\k<name>{1, 2}')
regex = r'(?<name>[a-b])\k<name>{1, 2}'
possibilities = ['aa', 'bb', 'aaa', 'bbb']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_zero_width_backreference():
regexEnumerator = RegexEnumerator(r'(a)?\1{0}')
regex = r'(a)?\1{0}'
possibilities = ['a', '']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_10_backreference():
regexEnumerator = RegexEnumerator(r'(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)\10')
regex = r'(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)\10'
possibilities = ['abcdefghijj']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_multiple_backreferences():
regexEnumerator = RegexEnumerator(r'(a)(b)\2\1')
regex = r'(a)(b)\2\1'
possibilities = ['abba']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_backreference_with_mismatch():
regexEnumerator = RegexEnumerator(r'(a)(b)\1')
regex = r'(a)(b)\1'
possibilities = ['aba']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_named_group_with_backreference():
regexEnumerator = RegexEnumerator(r'(?<letter>[ab])\k<letter>')
regex = r'(?<letter>[ab])\k<letter>'
possibilities = [
'aa', 'bb'
]

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_named_group_infinite_repetition_with_backreference():
regexEnumerator = RegexEnumerator(r'(?<letter>[ab])+\k<letter>')
regex = r'(?<letter>[ab])+\k<letter>'
possibilities = [
'aa', 'bb', 'abab', 'baba', 'aaaa', 'bbbb'
]

f_infinite(regexEnumerator, possibilities)
f_infinite(regex, possibilities)
78 changes: 40 additions & 38 deletions tests/test_char_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,129 +3,131 @@


def test_single_character_class():
regexEnumerator = RegexEnumerator(r'[a]')
regex = r'[a]'
possibilities = ['a']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_character_class_with_two_literals():
regexEnumerator = RegexEnumerator(r'[ab]')
regex = r'[ab]'
possibilities = ['a', 'b']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_character_class_with_zero_or_more_quantifier():
regexEnumerator = RegexEnumerator(r'[a]*')
regex = r'[a]*'
possibilities = ['', 'a', 'aa', 'aaa', 'aaaa', 'aaaaa']

f_infinite(regexEnumerator, possibilities)
f_infinite(regex, possibilities)


def test_range_character_class():
regexEnumerator = RegexEnumerator(r'[a-c]')
regex = r'[a-c]'
possibilities = ['a', 'b', 'c']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_range_character_class_with_repetition():
regexEnumerator = RegexEnumerator(r'[a-c]{1,2}')
regex = r'[a-c]{1,2}'
possibilities = ['a', 'b', 'c', 'aa', 'ab',
'ac', 'ba', 'bb', 'bc', 'ca', 'cb', 'cc']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_range_character_class_with_zero_repetition():
regexEnumerator = RegexEnumerator(r'[a-c]{0}')
regex = r'[a-c]{0}'
possibilities = ['']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_range_character_class_with_one_or_more_quantifier():
regexEnumerator = RegexEnumerator(r'[a-b]+')
regex = r'[a-b]+'
possibilities = ['a', 'b', 'aa', 'ab', 'ba', 'bb', 'aaa',
'aab', 'aba', 'abb', 'baa', 'bab', 'bba', 'bbb']

f_infinite(regexEnumerator, possibilities)
f_infinite(regex, possibilities)


def test_two_ranges_with_optional_quantifier():
regexEnumerator = RegexEnumerator(r'[a-cf-g]?')
regex = r'[a-cf-g]?'
possibilities = ['', 'a', 'b', 'c', 'f', 'g']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_literal_in_character_class():
regexEnumerator = RegexEnumerator(r'[.]')
regex = r'[.]'
possibilities = ['.']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_negated_character_class():
regexEnumerator = RegexEnumerator(r'[^a]')
regex = r'[^a]'
possibilities = [chr(i) for i in range(32, 127) if chr(i) != 'a']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_character_class_with_escaped_special_char_at_start():
regexEnumerator = RegexEnumerator(r'[\]-a]')
regex = r'[\]-a]'
possibilities = [chr(i) for i in range(93, 98)]

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_character_class_with_escaped_special_char_at_end():
regexEnumerator = RegexEnumerator(r'[Z-\]]')
regex = r'[Z-\]]'
possibilities = [chr(i) for i in range(90, 94)]

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_character_class_with_escape_sequence():
regexEnumerator = RegexEnumerator(r'[\d]')
regex = r'[\d]'
possibilities = [str(i) for i in range(10)]

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_incomplete_range_character_class():
regexEnumerator = RegexEnumerator(r'[a-]')
regex = r'[a-]'
possibilities = ['a', '-']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_2_ranges():
regexEnumerator = RegexEnumerator(r'[1a-crf-g3]')
regex = r'[1a-crf-g3]'
possibilities = ['1', 'a', 'b', 'c', 'f', 'g', 'r', '3']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_unicode_character_class():
regexEnumerator = RegexEnumerator(r'[à-å]')
regex = r'[à-å]'
possibilities = ['à', 'á', 'â', 'ã', 'ä', 'å']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities)


def test_additional_charset():
regexEnumerator = RegexEnumerator(
r'[^\w\d\s]', additional_charset=['γ', 'β', 'α'])
regex = r'[^\w\d\s]'
additional_charset = ['γ', 'β', 'α']
possibilities = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':',
';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~', 'α', 'β', 'γ']

f_finite(regexEnumerator, possibilities)
f_finite(regex, possibilities, additional_charset)


def test_charclass_with_quantifier_from_0():
regexEnumerator = RegexEnumerator(r'[b-d]{0,2}')
possibilities = ['', 'b', 'c', 'd', 'bb', 'bc', 'bd', 'cb', 'cc', 'cd', 'db', 'dc', 'dd']
regex = r'[b-d]{0,2}'
possibilities = ['', 'b', 'c', 'd', 'bb', 'bc',
'bd', 'cb', 'cc', 'cd', 'db', 'dc', 'dd']

f_finite(regexEnumerator, set(possibilities))
f_finite(regex, set(possibilities))
Loading

0 comments on commit 30b01fe

Please sign in to comment.