diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 76916c9..68a25af 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,7 +10,7 @@ jobs: unittest: strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] # format: 3.7, 3.8, 3.9 + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12"] # format: 3.7, 3.8, 3.9 platform: [ubuntu-latest, macos-latest, windows-latest] fail-fast: false runs-on: ${{ matrix.platform }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 9eab022..f37e984 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,14 @@ # envolved Changelog +## 1.5.0 +### Removed +* `envolved` no longer supports python 3.7 +### Added +* `FindIterCollectionParser` +### Fixed +* `CollectionParser`'s `opener` and `closer` arguments now correctly handle matches that would be split by the delimiter +* `CollectionParser`'s `closer` argument now correctly handles overlapping matches +* `CollectionParser`'s `closer` argument is now faster when using non-regex matches +* `CollectionParser.pair_wise_delimited` will now be more memory efficient when using a mapping `value_type` ## 1.4.0 ### Deprecated * this is the last release to support python 3.7 diff --git a/docs/string_parsing.rst b/docs/string_parsing.rst index 86a1d28..82edf7b 100644 --- a/docs/string_parsing.rst +++ b/docs/string_parsing.rst @@ -75,9 +75,10 @@ Utility Parsers :param delimiter: The delimiter string or pattern to split the string on. :param inner_parser: The parser to use to parse the elements of the collection. Note this parser is treated the same an an EnvVar type, so :ref:`string_parsing:Special parsers` apply. - :param output_type: The type to use to aggregate the parsed items to a collection defaults to list. + :param output_type: The type to use to aggregate the parsed items to a collection. Defaults to list. :param opener: If set, specifies a string or pattern that should be at the beginning of the delimited string. - :param closer: If set, specifies a string or pattern that should be at the end of the delimited string. + :param closer: If set, specifies a string or pattern that should be at the end of the delimited string. Note that providing + a pattern will slow down the parsing process. :param strip: Whether or not to strip whitespaces from the beginning and end of each item. .. code-block:: @@ -140,6 +141,36 @@ Utility Parsers assert server_params_ev.get() == {"host": "localhost", "port": 8080, "is_ssl": False} +.. class:: FindIterCollectionParser(element_pattern: typing.Pattern, element_func: collections.abc.Callable[[re.Match], E], \ + output_type: collections.abc.Callable[[collections.abc.Iterator[E]], G] = list, \ + opener: str | typing.Pattern = '', closer: str | typing.Pattern = '') + + A parser to translate a string to a collection of values by splitting the string to continguous elements that match + a regex pattern. This parser is useful for parsing strings that have a repeating, complex structure, or in cases where + a :class:`naive split ` would split the string incorrectly. + + :param element_pattern: A regex pattern to find the elements in the string. + :param element_func: A function that takes a regex match object and returns an element. + :param output_type: The type to use to aggregate the parsed items to a collection. Defaults to list. + :param opener: If set, specifies a string or pattern that should be at the beginning of the string. + :param closer: If set, specifies a string or pattern that should be at the end of the string. Note that providing + a pattern will slow down the parsing process. + + .. code-block:: + :caption: Using FindIterCollectionParser to parse a string of comma-separated groups of numbers. + + def parse_group(match: re.Match) -> set[int]: + return {int(x) for x in match.group(1).split(',')} + + groups_ev = env_var("GROUPS", type=FindIterCollectionParser( + re.compile(r"{([,\d]+)},?"), + parse_group + )) + + os.environ["GROUPS"] = "{1,2,3},{4,5,6},{7,8,9}" + + assert groups_ev.get() == [{1, 2, 3}, {4, 5, 6}, {7, 8, 9}] + .. class:: MatchParser(cases: collections.abc.Iterable[tuple[typing.Pattern[str] | str, T]] | \ collections.abc.Mapping[str, T] | type[enum.Enum], fallback: T = ...) diff --git a/envolved/_version.py b/envolved/_version.py index 3e8d9f9..5b60188 100644 --- a/envolved/_version.py +++ b/envolved/_version.py @@ -1 +1 @@ -__version__ = "1.4.0" +__version__ = "1.5.0" diff --git a/envolved/envparser.py b/envolved/envparser.py index b357030..6f03004 100644 --- a/envolved/envparser.py +++ b/envolved/envparser.py @@ -120,50 +120,12 @@ def get(self, case_sensitive: bool, key: str) -> str: return ret -class NonAuditingEnvParser(ReloadingEnvParser): - def get(self, case_sensitive: bool, key: str) -> str: - if case_sensitive: - return getenv_unsafe(key) - - def out_of_date() -> str: - self.reload() - return get_case_insensitive(retry_allowed=False) - - lowered = key.lower() - - def get_case_insensitive(retry_allowed: bool) -> str: - if retry_allowed and lowered not in self.environ_case_insensitive: - # if a retry is allowed, and no candidates are available, we need to retry - return out_of_date() - candidates = self.environ_case_insensitive[lowered] - if key in candidates: - preferred_key = key - elif retry_allowed and has_env(key): - # key is not a candidate, but it is in the env - return out_of_date() - elif len(candidates) == 1: - (preferred_key,) = candidates - elif retry_allowed: - return out_of_date() - else: - raise CaseInsensitiveAmbiguityError(candidates) - ret = getenv(preferred_key) - if ret is None: - assert retry_allowed - return out_of_date() - return ret - - return get_case_insensitive(retry_allowed=True) - - EnvParser: Type[BaseEnvParser] if name == "nt": # in windows, all env vars are uppercase EnvParser = CaseInsensitiveEnvParser -elif sys.version_info >= (3, 8): # adding audit hooks is only supported in python 3.8+ - EnvParser = AuditingEnvParser else: - EnvParser = NonAuditingEnvParser + EnvParser = AuditingEnvParser env_parser = EnvParser() diff --git a/envolved/parsers.py b/envolved/parsers.py index 801f1ba..b2cfdf8 100644 --- a/envolved/parsers.py +++ b/envolved/parsers.py @@ -2,7 +2,6 @@ import re from enum import Enum, auto -from functools import lru_cache from itertools import chain from sys import version_info from typing import ( @@ -138,6 +137,68 @@ def _duplicate_avoiding_dict(pairs: Iterator[Tuple[K, V]]) -> Dict[K, V]: return ret +def strip_opener_idx(x: str, opener: Pattern[str]) -> int: + opener_match = opener.match(x) + if not opener_match: + raise ValueError("position 0, expected opener") + return opener_match.end() + + +def strip_closer_idx(x: str, closer: Needle, pos: int) -> int: + if isinstance(closer, str): + if len(closer) + pos > len(x) or not x.endswith(closer): + raise ValueError("expected string to end in closer") + return len(x) - len(closer) + else: + assert isinstance(closer, Pattern) + # now we have a problem, as the standard re module doesn't support reverse matches + closer_matches = closer.finditer(x, pos) + closer_match = None + for closer_match in closer_matches: # noqa: B007 + # we iterate to find the last match + pass + if not closer_match: + raise ValueError("expected string to end in closer") + else: + while closer_match.end() != len(x): + # finditer could have missed on overlapping match, if there is an overlapping match + # it will be found after the start of the last match (but before its end) + closer_match = closer.search(x, closer_match.start() + 1) + # if there is a match, it's an overlapping match, but it doesn't neccessarily end at + # the end of the string + if not closer_match: + raise ValueError("expected string to end in closer") + return closer_match.start() + + +def strip_opener_and_closer(x: str, opener: Pattern[str], closer: Needle) -> str: + start_idx = strip_opener_idx(x, opener) + end_idx = strip_closer_idx(x, closer, start_idx) + + if start_idx != 0 or end_idx != len(x): + return x[start_idx:end_idx] + return x + + +def value_parser_func(value_type: Union[ParserInput[V], Mapping[K, ParserInput[V]]]) -> Callable[[K], Parser[V]]: + if isinstance(value_type, Mapping): + value_parsers = {k: parser(v) for k, v in value_type.items()} + + def get_value_parser(key: K) -> Parser[V]: + try: + return value_parsers[key] + except KeyError: + # in case the mapping has a default value or the like + return parser(value_type[key]) + else: + _value_parser = parser(value_type) + + def get_value_parser(key: K) -> Parser[V]: + return _value_parser + + return get_value_parser + + class CollectionParser(Generic[G, E]): """ A parser that splits a string by a delimiter, and parses each part individually. @@ -149,45 +210,20 @@ def __init__( inner_parser: ParserInput[E], output_type: Callable[[Iterator[E]], G] = list, # type: ignore[assignment] opener: Needle = empty_pattern, - closer: Needle = empty_pattern, + closer: Needle = "", *, strip: bool = True, ): - """ - :param delimiter: The delimiter to split by. - :param inner_parser: The inner parser to apply to each element. - :param output_type: The aggregator function of all the parsed elements. - :param opener: Optional opener that must be present at the start of the string. - :param closer: Optional closer that must be present at the end of the string. - """ self.delimiter_pattern = needle_to_pattern(delimiter) self.inner_parser = parser(inner_parser) self.output_type = output_type self.opener_pattern = needle_to_pattern(opener) - self.closer_pattern = needle_to_pattern(closer) + self.closer = closer self.strip = strip def __call__(self, x: str) -> G: - opener_match = self.opener_pattern.match(x) - if not opener_match: - raise ValueError("position 0, expected opener") - x = x[opener_match.end() :] - raw_elements = self.delimiter_pattern.split(x) - closer_matches = self.closer_pattern.finditer(raw_elements[-1]) - - closer_match = None - for closer_match in closer_matches: # noqa: B007 - pass - if not closer_match: - raise ValueError("expected string to end in closer") - elif closer_match.end() != len(raw_elements[-1]): - raise ValueError( - "expected closer to match end of string, got unexpected suffix: " - + raw_elements[-1][closer_match.end() :] - ) - - raw_elements[-1] = raw_elements[-1][: closer_match.start()] - raw_items = iter(raw_elements) + x = strip_opener_and_closer(x, self.opener_pattern, self.closer) + raw_items = iter(self.delimiter_pattern.split(x)) if self.strip: raw_items = (r.strip() for r in raw_items) elements = (self.inner_parser(r) for r in raw_items) @@ -201,36 +237,14 @@ def pair_wise_delimited( key_type: ParserInput[K], value_type: Union[ParserInput[V], Mapping[K, ParserInput[V]]], output_type: Callable[[Iterator[Tuple[K, V]]], G] = _duplicate_avoiding_dict, # type: ignore[assignment] - *, key_first: bool = True, strip_keys: bool = True, strip_values: bool = True, **kwargs: Any, ) -> Parser[G]: - """ - Create a collectionParser that aggregates to key-value pairs. - :param pair_delimiter: The separator between different key-value pairs. - :param key_value_delimiter: The separator between each key and value. - :param key_type: The parser for key elements. - :param value_type: The parser for value elements. Can also be a mapping, parsing each key under a different - parser. - :param output_type: The tuple aggregator function. Defaults to a duplicate-checking dict. - :param key_first: If set to false, will evaluate the part behind the key-value separator as a value. - :param kwargs: forwarded to `CollectionParser.__init__` - """ key_value_delimiter = needle_to_pattern(key_value_delimiter) key_parser = parser(key_type) - get_value_parser: Callable[[K], Parser] - if isinstance(value_type, Mapping): - - @lru_cache(None) - def get_value_parser(key: K) -> Parser[V]: - return parser(value_type[key]) - else: - _value_parser = parser(value_type) - - def get_value_parser(key: K) -> Parser[V]: - return _value_parser + get_value_parser = value_parser_func(value_type) def combined_parser(s: str) -> Tuple[K, V]: split = key_value_delimiter.split(s, maxsplit=2) @@ -250,6 +264,38 @@ def combined_parser(s: str) -> Tuple[K, V]: return cls(pair_delimiter, combined_parser, output_type, **kwargs) # type: ignore[arg-type] +def find_iter_contingient(x: str, pattern: Pattern[str]) -> Iterator[re.Match[str]]: + start_idx = 0 + while start_idx < len(x): + match = pattern.match(x, start_idx) + if match is None: + raise ValueError(f"could not match pattern {pattern} at position {start_idx}") + start_idx = match.end() + yield match + + +class FindIterCollectionParser(Generic[G, E]): + def __init__( + self, + element_pattern: Pattern[str], + element_func: Callable[[re.Match[str]], E], + output_type: Callable[[Iterator[E]], G] = list, # type: ignore[assignment] + opener: Needle = empty_pattern, + closer: Needle = "", + ): + self.prefix_pattern = element_pattern + self.element_func = element_func + self.output_type = output_type + self.opener_pattern = needle_to_pattern(opener) + self.closer = closer + + def __call__(self, x: str) -> G: + x = strip_opener_and_closer(x, self.opener_pattern, self.closer) + raw_matches = find_iter_contingient(x, self.prefix_pattern) + elements = (self.element_func(r) for r in raw_matches) + return self.output_type(elements) + + class NoFallback(Enum): no_fallback = auto() diff --git a/pyproject.toml b/pyproject.toml index b28b38a..d81abee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "envolved" -version = "1.4.0" +version = "1.5.0" description = "" authors = ["ben avrahami "] license = "MIT" @@ -12,11 +12,8 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.7" -typing-extensions = [ - {version="<4.8.0", python=">=3.7, <3.8"}, - {version="*", python=">=3.8"}, -] +python = "^3.8" +typing-extensions = "*" [tool.poetry.group.dev.dependencies] pytest = "*" @@ -39,7 +36,7 @@ build-backend = "poetry.masonry.api" [tool.ruff] -target-version = "py37" +target-version = "py38" line-length = 120 output-format = "full" [tool.ruff.lint] @@ -108,3 +105,7 @@ keep-runtime-typing = true "PTH", # use pathlib "PERF", # performance anti-patterns ] + +"type_checking/**" = [ + "INP001", # implicit namespace packages +] \ No newline at end of file diff --git a/scripts/test_type_hinting.sh b/scripts/test_type_hinting.sh new file mode 100644 index 0000000..5021a42 --- /dev/null +++ b/scripts/test_type_hinting.sh @@ -0,0 +1 @@ +python -m mypy --show-error-codes --check-untyped-defs type_checking \ No newline at end of file diff --git a/tests/unittests/test_parsers.py b/tests/unittests/test_parsers.py index e8b7047..db789e7 100644 --- a/tests/unittests/test_parsers.py +++ b/tests/unittests/test_parsers.py @@ -1,4 +1,6 @@ import re +from collections import defaultdict +from dataclasses import dataclass from enum import Enum from typing import List @@ -6,7 +8,15 @@ from pydantic.v1 import BaseModel as BaseModel1 from pytest import mark, raises -from envolved.parsers import BoolParser, CollectionParser, LookupParser, MatchParser, complex_parser, parser +from envolved.parsers import ( + BoolParser, + CollectionParser, + FindIterCollectionParser, + LookupParser, + MatchParser, + complex_parser, + parser, +) def test_complex(): @@ -81,6 +91,13 @@ def test_mapping_different_val_types(): assert p("a=hello world;b=true;c=3") == {"a": "hello world", "b": True, "c": 3} +def test_mapping_different_val_types_with_missing(): + val_dict = defaultdict(lambda: str) + val_dict.update({"b": bool, "c": int}) + p = CollectionParser.pair_wise_delimited(";", "=", str, val_dict) + assert p("a=hello world;b=true;c=3") == {"a": "hello world", "b": True, "c": 3} + + def test_mapping_vfirst(): p = CollectionParser.pair_wise_delimited(";", "=", int, str, key_first=False) assert p("a=1;b=2;c=3") == {1: "a", 2: "b", 3: "c"} @@ -263,3 +280,26 @@ def test_typeadapter(): t = TypeAdapter(List[int]) p = parser(t) assert p("[1,2,3]") == [1, 2, 3] + + +@mark.parametrize("closer", ["];]", re.compile(r"\];\]")]) +def test_delimited_boundries_collections(closer): + assert CollectionParser(";", str, opener="[;[", closer=closer)("[;[a;b;c];]") == ["a", "b", "c"] + + +def test_finditer_parser(): + p = FindIterCollectionParser(re.compile(r"\d+(?:\s*)"), lambda m: int(m[0])) + assert p("1 2 3 4") == [1, 2, 3, 4] + + +def test_finditer_parser_complex(): + @dataclass + class Node: + name: str + values: list[int] + + values_parser = CollectionParser(";", int, opener="(", closer=")") + p = FindIterCollectionParser( + re.compile(r"(\w+)(?:\s*)(\(.*?\))?;?"), lambda m: Node(m[1], values_parser(m[2]) if m[2] else []) + ) + assert p("a(1;2;3);b(4;5;6)") == [Node("a", [1, 2, 3]), Node("b", [4, 5, 6])] diff --git a/tests/unittests/test_parsers_utils.py b/tests/unittests/test_parsers_utils.py new file mode 100644 index 0000000..2b91722 --- /dev/null +++ b/tests/unittests/test_parsers_utils.py @@ -0,0 +1,35 @@ +import re + +from pytest import mark, raises + +from envolved.parsers import strip_opener_and_closer + + +@mark.parametrize("closer", ["]", re.compile(r"\]")]) +def test_strip_bounds(closer): + assert strip_opener_and_closer("[abca]", re.compile(r"\["), closer) == "abca" + + +@mark.parametrize("x", ["[aabc]", "[aabcaaaa]", "[bc]"]) +def test_strip_bounds_dyn(x): + assert strip_opener_and_closer(x, re.compile(r"\[a*"), re.compile(r"a*\]")) == "bc" + + +def test_strip_bounds_overlapping_closer(): + assert strip_opener_and_closer("fababa", re.compile(""), re.compile("aba")) == "fab" + + +def test_strip_no_closer(): + with raises(ValueError): + strip_opener_and_closer("ab", re.compile("a"), re.compile("c")) + + +def test_strip_closer_not_at_end(): + with raises(ValueError): + strip_opener_and_closer("abf", re.compile("a"), re.compile("b")) + + +@mark.parametrize("closer", ["a]", re.compile(r"a\]")]) +def test_strip_no_double_strip(closer): + with raises(ValueError): + strip_opener_and_closer("[a]", re.compile(r"\[a"), closer) diff --git a/type_checking/env_var.py b/type_checking/env_var.py new file mode 100644 index 0000000..776cc3d --- /dev/null +++ b/type_checking/env_var.py @@ -0,0 +1,29 @@ +from collections.abc import AsyncIterator +from contextlib import AbstractAsyncContextManager, asynccontextmanager, nullcontext + +from envolved import EnvVar, env_var, inferred_env_var +from envolved.describe import exclude_from_description + +number_ev = env_var("NUMBER", type=int) +i: int = number_ev.get() + +ignored_number_ev = exclude_from_description(env_var("IGNORED_NUMBER", type=int)) +j: int = ignored_number_ev.get() + + +# test contravariance +@asynccontextmanager +async def cont(a: int, b: str) -> AsyncIterator[int]: + yield a + + +base_ev: EnvVar[AbstractAsyncContextManager[int | None]] = exclude_from_description( + env_var("SEQ_", type=cont, args={"a": inferred_env_var(), "b": inferred_env_var()}) +) +seq_ev = base_ev.with_prefix("SEQ_") +seq_ev.default = nullcontext() + + +async def test_cont() -> int | None: + async with seq_ev.get() as t: + return t