Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.6.1 #203

Merged
merged 26 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5cb86fa
🚧 (model) tweak main exports
simonwoerpel Feb 26, 2024
0b4ec71
Bump sqlalchemy from 2.0.27 to 2.0.28
dependabot[bot] Mar 7, 2024
1abd509
Bump nomenklatura from 3.10.4 to 3.10.5
dependabot[bot] Mar 7, 2024
aec3142
Bump redis from 5.0.1 to 5.0.3
dependabot[bot] Mar 11, 2024
db09ce1
Bump mypy from 1.8.0 to 1.9.0
dependabot[bot] Mar 12, 2024
918b4fb
Bump pydantic from 2.6.2 to 2.6.4
dependabot[bot] Mar 12, 2024
92fa56a
Bump pytest from 8.0.1 to 8.1.1
dependabot[bot] Mar 12, 2024
5fc719e
Bump moto from 5.0.2 to 5.0.3
dependabot[bot] Mar 12, 2024
47a341f
Merge pull request #202 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
bb6d30c
Merge pull request #201 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
cb8dda9
Merge pull request #200 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
425a484
Merge pull request #199 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
f949f92
Merge pull request #198 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
3c4bfdd
Merge pull request #197 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
496e362
Merge pull request #196 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
718d6e5
Bump anystore from 0.0.6 to 0.1.0
dependabot[bot] Mar 13, 2024
67b2c2f
Merge pull request #204 from investigativedata/dependabot/pip/develop…
simonwoerpel Mar 13, 2024
c3e2d4f
💥 (util) rename some functions
simonwoerpel Mar 13, 2024
24d39e1
✅ (test) fix test_model
simonwoerpel Mar 13, 2024
8bb552e
✨ (store) load file uri as MemoryStore
simonwoerpel Mar 13, 2024
180ed7e
✅ tweak test_aggregate
simonwoerpel Mar 13, 2024
3204db9
🐛 (store) this was dumb
simonwoerpel Mar 14, 2024
916c855
⚡️ (tests) Drop large test dataset and rely on external s3 tests
simonwoerpel Mar 14, 2024
548d28b
🐛 Numeric aggregations
simonwoerpel Mar 14, 2024
215c803
⬆️ poetry dependencies
simonwoerpel Mar 14, 2024
0b8dea9
🔖 Bump version: 0.6.0 → 0.6.1
simonwoerpel Mar 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.0
current_version = 0.6.1
commit = True
tag = True
message = 🔖 Bump version: {current_version} → {new_version}
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.6.0
0.6.1
2 changes: 1 addition & 1 deletion ftmq/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ftmq.query import Query

__version__ = "0.6.0"
__version__ = "0.6.1"
__all__ = ["Query"]
20 changes: 5 additions & 15 deletions ftmq/aggregations.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,20 @@
import statistics
from collections import defaultdict
from functools import cache
from typing import Any, Generator, Iterable, TypeAlias

from anystore.util import clean_dict
from banal import ensure_list
from followthemoney.schema import Schema
from followthemoney.types import registry
from pydantic import BaseModel

from ftmq.enums import Aggregations, Fields, Properties
from ftmq.types import CE, CEGenerator
from ftmq.util import to_numeric
from ftmq.util import prop_is_numeric, to_numeric

Value: TypeAlias = int | float | str
Values: TypeAlias = list[Value]


@cache
def get_is_numeric(schema: Schema, prop: str) -> bool:
prop = schema.get(prop)
if prop is not None:
return prop.type == registry.number
return False


class Aggregation(BaseModel):
prop: Properties | Fields
func: Aggregations
Expand Down Expand Up @@ -71,7 +61,7 @@ def get_proxy_values(
yield from proxy.get(prop, quiet=True)

def collect(self, proxy: CE) -> CE:
is_numeric = get_is_numeric(proxy.schema, self.prop)
is_numeric = prop_is_numeric(proxy.schema, self.prop)
for value in self.get_proxy_values(proxy):
if is_numeric:
value = to_numeric(value)
Expand Down Expand Up @@ -118,9 +108,9 @@ def __exit__(self, *args, **kwargs) -> None:
for agg in self.aggregations:
self.result[str(agg.func)][str(agg.prop)] = agg.value
for group in agg.group_props:
self.result["groups"][str(group)][str(agg.func)][
str(agg.prop)
] = agg.groups[group]
self.result["groups"][str(group)][str(agg.func)][str(agg.prop)] = (
agg.groups[group]
)
self.result = clean_dict(self.result)

def apply(self, proxies: CEGenerator) -> CEGenerator:
Expand Down
2 changes: 1 addition & 1 deletion ftmq/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def q(
smart_write_proxies(output_uri, proxies, serialize=True, dataset=store_dataset)
if stats_uri:
stats = stats.export()
stats = orjson.dumps(stats.dict(), option=orjson.OPT_APPEND_NEWLINE)
stats = orjson.dumps(stats.model_dump(), option=orjson.OPT_APPEND_NEWLINE)
smart_write(stats_uri, stats)
if q.aggregator:
result = orjson.dumps(
Expand Down
12 changes: 10 additions & 2 deletions ftmq/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
from ftmq.model.coverage import Coverage
from ftmq.model.coverage import DatasetStats
from ftmq.model.dataset import Catalog, Dataset, Maintainer, Publisher, Resource
from ftmq.model.proxy import Entity

__all__ = [Catalog, Coverage, Dataset, Entity, Maintainer, Publisher, Resource]
__all__ = [
"Catalog",
"DatasetStats",
"Dataset",
"Entity",
"Maintainer",
"Publisher",
"Resource",
]
19 changes: 10 additions & 9 deletions ftmq/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
)
from ftmq.sql import Sql
from ftmq.types import CEGenerator
from ftmq.util import parse_comparator, parse_unknown_filters
from ftmq.util import (
parse_comparator,
parse_unknown_filters,
prop_is_numeric,
to_numeric,
)

Q = TypeVar("Q", bound="Query")
Slice = TypeVar("Slice", bound=slice)
Expand All @@ -33,9 +38,10 @@ def __init__(self, values: Iterable[str], ascending: bool | None = True) -> None
def apply(self, proxy: CE) -> tuple[str]:
values = tuple()
for v in self.values:
p_values = proxy.get(v, quiet=True)
if p_values is not None:
values = values + (tuple(p_values))
p_values = proxy.get(v, quiet=True) or []
if prop_is_numeric(proxy.schema, v):
p_values = map(to_numeric, p_values)
values = values + (tuple(p_values))
return values

def apply_iter(self, proxies: CEGenerator) -> CEGenerator:
Expand Down Expand Up @@ -301,8 +307,3 @@ def apply_iter(self, proxies: CEGenerator) -> CEGenerator:
self.aggregator = self.get_aggregator()
proxies = self.aggregator.apply(proxies)
yield from proxies

def apply_aggregations(self, proxies: CEGenerator) -> Aggregator:
aggregator = self.get_aggregator()
[x for x in aggregator.apply(proxies)]
return aggregator
3 changes: 2 additions & 1 deletion ftmq/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,11 @@ def _sorted_statements(self) -> Select:
value = self.table.c.value
if PropertyTypesMap[prop].value == registry.number:
value = func.cast(self.table.c.value, NUMERIC)
group_func = func.min if self.q.sort.ascending else func.max
inner = (
select(
self.table.c.canonical_id,
func.group_concat(value).label("sortable_value"),
group_func(value).label("sortable_value"),
)
.where(
and_(
Expand Down
4 changes: 2 additions & 2 deletions ftmq/store/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def aggregations(self, query: Q) -> AggregatorResult | None:
key = f"agg-{hash(query)}"
if key in self._cache:
return self._cache[key]
aggregator = query.apply_aggregations(self.entities(query))
res = dict(aggregator.result)
_ = [x for x in self.entities(query)]
res = dict(query.aggregator.result)
self._cache[key] = res
return res
22 changes: 15 additions & 7 deletions ftmq/util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re
from collections.abc import Generator, Iterable
from functools import cache, lru_cache
from typing import Any
from typing import Any, Generator

import pycountry
from banal import ensure_list
from followthemoney.schema import Schema
from followthemoney.types import registry
from followthemoney.util import make_entity_id, sanitize_text
from nomenklatura.dataset import Dataset
Expand Down Expand Up @@ -72,7 +72,7 @@ def make_proxy(data: dict[str, Any], dataset: str | Dataset | None = None) -> CE
return proxy


def get_statements(proxy: CE, *datasets: Iterable[str]) -> SGenerator:
def get_statements(proxy: CE, *datasets: str) -> SGenerator:
datasets = datasets or ["default"]
for dataset in datasets:
# FIXME
Expand Down Expand Up @@ -177,7 +177,7 @@ def clean_name(value: Any) -> str | None:


@lru_cache(1024)
def fingerprint(value: Any) -> str | None:
def make_fingerprint(value: Any) -> str | None:
"""
Create a stable but simplified string or None from input that can be used
to generate ids (to mimic `fingerprints.generate` which is unstable for IDs
Expand All @@ -190,10 +190,18 @@ def fingerprint(value: Any) -> str | None:


@lru_cache(1024)
def string_id(value: Any) -> str | None:
def make_string_id(value: Any) -> str | None:
return make_entity_id(clean_name(value))


@lru_cache(1024)
def fingerprint_id(value: Any) -> str | None:
return make_entity_id(fingerprint(value))
def make_fingerprint_id(value: Any) -> str | None:
return make_entity_id(make_fingerprint(value))


@cache
def prop_is_numeric(schema: Schema, prop: str) -> bool:
prop = schema.get(prop)
if prop is not None:
return prop.type == registry.number
return False
Loading