investigativedata · simonwoerpel · Jan 3, 2025 · Dec 4, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.7.0
+current_version = 0.7.1
 commit = True
 tag = True
 message = 🔖 Bump version: {current_version} → {new_version}

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -25,6 +25,8 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install poetry
       run: curl -sSL https://install.python-poetry.org | python3 -
+    - name: Install leveldb
+      run: sudo apt-get install pkg-config libleveldb-dev
     - name: Configure poetry
       run: poetry config virtualenvs.in-project true
     - name: set PY
@@ -43,7 +45,7 @@ jobs:
         path: ~/.cache/pre-commit
         key: pre-commit-${{ runner.os }}-${{ env.PY }}-${{ hashFiles('.pre-commit-config.yaml') }}
     - name: Install dependencies
-      run: poetry install --with dev
+      run: poetry install --with dev --all-extras
     - name: Run pre-commit hooks
       run: poetry run pre-commit run
     - name: Lint with flake8

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -74,4 +74,4 @@ repos:
       - id: poetry-lock
         args: ["--no-update"]
       - id: poetry-export
-        args: ["--dev", "-f", "requirements.txt", "-o", "requirements.txt"]
+        args: ["-f", "requirements.txt", "-o", "requirements.txt"]
diff --git a/Dockerfile b/Dockerfile
@@ -4,7 +4,6 @@ RUN apt-get -qq update && apt-get -qq -y upgrade
 RUN apt-get install -qq -y pkg-config libicu-dev libleveldb-dev
 RUN apt-get -qq -y autoremove && apt-get clean
 
-RUN pip install --no-cache-dir -q -U pip setuptools
 RUN pip install --no-cache-dir -q --no-binary=:pyicu: pyicu
 
 COPY ftmq /src/ftmq

diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 all: clean install test
 
 install:
-	poetry install --with dev
+	poetry install --with dev --all-extras
 
 lint:
 	poetry run flake8 ftmq --count --select=E9,F63,F7,F82 --show-source --statistics

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.7.0
+0.7.1
diff --git a/ftmq/__init__.py b/ftmq/__init__.py
@@ -1,4 +1,4 @@
 from ftmq.query import Query
 
-__version__ = "0.7.0"
+__version__ = "0.7.1"
 __all__ = ["Query"]
diff --git a/ftmq/model/dataset.py b/ftmq/model/dataset.py
@@ -1,11 +1,10 @@
 from datetime import datetime
 from typing import Iterable, Literal, Self, TypeVar
 
-from nomenklatura.dataset.catalog import DataCatalog as NKCatalog
 from nomenklatura.dataset.dataset import Dataset as NKDataset
 from normality import slugify
-from pantomime.types import FTM
 from pydantic import AnyUrl, HttpUrl
+from rigour.mime.types import FTM
 
 from ftmq.enums import Categories, Frequencies
 from ftmq.model.coverage import Coverage, DatasetStats, Schemata
@@ -83,6 +82,7 @@ class Dataset(BaseModel):
     aleph_url: HttpUrl | None = None
     tags: list[str] | None = []
     content_type: ContentType | None = "structured"
+    total_file_size: int | None = 0
 
     git_repo: AnyUrl | None = None
     uri: str | None = None
@@ -143,16 +143,15 @@ def get(self, name: str) -> Dataset | None:
 
     def get_scope(self) -> NKDataset:
         # FIXME clarify
-        return NKDataset(
-            NKCatalog(
-                NKDataset, {"datasets": [make_dataset(n).to_dict() for n in self.names]}
-            ),
+        ds = NKDataset(
             {
                 "name": slugify(self.name),
                 "title": self.name.title(),
                 "children": self.names,
             },
         )
+        ds.children = {make_dataset(n) for n in self.names}
+        return ds
 
     def iterate(self) -> CEGenerator:
         for dataset in self.datasets:

diff --git a/ftmq/model/proxy.py b/ftmq/model/proxy.py
@@ -1,13 +1,14 @@
-from typing import Any, Iterable, Self, TypeAlias, TypeVar, Union
+from typing import Any, Iterable, Self, Sequence, TypeAlias, TypeVar, Union
 
 from followthemoney.types import registry
+from nomenklatura.publish.names import pick_caption
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 
 from ftmq.types import CE
-from ftmq.util import make_proxy
+from ftmq.util import make_proxy, must_str
 
 EntityProp = TypeVar("EntityProp", bound="Entity")
-Properties: TypeAlias = dict[str, list[Union[str, EntityProp]]]
+Properties: TypeAlias = dict[str, Sequence[Union[str, EntityProp]]]
 
 
 class Entity(BaseModel):
@@ -24,14 +25,16 @@ class Entity(BaseModel):
     def from_proxy(cls, entity: CE, adjacents: Iterable[CE] | None = None) -> Self:
         properties = dict(entity.properties)
         if adjacents:
-            adjacents = {e.id: Entity.from_proxy(e) for e in adjacents}
+            adjacents_: dict[str, Entity] = {
+                must_str(e.id): Entity.from_proxy(e) for e in adjacents
+            }
             for prop in entity.iterprops():
                 if prop.type == registry.entity:
                     properties[prop.name] = [
-                        adjacents.get(i, i) for i in entity.get(prop)
+                        adjacents_.get(i, i) for i in entity.get(prop)
                     ]
         return cls(
-            id=entity.id,
+            id=must_str(entity.id),
             caption=entity.caption,
             schema=entity.schema.name,
             properties=properties,
@@ -46,6 +49,5 @@ def to_proxy(self) -> CE:
     @classmethod
     def get_caption(cls, data: Any) -> Any:
         if data.get("caption") is None:
-            proxy = make_proxy(data)
-            data["caption"] = proxy.caption
+            data["caption"] = pick_caption(make_proxy(data))
         return data
diff --git a/ftmq/query.py b/ftmq/query.py
@@ -6,7 +6,7 @@
 from nomenklatura.entity import CE
 
 from ftmq.aggregations import Aggregation, Aggregator
-from ftmq.enums import Aggregations, Comparators, Properties
+from ftmq.enums import Aggregations, Properties
 from ftmq.exceptions import ValidationError
 from ftmq.filters import (
     FILTERS,
@@ -56,24 +56,15 @@ def serialize(self) -> list[str]:
 
 
 class Query:
-    DEFAULT_SEARCH_PROPS = (
-        Properties["name"],
-        Properties["firstName"],
-        Properties["middleName"],
-        Properties["lastName"],
-    )
-
     def __init__(
         self,
         filters: Iterable[F] | None = None,
-        search_filters: Iterable[F] | None = None,
         aggregations: Iterable[Aggregation] | None = None,
         aggregator: Aggregator | None = None,
         sort: Sort | None = None,
         slice: Slice | None = None,
     ):
         self.filters = set(ensure_list(filters))
-        self.search_filters = set(ensure_list(search_filters))
         self.aggregations = set(ensure_list(aggregations))
         self.aggregator = aggregator
         self.sort = sort
@@ -162,13 +153,6 @@ def lookups(self) -> dict[str, Any]:
         """
         return self._get_lookups(self.filters)
 
-    @property
-    def search_lookups(self) -> dict[str, Any]:
-        """
-        The current search lookups as dictionary
-        """
-        return self._get_lookups(self.search_filters)
-
     @property
     def limit(self) -> int | None:
         """
@@ -283,9 +267,6 @@ def to_dict(self) -> dict[str, Any]:
             ```
         """
         data = self.lookups
-        search_data = self.search_lookups
-        if search_data:
-            data["search"] = search_data
         if self.sort:
             data["order_by"] = self.sort.serialize()
         if self.slice:
@@ -364,14 +345,6 @@ def where(self, **lookup: Any) -> Q:
 
         return self._chain()
 
-    def search(self, q: str, props: Iterable[Properties | str] = None) -> Q:
-        # reset existing search
-        self.search_filters: set[F] = set()
-        props = props or self.DEFAULT_SEARCH_PROPS
-        for prop in props:
-            self.search_filters.add(PropertyFilter(prop, q, Comparators.ilike))
-        return self._chain()
-
     def order_by(self, *values: Iterable[str], ascending: bool | None = True) -> Q:
         """
         Add or update the current sorting.
@@ -401,23 +374,13 @@ def aggregate(
     def get_aggregator(self) -> Aggregator:
         return Aggregator(aggregations=self.aggregations)
 
-    def apply_filter(self, proxy: CE) -> bool:
-        if not self.filters:
-            return True
-        return all(f.apply(proxy) for f in self.filters)
-
-    def apply_search(self, proxy: CE) -> bool:
-        if not self.search_filters:
-            return True
-        return any(f.apply(proxy) for f in self.search_filters)
-
     def apply(self, proxy: CE) -> bool:
         """
         Test if a proxy matches the current `Query` instance.
         """
-        if self.apply_filter(proxy):
-            return self.apply_search(proxy)
-        return False
+        if not self.filters:
+            return True
+        return all(f.apply(proxy) for f in self.filters)
 
     def apply_iter(self, proxies: CEGenerator) -> CEGenerator:
         """

diff --git a/ftmq/sql.py b/ftmq/sql.py
@@ -117,26 +117,9 @@ def clause(self) -> BooleanClauseList:
             )
         return and_(*clauses)
 
-    @cached_property
-    def search_clause(self) -> BooleanClauseList | None:
-        if not self.q.search_filters:
-            return
-        return or_(
-            and_(
-                self.table.c.prop == f.key,
-                self.get_expression(self.table.c.value, f),
-            )
-            for f in self.q.search_filters
-        )
-
     @cached_property
     def canonical_ids(self) -> Select:
         q = select(self.table.c.canonical_id.distinct()).where(self.clause)
-        if self.q.search_filters:
-            search_ids = select(self.table.c.canonical_id.distinct()).where(
-                self.search_clause
-            )
-            q = q.where(self.table.c.canonical_id.in_(search_ids))
         if self.q.sort is None:
             q = q.limit(self.q.limit).offset(self.q.offset)
         return q
@@ -148,12 +131,7 @@ def all_canonical_ids(self) -> Select:
     @cached_property
     def _unsorted_statements(self) -> Select:
         where = self.clause
-        if (
-            self.q.properties
-            or self.q.reversed
-            or self.q.search_filters
-            or self.q.limit
-        ):
+        if self.q.properties or self.q.reversed or self.q.limit:
             where = self.table.c.canonical_id.in_(self.canonical_ids)
         return select(self.table).where(where).order_by(self.table.c.canonical_id)
 

diff --git a/ftmq/util.py b/ftmq/util.py
@@ -4,6 +4,7 @@
 
 import pycountry
 from banal import ensure_list, is_listish
+from followthemoney.proxy import E, EntityProxy
 from followthemoney.schema import Schema
 from followthemoney.types import registry
 from followthemoney.util import make_entity_id, sanitize_text
@@ -95,6 +96,14 @@ def make_proxy(data: dict[str, Any], dataset: str | Dataset | None = None) -> CE
     return proxy
 
 
+def ensure_proxy(data: dict[str, Any] | CE | E) -> CompositeEntity:
+    if isinstance(data, CompositeEntity):
+        return data
+    if isinstance(data, EntityProxy):
+        data = data.to_full_dict()
+    return make_proxy(data)
+
+
 def get_statements(proxy: CE, *datasets: str) -> SGenerator:
     """
     Get statements from a `nomenklatura.entity.CompositeEntity` with multiple
@@ -119,7 +128,7 @@ def get_statements(proxy: CE, *datasets: str) -> SGenerator:
 
 
 @cache
-def get_country_name(alpha2: str) -> str:
+def get_country_name(code: str) -> str:
     """
     Get the (english) country name for the given 2-letter iso code via
     [pycountry](https://pypi.org/project/pycountry/)
@@ -129,21 +138,25 @@ def get_country_name(alpha2: str) -> str:
         "Germany"
         >>> get_country_name("xx")
         "xx"
+        >>> get_country_name("gb") == get_country_name("uk")
+        True  # United Kingdom
 
     Args:
         alpha2: Two-letter iso code, case insensitive
 
     Returns:
         Either the country name for a valid code or the code as fallback.
     """
-    alpha2 = alpha2.lower()
+    code_clean = get_country_code(code)
+    if code_clean is None:
+        code_clean = code.lower()
     try:
-        country = pycountry.countries.get(alpha_2=alpha2)
+        country = pycountry.countries.get(alpha_2=code_clean)
         if country is not None:
             return country.name
     except (LookupError, AttributeError):
-        return alpha2
-    return alpha2
+        return code
+    return code_clean
 
 
 @lru_cache(1024)
@@ -485,3 +498,10 @@ def get_featured_proxy(proxy: CE) -> CE:
     for prop in proxy.schema.featured:
         featured.add(prop, proxy.get(prop))
     return featured
+
+
+def must_str(value: Any) -> str:
+    value = clean_string(value)
+    if not value:
+        raise ValueError(f"Value invalid: `{value}`")
+    return value
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@investigativedata/ftmq",
-  "version": "0.7.0",
+  "version": "0.7.1",
   "description": "javascript interface for ftmq",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",