From 5046c669068bdeb06b509b228225e3503ea35018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20W=C3=B6rpel?= Date: Mon, 21 Oct 2024 09:46:09 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20(sync/memorious)=20More=20verbos?= =?UTF-8?q?e=20runtime=20status?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leakrfc/cli.py | 3 ++- leakrfc/sync/memorious.py | 15 +++++++++++++-- tests/test_sync.py | 8 ++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/leakrfc/cli.py b/leakrfc/cli.py index 974f82b..01fb09a 100644 --- a/leakrfc/cli.py +++ b/leakrfc/cli.py @@ -220,7 +220,8 @@ def cli_sync_memorious( key_func = get_file_name_templ_func(key_template) else: key_func = None - import_memorious(dataset, uri, key_func) + res = import_memorious(dataset, uri, key_func) + write_obj(res, "-") @sync.command("aleph") diff --git a/leakrfc/sync/memorious.py b/leakrfc/sync/memorious.py index c10f4f6..7509c52 100644 --- a/leakrfc/sync/memorious.py +++ b/leakrfc/sync/memorious.py @@ -16,6 +16,7 @@ from anystore.store import get_store from anystore.types import StrGenerator, Uri from anystore.util import make_data_checksum +from anystore.worker import WorkerStatus from leakrfc.archive import DatasetArchive from leakrfc.archive.cache import get_cache @@ -36,6 +37,12 @@ def make_cache_key(self: "MemoriousWorker", key: str) -> str | None: return f"memorious/sync/{host}/{self.dataset.name}/{key}" +class MemoriousStatus(WorkerStatus): + added: int = 0 + skipped: int = 0 + not_found: int = 0 + + class MemoriousWorker(DatasetWorker): def __init__( self, uri: Uri, key_func: Callable | None = None, *args, **kwargs @@ -43,6 +50,7 @@ def __init__( super().__init__(*args, **kwargs) self.memorious = get_store(uri, serialization_mode="raw") self.key_func = key_func or get_file_key + self.status_model = MemoriousStatus def get_tasks(self) -> StrGenerator: yield from self.memorious.iterate_keys(glob="*.json") @@ -56,11 +64,13 @@ def handle_task(self, task: str) -> None: store=self.memorious, file=file, ) + self.count(added=1) else: self.log_info( f"Skipping already existing `{file.key}` ...", store=self.memorious.uri, ) + self.count(skipped=1) @anycache(store=get_cache(), key_func=make_cache_key, model=OriginalFile) def load_memorious(self, key: str) -> OriginalFile | None: @@ -68,6 +78,7 @@ def load_memorious(self, key: str) -> OriginalFile | None: content_hash = data.pop("content_hash", None) if content_hash is None: log.warning(f"No content hash for `{key}`", store=self.memorious.uri) + self.count(not_found=1) elif data.get("_file_name") is None: log.warning(f"No original file for `{key}`", store=self.memorious.uri) else: @@ -89,10 +100,10 @@ def done(self) -> None: def import_memorious( dataset: DatasetArchive, uri: Uri, key_func: Callable | None = None -) -> None: +) -> MemoriousStatus: worker = MemoriousWorker(uri, key_func, dataset=dataset) worker.log_info(f"Starting memorious import from `{worker.memorious.uri}` ...") - worker.run() + return worker.run() def get_file_key(data: dict[str, Any]) -> str: diff --git a/tests/test_sync.py b/tests/test_sync.py index 5f2bf4e..118ad69 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -23,7 +23,9 @@ def test_sync_memorious(fixtures_path, tmp_path): assert file.extra["title"] == "Home - BishopAccountability.org" assert file.mimetype == "image/jpeg" - import_memorious(dataset, fixtures_path / "memorious/") + res = import_memorious(dataset, fixtures_path / "memorious/") + assert res.added == 1 + assert res.skipped == 0 archived_file = next(dataset.iter_files()) assert archived_file.name == file.name assert archived_file.key == file.key @@ -31,7 +33,9 @@ def test_sync_memorious(fixtures_path, tmp_path): assert dataset.exists_hash(file.content_hash) # now cached - import_memorious(dataset, fixtures_path / "memorious/") + res = import_memorious(dataset, fixtures_path / "memorious/") + assert res.added == 0 + assert res.skipped == 1 # custom file key (path) method def get_key(data):