Skip to content

Commit

Permalink
🚧 (sync/memorious) More verbose runtime status
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwoerpel committed Oct 21, 2024
1 parent dc2e52b commit 5046c66
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 5 deletions.
3 changes: 2 additions & 1 deletion leakrfc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ def cli_sync_memorious(
key_func = get_file_name_templ_func(key_template)
else:
key_func = None
import_memorious(dataset, uri, key_func)
res = import_memorious(dataset, uri, key_func)
write_obj(res, "-")


@sync.command("aleph")
Expand Down
15 changes: 13 additions & 2 deletions leakrfc/sync/memorious.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from anystore.store import get_store
from anystore.types import StrGenerator, Uri
from anystore.util import make_data_checksum
from anystore.worker import WorkerStatus

from leakrfc.archive import DatasetArchive
from leakrfc.archive.cache import get_cache
Expand All @@ -36,13 +37,20 @@ def make_cache_key(self: "MemoriousWorker", key: str) -> str | None:
return f"memorious/sync/{host}/{self.dataset.name}/{key}"


class MemoriousStatus(WorkerStatus):
added: int = 0
skipped: int = 0
not_found: int = 0


class MemoriousWorker(DatasetWorker):
def __init__(
self, uri: Uri, key_func: Callable | None = None, *args, **kwargs
) -> None:
super().__init__(*args, **kwargs)
self.memorious = get_store(uri, serialization_mode="raw")
self.key_func = key_func or get_file_key
self.status_model = MemoriousStatus

def get_tasks(self) -> StrGenerator:
yield from self.memorious.iterate_keys(glob="*.json")
Expand All @@ -56,18 +64,21 @@ def handle_task(self, task: str) -> None:
store=self.memorious,
file=file,
)
self.count(added=1)
else:
self.log_info(
f"Skipping already existing `{file.key}` ...",
store=self.memorious.uri,
)
self.count(skipped=1)

@anycache(store=get_cache(), key_func=make_cache_key, model=OriginalFile)
def load_memorious(self, key: str) -> OriginalFile | None:
data = self.memorious.get(key, serialization_mode="json")
content_hash = data.pop("content_hash", None)
if content_hash is None:
log.warning(f"No content hash for `{key}`", store=self.memorious.uri)
self.count(not_found=1)
elif data.get("_file_name") is None:
log.warning(f"No original file for `{key}`", store=self.memorious.uri)
else:
Expand All @@ -89,10 +100,10 @@ def done(self) -> None:

def import_memorious(
dataset: DatasetArchive, uri: Uri, key_func: Callable | None = None
) -> None:
) -> MemoriousStatus:
worker = MemoriousWorker(uri, key_func, dataset=dataset)
worker.log_info(f"Starting memorious import from `{worker.memorious.uri}` ...")
worker.run()
return worker.run()


def get_file_key(data: dict[str, Any]) -> str:
Expand Down
8 changes: 6 additions & 2 deletions tests/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,19 @@ def test_sync_memorious(fixtures_path, tmp_path):
assert file.extra["title"] == "Home - BishopAccountability.org"
assert file.mimetype == "image/jpeg"

import_memorious(dataset, fixtures_path / "memorious/")
res = import_memorious(dataset, fixtures_path / "memorious/")
assert res.added == 1
assert res.skipped == 0
archived_file = next(dataset.iter_files())
assert archived_file.name == file.name
assert archived_file.key == file.key

assert dataset.exists_hash(file.content_hash)

# now cached
import_memorious(dataset, fixtures_path / "memorious/")
res = import_memorious(dataset, fixtures_path / "memorious/")
assert res.added == 0
assert res.skipped == 1

# custom file key (path) method
def get_key(data):
Expand Down

0 comments on commit 5046c66

Please sign in to comment.