From d6db0f94991516ab1081f38804934127cb6a5188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20W=C3=B6rpel?= Date: Thu, 16 Jan 2025 22:06:34 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=A9=B9=20(sync/aleph)=20Respect=20existin?= =?UTF-8?q?g=20source=20url=20in=20metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- leakrfc/sync/aleph.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/leakrfc/sync/aleph.py b/leakrfc/sync/aleph.py index 1461f6d..9401453 100644 --- a/leakrfc/sync/aleph.py +++ b/leakrfc/sync/aleph.py @@ -9,7 +9,9 @@ from anystore import anycache from anystore.io import logged_items +from anystore.types import SDict from anystore.worker import WorkerStatus +from banal import ensure_dict from leakrfc.archive.cache import get_cache from leakrfc.archive.dataset import DatasetArchive @@ -39,6 +41,16 @@ def make_current_version_cache_key(self: "AlephUploadWorker") -> str: return aleph.make_aleph_cache_key(self, version) +def get_source_url(data: SDict) -> str | None: + url = data.get("source_url") + if url: + return url + url = ensure_dict(data.get("extra")).get("source_url") + if url: + return url + return data.get("url") + + class AlephUploadStatus(WorkerStatus): uploaded: int = 0 folders_created: int = 0 @@ -107,7 +119,7 @@ def handle_task(self, task: File) -> dict[str, Any]: foreign_id=self.foreign_id, ) metadata = {**task.extra, "file_name": task.name, "foreign_id": task.key} - metadata["source_url"] = metadata.get("url") + metadata["source_url"] = get_source_url(metadata) parent = self.get_parent(task.key, self.prefix) if parent: metadata["parent"] = parent