Skip to content

Commit

Permalink
🩹 (sync/aleph) Respect existing source url in metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwoerpel committed Jan 16, 2025
1 parent dcea6c4 commit d6db0f9
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion leakrfc/sync/aleph.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

from anystore import anycache
from anystore.io import logged_items
from anystore.types import SDict
from anystore.worker import WorkerStatus
from banal import ensure_dict

from leakrfc.archive.cache import get_cache
from leakrfc.archive.dataset import DatasetArchive
Expand Down Expand Up @@ -39,6 +41,16 @@ def make_current_version_cache_key(self: "AlephUploadWorker") -> str:
return aleph.make_aleph_cache_key(self, version)


def get_source_url(data: SDict) -> str | None:
url = data.get("source_url")
if url:
return url
url = ensure_dict(data.get("extra")).get("source_url")
if url:
return url
return data.get("url")


class AlephUploadStatus(WorkerStatus):
uploaded: int = 0
folders_created: int = 0
Expand Down Expand Up @@ -107,7 +119,7 @@ def handle_task(self, task: File) -> dict[str, Any]:
foreign_id=self.foreign_id,
)
metadata = {**task.extra, "file_name": task.name, "foreign_id": task.key}
metadata["source_url"] = metadata.get("url")
metadata["source_url"] = get_source_url(metadata)
parent = self.get_parent(task.key, self.prefix)
if parent:
metadata["parent"] = parent
Expand Down

0 comments on commit d6db0f9

Please sign in to comment.