Skip to content

Commit

Permalink
Merge pull request #633 from bioimage-io/cache_cover_images
Browse files Browse the repository at this point in the history
Cache cover images
  • Loading branch information
FynnBe authored Sep 12, 2023
2 parents 2e29ed1 + bc4a149 commit 4b1dfbb
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 26 deletions.
8 changes: 5 additions & 3 deletions .github/workflows/auto_update_main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,15 +144,17 @@ jobs:
lxml
requests
typer
- name: generate collection rdf
pooch
pillow
- name: generate collection rdf and thumbnails
shell: bash -l {0}
run: python scripts/generate_collection_rdf.py
run: python scripts/generate_collection_rdf_and_thumbnails.py
- name: Upload preview of collection.json
if: github.event_name == 'pull_request'
uses: actions/upload-artifact@v3
with:
name: preview-collection-json
path: dist/collection.json
path: dist # /collection.json include all thumbnails in preview for now; todo: preview only collection.json again
retention-days: 90
- name: Deploy collection.json to gh-pages 🚀
if: github.event_name != 'pull_request'
Expand Down
70 changes: 70 additions & 0 deletions download_counts_offsets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"10.5281/zenodo.8324706": -2,
"10.5281/zenodo.8260660": -83,
"10.5281/zenodo.5914248": -3764,
"10.5281/zenodo.8142283": -204,
"10.5281/zenodo.7274275": -4761,
"10.5281/zenodo.8064806": -245,
"10.5281/zenodo.7872357": -460,
"10.5281/zenodo.6334583": -956,
"10.5281/zenodo.6334383": -1289,
"10.5281/zenodo.6334881": -1257,
"10.5281/zenodo.7786492": -2256,
"10.5281/zenodo.7781877": -585,
"10.5281/zenodo.7261974": -1235,
"10.5281/zenodo.7772662": -602,
"10.5281/zenodo.6383429": -7681,
"10.5281/zenodo.6384845": -9603,
"10.5281/zenodo.6334793": -2403,
"10.5281/zenodo.6346511": -12167,
"10.5281/zenodo.6334777": -1558,
"10.5281/zenodo.6200635": -5016,
"10.5281/zenodo.6079314": -1506,
"10.5281/zenodo.6200999": -1240,
"10.5281/zenodo.7689187": -772,
"10.5281/zenodo.7653695": -835,
"10.5281/zenodo.7614645": -849,
"10.5281/zenodo.7634388": -892,
"10.5281/zenodo.7612115": -837,
"10.5281/zenodo.7372476": -9,
"10.5281/zenodo.7254196": -1238,
"10.5281/zenodo.7380171": -998,
"10.5281/zenodo.7385954": 6464,
"10.5281/zenodo.7380213": 1,
"10.5281/zenodo.7139022": -1286,
"10.5281/zenodo.7315440": -4383,
"10.5281/zenodo.6518890": -1900,
"10.5281/zenodo.7052800": -1362,
"10.5281/zenodo.7053390": -1365,
"10.5281/zenodo.6865412": -1456,
"10.5281/zenodo.6827058": -1539,
"10.5281/zenodo.6821147": -741,
"10.5281/zenodo.6817638": -773,
"10.5281/zenodo.6406803": -21499,
"10.5281/zenodo.6406756": -3562,
"10.5281/zenodo.6811491": -14121,
"10.5281/zenodo.6808325": -1263,
"10.5281/zenodo.6028280": -8081,
"10.5281/zenodo.5869899": -3285,
"10.5281/zenodo.5847355": -2028,
"10.5281/zenodo.5764892": -2894,
"10.5281/zenodo.5874841": -7737,
"10.5281/zenodo.6559929": -1685,
"10.5281/zenodo.6559474": -9486,
"10.5281/zenodo.6554667": -1804,
"10.5281/zenodo.5910854": -2246,
"10.5281/zenodo.6518571": -1911,
"10.5281/zenodo.6518500": -1911,
"10.5281/zenodo.6518218": -1922,
"10.5281/zenodo.6348728": -25028,
"10.5281/zenodo.6326366": -2443,
"10.5281/zenodo.6028097": -28693,
"10.5281/zenodo.5910163": -1905,
"10.5281/zenodo.5940478": -1406,
"10.5281/zenodo.5817052": -534,
"10.5281/zenodo.5749843": -2984,
"10.5281/zenodo.5874741": -31074,
"10.5281/zenodo.5744489": -5461,
"10.5281/zenodo.6348084": -23547,
"10.5281/zenodo.6338614": -26479
}
4 changes: 2 additions & 2 deletions scripts/check_validation_passed.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
def main(artifact_dir: Path = typer.Argument(..., help="folder with validation artifacts")):
"""check validation summaries in artifact folder"""
failed_val = []
for sp in sorted(artifact_dir.glob(f"**/validation_summary*.yaml"), key=os.path.getmtime):
for sp in sorted(artifact_dir.glob("**/validation_summary*.yaml"), key=os.path.getmtime):
summary = yaml.load(sp)
if isinstance(summary, dict):
summary = [summary]

for s in summary:
if s["error"]:
if s["status"] != "passed":
s["id"] = sp.stem
failed_val.append(summary)

Expand Down
38 changes: 38 additions & 0 deletions scripts/compute_download_counts_offsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import json
from pathlib import Path

import requests

gh_pages = Path(__file__).parent / "../gh-pages"

download_offsets = {}
for page in range(1, 1000):
zenodo_request = (
f"https://zenodo.org/api/records/?&sort=mostrecent&page={page}&size=1000&all_versions=1&keywords=bioimage.io"
)
r = requests.get(zenodo_request)
if not r.status_code == 200:
print(f"Could not get zenodo records page {page}: {r.status_code}: {r.reason}")
break

print(f"Collecting items from zenodo: {zenodo_request}")

hits = r.json()["hits"]["hits"]
if not hits:
break

for hit in hits:
resource_doi = hit["conceptdoi"]
doi = hit["doi"] # "version" doi

total_size = sum(f["size"] for f in hit["files"])
download_count = int(hit["stats"]["unique_downloads"])
downloaded_volume = int(hit["stats"]["version_volume"])
desired_count = round(downloaded_volume / total_size)

download_offsets[resource_doi] = desired_count - download_count


print(download_offsets)
with (Path(__file__).parent / "download_counts_offsets.json").open("w") as f:
json.dump(download_offsets, f)
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
from typing import Optional

import typer
from boltons.iterutils import remap

from bioimageio.spec.shared import yaml
from utils import iterate_known_resources, load_yaml_dict, rec_sort
from boltons.iterutils import remap
from utils import deploy_thumbnails, iterate_known_resources, load_yaml_dict, rec_sort

SUMMARY_FIELDS = [
SUMMARY_FIELDS = (
"authors",
"badges",
"covers",
Expand All @@ -30,7 +29,7 @@
"type",
"versions",
"training_data",
]
)

SUMMARY_FIELDS_FROM_CONFIG_BIOIMAGEIO = [
"nickname",
Expand Down Expand Up @@ -137,12 +136,11 @@ def main(
summary["download_count"] = download_counts.get(r.resource_id, 1)

links = summary.get("links", [])
extend_links_from_test_summary(
links, gh_pages / "rdfs" / r.resource_id / version_id / "test_summary.yaml"
)
extend_links_from_test_summary(links, gh_pages / "rdfs" / r.resource_id / version_id / "test_summary.yaml")
if links:
summary["links"] = links

deploy_thumbnails(summary, dist, r.resource_id, version_id)
rdf["collection"].append(summary)
type_ = latest_version.get("type", "unknown")
n_accepted[type_] = n_accepted.get(type_, 0) + 1
Expand Down
6 changes: 3 additions & 3 deletions scripts/run_main_ci_equivalent_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@

import requests
import typer

from bare_utils import GH_API_URL, GITHUB_REPOSITORY_OWNER
from dynamic_validation import main as dynamic_validation_script
from generate_collection_rdf import main as generate_collection_rdf_script
from prepare_to_deploy import main as prepare_to_deploy_script
from static_validation import main as static_validation_script
from update_external_resources import main as update_external_resources_script
from update_partner_resources import main as update_partner_resources_script
from update_rdfs import main as update_rdfs_script
from utils import iterate_over_gh_matrix

from scripts.generate_collection_rdf_and_thumbnails import main as generate_collection_rdf_and_thumbnails_script


def download_from_gh(owner: str, repo: str, branch: str, folder: Path):
r = requests.get(
Expand Down Expand Up @@ -161,7 +161,7 @@ def main(always_continue: bool = True, skip_update_external: bool = True, with_s
##################
# build-collection
##################
generate_collection_rdf_script()
generate_collection_rdf_and_thumbnails_script()

fake_deploy(dist, gh_pages)
if pending["retrigger"]:
Expand Down
4 changes: 2 additions & 2 deletions scripts/static_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,10 @@ def main(
static_summary_path = dist / resource_id / version_id / "validation_summary_static.yaml"
static_summary_path.parent.mkdir(parents=True, exist_ok=True)
yaml.dump(static_summary, static_summary_path)
if not static_summary["error"]:
if static_summary["status"] == "passed":
# validate rdf using the latest format version
latest_static_summary = validate(rdf_path, update_format=True)
if not latest_static_summary["error"]:
if latest_static_summary["status"] == "passed":
rd = load_raw_resource_description(rdf_path, update_to_format="latest")
assert isinstance(rd, RDF_Base)
dynamic_test_cases += prepare_dynamic_test_cases(rd, resource_id, version_id, dist)
Expand Down
12 changes: 8 additions & 4 deletions scripts/update_external_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import requests
import typer

from bare_utils import set_gh_actions_outputs
from utils import ADJECTIVES, ANIMALS, enforce_block_style_resource, get_animal_nickname, split_animal_nickname, yaml

Expand Down Expand Up @@ -145,7 +144,7 @@ def update_from_zenodo(
updated_resources: DefaultDict[str, List[Dict[str, Union[str, datetime]]]],
ignore_status_5xx: bool,
):
download_counts = {}
download_counts: Dict[str, int] = {}
for page in range(1, 1000):
zenodo_request = f"https://zenodo.org/api/records/?&sort=mostrecent&page={page}&size=1000&all_versions=1&keywords=bioimage.io"
r = requests.get(zenodo_request)
Expand All @@ -160,8 +159,8 @@ def update_from_zenodo(
break

for hit in hits:
resource_doi = hit["conceptdoi"]
doi = hit["doi"] # "version" doi
resource_doi: str = hit["conceptdoi"]
doi: str = hit["doi"] # "version" doi
created = datetime.fromisoformat(hit["created"]).replace(tzinfo=None)
assert isinstance(created, datetime), created
resource_path = collection / resource_doi / "resource.yaml"
Expand Down Expand Up @@ -228,6 +227,11 @@ def update_from_zenodo(
assert isinstance(resource, dict)
update_with_new_version(new_version, resource_doi, rdf, updated_resources)

with Path("download_counts_offsets.json").open() as f:
download_counts_offsets = json.load(f)

download_counts = {k: v + download_counts_offsets.get(k, 0) for k, v in download_counts.items()}

dist.mkdir(parents=True, exist_ok=True)
with (dist / "download_counts.json").open("w", encoding="utf-8") as f:
json.dump(download_counts, f, indent=2, sort_keys=True)
Expand Down
66 changes: 62 additions & 4 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import dataclasses
import json
import pathlib
import shutil
import warnings
from hashlib import sha256
from itertools import product
Expand All @@ -10,9 +11,6 @@

import numpy
import requests
from marshmallow import missing
from ruamel.yaml import YAML, comments

from bare_utils import DEPLOYED_BASE_URL, GH_API_URL
from bioimageio.spec import (
load_raw_resource_description,
Expand All @@ -21,6 +19,7 @@
from bioimageio.spec.collection.v0_2.raw_nodes import Collection
from bioimageio.spec.collection.v0_2.utils import resolve_collection_entries
from bioimageio.spec.partner.utils import enrich_partial_rdf_with_imjoy_plugin
from ruamel.yaml import YAML, comments


# todo: use MyYAML from bioimageio.spec. see comment below
Expand Down Expand Up @@ -268,7 +267,6 @@ def write_rdfs_for_resource(resource: dict, dist: Path, only_for_version_id: Opt
rdf["rdf_source"] = f"{DEPLOYED_BASE_URL}/rdfs/{resource_id}/{version_id}/rdf.yaml"

rdf.pop("root_path", None)
assert missing not in rdf.values(), rdf

# sort rdf to avoid random diffs
rdf = rec_sort(rdf)
Expand Down Expand Up @@ -385,3 +383,63 @@ def load_yaml_dict(path: Path, raise_missing_keys: Sequence[str]) -> Optional[Di
raise KeyError(f"Expected missing keys {missing} in {path}")

return data


def downsize_image(image_path: Path, dist: Path, size: Tuple[int, int]):
"""downsize or copy an image"""
from PIL import Image

output_path = dist / f"{image_path.stem}.png"
try:
with Image.open(image_path) as img:
img.thumbnail(size)
img.save(output_path, "PNG")
except Exception as e:
warnings.warn(str(e))
output_path = output_path.with_name(image_path.name)
shutil.copy(image_path, output_path)

return output_path


def deploy_thumbnails(rdf_like: Dict[str, Any], dist: Path, resource_id: str, version_id: str) -> None:
import pooch

dist /= f"rdfs/{resource_id}/{version_id}"
dist.mkdir(exist_ok=True, parents=True)
covers: Union[Any, List[Any]] = rdf_like.get("covers")
if isinstance(covers, list):
for i, cover_url in enumerate(covers):
if not isinstance(cover_url, str) or cover_url.startswith(DEPLOYED_BASE_URL):
continue # invalid or already cached

try:
downloaded_cover = Path(pooch.retrieve(cover_url, None)) # type: ignore
except Exception as e:
warnings.warn(str(e))
continue

resized_cover = downsize_image(downloaded_cover, dist, size=(600, 340))

rdf_like["covers"][i] = f"{DEPLOYED_BASE_URL}/rdfs/{resource_id}/{version_id}/{resized_cover.name}"

badges: Union[Any, List[Union[Any, Dict[Any, Any]]]] = rdf_like.get("badges")
if isinstance(badges, list):
for i, badge in enumerate(badges):
if not isinstance(badge, dict):
continue

icon = badge.get("icon")
if not isinstance(icon, str) or not icon.startswith("https://zenodo.org/api/files/"):
# only cache badges stored on zenodo
continue

try:
downloaded_icon = Path(pooch.retrieve(icon, None, path=dist)) # type: ignore
except Exception as e:
warnings.warn(str(e))
continue

resized_icon = downsize_image(downloaded_icon, dist, size=(320, 320))

rdf_like["badges"][i]["icon"] = f"{DEPLOYED_BASE_URL}/rdfs/{resource_id}/{version_id}/{resized_icon.name}"

0 comments on commit 4b1dfbb

Please sign in to comment.