From 9b465e4799de438ad81f0202af98395a0c75c37e Mon Sep 17 00:00:00 2001 From: fynnbe Date: Tue, 12 Sep 2023 10:36:21 +0200 Subject: [PATCH 1/4] more explicity status checks --- scripts/check_validation_passed.py | 4 ++-- scripts/static_validation.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/check_validation_passed.py b/scripts/check_validation_passed.py index bc5fa7c21f..a321226402 100644 --- a/scripts/check_validation_passed.py +++ b/scripts/check_validation_passed.py @@ -10,13 +10,13 @@ def main(artifact_dir: Path = typer.Argument(..., help="folder with validation artifacts")): """check validation summaries in artifact folder""" failed_val = [] - for sp in sorted(artifact_dir.glob(f"**/validation_summary*.yaml"), key=os.path.getmtime): + for sp in sorted(artifact_dir.glob("**/validation_summary*.yaml"), key=os.path.getmtime): summary = yaml.load(sp) if isinstance(summary, dict): summary = [summary] for s in summary: - if s["error"]: + if s["status"] != "passed": s["id"] = sp.stem failed_val.append(summary) diff --git a/scripts/static_validation.py b/scripts/static_validation.py index 7c158f8e00..102f3ee7a5 100644 --- a/scripts/static_validation.py +++ b/scripts/static_validation.py @@ -212,10 +212,10 @@ def main( static_summary_path = dist / resource_id / version_id / "validation_summary_static.yaml" static_summary_path.parent.mkdir(parents=True, exist_ok=True) yaml.dump(static_summary, static_summary_path) - if not static_summary["error"]: + if static_summary["status"] == "passed": # validate rdf using the latest format version latest_static_summary = validate(rdf_path, update_format=True) - if not latest_static_summary["error"]: + if latest_static_summary["status"] == "passed": rd = load_raw_resource_description(rdf_path, update_to_format="latest") assert isinstance(rd, RDF_Base) dynamic_test_cases += prepare_dynamic_test_cases(rd, resource_id, version_id, dist) From 3ea41e565cdf5cddf209a057b86c588e1154b6b2 Mon Sep 17 00:00:00 2001 From: fynnbe Date: Tue, 12 Sep 2023 14:16:34 +0200 Subject: [PATCH 2/4] use thumbnails in generated collection --- .github/workflows/auto_update_main.yaml | 6 +- ...generate_collection_rdf_and_thumbnails.py} | 14 ++-- scripts/run_main_ci_equivalent_local.py | 6 +- scripts/utils.py | 66 +++++++++++++++++-- 4 files changed, 74 insertions(+), 18 deletions(-) rename scripts/{generate_collection_rdf.py => generate_collection_rdf_and_thumbnails.py} (95%) diff --git a/.github/workflows/auto_update_main.yaml b/.github/workflows/auto_update_main.yaml index 49b7207c59..c5776acd96 100644 --- a/.github/workflows/auto_update_main.yaml +++ b/.github/workflows/auto_update_main.yaml @@ -144,15 +144,15 @@ jobs: lxml requests typer - - name: generate collection rdf + - name: generate collection rdf and thumbnails shell: bash -l {0} - run: python scripts/generate_collection_rdf.py + run: python scripts/generate_collection_rdf_and_thumbnails.py - name: Upload preview of collection.json if: github.event_name == 'pull_request' uses: actions/upload-artifact@v3 with: name: preview-collection-json - path: dist/collection.json + path: dist # /collection.json include all thumbnails in preview for now; todo: preview only collection.json again retention-days: 90 - name: Deploy collection.json to gh-pages 🚀 if: github.event_name != 'pull_request' diff --git a/scripts/generate_collection_rdf.py b/scripts/generate_collection_rdf_and_thumbnails.py similarity index 95% rename from scripts/generate_collection_rdf.py rename to scripts/generate_collection_rdf_and_thumbnails.py index 95b75efde7..9940230cbc 100644 --- a/scripts/generate_collection_rdf.py +++ b/scripts/generate_collection_rdf_and_thumbnails.py @@ -7,12 +7,11 @@ from typing import Optional import typer -from boltons.iterutils import remap - from bioimageio.spec.shared import yaml -from utils import iterate_known_resources, load_yaml_dict, rec_sort +from boltons.iterutils import remap +from utils import deploy_thumbnails, iterate_known_resources, load_yaml_dict, rec_sort -SUMMARY_FIELDS = [ +SUMMARY_FIELDS = ( "authors", "badges", "covers", @@ -30,7 +29,7 @@ "type", "versions", "training_data", -] +) SUMMARY_FIELDS_FROM_CONFIG_BIOIMAGEIO = [ "nickname", @@ -137,12 +136,11 @@ def main( summary["download_count"] = download_counts.get(r.resource_id, 1) links = summary.get("links", []) - extend_links_from_test_summary( - links, gh_pages / "rdfs" / r.resource_id / version_id / "test_summary.yaml" - ) + extend_links_from_test_summary(links, gh_pages / "rdfs" / r.resource_id / version_id / "test_summary.yaml") if links: summary["links"] = links + deploy_thumbnails(summary, dist, r.resource_id, version_id) rdf["collection"].append(summary) type_ = latest_version.get("type", "unknown") n_accepted[type_] = n_accepted.get(type_, 0) + 1 diff --git a/scripts/run_main_ci_equivalent_local.py b/scripts/run_main_ci_equivalent_local.py index 0c7fe91bd4..93d25c28e5 100644 --- a/scripts/run_main_ci_equivalent_local.py +++ b/scripts/run_main_ci_equivalent_local.py @@ -9,10 +9,8 @@ import requests import typer - from bare_utils import GH_API_URL, GITHUB_REPOSITORY_OWNER from dynamic_validation import main as dynamic_validation_script -from generate_collection_rdf import main as generate_collection_rdf_script from prepare_to_deploy import main as prepare_to_deploy_script from static_validation import main as static_validation_script from update_external_resources import main as update_external_resources_script @@ -20,6 +18,8 @@ from update_rdfs import main as update_rdfs_script from utils import iterate_over_gh_matrix +from scripts.generate_collection_rdf_and_thumbnails import main as generate_collection_rdf_and_thumbnails_script + def download_from_gh(owner: str, repo: str, branch: str, folder: Path): r = requests.get( @@ -161,7 +161,7 @@ def main(always_continue: bool = True, skip_update_external: bool = True, with_s ################## # build-collection ################## - generate_collection_rdf_script() + generate_collection_rdf_and_thumbnails_script() fake_deploy(dist, gh_pages) if pending["retrigger"]: diff --git a/scripts/utils.py b/scripts/utils.py index eebd7e4c8a..038be985c2 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -2,6 +2,7 @@ import dataclasses import json import pathlib +import shutil import warnings from hashlib import sha256 from itertools import product @@ -10,9 +11,6 @@ import numpy import requests -from marshmallow import missing -from ruamel.yaml import YAML, comments - from bare_utils import DEPLOYED_BASE_URL, GH_API_URL from bioimageio.spec import ( load_raw_resource_description, @@ -21,6 +19,7 @@ from bioimageio.spec.collection.v0_2.raw_nodes import Collection from bioimageio.spec.collection.v0_2.utils import resolve_collection_entries from bioimageio.spec.partner.utils import enrich_partial_rdf_with_imjoy_plugin +from ruamel.yaml import YAML, comments # todo: use MyYAML from bioimageio.spec. see comment below @@ -268,7 +267,6 @@ def write_rdfs_for_resource(resource: dict, dist: Path, only_for_version_id: Opt rdf["rdf_source"] = f"{DEPLOYED_BASE_URL}/rdfs/{resource_id}/{version_id}/rdf.yaml" rdf.pop("root_path", None) - assert missing not in rdf.values(), rdf # sort rdf to avoid random diffs rdf = rec_sort(rdf) @@ -385,3 +383,63 @@ def load_yaml_dict(path: Path, raise_missing_keys: Sequence[str]) -> Optional[Di raise KeyError(f"Expected missing keys {missing} in {path}") return data + + +def downsize_image(image_path: Path, dist: Path, size: Tuple[int, int]): + """downsize or copy an image""" + from PIL import Image + + output_path = dist / f"{image_path.stem}.png" + try: + with Image.open(image_path) as img: + img.thumbnail(size) + img.save(output_path, "PNG") + except Exception as e: + warnings.warn(str(e)) + output_path = output_path.with_name(image_path.name) + shutil.copy(image_path, output_path) + + return output_path + + +def deploy_thumbnails(rdf_like: Dict[str, Any], dist: Path, resource_id: str, version_id: str) -> None: + import pooch + + dist /= f"rdfs/{resource_id}/{version_id}" + dist.mkdir(exist_ok=True, parents=True) + covers: Union[Any, List[Any]] = rdf_like.get("covers") + if isinstance(covers, list): + for i, cover_url in enumerate(covers): + if not isinstance(cover_url, str) or cover_url.startswith(DEPLOYED_BASE_URL): + continue # invalid or already cached + + try: + downloaded_cover = Path(pooch.retrieve(cover_url, None)) # type: ignore + except Exception as e: + warnings.warn(str(e)) + continue + + resized_cover = downsize_image(downloaded_cover, dist, size=(600, 340)) + + rdf_like["covers"][i] = f"{DEPLOYED_BASE_URL}/rdfs/{resource_id}/{version_id}/{resized_cover.name}" + + badges: Union[Any, List[Union[Any, Dict[Any, Any]]]] = rdf_like.get("badges") + if isinstance(badges, list): + for i, badge in enumerate(badges): + if not isinstance(badge, dict): + continue + + icon = badge.get("icon") + if not isinstance(icon, str) or not icon.startswith("https://zenodo.org/api/files/"): + # only cache badges stored on zenodo + continue + + try: + downloaded_icon = Path(pooch.retrieve(icon, None, path=dist)) # type: ignore + except Exception as e: + warnings.warn(str(e)) + continue + + resized_icon = downsize_image(downloaded_icon, dist, size=(320, 320)) + + rdf_like["badges"][i]["icon"] = f"{DEPLOYED_BASE_URL}/rdfs/{resource_id}/{version_id}/{resized_icon.name}" From c49efaa39f2daa3aba526a508dccfac436d4e0a0 Mon Sep 17 00:00:00 2001 From: fynnbe Date: Tue, 12 Sep 2023 14:46:19 +0200 Subject: [PATCH 3/4] compute download counts offsets and use them --- download_counts_offsets.json | 70 ++++++++++++++++++++++ scripts/compute_download_counts_offsets.py | 38 ++++++++++++ scripts/update_external_resources.py | 12 ++-- 3 files changed, 116 insertions(+), 4 deletions(-) create mode 100644 download_counts_offsets.json create mode 100644 scripts/compute_download_counts_offsets.py diff --git a/download_counts_offsets.json b/download_counts_offsets.json new file mode 100644 index 0000000000..066fd4b8e7 --- /dev/null +++ b/download_counts_offsets.json @@ -0,0 +1,70 @@ +{ + "10.5281/zenodo.8324706": -2, + "10.5281/zenodo.8260660": -83, + "10.5281/zenodo.5914248": -3764, + "10.5281/zenodo.8142283": -204, + "10.5281/zenodo.7274275": -4761, + "10.5281/zenodo.8064806": -245, + "10.5281/zenodo.7872357": -460, + "10.5281/zenodo.6334583": -956, + "10.5281/zenodo.6334383": -1289, + "10.5281/zenodo.6334881": -1257, + "10.5281/zenodo.7786492": -2256, + "10.5281/zenodo.7781877": -585, + "10.5281/zenodo.7261974": -1235, + "10.5281/zenodo.7772662": -602, + "10.5281/zenodo.6383429": -7681, + "10.5281/zenodo.6384845": -9603, + "10.5281/zenodo.6334793": -2403, + "10.5281/zenodo.6346511": -12167, + "10.5281/zenodo.6334777": -1558, + "10.5281/zenodo.6200635": -5016, + "10.5281/zenodo.6079314": -1506, + "10.5281/zenodo.6200999": -1240, + "10.5281/zenodo.7689187": -772, + "10.5281/zenodo.7653695": -835, + "10.5281/zenodo.7614645": -849, + "10.5281/zenodo.7634388": -892, + "10.5281/zenodo.7612115": -837, + "10.5281/zenodo.7372476": -9, + "10.5281/zenodo.7254196": -1238, + "10.5281/zenodo.7380171": -998, + "10.5281/zenodo.7385954": 6464, + "10.5281/zenodo.7380213": 1, + "10.5281/zenodo.7139022": -1286, + "10.5281/zenodo.7315440": -4383, + "10.5281/zenodo.6518890": -1900, + "10.5281/zenodo.7052800": -1362, + "10.5281/zenodo.7053390": -1365, + "10.5281/zenodo.6865412": -1456, + "10.5281/zenodo.6827058": -1539, + "10.5281/zenodo.6821147": -741, + "10.5281/zenodo.6817638": -773, + "10.5281/zenodo.6406803": -21499, + "10.5281/zenodo.6406756": -3562, + "10.5281/zenodo.6811491": -14121, + "10.5281/zenodo.6808325": -1263, + "10.5281/zenodo.6028280": -8081, + "10.5281/zenodo.5869899": -3285, + "10.5281/zenodo.5847355": -2028, + "10.5281/zenodo.5764892": -2894, + "10.5281/zenodo.5874841": -7737, + "10.5281/zenodo.6559929": -1685, + "10.5281/zenodo.6559474": -9486, + "10.5281/zenodo.6554667": -1804, + "10.5281/zenodo.5910854": -2246, + "10.5281/zenodo.6518571": -1911, + "10.5281/zenodo.6518500": -1911, + "10.5281/zenodo.6518218": -1922, + "10.5281/zenodo.6348728": -25028, + "10.5281/zenodo.6326366": -2443, + "10.5281/zenodo.6028097": -28693, + "10.5281/zenodo.5910163": -1905, + "10.5281/zenodo.5940478": -1406, + "10.5281/zenodo.5817052": -534, + "10.5281/zenodo.5749843": -2984, + "10.5281/zenodo.5874741": -31074, + "10.5281/zenodo.5744489": -5461, + "10.5281/zenodo.6348084": -23547, + "10.5281/zenodo.6338614": -26479 +} \ No newline at end of file diff --git a/scripts/compute_download_counts_offsets.py b/scripts/compute_download_counts_offsets.py new file mode 100644 index 0000000000..c5b93eacd1 --- /dev/null +++ b/scripts/compute_download_counts_offsets.py @@ -0,0 +1,38 @@ +import json +from pathlib import Path + +import requests + +gh_pages = Path(__file__).parent / "../gh-pages" + +download_offsets = {} +for page in range(1, 1000): + zenodo_request = ( + f"https://zenodo.org/api/records/?&sort=mostrecent&page={page}&size=1000&all_versions=1&keywords=bioimage.io" + ) + r = requests.get(zenodo_request) + if not r.status_code == 200: + print(f"Could not get zenodo records page {page}: {r.status_code}: {r.reason}") + break + + print(f"Collecting items from zenodo: {zenodo_request}") + + hits = r.json()["hits"]["hits"] + if not hits: + break + + for hit in hits: + resource_doi = hit["conceptdoi"] + doi = hit["doi"] # "version" doi + + total_size = sum(f["size"] for f in hit["files"]) + download_count = int(hit["stats"]["unique_downloads"]) + downloaded_volume = int(hit["stats"]["version_volume"]) + desired_count = round(downloaded_volume / total_size) + + download_offsets[resource_doi] = desired_count - download_count + + +print(download_offsets) +with (Path(__file__).parent / "download_counts_offsets.json").open("w") as f: + json.dump(download_offsets, f) diff --git a/scripts/update_external_resources.py b/scripts/update_external_resources.py index dca2fe7406..5200eb07f3 100644 --- a/scripts/update_external_resources.py +++ b/scripts/update_external_resources.py @@ -9,7 +9,6 @@ import requests import typer - from bare_utils import set_gh_actions_outputs from utils import ADJECTIVES, ANIMALS, enforce_block_style_resource, get_animal_nickname, split_animal_nickname, yaml @@ -145,7 +144,7 @@ def update_from_zenodo( updated_resources: DefaultDict[str, List[Dict[str, Union[str, datetime]]]], ignore_status_5xx: bool, ): - download_counts = {} + download_counts: Dict[str, int] = {} for page in range(1, 1000): zenodo_request = f"https://zenodo.org/api/records/?&sort=mostrecent&page={page}&size=1000&all_versions=1&keywords=bioimage.io" r = requests.get(zenodo_request) @@ -160,8 +159,8 @@ def update_from_zenodo( break for hit in hits: - resource_doi = hit["conceptdoi"] - doi = hit["doi"] # "version" doi + resource_doi: str = hit["conceptdoi"] + doi: str = hit["doi"] # "version" doi created = datetime.fromisoformat(hit["created"]).replace(tzinfo=None) assert isinstance(created, datetime), created resource_path = collection / resource_doi / "resource.yaml" @@ -228,6 +227,11 @@ def update_from_zenodo( assert isinstance(resource, dict) update_with_new_version(new_version, resource_doi, rdf, updated_resources) + with Path("download_counts_offsets.json").open() as f: + download_counts_offsets = json.load(f) + + download_counts = {k: v + download_counts_offsets.get(k, 0) for k, v in download_counts.items()} + dist.mkdir(parents=True, exist_ok=True) with (dist / "download_counts.json").open("w", encoding="utf-8") as f: json.dump(download_counts, f, indent=2, sort_keys=True) From bc4a149e8a9ae8155112e62363dac46076e88336 Mon Sep 17 00:00:00 2001 From: fynnbe Date: Tue, 12 Sep 2023 14:53:04 +0200 Subject: [PATCH 4/4] add script deps --- .github/workflows/auto_update_main.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/auto_update_main.yaml b/.github/workflows/auto_update_main.yaml index c5776acd96..a7e0f90ce8 100644 --- a/.github/workflows/auto_update_main.yaml +++ b/.github/workflows/auto_update_main.yaml @@ -144,6 +144,8 @@ jobs: lxml requests typer + pooch + pillow - name: generate collection rdf and thumbnails shell: bash -l {0} run: python scripts/generate_collection_rdf_and_thumbnails.py