Skip to content

Commit

Permalink
feat: Bucket operations using storage token (#200)
Browse files Browse the repository at this point in the history
* First pass

* Add a storage session class to be used as a context manager when using the storage token with the bucket

* Use the storage session in the Hub client to fetch the Parquet file

* Zarr FS store does not play nice with PyArrow FS, so revert to less efficient fsspec implementation. Use storage token to get Zarr content, when present.

* After PyArrow's filesystem failed to work the way I needed to with Zarr's FSStore, I implemented a S3 store for Zarr, without intermediary library.

* Update doc

* Some cleanup and bug fixing

* Review feedback

* Rebase fixups

* Further review feeback

* Linting and formating

* Build errors
  • Loading branch information
jstlaurent authored Sep 26, 2024
1 parent e5370b7 commit 23fd61e
Show file tree
Hide file tree
Showing 29 changed files with 975 additions and 331 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,5 +136,6 @@ rever/
# VS Code
.vscode/

# Generated requirements.txt
# Generated requirements.txt and uv lock file
requirements.txt
uv.lock
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
::: polaris.hub.external_auth_client.ExternalAuthClient
::: polaris.hub.external_client.ExternalAuthClient
options:
merge_init_into_class: true
filters: ["!create_authorization_url", "!fetch_token"]
Expand Down
10 changes: 10 additions & 0 deletions docs/api/hub.storage.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
::: polaris.hub.storage.StorageSession
options:
merge_init_into_class: true

---

::: polaris.hub.storage.S3Store
options:
merge_init_into_class: true
---
6 changes: 5 additions & 1 deletion env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ dependencies:
- pydantic-settings >=2
- fsspec
- yaspin
- typing-extensions >=4.12.0
- boto3 >=1.35.0


# Hub client
- authlib
Expand Down Expand Up @@ -45,6 +48,7 @@ dependencies:
- ruff
- jupyterlab
- ipywidgets
- moto >=5.0.0

# Doc
- mkdocs
Expand All @@ -58,4 +62,4 @@ dependencies:
- mike >=1.0.0

- pip:
- fastpdb
- fastpdb
8 changes: 4 additions & 4 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ nav:
- Competiton Evaluation: api/competition.evaluation.md
- Hub:
- Client: api/hub.client.md
- External Auth Client: api/hub.external_auth_client.md
- External Auth Client: api/hub.external_client.md
- PolarisFileSystem: api/hub.polarisfs.md
- Additional:
- Dataset Factory: api/factory.md
Expand Down Expand Up @@ -122,9 +122,9 @@ plugins:
- mkdocs-jupyter:
execute: False
remove_tag_config:
remove_cell_tags: [remove_cell]
remove_all_outputs_tags: [remove_output]
remove_input_tags: [remove_input]
remove_cell_tags: [ remove_cell ]
remove_all_outputs_tags: [ remove_output ]
remove_input_tags: [ remove_input ]

- mike:
version_selector: true
Expand Down
78 changes: 50 additions & 28 deletions polaris/_artifact.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from typing import Dict, Optional, Union
from typing import ClassVar

import fsspec
from loguru import logger
Expand All @@ -13,10 +13,11 @@
field_validator,
)
from pydantic.alias_generators import to_camel
from typing_extensions import Self

import polaris as po
from polaris.utils.misc import sluggify
from polaris.utils.types import HubOwner, SlugCompatibleStringType
import polaris
from polaris.utils.misc import slugify
from polaris.utils.types import ArtifactUrn, HubOwner, SlugCompatibleStringType, SlugStringType


class BaseArtifactModel(BaseModel):
Expand All @@ -29,30 +30,47 @@ class BaseArtifactModel(BaseModel):
Only when uploading to the Hub, some of the attributes are required.
Attributes:
name: A slug-compatible name for the dataset.
Together with the owner, this is used by the Hub to uniquely identify the benchmark.
description: A beginner-friendly, short description of the dataset.
tags: A list of tags to categorize the benchmark by. This is used by the hub to search over benchmarks.
name: A slug-compatible name for the artifact.
Together with the owner, this is used by the Hub to uniquely identify the artifact.
description: A beginner-friendly, short description of the artifact.
tags: A list of tags to categorize the artifact by. This is used by the hub to search over artifacts.
user_attributes: A dict with additional, textual user attributes.
owner: A slug-compatible name for the owner of the dataset.
If the dataset comes from the Polaris Hub, this is the associated owner (organization or user).
Together with the name, this is used by the Hub to uniquely identify the benchmark.
owner: A slug-compatible name for the owner of the artifact.
If the artifact comes from the Polaris Hub, this is the associated owner (organization or user).
Together with the name, this is used by the Hub to uniquely identify the artifact.
polaris_version: The version of the Polaris library that was used to create the artifact.
"""

_artifact_type: ClassVar[str]

model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, arbitrary_types_allowed=True)

name: Optional[SlugCompatibleStringType] = None
# Model attributes
name: SlugCompatibleStringType | None = None
description: str = ""
tags: list[str] = Field(default_factory=list)
user_attributes: Dict[str, str] = Field(default_factory=dict)
owner: Optional[HubOwner] = None
polaris_version: str = po.__version__
user_attributes: dict[str, str] = Field(default_factory=dict)
owner: HubOwner | None = None
polaris_version: str = polaris.__version__

@computed_field
@property
def artifact_id(self) -> Optional[str]:
return f"{self.owner}/{sluggify(self.name)}" if self.owner and self.name else None
def slug(self) -> SlugStringType | None:
return slugify(self.name) if self.name else None

@computed_field
@property
def artifact_id(self) -> str | None:
if self.owner and self.slug:
return f"{self.owner}/{self.slug}"
return None

@computed_field
@property
def urn(self) -> ArtifactUrn | None:
if self.owner and self.slug:
return self.urn_for(self.owner, self.slug)
return None

@field_validator("polaris_version")
@classmethod
Expand All @@ -61,7 +79,7 @@ def _validate_version(cls, value: str) -> str:
# Make sure it is a valid semantic version
Version(value)

current_version = po.__version__
current_version = polaris.__version__
if value != current_version:
logger.info(
f"The version of Polaris that was used to create the artifact ({value}) is different "
Expand All @@ -71,31 +89,35 @@ def _validate_version(cls, value: str) -> str:

@field_validator("owner", mode="before")
@classmethod
def _validate_owner(cls, value: Union[str, HubOwner, None]):
def _validate_owner(cls, value: str | HubOwner | None):
if isinstance(value, str):
return HubOwner(slug=value)
return value

@field_serializer("owner")
def _serialize_owner(self, value: HubOwner) -> Union[str, None]:
def _serialize_owner(self, value: HubOwner) -> str | None:
return value.slug if value else None

@classmethod
def from_json(cls, path: str):
"""Loads a benchmark from a JSON file.
def from_json(cls, path: str) -> Self:
"""Loads an artifact from a JSON file.
Args:
path: Loads a benchmark specification from a JSON file.
path: Path to a JSON file containing the artifact definition.
"""
with fsspec.open(path, "r") as f:
data = json.load(f)
return cls.model_validate(data)
return cls.model_validate(data)

def to_json(self, path: str):
"""Saves the benchmark to a JSON file.
def to_json(self, path: str) -> None:
"""Saves an artifact to a JSON file.
Args:
path: Saves the benchmark specification to a JSON file.
path: Path to save the artifact definition as JSON.
"""
with fsspec.open(path, "w") as f:
json.dump(self.model_dump(), f)
f.write(self.model_dump_json())

@classmethod
def urn_for(cls, owner: str | HubOwner, name: str) -> ArtifactUrn:
return f"urn:polaris:{cls._artifact_type}:{owner}:{slugify(name)}"
6 changes: 4 additions & 2 deletions polaris/benchmark/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
from hashlib import md5
from itertools import chain
from typing import Any, Callable, Optional, Union
from typing import Any, Callable, Optional, TypeAlias, Union

import fsspec
import numpy as np
Expand Down Expand Up @@ -36,7 +36,7 @@
TaskType,
)

ColumnsType = Union[str, list[str]]
ColumnsType: TypeAlias = str | list[str]


class BenchmarkSpecification(BaseArtifactModel, ChecksumMixin):
Expand Down Expand Up @@ -95,6 +95,8 @@ class BenchmarkSpecification(BaseArtifactModel, ChecksumMixin):
For additional meta-data attributes, see the [`BaseArtifactModel`][polaris._artifact.BaseArtifactModel] class.
"""

_artifact_type = "benchmark"

# Public attributes
# Data
dataset: Union[DatasetV1, CompetitionDataset, str, dict[str, Any]]
Expand Down
4 changes: 3 additions & 1 deletion polaris/competition/_competition.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Optional

from polaris.benchmark import BenchmarkSpecification
from polaris.evaluate._results import CompetitionPredictions
from polaris.evaluate import CompetitionPredictions
from polaris.hub.settings import PolarisHubSettings
from polaris.utils.types import HubOwner

Expand All @@ -18,6 +18,8 @@ class CompetitionSpecification(BenchmarkSpecification):
end_time: The time at which the competition ends and is no longer interactable.
"""

_artifact_type = "competition"

# Additional properties specific to Competitions
owner: HubOwner
start_time: datetime | None = None
Expand Down
3 changes: 1 addition & 2 deletions polaris/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from polaris.dataset._column import ColumnAnnotation, KnownContentType, Modality
from polaris.dataset._competition_dataset import CompetitionDataset
from polaris.dataset._dataset import DatasetV1
from polaris.dataset._dataset import DatasetV1 as Dataset
from polaris.dataset._dataset import DatasetV1, DatasetV1 as Dataset
from polaris.dataset._factory import DatasetFactory, create_dataset_from_file, create_dataset_from_files
from polaris.dataset._subset import Subset

Expand Down
Loading

0 comments on commit 23fd61e

Please sign in to comment.