diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 6be6ba83..952bd64b 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -22,7 +22,7 @@ from polaris.benchmark._task import PredictiveTaskSpecificationMixin from polaris.dataset import DatasetV1, Subset from polaris.dataset._base import BaseDataset -from polaris.evaluate import BenchmarkResults +from polaris.evaluate import BenchmarkResultsV1 from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings from polaris.mixins import ChecksumMixin @@ -151,64 +151,6 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs featurization_fn=featurization_fn, ) - def evaluate( - self, - y_pred: IncomingPredictionsType | None = None, - y_prob: IncomingPredictionsType | None = None, - ) -> BenchmarkResults: - """Execute the evaluation protocol for the benchmark, given a set of predictions. - - info: What about `y_true`? - Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just - the predictions. This reduces the chance of accidentally using the test targets during training. - - For this method, we make the following assumptions: - - 1. There can be one or multiple test set(s); - 2. There can be one or multiple target(s); - 3. The metrics are _constant_ across test sets; - 4. The metrics are _constant_ across targets; - 5. There can be metrics which measure across tasks. - - Args: - y_pred: The predictions for the test set, as NumPy arrays. - If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. - If there are multiple test sets, the predictions should be further wrapped in a dictionary - with the test subset labels as keys. - y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the - number of tasks and test sets. - - Returns: - A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. - - Examples: - 1. For regression benchmarks: - pred_scores = your_model.predict_score(molecules) # predict continuous score values - benchmark.evaluate(y_pred=pred_scores) - 2. For classification benchmarks: - - If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required: - pred_probs = your_model.predict_proba(molecules) # predict probablities - pred_labels = your_model.predict_labels(molecules) # predict class labels - benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs) - - Otherwise: - benchmark.evaluate(y_pred=pred_labels) - """ - - # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves. - y_true = self._get_test_sets(hide_targets=False) - - scores = evaluate_benchmark( - target_cols=list(self.target_cols), - test_set_labels=self.test_set_labels, - test_set_sizes=self.test_set_sizes, - metrics=self.metrics, - y_true=y_true, - y_pred=y_pred, - y_prob=y_prob, - ) - - return BenchmarkResults(results=scores, benchmark_artifact_id=self.artifact_id) - def upload_to_hub( self, settings: PolarisHubSettings | None = None, @@ -423,6 +365,64 @@ def n_classes(self) -> dict[str, int]: if self.target_types.get(target) == TargetType.CLASSIFICATION } + def evaluate( + self, + y_pred: IncomingPredictionsType | None = None, + y_prob: IncomingPredictionsType | None = None, + ) -> BenchmarkResultsV1: + """Execute the evaluation protocol for the benchmark, given a set of predictions. + + info: What about `y_true`? + Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just + the predictions. This reduces the chance of accidentally using the test targets during training. + + For this method, we make the following assumptions: + + 1. There can be one or multiple test set(s); + 2. There can be one or multiple target(s); + 3. The metrics are _constant_ across test sets; + 4. The metrics are _constant_ across targets; + 5. There can be metrics which measure across tasks. + + Args: + y_pred: The predictions for the test set, as NumPy arrays. + If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. + If there are multiple test sets, the predictions should be further wrapped in a dictionary + with the test subset labels as keys. + y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the + number of tasks and test sets. + + Returns: + A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. + + Examples: + 1. For regression benchmarks: + pred_scores = your_model.predict_score(molecules) # predict continuous score values + benchmark.evaluate(y_pred=pred_scores) + 2. For classification benchmarks: + - If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required: + pred_probs = your_model.predict_proba(molecules) # predict probablities + pred_labels = your_model.predict_labels(molecules) # predict class labels + benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs) + - Otherwise: + benchmark.evaluate(y_pred=pred_labels) + """ + + # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves. + y_true = self._get_test_sets(hide_targets=False) + + scores = evaluate_benchmark( + target_cols=list(self.target_cols), + test_set_labels=self.test_set_labels, + test_set_sizes=self.test_set_sizes, + metrics=self.metrics, + y_true=y_true, + y_pred=y_pred, + y_prob=y_prob, + ) + + return BenchmarkResultsV1(results=scores, benchmark_artifact_id=self.artifact_id) + def __eq__(self, other) -> bool: if not isinstance(other, BenchmarkSpecification): return False diff --git a/polaris/evaluate/__init__.py b/polaris/evaluate/__init__.py index 67423f63..a9e87520 100644 --- a/polaris/evaluate/__init__.py +++ b/polaris/evaluate/__init__.py @@ -1,19 +1,30 @@ -from polaris.evaluate._metadata import ResultsMetadata +from polaris.evaluate._metadata import ResultsMetadataV1, ResultsMetadataV2 +from polaris.evaluate._metadata import ResultsMetadataV1 as ResultsMetadata from polaris.evaluate._metric import Metric, MetricInfo from polaris.evaluate._predictions import BenchmarkPredictions, CompetitionPredictions from polaris.evaluate._results import ( - BenchmarkResults, + BenchmarkResultsV1 as BenchmarkResults, + BenchmarkResultsV1, + BenchmarkResultsV2, CompetitionResults, - EvaluationResult, + EvaluationResultV1, + EvaluationResultV1 as EvaluationResult, + EvaluationResultV2, ) from polaris.evaluate.utils import evaluate_benchmark __all__ = [ "ResultsMetadata", + "ResultsMetadataV1", + "ResultsMetadataV2", "Metric", "MetricInfo", "EvaluationResult", + "EvaluationResultV1", + "EvaluationResultV2", "BenchmarkResults", + "BenchmarkResultsV1", + "BenchmarkResultsV2", "CompetitionResults", "evaluate_benchmark", "CompetitionPredictions", diff --git a/polaris/evaluate/_metadata.py b/polaris/evaluate/_metadata.py index 8dd35b21..3c3b43c7 100644 --- a/polaris/evaluate/_metadata.py +++ b/polaris/evaluate/_metadata.py @@ -5,10 +5,11 @@ from polaris._artifact import BaseArtifactModel from polaris.utils.dict2html import dict2html from polaris.utils.types import HttpUrlString, HubUser +from polaris.model import Model -class ResultsMetadata(BaseArtifactModel): - """Base class for evaluation results +class ResultsMetadataV1(BaseArtifactModel): + """V1 implementation of evaluation results without model field support Attributes: github_url: The URL to the code repository that was used to generate these results. @@ -32,3 +33,27 @@ def _repr_html_(self) -> str: def __repr__(self): return self.model_dump_json(indent=2) + + +class ResultsMetadataV2(BaseArtifactModel): + """V2 implementation of evaluation results with model field replacing URLs + + Attributes: + model: The model that was used to generate these results. + contributors: The users that are credited for these results. + + For additional meta-data attributes, see the base classes. + """ + + # Additional meta-data + model: Model | None = Field(None, exclude=True) + contributors: list[HubUser] = Field(default_factory=list) + + # Private attributes + _created_at: datetime = PrivateAttr(default_factory=datetime.now) + + def _repr_html_(self) -> str: + return dict2html(self.model_dump()) + + def __repr__(self): + return self.model_dump_json(indent=2) diff --git a/polaris/evaluate/_predictions.py b/polaris/evaluate/_predictions.py index 472f9984..7f451a2d 100644 --- a/polaris/evaluate/_predictions.py +++ b/polaris/evaluate/_predictions.py @@ -12,7 +12,7 @@ ) from typing_extensions import Self -from polaris.evaluate import ResultsMetadata +from polaris.evaluate import ResultsMetadataV1 from polaris.utils.misc import convert_lists_to_arrays from polaris.utils.types import ( HttpUrlString, @@ -249,7 +249,7 @@ def __len__(self) -> int: return self.get_size() -class CompetitionPredictions(BenchmarkPredictions, ResultsMetadata): +class CompetitionPredictions(BenchmarkPredictions, ResultsMetadataV1): """ Predictions for competition benchmarks. diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py index d6ea4434..c3ce090d 100644 --- a/polaris/evaluate/_results.py +++ b/polaris/evaluate/_results.py @@ -13,7 +13,7 @@ from pydantic.alias_generators import to_camel from polaris.model import Model -from polaris.evaluate import ResultsMetadata +from polaris.evaluate import ResultsMetadataV1, ResultsMetadataV2 from polaris.utils.errors import InvalidResultError from polaris.utils.misc import slugify from polaris.utils.types import ( @@ -41,8 +41,8 @@ class ResultRecords(BaseModel): model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) -class EvaluationResult(ResultsMetadata): - """Class for saving evaluation results +class BaseEvaluationResult: + """Base class for saving evaluation results The actual results are saved in the `results` field using the following tabular format: @@ -134,8 +134,20 @@ def _serialize_results(self, value: pd.DataFrame) -> list[ResultRecords]: return serialized -class BenchmarkResults(EvaluationResult): - """Class specific to results for standard benchmarks. +class EvaluationResultV1(ResultsMetadataV1, BaseEvaluationResult): + """V1 implementation of evaluation results without model field support""" + + pass + + +class EvaluationResultV2(ResultsMetadataV2, BaseEvaluationResult): + """V2 implementation of evaluation results with model field replacing URLs""" + + pass + + +class BaseBenchmarkResults: + """Base class for results of standard benchmarks. This object is returned by [`BenchmarkSpecification.evaluate`][polaris.benchmark.BenchmarkSpecification.evaluate]. In addition to the metrics on the test set, it contains additional meta-data and logic to integrate @@ -152,7 +164,6 @@ class BenchmarkResults(EvaluationResult): benchmark_artifact_id: str | None = Field(None) benchmark_name: SlugCompatibleStringType | None = Field(None, deprecated=True) benchmark_owner: HubOwner | None = Field(None, deprecated=True) - model: Model | None = Field(None) @model_validator(mode="after") def set_benchmark_artifact_id(self): @@ -165,7 +176,7 @@ def upload_to_hub( access: AccessType = "private", owner: HubOwner | str | None = None, **kwargs: dict, - ) -> "BenchmarkResults": + ) -> "BenchmarkResultsV1" | "BenchmarkResultsV2": """ Very light, convenient wrapper around the [`PolarisHubClient.upload_results`][polaris.hub.client.PolarisHubClient.upload_results] method. @@ -176,7 +187,19 @@ def upload_to_hub( return client.upload_results(self, access=access, owner=owner) -class CompetitionResults(EvaluationResult): +class BenchmarkResultsV1(EvaluationResultV1, BaseBenchmarkResults): + """V1 implementation of benchmark results without model field support""" + + pass + + +class BenchmarkResultsV2(EvaluationResultV2, BaseBenchmarkResults): + """V2 implementation of benchmark results with model field replacing URLs""" + + pass + + +class CompetitionResults(EvaluationResultV1): """Class specific to results for competition benchmarks. This object is returned by [`CompetitionSpecification.evaluate`][polaris.competition.CompetitionSpecification.evaluate]. diff --git a/polaris/experimental/_benchmark_v2.py b/polaris/experimental/_benchmark_v2.py index 7ea916db..4695349b 100644 --- a/polaris/experimental/_benchmark_v2.py +++ b/polaris/experimental/_benchmark_v2.py @@ -4,6 +4,10 @@ from typing_extensions import Self from polaris.benchmark import BenchmarkSpecification +from polaris.evaluate.utils import evaluate_benchmark +from polaris.utils.types import IncomingPredictionsType + +from polaris.evaluate import BenchmarkResultsV2 from polaris.dataset import DatasetV2, Subset from polaris.experimental._split_v2 import SplitSpecificationV2Mixin from polaris.utils.errors import InvalidBenchmarkError @@ -92,3 +96,61 @@ def get_train_test_split( ) test = self._get_test_sets(hide_targets=True, featurization_fn=featurization_fn) return train, test + + def evaluate( + self, + y_pred: IncomingPredictionsType | None = None, + y_prob: IncomingPredictionsType | None = None, + ) -> BenchmarkResultsV2: + """Execute the evaluation protocol for the benchmark, given a set of predictions. + + info: What about `y_true`? + Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just + the predictions. This reduces the chance of accidentally using the test targets during training. + + For this method, we make the following assumptions: + + 1. There can be one or multiple test set(s); + 2. There can be one or multiple target(s); + 3. The metrics are _constant_ across test sets; + 4. The metrics are _constant_ across targets; + 5. There can be metrics which measure across tasks. + + Args: + y_pred: The predictions for the test set, as NumPy arrays. + If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. + If there are multiple test sets, the predictions should be further wrapped in a dictionary + with the test subset labels as keys. + y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the + number of tasks and test sets. + + Returns: + A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. + + Examples: + 1. For regression benchmarks: + pred_scores = your_model.predict_score(molecules) # predict continuous score values + benchmark.evaluate(y_pred=pred_scores) + 2. For classification benchmarks: + - If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required: + pred_probs = your_model.predict_proba(molecules) # predict probablities + pred_labels = your_model.predict_labels(molecules) # predict class labels + benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs) + - Otherwise: + benchmark.evaluate(y_pred=pred_labels) + """ + + # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves. + y_true = self._get_test_sets(hide_targets=False) + + scores = evaluate_benchmark( + target_cols=list(self.target_cols), + test_set_labels=self.test_set_labels, + test_set_sizes=self.test_set_sizes, + metrics=self.metrics, + y_true=y_true, + y_pred=y_pred, + y_prob=y_prob, + ) + + return BenchmarkResultsV2(results=scores, benchmark_artifact_id=self.artifact_id) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 83be8efa..2bba717b 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -892,7 +892,7 @@ def submit_competition_predictions( return response def get_model(self, artifact_id: str) -> Model: - url = f"/v1/model/{artifact_id}" + url = f"/v2/model/{artifact_id}" response = self._base_request_to_hub(url=url, method="GET") response_data = response.json() @@ -928,7 +928,7 @@ def upload_model( # Make a request to the hub response = self._base_request_to_hub( - url="/v1/model", method="POST", json={"access": access, **model_json} + url="/v2/model", method="POST", json={"access": access, **model_json} ) # Inform the user about where to find their newly created artifact. diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index c3822905..9195efb9 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -13,7 +13,7 @@ ) from polaris.dataset import DatasetV1 from polaris.evaluate._metric import DEFAULT_METRICS, Metric -from polaris.evaluate._results import BenchmarkResults +from polaris.evaluate._results import BenchmarkResultsV1 from polaris.utils.types import HubOwner @@ -27,7 +27,7 @@ def test_result_to_json(tmp_path: Path, test_user_owner: HubOwner): } ) - result = BenchmarkResults( + result = BenchmarkResultsV1( name="test", description="Lorem ipsum!", tags=["test"], @@ -44,7 +44,7 @@ def test_result_to_json(tmp_path: Path, test_user_owner: HubOwner): path = str(tmp_path / "result.json") result.to_json(path) - BenchmarkResults.from_json(path) + BenchmarkResultsV1.from_json(path) assert po.__version__ == result.polaris_version