Apply feedback wip

polaris-hub · Feb 7, 2025 · dded7f3 · dded7f3
1 parent 8e03a92
commit dded7f3
Show file tree

Hide file tree

Showing 8 changed files with 200 additions and 79 deletions.
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
@@ -22,7 +22,7 @@
 from polaris.benchmark._task import PredictiveTaskSpecificationMixin
 from polaris.dataset import DatasetV1, Subset
 from polaris.dataset._base import BaseDataset
-from polaris.evaluate import BenchmarkResults
+from polaris.evaluate import BenchmarkResultsV1
 from polaris.evaluate.utils import evaluate_benchmark
 from polaris.hub.settings import PolarisHubSettings
 from polaris.mixins import ChecksumMixin
@@ -151,64 +151,6 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs
             featurization_fn=featurization_fn,
         )
 
-    def evaluate(
-        self,
-        y_pred: IncomingPredictionsType | None = None,
-        y_prob: IncomingPredictionsType | None = None,
-    ) -> BenchmarkResults:
-        """Execute the evaluation protocol for the benchmark, given a set of predictions.
-
-        info: What about `y_true`?
-            Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just
-            the predictions. This reduces the chance of accidentally using the test targets during training.
-
-        For this method, we make the following assumptions:
-
-        1. There can be one or multiple test set(s);
-        2. There can be one or multiple target(s);
-        3. The metrics are _constant_ across test sets;
-        4. The metrics are _constant_ across targets;
-        5. There can be metrics which measure across tasks.
-
-        Args:
-            y_pred: The predictions for the test set, as NumPy arrays.
-                If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
-                If there are multiple test sets, the predictions should be further wrapped in a dictionary
-                    with the test subset labels as keys.
-            y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the
-                number of tasks and test sets.
-
-        Returns:
-            A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
-
-        Examples:
-            1. For regression benchmarks:
-                pred_scores = your_model.predict_score(molecules) # predict continuous score values
-                benchmark.evaluate(y_pred=pred_scores)
-            2. For classification benchmarks:
-                - If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required:
-                    pred_probs = your_model.predict_proba(molecules) # predict probablities
-                    pred_labels = your_model.predict_labels(molecules) # predict class labels
-                    benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs)
-                - Otherwise:
-                    benchmark.evaluate(y_pred=pred_labels)
-        """
-
-        # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
-        y_true = self._get_test_sets(hide_targets=False)
-
-        scores = evaluate_benchmark(
-            target_cols=list(self.target_cols),
-            test_set_labels=self.test_set_labels,
-            test_set_sizes=self.test_set_sizes,
-            metrics=self.metrics,
-            y_true=y_true,
-            y_pred=y_pred,
-            y_prob=y_prob,
-        )
-
-        return BenchmarkResults(results=scores, benchmark_artifact_id=self.artifact_id)
-
     def upload_to_hub(
         self,
         settings: PolarisHubSettings | None = None,
@@ -423,6 +365,64 @@ def n_classes(self) -> dict[str, int]:
             if self.target_types.get(target) == TargetType.CLASSIFICATION
         }
 
+    def evaluate(
+        self,
+        y_pred: IncomingPredictionsType | None = None,
+        y_prob: IncomingPredictionsType | None = None,
+    ) -> BenchmarkResultsV1:
+        """Execute the evaluation protocol for the benchmark, given a set of predictions.
+
+        info: What about `y_true`?
+            Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just
+            the predictions. This reduces the chance of accidentally using the test targets during training.
+
+        For this method, we make the following assumptions:
+
+        1. There can be one or multiple test set(s);
+        2. There can be one or multiple target(s);
+        3. The metrics are _constant_ across test sets;
+        4. The metrics are _constant_ across targets;
+        5. There can be metrics which measure across tasks.
+
+        Args:
+            y_pred: The predictions for the test set, as NumPy arrays.
+                If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
+                If there are multiple test sets, the predictions should be further wrapped in a dictionary
+                    with the test subset labels as keys.
+            y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the
+                number of tasks and test sets.
+
+        Returns:
+            A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
+
+        Examples:
+            1. For regression benchmarks:
+                pred_scores = your_model.predict_score(molecules) # predict continuous score values
+                benchmark.evaluate(y_pred=pred_scores)
+            2. For classification benchmarks:
+                - If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required:
+                    pred_probs = your_model.predict_proba(molecules) # predict probablities
+                    pred_labels = your_model.predict_labels(molecules) # predict class labels
+                    benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs)
+                - Otherwise:
+                    benchmark.evaluate(y_pred=pred_labels)
+        """
+
+        # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
+        y_true = self._get_test_sets(hide_targets=False)
+
+        scores = evaluate_benchmark(
+            target_cols=list(self.target_cols),
+            test_set_labels=self.test_set_labels,
+            test_set_sizes=self.test_set_sizes,
+            metrics=self.metrics,
+            y_true=y_true,
+            y_pred=y_pred,
+            y_prob=y_prob,
+        )
+
+        return BenchmarkResultsV1(results=scores, benchmark_artifact_id=self.artifact_id)
+
     def __eq__(self, other) -> bool:
         if not isinstance(other, BenchmarkSpecification):
             return False

diff --git a/polaris/evaluate/__init__.py b/polaris/evaluate/__init__.py
@@ -1,19 +1,30 @@
-from polaris.evaluate._metadata import ResultsMetadata
+from polaris.evaluate._metadata import ResultsMetadataV1, ResultsMetadataV2
+from polaris.evaluate._metadata import ResultsMetadataV1 as ResultsMetadata
 from polaris.evaluate._metric import Metric, MetricInfo
 from polaris.evaluate._predictions import BenchmarkPredictions, CompetitionPredictions
 from polaris.evaluate._results import (
-    BenchmarkResults,
+    BenchmarkResultsV1 as BenchmarkResults,
+    BenchmarkResultsV1,
+    BenchmarkResultsV2,
     CompetitionResults,
-    EvaluationResult,
+    EvaluationResultV1,
+    EvaluationResultV1 as EvaluationResult,
+    EvaluationResultV2,
 )
 from polaris.evaluate.utils import evaluate_benchmark
 
 __all__ = [
     "ResultsMetadata",
+    "ResultsMetadataV1",
+    "ResultsMetadataV2",
     "Metric",
     "MetricInfo",
     "EvaluationResult",
+    "EvaluationResultV1",
+    "EvaluationResultV2",
     "BenchmarkResults",
+    "BenchmarkResultsV1",
+    "BenchmarkResultsV2",
     "CompetitionResults",
     "evaluate_benchmark",
     "CompetitionPredictions",

diff --git a/polaris/evaluate/_metadata.py b/polaris/evaluate/_metadata.py
@@ -5,10 +5,11 @@
 from polaris._artifact import BaseArtifactModel
 from polaris.utils.dict2html import dict2html
 from polaris.utils.types import HttpUrlString, HubUser
+from polaris.model import Model
 
 
-class ResultsMetadata(BaseArtifactModel):
-    """Base class for evaluation results
+class ResultsMetadataV1(BaseArtifactModel):
+    """V1 implementation of evaluation results without model field support
 
     Attributes:
         github_url: The URL to the code repository that was used to generate these results.
@@ -32,3 +33,27 @@ def _repr_html_(self) -> str:
 
     def __repr__(self):
         return self.model_dump_json(indent=2)
+
+
+class ResultsMetadataV2(BaseArtifactModel):
+    """V2 implementation of evaluation results with model field replacing URLs
+
+    Attributes:
+        model: The model that was used to generate these results.
+        contributors: The users that are credited for these results.
+
+    For additional meta-data attributes, see the base classes.
+    """
+
+    # Additional meta-data
+    model: Model | None = Field(None, exclude=True)
+    contributors: list[HubUser] = Field(default_factory=list)
+
+    # Private attributes
+    _created_at: datetime = PrivateAttr(default_factory=datetime.now)
+
+    def _repr_html_(self) -> str:
+        return dict2html(self.model_dump())
+
+    def __repr__(self):
+        return self.model_dump_json(indent=2)
diff --git a/polaris/evaluate/_predictions.py b/polaris/evaluate/_predictions.py
@@ -12,7 +12,7 @@
 )
 from typing_extensions import Self
 
-from polaris.evaluate import ResultsMetadata
+from polaris.evaluate import ResultsMetadataV1
 from polaris.utils.misc import convert_lists_to_arrays
 from polaris.utils.types import (
     HttpUrlString,
@@ -249,7 +249,7 @@ def __len__(self) -> int:
         return self.get_size()
 
 
-class CompetitionPredictions(BenchmarkPredictions, ResultsMetadata):
+class CompetitionPredictions(BenchmarkPredictions, ResultsMetadataV1):
     """
     Predictions for competition benchmarks.
 

diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py
@@ -13,7 +13,7 @@
 from pydantic.alias_generators import to_camel
 from polaris.model import Model
 
-from polaris.evaluate import ResultsMetadata
+from polaris.evaluate import ResultsMetadataV1, ResultsMetadataV2
 from polaris.utils.errors import InvalidResultError
 from polaris.utils.misc import slugify
 from polaris.utils.types import (
@@ -41,8 +41,8 @@ class ResultRecords(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
 
-class EvaluationResult(ResultsMetadata):
-    """Class for saving evaluation results
+class BaseEvaluationResult:
+    """Base class for saving evaluation results
 
     The actual results are saved in the `results` field using the following tabular format:
 
@@ -134,8 +134,20 @@ def _serialize_results(self, value: pd.DataFrame) -> list[ResultRecords]:
         return serialized
 
 
-class BenchmarkResults(EvaluationResult):
-    """Class specific to results for standard benchmarks.
+class EvaluationResultV1(ResultsMetadataV1, BaseEvaluationResult):
+    """V1 implementation of evaluation results without model field support"""
+
+    pass
+
+
+class EvaluationResultV2(ResultsMetadataV2, BaseEvaluationResult):
+    """V2 implementation of evaluation results with model field replacing URLs"""
+
+    pass
+
+
+class BaseBenchmarkResults:
+    """Base class for results of standard benchmarks.
 
     This object is returned by [`BenchmarkSpecification.evaluate`][polaris.benchmark.BenchmarkSpecification.evaluate].
     In addition to the metrics on the test set, it contains additional meta-data and logic to integrate
@@ -152,7 +164,6 @@ class BenchmarkResults(EvaluationResult):
     benchmark_artifact_id: str | None = Field(None)
     benchmark_name: SlugCompatibleStringType | None = Field(None, deprecated=True)
     benchmark_owner: HubOwner | None = Field(None, deprecated=True)
-    model: Model | None = Field(None)
 
     @model_validator(mode="after")
     def set_benchmark_artifact_id(self):
@@ -165,7 +176,7 @@ def upload_to_hub(
         access: AccessType = "private",
         owner: HubOwner | str | None = None,
         **kwargs: dict,
-    ) -> "BenchmarkResults":
+    ) -> "BenchmarkResultsV1" | "BenchmarkResultsV2":
         """
         Very light, convenient wrapper around the
         [`PolarisHubClient.upload_results`][polaris.hub.client.PolarisHubClient.upload_results] method.
@@ -176,7 +187,19 @@ def upload_to_hub(
             return client.upload_results(self, access=access, owner=owner)
 
 
-class CompetitionResults(EvaluationResult):
+class BenchmarkResultsV1(EvaluationResultV1, BaseBenchmarkResults):
+    """V1 implementation of benchmark results without model field support"""
+
+    pass
+
+
+class BenchmarkResultsV2(EvaluationResultV2, BaseBenchmarkResults):
+    """V2 implementation of benchmark results with model field replacing URLs"""
+
+    pass
+
+
+class CompetitionResults(EvaluationResultV1):
     """Class specific to results for competition benchmarks.
 
     This object is returned by [`CompetitionSpecification.evaluate`][polaris.competition.CompetitionSpecification.evaluate].

diff --git a/polaris/experimental/_benchmark_v2.py b/polaris/experimental/_benchmark_v2.py
@@ -4,6 +4,10 @@
 from typing_extensions import Self
 
 from polaris.benchmark import BenchmarkSpecification
+from polaris.evaluate.utils import evaluate_benchmark
+from polaris.utils.types import IncomingPredictionsType
+
+from polaris.evaluate import BenchmarkResultsV2
 from polaris.dataset import DatasetV2, Subset
 from polaris.experimental._split_v2 import SplitSpecificationV2Mixin
 from polaris.utils.errors import InvalidBenchmarkError
@@ -92,3 +96,61 @@ def get_train_test_split(
         )
         test = self._get_test_sets(hide_targets=True, featurization_fn=featurization_fn)
         return train, test
+
+    def evaluate(
+        self,
+        y_pred: IncomingPredictionsType | None = None,
+        y_prob: IncomingPredictionsType | None = None,
+    ) -> BenchmarkResultsV2:
+        """Execute the evaluation protocol for the benchmark, given a set of predictions.
+
+        info: What about `y_true`?
+            Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just
+            the predictions. This reduces the chance of accidentally using the test targets during training.
+
+        For this method, we make the following assumptions:
+
+        1. There can be one or multiple test set(s);
+        2. There can be one or multiple target(s);
+        3. The metrics are _constant_ across test sets;
+        4. The metrics are _constant_ across targets;
+        5. There can be metrics which measure across tasks.
+
+        Args:
+            y_pred: The predictions for the test set, as NumPy arrays.
+                If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
+                If there are multiple test sets, the predictions should be further wrapped in a dictionary
+                    with the test subset labels as keys.
+            y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the
+                number of tasks and test sets.
+
+        Returns:
+            A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
+
+        Examples:
+            1. For regression benchmarks:
+                pred_scores = your_model.predict_score(molecules) # predict continuous score values
+                benchmark.evaluate(y_pred=pred_scores)
+            2. For classification benchmarks:
+                - If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required:
+                    pred_probs = your_model.predict_proba(molecules) # predict probablities
+                    pred_labels = your_model.predict_labels(molecules) # predict class labels
+                    benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs)
+                - Otherwise:
+                    benchmark.evaluate(y_pred=pred_labels)
+        """
+
+        # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
+        y_true = self._get_test_sets(hide_targets=False)
+
+        scores = evaluate_benchmark(
+            target_cols=list(self.target_cols),
+            test_set_labels=self.test_set_labels,
+            test_set_sizes=self.test_set_sizes,
+            metrics=self.metrics,
+            y_true=y_true,
+            y_pred=y_pred,
+            y_prob=y_prob,
+        )
+
+        return BenchmarkResultsV2(results=scores, benchmark_artifact_id=self.artifact_id)