Skip to content

Commit

Permalink
Apply feedback wip
Browse files Browse the repository at this point in the history
  • Loading branch information
mercuryseries committed Feb 7, 2025
1 parent 8e03a92 commit dded7f3
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 79 deletions.
118 changes: 59 additions & 59 deletions polaris/benchmark/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from polaris.benchmark._task import PredictiveTaskSpecificationMixin
from polaris.dataset import DatasetV1, Subset
from polaris.dataset._base import BaseDataset
from polaris.evaluate import BenchmarkResults
from polaris.evaluate import BenchmarkResultsV1
from polaris.evaluate.utils import evaluate_benchmark
from polaris.hub.settings import PolarisHubSettings
from polaris.mixins import ChecksumMixin
Expand Down Expand Up @@ -151,64 +151,6 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs
featurization_fn=featurization_fn,
)

def evaluate(
self,
y_pred: IncomingPredictionsType | None = None,
y_prob: IncomingPredictionsType | None = None,
) -> BenchmarkResults:
"""Execute the evaluation protocol for the benchmark, given a set of predictions.
info: What about `y_true`?
Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just
the predictions. This reduces the chance of accidentally using the test targets during training.
For this method, we make the following assumptions:
1. There can be one or multiple test set(s);
2. There can be one or multiple target(s);
3. The metrics are _constant_ across test sets;
4. The metrics are _constant_ across targets;
5. There can be metrics which measure across tasks.
Args:
y_pred: The predictions for the test set, as NumPy arrays.
If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
If there are multiple test sets, the predictions should be further wrapped in a dictionary
with the test subset labels as keys.
y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the
number of tasks and test sets.
Returns:
A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
Examples:
1. For regression benchmarks:
pred_scores = your_model.predict_score(molecules) # predict continuous score values
benchmark.evaluate(y_pred=pred_scores)
2. For classification benchmarks:
- If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required:
pred_probs = your_model.predict_proba(molecules) # predict probablities
pred_labels = your_model.predict_labels(molecules) # predict class labels
benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs)
- Otherwise:
benchmark.evaluate(y_pred=pred_labels)
"""

# Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
y_true = self._get_test_sets(hide_targets=False)

scores = evaluate_benchmark(
target_cols=list(self.target_cols),
test_set_labels=self.test_set_labels,
test_set_sizes=self.test_set_sizes,
metrics=self.metrics,
y_true=y_true,
y_pred=y_pred,
y_prob=y_prob,
)

return BenchmarkResults(results=scores, benchmark_artifact_id=self.artifact_id)

def upload_to_hub(
self,
settings: PolarisHubSettings | None = None,
Expand Down Expand Up @@ -423,6 +365,64 @@ def n_classes(self) -> dict[str, int]:
if self.target_types.get(target) == TargetType.CLASSIFICATION
}

def evaluate(
self,
y_pred: IncomingPredictionsType | None = None,
y_prob: IncomingPredictionsType | None = None,
) -> BenchmarkResultsV1:
"""Execute the evaluation protocol for the benchmark, given a set of predictions.
info: What about `y_true`?
Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just
the predictions. This reduces the chance of accidentally using the test targets during training.
For this method, we make the following assumptions:
1. There can be one or multiple test set(s);
2. There can be one or multiple target(s);
3. The metrics are _constant_ across test sets;
4. The metrics are _constant_ across targets;
5. There can be metrics which measure across tasks.
Args:
y_pred: The predictions for the test set, as NumPy arrays.
If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
If there are multiple test sets, the predictions should be further wrapped in a dictionary
with the test subset labels as keys.
y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the
number of tasks and test sets.
Returns:
A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
Examples:
1. For regression benchmarks:
pred_scores = your_model.predict_score(molecules) # predict continuous score values
benchmark.evaluate(y_pred=pred_scores)
2. For classification benchmarks:
- If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required:
pred_probs = your_model.predict_proba(molecules) # predict probablities
pred_labels = your_model.predict_labels(molecules) # predict class labels
benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs)
- Otherwise:
benchmark.evaluate(y_pred=pred_labels)
"""

# Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
y_true = self._get_test_sets(hide_targets=False)

scores = evaluate_benchmark(
target_cols=list(self.target_cols),
test_set_labels=self.test_set_labels,
test_set_sizes=self.test_set_sizes,
metrics=self.metrics,
y_true=y_true,
y_pred=y_pred,
y_prob=y_prob,
)

return BenchmarkResultsV1(results=scores, benchmark_artifact_id=self.artifact_id)

def __eq__(self, other) -> bool:
if not isinstance(other, BenchmarkSpecification):
return False
Expand Down
17 changes: 14 additions & 3 deletions polaris/evaluate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
from polaris.evaluate._metadata import ResultsMetadata
from polaris.evaluate._metadata import ResultsMetadataV1, ResultsMetadataV2
from polaris.evaluate._metadata import ResultsMetadataV1 as ResultsMetadata
from polaris.evaluate._metric import Metric, MetricInfo
from polaris.evaluate._predictions import BenchmarkPredictions, CompetitionPredictions
from polaris.evaluate._results import (
BenchmarkResults,
BenchmarkResultsV1 as BenchmarkResults,
BenchmarkResultsV1,
BenchmarkResultsV2,
CompetitionResults,
EvaluationResult,
EvaluationResultV1,
EvaluationResultV1 as EvaluationResult,
EvaluationResultV2,
)
from polaris.evaluate.utils import evaluate_benchmark

__all__ = [
"ResultsMetadata",
"ResultsMetadataV1",
"ResultsMetadataV2",
"Metric",
"MetricInfo",
"EvaluationResult",
"EvaluationResultV1",
"EvaluationResultV2",
"BenchmarkResults",
"BenchmarkResultsV1",
"BenchmarkResultsV2",
"CompetitionResults",
"evaluate_benchmark",
"CompetitionPredictions",
Expand Down
29 changes: 27 additions & 2 deletions polaris/evaluate/_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from polaris._artifact import BaseArtifactModel
from polaris.utils.dict2html import dict2html
from polaris.utils.types import HttpUrlString, HubUser
from polaris.model import Model


class ResultsMetadata(BaseArtifactModel):
"""Base class for evaluation results
class ResultsMetadataV1(BaseArtifactModel):
"""V1 implementation of evaluation results without model field support
Attributes:
github_url: The URL to the code repository that was used to generate these results.
Expand All @@ -32,3 +33,27 @@ def _repr_html_(self) -> str:

def __repr__(self):
return self.model_dump_json(indent=2)


class ResultsMetadataV2(BaseArtifactModel):
"""V2 implementation of evaluation results with model field replacing URLs
Attributes:
model: The model that was used to generate these results.
contributors: The users that are credited for these results.
For additional meta-data attributes, see the base classes.
"""

# Additional meta-data
model: Model | None = Field(None, exclude=True)
contributors: list[HubUser] = Field(default_factory=list)

# Private attributes
_created_at: datetime = PrivateAttr(default_factory=datetime.now)

def _repr_html_(self) -> str:
return dict2html(self.model_dump())

def __repr__(self):
return self.model_dump_json(indent=2)
4 changes: 2 additions & 2 deletions polaris/evaluate/_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from typing_extensions import Self

from polaris.evaluate import ResultsMetadata
from polaris.evaluate import ResultsMetadataV1
from polaris.utils.misc import convert_lists_to_arrays
from polaris.utils.types import (
HttpUrlString,
Expand Down Expand Up @@ -249,7 +249,7 @@ def __len__(self) -> int:
return self.get_size()


class CompetitionPredictions(BenchmarkPredictions, ResultsMetadata):
class CompetitionPredictions(BenchmarkPredictions, ResultsMetadataV1):
"""
Predictions for competition benchmarks.
Expand Down
39 changes: 31 additions & 8 deletions polaris/evaluate/_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pydantic.alias_generators import to_camel
from polaris.model import Model

from polaris.evaluate import ResultsMetadata
from polaris.evaluate import ResultsMetadataV1, ResultsMetadataV2
from polaris.utils.errors import InvalidResultError
from polaris.utils.misc import slugify
from polaris.utils.types import (
Expand Down Expand Up @@ -41,8 +41,8 @@ class ResultRecords(BaseModel):
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)


class EvaluationResult(ResultsMetadata):
"""Class for saving evaluation results
class BaseEvaluationResult:
"""Base class for saving evaluation results
The actual results are saved in the `results` field using the following tabular format:
Expand Down Expand Up @@ -134,8 +134,20 @@ def _serialize_results(self, value: pd.DataFrame) -> list[ResultRecords]:
return serialized


class BenchmarkResults(EvaluationResult):
"""Class specific to results for standard benchmarks.
class EvaluationResultV1(ResultsMetadataV1, BaseEvaluationResult):
"""V1 implementation of evaluation results without model field support"""

pass


class EvaluationResultV2(ResultsMetadataV2, BaseEvaluationResult):
"""V2 implementation of evaluation results with model field replacing URLs"""

pass


class BaseBenchmarkResults:
"""Base class for results of standard benchmarks.
This object is returned by [`BenchmarkSpecification.evaluate`][polaris.benchmark.BenchmarkSpecification.evaluate].
In addition to the metrics on the test set, it contains additional meta-data and logic to integrate
Expand All @@ -152,7 +164,6 @@ class BenchmarkResults(EvaluationResult):
benchmark_artifact_id: str | None = Field(None)
benchmark_name: SlugCompatibleStringType | None = Field(None, deprecated=True)
benchmark_owner: HubOwner | None = Field(None, deprecated=True)
model: Model | None = Field(None)

@model_validator(mode="after")
def set_benchmark_artifact_id(self):
Expand All @@ -165,7 +176,7 @@ def upload_to_hub(
access: AccessType = "private",
owner: HubOwner | str | None = None,
**kwargs: dict,
) -> "BenchmarkResults":
) -> "BenchmarkResultsV1" | "BenchmarkResultsV2":
"""
Very light, convenient wrapper around the
[`PolarisHubClient.upload_results`][polaris.hub.client.PolarisHubClient.upload_results] method.
Expand All @@ -176,7 +187,19 @@ def upload_to_hub(
return client.upload_results(self, access=access, owner=owner)


class CompetitionResults(EvaluationResult):
class BenchmarkResultsV1(EvaluationResultV1, BaseBenchmarkResults):
"""V1 implementation of benchmark results without model field support"""

pass


class BenchmarkResultsV2(EvaluationResultV2, BaseBenchmarkResults):
"""V2 implementation of benchmark results with model field replacing URLs"""

pass


class CompetitionResults(EvaluationResultV1):
"""Class specific to results for competition benchmarks.
This object is returned by [`CompetitionSpecification.evaluate`][polaris.competition.CompetitionSpecification.evaluate].
Expand Down
62 changes: 62 additions & 0 deletions polaris/experimental/_benchmark_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
from typing_extensions import Self

from polaris.benchmark import BenchmarkSpecification
from polaris.evaluate.utils import evaluate_benchmark
from polaris.utils.types import IncomingPredictionsType

from polaris.evaluate import BenchmarkResultsV2
from polaris.dataset import DatasetV2, Subset
from polaris.experimental._split_v2 import SplitSpecificationV2Mixin
from polaris.utils.errors import InvalidBenchmarkError
Expand Down Expand Up @@ -92,3 +96,61 @@ def get_train_test_split(
)
test = self._get_test_sets(hide_targets=True, featurization_fn=featurization_fn)
return train, test

def evaluate(
self,
y_pred: IncomingPredictionsType | None = None,
y_prob: IncomingPredictionsType | None = None,
) -> BenchmarkResultsV2:
"""Execute the evaluation protocol for the benchmark, given a set of predictions.
info: What about `y_true`?
Contrary to other frameworks that you might be familiar with, we opted for a signature that includes just
the predictions. This reduces the chance of accidentally using the test targets during training.
For this method, we make the following assumptions:
1. There can be one or multiple test set(s);
2. There can be one or multiple target(s);
3. The metrics are _constant_ across test sets;
4. The metrics are _constant_ across targets;
5. There can be metrics which measure across tasks.
Args:
y_pred: The predictions for the test set, as NumPy arrays.
If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
If there are multiple test sets, the predictions should be further wrapped in a dictionary
with the test subset labels as keys.
y_prob: The predicted probabilities for the test set, formatted similarly to predictions, based on the
number of tasks and test sets.
Returns:
A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
Examples:
1. For regression benchmarks:
pred_scores = your_model.predict_score(molecules) # predict continuous score values
benchmark.evaluate(y_pred=pred_scores)
2. For classification benchmarks:
- If `roc_auc` and `pr_auc` are in the metric list, both class probabilities and label predictions are required:
pred_probs = your_model.predict_proba(molecules) # predict probablities
pred_labels = your_model.predict_labels(molecules) # predict class labels
benchmark.evaluate(y_pred=pred_labels, y_prob=pred_probs)
- Otherwise:
benchmark.evaluate(y_pred=pred_labels)
"""

# Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
y_true = self._get_test_sets(hide_targets=False)

scores = evaluate_benchmark(
target_cols=list(self.target_cols),
test_set_labels=self.test_set_labels,
test_set_sizes=self.test_set_sizes,
metrics=self.metrics,
y_true=y_true,
y_pred=y_pred,
y_prob=y_prob,
)

return BenchmarkResultsV2(results=scores, benchmark_artifact_id=self.artifact_id)
Loading

0 comments on commit dded7f3

Please sign in to comment.