From f17cbb88b720eaca198c08bde94a9d363a93d66a Mon Sep 17 00:00:00 2001 From: Jialiang Xu Date: Tue, 21 Jan 2025 22:10:32 -0800 Subject: [PATCH 1/3] update wildbench score calculation to use multiple annotators --- .../annotation/wildbench_annotator.py | 67 ++++++++++++------- .../benchmark/metrics/wildbench_metrics.py | 5 +- 2 files changed, 48 insertions(+), 24 deletions(-) diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py index f74d873eb5..83d8fdbad6 100644 --- a/src/helm/benchmark/annotation/wildbench_annotator.py +++ b/src/helm/benchmark/annotation/wildbench_annotator.py @@ -1,9 +1,11 @@ import re from typing import Any from importlib.resources import files +from typing import Dict from helm.benchmark.adaptation.request_state import RequestState from helm.benchmark.annotation.annotator import Annotator +from helm.benchmark.annotation.model_as_judge import _AnnotatorModelInfo from helm.clients.auto_client import AutoClient from helm.common.request import Request @@ -38,28 +40,47 @@ def annotate(self, request_state: RequestState) -> Any: .replace("{$model_output}", model_output_text) .replace("{$checklist}", "\n".join(request_state.instance.extra_data["checklist"])) ) - annotator_request = Request( - model="openai/gpt-4o-2024-05-13", - model_deployment="openai/gpt-4o-2024-05-13", - prompt=annotator_prompt, - temperature=0.0, - max_tokens=2000, - ) - annotator_response = self._auto_client.make_request(annotator_request) - if not annotator_response.success: - raise Exception(f"Annotation request failed: {annotator_response.error}") - assert len(annotator_response.completions) == 1 - annotator_response_text = annotator_response.completions[0].text - annotator_response_parts = self._pattern.search(annotator_response_text) - if not annotator_response_parts: - raise ValueError(f"Malformed annotator response: {annotator_response_text}") - strengths = annotator_response_parts[1].strip() - weaknesses = annotator_response_parts[2].strip() - score_text = annotator_response_parts[3].strip().strip('"') - try: - score = float(score_text) - except ValueError: - raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}") + SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = { + "gpt": _AnnotatorModelInfo(model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"), + "llama": _AnnotatorModelInfo( + model_name="meta/llama-3.1-405b-instruct-turbo", model_deployment="together/llama-3.1-405b-instruct-turbo" + ), + "claude": _AnnotatorModelInfo( + model_name="anthropic/claude-3-5-sonnet-20241022", model_deployment="anthropic/claude-3-5-sonnet-20241022" + ), + } + all_strengths = [] + all_weaknesses = [] + all_scores = [] + for annotator_model in SHORT_NAME_TO_MODEL_INFO: + annotator_model_info = SHORT_NAME_TO_MODEL_INFO[annotator_model] + annotator_request = Request( + model=annotator_model_info.model_name, + model_deployment=annotator_model_info.model_deployment, + prompt=annotator_prompt, + temperature=0.0, + max_tokens=2000, + ) + annotator_response = self._auto_client.make_request(annotator_request) + if not annotator_response.success: + continue # skip this annotator if the request failed + assert len(annotator_response.completions) == 1 + annotator_response_text = annotator_response.completions[0].text + annotator_response_parts = self._pattern.search(annotator_response_text) + if not annotator_response_parts: + continue # skip this annotator if the response is malformed + + strengths = annotator_response_parts[1].strip() + weaknesses = annotator_response_parts[2].strip() + score_text = annotator_response_parts[3].strip().strip('"') + try: + score = float(score_text) + except ValueError: + continue # skip this annotator if the score is not a number + + all_strengths.append(strengths) + all_weaknesses.append(weaknesses) + all_scores.append(score) - return {"strengths": strengths, "weaknesses": weaknesses, "score": score} + return {"strengths": all_strengths, "weaknesses": all_weaknesses, "score": all_scores} diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py index b3deb766b1..5ef69e992c 100644 --- a/src/helm/benchmark/metrics/wildbench_metrics.py +++ b/src/helm/benchmark/metrics/wildbench_metrics.py @@ -19,7 +19,10 @@ def evaluate_generation( eval_cache_path: str, ) -> List[Stat]: assert request_state.annotations - score = request_state.annotations["wildbench"]["score"] + all_scores = request_state.annotations["wildbench"]["score"] + if len(all_scores) == 0: + raise ValueError("Could not compute WB Score because all annotators failed.") + score = sum(score) / len(score) score_rescaled = (score - 1) / 9 return [ Stat(MetricName("wildbench_score")).add(score), From 6113ebc78df48d2a3e7a52f9aa2ca93b16eeb60e Mon Sep 17 00:00:00 2001 From: Jialiang Xu Date: Tue, 21 Jan 2025 22:14:15 -0800 Subject: [PATCH 2/3] formatting --- .../benchmark/annotation/wildbench_annotator.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py index 83d8fdbad6..1bd8361e22 100644 --- a/src/helm/benchmark/annotation/wildbench_annotator.py +++ b/src/helm/benchmark/annotation/wildbench_annotator.py @@ -42,12 +42,16 @@ def annotate(self, request_state: RequestState) -> Any: ) SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = { - "gpt": _AnnotatorModelInfo(model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"), + "gpt": _AnnotatorModelInfo( + model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13" + ), "llama": _AnnotatorModelInfo( - model_name="meta/llama-3.1-405b-instruct-turbo", model_deployment="together/llama-3.1-405b-instruct-turbo" + model_name="meta/llama-3.1-405b-instruct-turbo", + model_deployment="together/llama-3.1-405b-instruct-turbo", ), "claude": _AnnotatorModelInfo( - model_name="anthropic/claude-3-5-sonnet-20241022", model_deployment="anthropic/claude-3-5-sonnet-20241022" + model_name="anthropic/claude-3-5-sonnet-20241022", + model_deployment="anthropic/claude-3-5-sonnet-20241022", ), } all_strengths = [] @@ -69,7 +73,7 @@ def annotate(self, request_state: RequestState) -> Any: annotator_response_text = annotator_response.completions[0].text annotator_response_parts = self._pattern.search(annotator_response_text) if not annotator_response_parts: - continue # skip this annotator if the response is malformed + continue # skip this annotator if the response is malformed strengths = annotator_response_parts[1].strip() weaknesses = annotator_response_parts[2].strip() @@ -78,7 +82,7 @@ def annotate(self, request_state: RequestState) -> Any: score = float(score_text) except ValueError: continue # skip this annotator if the score is not a number - + all_strengths.append(strengths) all_weaknesses.append(weaknesses) all_scores.append(score) From 679778c971014a0d6e5ce20b9575eddee8f5b22b Mon Sep 17 00:00:00 2001 From: Jialiang Xu Date: Tue, 21 Jan 2025 22:34:24 -0800 Subject: [PATCH 3/3] minor fix --- src/helm/benchmark/metrics/wildbench_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py index 5ef69e992c..cde95bd689 100644 --- a/src/helm/benchmark/metrics/wildbench_metrics.py +++ b/src/helm/benchmark/metrics/wildbench_metrics.py @@ -22,7 +22,7 @@ def evaluate_generation( all_scores = request_state.annotations["wildbench"]["score"] if len(all_scores) == 0: raise ValueError("Could not compute WB Score because all annotators failed.") - score = sum(score) / len(score) + score = sum(all_scores) / len(all_scores) score_rescaled = (score - 1) / 9 return [ Stat(MetricName("wildbench_score")).add(score),