From f17cbb88b720eaca198c08bde94a9d363a93d66a Mon Sep 17 00:00:00 2001
From: Jialiang Xu <xjl@stanford.edu>
Date: Tue, 21 Jan 2025 22:10:32 -0800
Subject: [PATCH 1/3] update wildbench score calculation to use multiple
 annotators

---
 .../annotation/wildbench_annotator.py         | 67 ++++++++++++-------
 .../benchmark/metrics/wildbench_metrics.py    |  5 +-
 2 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py
index f74d873eb5..83d8fdbad6 100644
--- a/src/helm/benchmark/annotation/wildbench_annotator.py
+++ b/src/helm/benchmark/annotation/wildbench_annotator.py
@@ -1,9 +1,11 @@
 import re
 from typing import Any
 from importlib.resources import files
+from typing import Dict
 
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
+from helm.benchmark.annotation.model_as_judge import _AnnotatorModelInfo
 from helm.clients.auto_client import AutoClient
 from helm.common.request import Request
 
@@ -38,28 +40,47 @@ def annotate(self, request_state: RequestState) -> Any:
             .replace("{$model_output}", model_output_text)
             .replace("{$checklist}", "\n".join(request_state.instance.extra_data["checklist"]))
         )
-        annotator_request = Request(
-            model="openai/gpt-4o-2024-05-13",
-            model_deployment="openai/gpt-4o-2024-05-13",
-            prompt=annotator_prompt,
-            temperature=0.0,
-            max_tokens=2000,
-        )
-        annotator_response = self._auto_client.make_request(annotator_request)
-        if not annotator_response.success:
-            raise Exception(f"Annotation request failed: {annotator_response.error}")
-        assert len(annotator_response.completions) == 1
-        annotator_response_text = annotator_response.completions[0].text
-        annotator_response_parts = self._pattern.search(annotator_response_text)
-        if not annotator_response_parts:
-            raise ValueError(f"Malformed annotator response: {annotator_response_text}")
 
-        strengths = annotator_response_parts[1].strip()
-        weaknesses = annotator_response_parts[2].strip()
-        score_text = annotator_response_parts[3].strip().strip('"')
-        try:
-            score = float(score_text)
-        except ValueError:
-            raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")
+        SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = {
+            "gpt": _AnnotatorModelInfo(model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"),
+            "llama": _AnnotatorModelInfo(
+                model_name="meta/llama-3.1-405b-instruct-turbo", model_deployment="together/llama-3.1-405b-instruct-turbo"
+            ),
+            "claude": _AnnotatorModelInfo(
+                model_name="anthropic/claude-3-5-sonnet-20241022", model_deployment="anthropic/claude-3-5-sonnet-20241022"
+            ),
+        }
+        all_strengths = []
+        all_weaknesses = []
+        all_scores = []
+        for annotator_model in SHORT_NAME_TO_MODEL_INFO:
+            annotator_model_info = SHORT_NAME_TO_MODEL_INFO[annotator_model]
+            annotator_request = Request(
+                model=annotator_model_info.model_name,
+                model_deployment=annotator_model_info.model_deployment,
+                prompt=annotator_prompt,
+                temperature=0.0,
+                max_tokens=2000,
+            )
+            annotator_response = self._auto_client.make_request(annotator_request)
+            if not annotator_response.success:
+                continue  # skip this annotator if the request failed
+            assert len(annotator_response.completions) == 1
+            annotator_response_text = annotator_response.completions[0].text
+            annotator_response_parts = self._pattern.search(annotator_response_text)
+            if not annotator_response_parts:
+                continue   # skip this annotator if the response is malformed
+
+            strengths = annotator_response_parts[1].strip()
+            weaknesses = annotator_response_parts[2].strip()
+            score_text = annotator_response_parts[3].strip().strip('"')
+            try:
+                score = float(score_text)
+            except ValueError:
+                continue  # skip this annotator if the score is not a number
+            
+            all_strengths.append(strengths)
+            all_weaknesses.append(weaknesses)
+            all_scores.append(score)
 
-        return {"strengths": strengths, "weaknesses": weaknesses, "score": score}
+        return {"strengths": all_strengths, "weaknesses": all_weaknesses, "score": all_scores}
diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py
index b3deb766b1..5ef69e992c 100644
--- a/src/helm/benchmark/metrics/wildbench_metrics.py
+++ b/src/helm/benchmark/metrics/wildbench_metrics.py
@@ -19,7 +19,10 @@ def evaluate_generation(
         eval_cache_path: str,
     ) -> List[Stat]:
         assert request_state.annotations
-        score = request_state.annotations["wildbench"]["score"]
+        all_scores = request_state.annotations["wildbench"]["score"]
+        if len(all_scores) == 0:
+            raise ValueError("Could not compute WB Score because all annotators failed.")
+        score = sum(score) / len(score)
         score_rescaled = (score - 1) / 9
         return [
             Stat(MetricName("wildbench_score")).add(score),

From 6113ebc78df48d2a3e7a52f9aa2ca93b16eeb60e Mon Sep 17 00:00:00 2001
From: Jialiang Xu <xjl@stanford.edu>
Date: Tue, 21 Jan 2025 22:14:15 -0800
Subject: [PATCH 2/3] formatting

---
 .../benchmark/annotation/wildbench_annotator.py    | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py
index 83d8fdbad6..1bd8361e22 100644
--- a/src/helm/benchmark/annotation/wildbench_annotator.py
+++ b/src/helm/benchmark/annotation/wildbench_annotator.py
@@ -42,12 +42,16 @@ def annotate(self, request_state: RequestState) -> Any:
         )
 
         SHORT_NAME_TO_MODEL_INFO: Dict[str, _AnnotatorModelInfo] = {
-            "gpt": _AnnotatorModelInfo(model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"),
+            "gpt": _AnnotatorModelInfo(
+                model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"
+            ),
             "llama": _AnnotatorModelInfo(
-                model_name="meta/llama-3.1-405b-instruct-turbo", model_deployment="together/llama-3.1-405b-instruct-turbo"
+                model_name="meta/llama-3.1-405b-instruct-turbo",
+                model_deployment="together/llama-3.1-405b-instruct-turbo",
             ),
             "claude": _AnnotatorModelInfo(
-                model_name="anthropic/claude-3-5-sonnet-20241022", model_deployment="anthropic/claude-3-5-sonnet-20241022"
+                model_name="anthropic/claude-3-5-sonnet-20241022",
+                model_deployment="anthropic/claude-3-5-sonnet-20241022",
             ),
         }
         all_strengths = []
@@ -69,7 +73,7 @@ def annotate(self, request_state: RequestState) -> Any:
             annotator_response_text = annotator_response.completions[0].text
             annotator_response_parts = self._pattern.search(annotator_response_text)
             if not annotator_response_parts:
-                continue   # skip this annotator if the response is malformed
+                continue  # skip this annotator if the response is malformed
 
             strengths = annotator_response_parts[1].strip()
             weaknesses = annotator_response_parts[2].strip()
@@ -78,7 +82,7 @@ def annotate(self, request_state: RequestState) -> Any:
                 score = float(score_text)
             except ValueError:
                 continue  # skip this annotator if the score is not a number
-            
+
             all_strengths.append(strengths)
             all_weaknesses.append(weaknesses)
             all_scores.append(score)

From 679778c971014a0d6e5ce20b9575eddee8f5b22b Mon Sep 17 00:00:00 2001
From: Jialiang Xu <xjl@stanford.edu>
Date: Tue, 21 Jan 2025 22:34:24 -0800
Subject: [PATCH 3/3] minor fix

---
 src/helm/benchmark/metrics/wildbench_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py
index 5ef69e992c..cde95bd689 100644
--- a/src/helm/benchmark/metrics/wildbench_metrics.py
+++ b/src/helm/benchmark/metrics/wildbench_metrics.py
@@ -22,7 +22,7 @@ def evaluate_generation(
         all_scores = request_state.annotations["wildbench"]["score"]
         if len(all_scores) == 0:
             raise ValueError("Could not compute WB Score because all annotators failed.")
-        score = sum(score) / len(score)
+        score = sum(all_scores) / len(all_scores)
         score_rescaled = (score - 1) / 9
         return [
             Stat(MetricName("wildbench_score")).add(score),