From 6d93e4f454bdeefc5d3d38cd8a4cf2c21d72651f Mon Sep 17 00:00:00 2001
From: Morgan McGuire <morganmcg1@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:31:23 +0100
Subject: [PATCH] Eval: pass experiment name to wandbot call; update eval
 config import

---
 src/wandbot/evaluation/eval/async_main.py     | 2 +-
 src/wandbot/evaluation/weave_eval/eval.py     | 9 +++++----
 src/wandbot/evaluation/weave_eval/log_data.py | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/wandbot/evaluation/eval/async_main.py b/src/wandbot/evaluation/eval/async_main.py
index d219d5c..57d71d0 100644
--- a/src/wandbot/evaluation/eval/async_main.py
+++ b/src/wandbot/evaluation/eval/async_main.py
@@ -12,7 +12,7 @@
 from tqdm import tqdm
 
 import wandb
-from wandbot.evaluation.config import EvalConfig
+from wandbot.evaluation.eval_config import EvalConfig
 from wandbot.evaluation.eval.correctness import (
     CORRECTNESS_EVAL_TEMPLATE,
     WandbCorrectnessEvaluator,
diff --git a/src/wandbot/evaluation/weave_eval/eval.py b/src/wandbot/evaluation/weave_eval/eval.py
index 32a0324..8391748 100644
--- a/src/wandbot/evaluation/weave_eval/eval.py
+++ b/src/wandbot/evaluation/weave_eval/eval.py
@@ -87,8 +87,8 @@ def parse_text_to_json(text):
 
 
 @weave.op
-async def get_record(question: str, language: str = "en") -> dict:
-    response = await get_answer(question, language=language)
+async def get_record(question: str, application: str = "api-eval", language: str = "en") -> dict:
+    response = await get_answer(question, application, language=language)
     response = json.loads(response)
     
     # Return default values if response is empty or missing fields
@@ -120,10 +120,11 @@ async def get_record(question: str, language: str = "en") -> dict:
 
 class WandbotModel(weave.Model):
     language: str = "en"
+    application: str = "api-eval"
 
     @weave.op
     async def predict(self, question: str) -> dict:
-        prediction = await get_record(question, language=self.language)
+        prediction = await get_record(question, application=self.application, language=self.language)
         return prediction
 
 @weave.op
@@ -183,7 +184,7 @@ def main():
     ]
     logger.info("Number of evaluation samples: %s", len(question_rows))
 
-    wandbot = WandbotModel(language=config.lang)
+    wandbot = WandbotModel(language=config.lang, application=config.experiment_name)
 
     wandbot_evaluator = Evaluation(
         name=config.evaluation_name,
diff --git a/src/wandbot/evaluation/weave_eval/log_data.py b/src/wandbot/evaluation/weave_eval/log_data.py
index d52bd4b..57d8769 100644
--- a/src/wandbot/evaluation/weave_eval/log_data.py
+++ b/src/wandbot/evaluation/weave_eval/log_data.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from weave import Dataset
 
-from wandbot.evaluation.config import EvalConfig
+from wandbot.evaluation.eval_config import EvalConfig
 
 config = EvalConfig()