From 6d93e4f454bdeefc5d3d38cd8a4cf2c21d72651f Mon Sep 17 00:00:00 2001 From: Morgan McGuire Date: Thu, 16 Jan 2025 13:31:23 +0100 Subject: [PATCH] Eval: pass experiment name to wandbot call; update eval config import --- src/wandbot/evaluation/eval/async_main.py | 2 +- src/wandbot/evaluation/weave_eval/eval.py | 9 +++++---- src/wandbot/evaluation/weave_eval/log_data.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/wandbot/evaluation/eval/async_main.py b/src/wandbot/evaluation/eval/async_main.py index d219d5c..57d71d0 100644 --- a/src/wandbot/evaluation/eval/async_main.py +++ b/src/wandbot/evaluation/eval/async_main.py @@ -12,7 +12,7 @@ from tqdm import tqdm import wandb -from wandbot.evaluation.config import EvalConfig +from wandbot.evaluation.eval_config import EvalConfig from wandbot.evaluation.eval.correctness import ( CORRECTNESS_EVAL_TEMPLATE, WandbCorrectnessEvaluator, diff --git a/src/wandbot/evaluation/weave_eval/eval.py b/src/wandbot/evaluation/weave_eval/eval.py index 32a0324..8391748 100644 --- a/src/wandbot/evaluation/weave_eval/eval.py +++ b/src/wandbot/evaluation/weave_eval/eval.py @@ -87,8 +87,8 @@ def parse_text_to_json(text): @weave.op -async def get_record(question: str, language: str = "en") -> dict: - response = await get_answer(question, language=language) +async def get_record(question: str, application: str = "api-eval", language: str = "en") -> dict: + response = await get_answer(question, application, language=language) response = json.loads(response) # Return default values if response is empty or missing fields @@ -120,10 +120,11 @@ async def get_record(question: str, language: str = "en") -> dict: class WandbotModel(weave.Model): language: str = "en" + application: str = "api-eval" @weave.op async def predict(self, question: str) -> dict: - prediction = await get_record(question, language=self.language) + prediction = await get_record(question, application=self.application, language=self.language) return prediction @weave.op @@ -183,7 +184,7 @@ def main(): ] logger.info("Number of evaluation samples: %s", len(question_rows)) - wandbot = WandbotModel(language=config.lang) + wandbot = WandbotModel(language=config.lang, application=config.experiment_name) wandbot_evaluator = Evaluation( name=config.evaluation_name, diff --git a/src/wandbot/evaluation/weave_eval/log_data.py b/src/wandbot/evaluation/weave_eval/log_data.py index d52bd4b..57d8769 100644 --- a/src/wandbot/evaluation/weave_eval/log_data.py +++ b/src/wandbot/evaluation/weave_eval/log_data.py @@ -7,7 +7,7 @@ import pandas as pd from weave import Dataset -from wandbot.evaluation.config import EvalConfig +from wandbot.evaluation.eval_config import EvalConfig config = EvalConfig()