From ad020efc14224b32bcb94532fa7f928e0960883f Mon Sep 17 00:00:00 2001
From: Ali Maredia <amaredia@redhat.com>
Date: Mon, 17 Jun 2024 05:28:00 +0000
Subject: [PATCH] start to incorporate gen_judgement.py into library

Signed-off-by: Ali Maredia <amaredia@redhat.com>
---
 src/instructlab/eval/gen_api_answer.py |   1 -
 src/instructlab/eval/gen_judgment.py   | 102 +++++++++----------------
 src/instructlab/eval/mtbench.py        |  14 ++--
 test_mt_bench.py                       |   8 +-
 4 files changed, 47 insertions(+), 78 deletions(-)

diff --git a/src/instructlab/eval/gen_api_answer.py b/src/instructlab/eval/gen_api_answer.py
index 986314e..e02d059 100644
--- a/src/instructlab/eval/gen_api_answer.py
+++ b/src/instructlab/eval/gen_api_answer.py
@@ -101,7 +101,6 @@ def run(
         parallel=1,
         openai_api_base=None):
 
-    os.environ(
     if openai_api_base is not None:
         openai.api_base = openai_api_base
 
diff --git a/src/instructlab/eval/gen_judgment.py b/src/instructlab/eval/gen_judgment.py
index 6e26f35..8b8fefb 100644
--- a/src/instructlab/eval/gen_judgment.py
+++ b/src/instructlab/eval/gen_judgment.py
@@ -2,7 +2,6 @@
 Usage:
 python gen_judgment.py --model-list [LIST-OF-MODEL-ID] --parallel [num-concurrent-api-call] --mode [single|pairwise-baseline|pairwise-all]
 """
-import argparse
 from concurrent.futures import ThreadPoolExecutor
 import json
 
@@ -166,55 +165,21 @@ def make_judge_single(judge_model, judge_prompts):
     )
     return judges
 
+def run(
+        bench_name="mt_bench",
+        judge_file="data/judge_prompts.jsonl",
+        judge_model="gpt-4",
+        baseline_model=None,
+        mode="single",
+        model_list=None,
+        parallel=1,
+        first_n=None,
+        yes=True,
+        batch=True):
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--bench-name",
-        type=str,
-        default="mt_bench",
-        help="The name of the benchmark question set.",
-    )
-    parser.add_argument(
-        "--judge-file",
-        type=str,
-        default="data/judge_prompts.jsonl",
-        help="The file of judge prompts.",
-    )
-    parser.add_argument("--judge-model", type=str, default="gpt-4")
-    parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo")
-    parser.add_argument(
-        "--mode",
-        type=str,
-        default="single",
-        choices=["pairwise-baseline", "pairwise-all", "single"],
-        help=(
-            "Evaluation mode. "
-            "`pairwise-baseline` runs pairwise comparision against a baseline. "
-            "`pairwise-all` runs pairwise comparision between all pairs. "
-            "`single` runs single answer grading."
-        ),
-    )
-    parser.add_argument(
-        "--model-list",
-        type=str,
-        nargs="+",
-        default=None,
-        help="A list of models to be evaluated",
-    )
-    parser.add_argument(
-        "--parallel", type=int, default=1, help="The number of concurrent API calls."
-    )
-    parser.add_argument(
-        "--first-n", type=int, help="A debug option. Only run the first `n` judgments."
-    )
-    parser.add_argument("--yes", action="store_true")
-    parser.add_argument("--batch", action="store_true")
-    args = parser.parse_args()
-
-    question_file = f"data/{args.bench_name}/question.jsonl"
-    answer_dir = f"data/{args.bench_name}/model_answer/instructlab"
-    ref_answer_dir = f"data/{args.bench_name}/reference_answer"
+    question_file = f"data/{bench_name}/question.jsonl"
+    answer_dir = f"data/{bench_name}/model_answer/instructlab"
+    ref_answer_dir = f"data/{bench_name}/reference_answer"
 
     # Load questions
     questions = load_questions(question_file, None, None)
@@ -224,36 +189,36 @@ def make_judge_single(judge_model, judge_prompts):
     ref_answers = load_model_answers(ref_answer_dir)
 
     # Load judge
-    judge_prompts = load_judge_prompts(args.judge_file)
+    judge_prompts = load_judge_prompts(judge_file)
 
-    if args.first_n:
-        questions = questions[: args.first_n]
+    if first_n:
+        questions = questions[: first_n]
 
-    if args.model_list is None:
+    if model_list is None:
         models = get_model_list(answer_dir)
     else:
-        models = args.model_list
+        models = model_list
 
-    if args.mode == "single":
-        judges = make_judge_single(args.judge_model, judge_prompts)
-        play_a_match_func = play_a_match_single_batch if args.batch else play_a_match_single
+    if mode == "single":
+        judges = make_judge_single(judge_model, judge_prompts)
+        play_a_match_func = play_a_match_single_batch if batch else play_a_match_single
         output_file = (
-            f"data/{args.bench_name}/model_judgment/{args.judge_model}_single.jsonl"
+            f"data/{bench_name}/model_judgment/{judge_model}_single.jsonl"
         )
         make_match_func = make_match_single
         baseline_model = None
     else:
-        judges = make_judge_pairwise(args.judge_model, judge_prompts)
+        judges = make_judge_pairwise(judge_model, judge_prompts)
         play_a_match_func = play_a_match_pair
         output_file = (
-            f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl"
+            f"data/{bench_name}/model_judgment/{judge_model}_pair.jsonl"
         )
-        if args.mode == "pairwise-all":
+        if mode == "pairwise-all":
             make_match_func = make_match_all_pairs
             baseline_model = None
         else:
             make_match_func = make_match
-            baseline_model = args.baseline_model
+            baseline_model = baseline_model
 
     check_data(questions, model_answers, ref_answers, models, judges)
 
@@ -292,9 +257,9 @@ def make_judge_single(judge_model, judge_prompts):
     )
 
     match_stat = {}
-    match_stat["bench_name"] = args.bench_name
-    match_stat["mode"] = args.mode
-    match_stat["judge"] = args.judge_model
+    match_stat["bench_name"] = bench_name
+    match_stat["mode"] = mode
+    match_stat["judge"] = judge_model
     match_stat["baseline"] = baseline_model
     match_stat["model_list"] = models
     match_stat["total_num_questions"] = len(questions)
@@ -304,11 +269,11 @@ def make_judge_single(judge_model, judge_prompts):
     # Show match stats and prompt enter to continue
     print("Stats:")
     print(json.dumps(match_stat, indent=4))
-    if not args.yes and not args.batch:
+    if not yes and not batch:
         input("Press Enter to confirm...")
 
     # Play matches
-    if args.parallel == 1:
+    if parallel == 1:
         for match in tqdm(matches):
             play_a_match_func(match, output_file=output_file)
     else:
@@ -319,8 +284,9 @@ def play_a_match_wrapper(match):
         np.random.seed(0)
         np.random.shuffle(matches)
 
-        with ThreadPoolExecutor(args.parallel) as executor:
+        with ThreadPoolExecutor(parallel) as executor:
             for match in tqdm(
                 executor.map(play_a_match_wrapper, matches), total=len(matches)
             ):
                 pass
+    return output_file
diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py
index cba6934..4e7de5c 100644
--- a/src/instructlab/eval/mtbench.py
+++ b/src/instructlab/eval/mtbench.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
+import os
 
 # Local
 from .evaluator import Evaluator
-#from .gen_api_answer import reorg_answer_file
 import instructlab.eval.gen_api_answer as gen_api_answer
+import instructlab.eval.gen_judgement as gen_judgement
 
 
 class MT_Bench_Evaluator(Evaluator):
@@ -19,10 +20,12 @@ def __init__(self, server_url: str) -> None:
 
     def gen_answers(self, answer_file, server_url) -> str:
         """ Asks questions to model, returns path to answers"""
+        os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
         gen_api_answer.run(answer_file=answer_file, model_name="instructlab/granite-7b-lab", openai_api_base=server_url)
         return answer_file
 
-    def judge_answers(self) -> tuple:
+    #def judge_answers(self, judge_endpoint) -> tuple:
+    def judge_answers(self, judge_endpoint) -> str:
         """
         Runs MT-Bench judgement
 
@@ -30,9 +33,10 @@ def judge_answers(self) -> tuple:
             overall_score   MT-Bench score for the overall model evaluation
             qa_pairs        Question and answer pairs from the evaluation
         """
-        overall_score: float = 0.0
-        qa_pairs: list[tuple] = []
-        return overall_score, qa_pairs
+        os.environ['OPENAI_API_BASE'] = judge_endpoint
+        os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
+        output_file = gen_judgement.run(parallel=40)
+        return output_file
 
 
 class PR_Bench_Evaluator(Evaluator):
diff --git a/test_mt_bench.py b/test_mt_bench.py
index cc501f5..d1c51e3 100644
--- a/test_mt_bench.py
+++ b/test_mt_bench.py
@@ -1,7 +1,7 @@
 from instructlab.eval.mtbench import MT_Bench_Evaluator
 
 mt_bench = MT_Bench_Evaluator(server_url="http://localhost:8000")
-path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1")
-print(path)
-payload = mt_bench.judge_answers()
-print(payload)
+#path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1")
+#print(path)
+output_file = mt_bench.judge_answers("http://localhost:8000/v1")
+print(output_file)