From ad020efc14224b32bcb94532fa7f928e0960883f Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Mon, 17 Jun 2024 05:28:00 +0000 Subject: [PATCH] start to incorporate gen_judgement.py into library Signed-off-by: Ali Maredia --- src/instructlab/eval/gen_api_answer.py | 1 - src/instructlab/eval/gen_judgment.py | 102 +++++++++---------------- src/instructlab/eval/mtbench.py | 14 ++-- test_mt_bench.py | 8 +- 4 files changed, 47 insertions(+), 78 deletions(-) diff --git a/src/instructlab/eval/gen_api_answer.py b/src/instructlab/eval/gen_api_answer.py index 986314e..e02d059 100644 --- a/src/instructlab/eval/gen_api_answer.py +++ b/src/instructlab/eval/gen_api_answer.py @@ -101,7 +101,6 @@ def run( parallel=1, openai_api_base=None): - os.environ( if openai_api_base is not None: openai.api_base = openai_api_base diff --git a/src/instructlab/eval/gen_judgment.py b/src/instructlab/eval/gen_judgment.py index 6e26f35..8b8fefb 100644 --- a/src/instructlab/eval/gen_judgment.py +++ b/src/instructlab/eval/gen_judgment.py @@ -2,7 +2,6 @@ Usage: python gen_judgment.py --model-list [LIST-OF-MODEL-ID] --parallel [num-concurrent-api-call] --mode [single|pairwise-baseline|pairwise-all] """ -import argparse from concurrent.futures import ThreadPoolExecutor import json @@ -166,55 +165,21 @@ def make_judge_single(judge_model, judge_prompts): ) return judges +def run( + bench_name="mt_bench", + judge_file="data/judge_prompts.jsonl", + judge_model="gpt-4", + baseline_model=None, + mode="single", + model_list=None, + parallel=1, + first_n=None, + yes=True, + batch=True): -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--bench-name", - type=str, - default="mt_bench", - help="The name of the benchmark question set.", - ) - parser.add_argument( - "--judge-file", - type=str, - default="data/judge_prompts.jsonl", - help="The file of judge prompts.", - ) - parser.add_argument("--judge-model", type=str, default="gpt-4") - parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo") - parser.add_argument( - "--mode", - type=str, - default="single", - choices=["pairwise-baseline", "pairwise-all", "single"], - help=( - "Evaluation mode. " - "`pairwise-baseline` runs pairwise comparision against a baseline. " - "`pairwise-all` runs pairwise comparision between all pairs. " - "`single` runs single answer grading." - ), - ) - parser.add_argument( - "--model-list", - type=str, - nargs="+", - default=None, - help="A list of models to be evaluated", - ) - parser.add_argument( - "--parallel", type=int, default=1, help="The number of concurrent API calls." - ) - parser.add_argument( - "--first-n", type=int, help="A debug option. Only run the first `n` judgments." - ) - parser.add_argument("--yes", action="store_true") - parser.add_argument("--batch", action="store_true") - args = parser.parse_args() - - question_file = f"data/{args.bench_name}/question.jsonl" - answer_dir = f"data/{args.bench_name}/model_answer/instructlab" - ref_answer_dir = f"data/{args.bench_name}/reference_answer" + question_file = f"data/{bench_name}/question.jsonl" + answer_dir = f"data/{bench_name}/model_answer/instructlab" + ref_answer_dir = f"data/{bench_name}/reference_answer" # Load questions questions = load_questions(question_file, None, None) @@ -224,36 +189,36 @@ def make_judge_single(judge_model, judge_prompts): ref_answers = load_model_answers(ref_answer_dir) # Load judge - judge_prompts = load_judge_prompts(args.judge_file) + judge_prompts = load_judge_prompts(judge_file) - if args.first_n: - questions = questions[: args.first_n] + if first_n: + questions = questions[: first_n] - if args.model_list is None: + if model_list is None: models = get_model_list(answer_dir) else: - models = args.model_list + models = model_list - if args.mode == "single": - judges = make_judge_single(args.judge_model, judge_prompts) - play_a_match_func = play_a_match_single_batch if args.batch else play_a_match_single + if mode == "single": + judges = make_judge_single(judge_model, judge_prompts) + play_a_match_func = play_a_match_single_batch if batch else play_a_match_single output_file = ( - f"data/{args.bench_name}/model_judgment/{args.judge_model}_single.jsonl" + f"data/{bench_name}/model_judgment/{judge_model}_single.jsonl" ) make_match_func = make_match_single baseline_model = None else: - judges = make_judge_pairwise(args.judge_model, judge_prompts) + judges = make_judge_pairwise(judge_model, judge_prompts) play_a_match_func = play_a_match_pair output_file = ( - f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl" + f"data/{bench_name}/model_judgment/{judge_model}_pair.jsonl" ) - if args.mode == "pairwise-all": + if mode == "pairwise-all": make_match_func = make_match_all_pairs baseline_model = None else: make_match_func = make_match - baseline_model = args.baseline_model + baseline_model = baseline_model check_data(questions, model_answers, ref_answers, models, judges) @@ -292,9 +257,9 @@ def make_judge_single(judge_model, judge_prompts): ) match_stat = {} - match_stat["bench_name"] = args.bench_name - match_stat["mode"] = args.mode - match_stat["judge"] = args.judge_model + match_stat["bench_name"] = bench_name + match_stat["mode"] = mode + match_stat["judge"] = judge_model match_stat["baseline"] = baseline_model match_stat["model_list"] = models match_stat["total_num_questions"] = len(questions) @@ -304,11 +269,11 @@ def make_judge_single(judge_model, judge_prompts): # Show match stats and prompt enter to continue print("Stats:") print(json.dumps(match_stat, indent=4)) - if not args.yes and not args.batch: + if not yes and not batch: input("Press Enter to confirm...") # Play matches - if args.parallel == 1: + if parallel == 1: for match in tqdm(matches): play_a_match_func(match, output_file=output_file) else: @@ -319,8 +284,9 @@ def play_a_match_wrapper(match): np.random.seed(0) np.random.shuffle(matches) - with ThreadPoolExecutor(args.parallel) as executor: + with ThreadPoolExecutor(parallel) as executor: for match in tqdm( executor.map(play_a_match_wrapper, matches), total=len(matches) ): pass + return output_file diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py index cba6934..4e7de5c 100644 --- a/src/instructlab/eval/mtbench.py +++ b/src/instructlab/eval/mtbench.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import os # Local from .evaluator import Evaluator -#from .gen_api_answer import reorg_answer_file import instructlab.eval.gen_api_answer as gen_api_answer +import instructlab.eval.gen_judgement as gen_judgement class MT_Bench_Evaluator(Evaluator): @@ -19,10 +20,12 @@ def __init__(self, server_url: str) -> None: def gen_answers(self, answer_file, server_url) -> str: """ Asks questions to model, returns path to answers""" + os.environ['OPENAI_API_KEY'] = "NO_API_KEY" gen_api_answer.run(answer_file=answer_file, model_name="instructlab/granite-7b-lab", openai_api_base=server_url) return answer_file - def judge_answers(self) -> tuple: + #def judge_answers(self, judge_endpoint) -> tuple: + def judge_answers(self, judge_endpoint) -> str: """ Runs MT-Bench judgement @@ -30,9 +33,10 @@ def judge_answers(self) -> tuple: overall_score MT-Bench score for the overall model evaluation qa_pairs Question and answer pairs from the evaluation """ - overall_score: float = 0.0 - qa_pairs: list[tuple] = [] - return overall_score, qa_pairs + os.environ['OPENAI_API_BASE'] = judge_endpoint + os.environ['OPENAI_API_KEY'] = "NO_API_KEY" + output_file = gen_judgement.run(parallel=40) + return output_file class PR_Bench_Evaluator(Evaluator): diff --git a/test_mt_bench.py b/test_mt_bench.py index cc501f5..d1c51e3 100644 --- a/test_mt_bench.py +++ b/test_mt_bench.py @@ -1,7 +1,7 @@ from instructlab.eval.mtbench import MT_Bench_Evaluator mt_bench = MT_Bench_Evaluator(server_url="http://localhost:8000") -path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1") -print(path) -payload = mt_bench.judge_answers() -print(payload) +#path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1") +#print(path) +output_file = mt_bench.judge_answers("http://localhost:8000/v1") +print(output_file)