Skip to content

Commit

Permalink
start to incorporate gen_judgement.py into library
Browse files Browse the repository at this point in the history
Signed-off-by: Ali Maredia <[email protected]>
  • Loading branch information
alimaredia authored and danmcp committed Jun 17, 2024
1 parent cf2ea02 commit ad020ef
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 78 deletions.
1 change: 0 additions & 1 deletion src/instructlab/eval/gen_api_answer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ def run(
parallel=1,
openai_api_base=None):

os.environ(
if openai_api_base is not None:
openai.api_base = openai_api_base

Expand Down
102 changes: 34 additions & 68 deletions src/instructlab/eval/gen_judgment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Usage:
python gen_judgment.py --model-list [LIST-OF-MODEL-ID] --parallel [num-concurrent-api-call] --mode [single|pairwise-baseline|pairwise-all]
"""
import argparse
from concurrent.futures import ThreadPoolExecutor
import json

Expand Down Expand Up @@ -166,55 +165,21 @@ def make_judge_single(judge_model, judge_prompts):
)
return judges

def run(
bench_name="mt_bench",
judge_file="data/judge_prompts.jsonl",
judge_model="gpt-4",
baseline_model=None,
mode="single",
model_list=None,
parallel=1,
first_n=None,
yes=True,
batch=True):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--bench-name",
type=str,
default="mt_bench",
help="The name of the benchmark question set.",
)
parser.add_argument(
"--judge-file",
type=str,
default="data/judge_prompts.jsonl",
help="The file of judge prompts.",
)
parser.add_argument("--judge-model", type=str, default="gpt-4")
parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo")
parser.add_argument(
"--mode",
type=str,
default="single",
choices=["pairwise-baseline", "pairwise-all", "single"],
help=(
"Evaluation mode. "
"`pairwise-baseline` runs pairwise comparision against a baseline. "
"`pairwise-all` runs pairwise comparision between all pairs. "
"`single` runs single answer grading."
),
)
parser.add_argument(
"--model-list",
type=str,
nargs="+",
default=None,
help="A list of models to be evaluated",
)
parser.add_argument(
"--parallel", type=int, default=1, help="The number of concurrent API calls."
)
parser.add_argument(
"--first-n", type=int, help="A debug option. Only run the first `n` judgments."
)
parser.add_argument("--yes", action="store_true")
parser.add_argument("--batch", action="store_true")
args = parser.parse_args()

question_file = f"data/{args.bench_name}/question.jsonl"
answer_dir = f"data/{args.bench_name}/model_answer/instructlab"
ref_answer_dir = f"data/{args.bench_name}/reference_answer"
question_file = f"data/{bench_name}/question.jsonl"
answer_dir = f"data/{bench_name}/model_answer/instructlab"
ref_answer_dir = f"data/{bench_name}/reference_answer"

# Load questions
questions = load_questions(question_file, None, None)
Expand All @@ -224,36 +189,36 @@ def make_judge_single(judge_model, judge_prompts):
ref_answers = load_model_answers(ref_answer_dir)

# Load judge
judge_prompts = load_judge_prompts(args.judge_file)
judge_prompts = load_judge_prompts(judge_file)

if args.first_n:
questions = questions[: args.first_n]
if first_n:
questions = questions[: first_n]

if args.model_list is None:
if model_list is None:
models = get_model_list(answer_dir)
else:
models = args.model_list
models = model_list

if args.mode == "single":
judges = make_judge_single(args.judge_model, judge_prompts)
play_a_match_func = play_a_match_single_batch if args.batch else play_a_match_single
if mode == "single":
judges = make_judge_single(judge_model, judge_prompts)
play_a_match_func = play_a_match_single_batch if batch else play_a_match_single
output_file = (
f"data/{args.bench_name}/model_judgment/{args.judge_model}_single.jsonl"
f"data/{bench_name}/model_judgment/{judge_model}_single.jsonl"
)
make_match_func = make_match_single
baseline_model = None
else:
judges = make_judge_pairwise(args.judge_model, judge_prompts)
judges = make_judge_pairwise(judge_model, judge_prompts)
play_a_match_func = play_a_match_pair
output_file = (
f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl"
f"data/{bench_name}/model_judgment/{judge_model}_pair.jsonl"
)
if args.mode == "pairwise-all":
if mode == "pairwise-all":
make_match_func = make_match_all_pairs
baseline_model = None
else:
make_match_func = make_match
baseline_model = args.baseline_model
baseline_model = baseline_model

check_data(questions, model_answers, ref_answers, models, judges)

Expand Down Expand Up @@ -292,9 +257,9 @@ def make_judge_single(judge_model, judge_prompts):
)

match_stat = {}
match_stat["bench_name"] = args.bench_name
match_stat["mode"] = args.mode
match_stat["judge"] = args.judge_model
match_stat["bench_name"] = bench_name
match_stat["mode"] = mode
match_stat["judge"] = judge_model
match_stat["baseline"] = baseline_model
match_stat["model_list"] = models
match_stat["total_num_questions"] = len(questions)
Expand All @@ -304,11 +269,11 @@ def make_judge_single(judge_model, judge_prompts):
# Show match stats and prompt enter to continue
print("Stats:")
print(json.dumps(match_stat, indent=4))
if not args.yes and not args.batch:
if not yes and not batch:
input("Press Enter to confirm...")

# Play matches
if args.parallel == 1:
if parallel == 1:
for match in tqdm(matches):
play_a_match_func(match, output_file=output_file)
else:
Expand All @@ -319,8 +284,9 @@ def play_a_match_wrapper(match):
np.random.seed(0)
np.random.shuffle(matches)

with ThreadPoolExecutor(args.parallel) as executor:
with ThreadPoolExecutor(parallel) as executor:
for match in tqdm(
executor.map(play_a_match_wrapper, matches), total=len(matches)
):
pass
return output_file
14 changes: 9 additions & 5 deletions src/instructlab/eval/mtbench.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
import os

# Local
from .evaluator import Evaluator
#from .gen_api_answer import reorg_answer_file
import instructlab.eval.gen_api_answer as gen_api_answer
import instructlab.eval.gen_judgement as gen_judgement


class MT_Bench_Evaluator(Evaluator):
Expand All @@ -19,20 +20,23 @@ def __init__(self, server_url: str) -> None:

def gen_answers(self, answer_file, server_url) -> str:
""" Asks questions to model, returns path to answers"""
os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
gen_api_answer.run(answer_file=answer_file, model_name="instructlab/granite-7b-lab", openai_api_base=server_url)
return answer_file

def judge_answers(self) -> tuple:
#def judge_answers(self, judge_endpoint) -> tuple:
def judge_answers(self, judge_endpoint) -> str:
"""
Runs MT-Bench judgement
Returns:
overall_score MT-Bench score for the overall model evaluation
qa_pairs Question and answer pairs from the evaluation
"""
overall_score: float = 0.0
qa_pairs: list[tuple] = []
return overall_score, qa_pairs
os.environ['OPENAI_API_BASE'] = judge_endpoint
os.environ['OPENAI_API_KEY'] = "NO_API_KEY"
output_file = gen_judgement.run(parallel=40)
return output_file


class PR_Bench_Evaluator(Evaluator):
Expand Down
8 changes: 4 additions & 4 deletions test_mt_bench.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from instructlab.eval.mtbench import MT_Bench_Evaluator

mt_bench = MT_Bench_Evaluator(server_url="http://localhost:8000")
path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1")
print(path)
payload = mt_bench.judge_answers()
print(payload)
#path = mt_bench.gen_answers("test-answers.jsonl", "http://localhost:8000/v1")
#print(path)
output_file = mt_bench.judge_answers("http://localhost:8000/v1")
print(output_file)

0 comments on commit ad020ef

Please sign in to comment.