Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement MMLU_Evaluator.run() #10

Merged
merged 7 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ transformers
accelerate
pandas
pandas-stubs
lm-eval>=0.4.2
65 changes: 52 additions & 13 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,39 @@
# SPDX-License-Identifier: Apache-2.0

# Local
from .evaluator import Evaluator
# Standard
import os

# Third Party
from lm_eval.evaluator import simple_evaluate # type: ignore

class MMLU_Evaluator(Evaluator):
# First Party
from instructlab.eval.evaluator import Evaluator


class MMLUEvaluator(Evaluator):
"""
Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)

Attributes:
model_path absolute path to or name of a huggingface model
alinaryan marked this conversation as resolved.
Show resolved Hide resolved
tasks list of tasks for MMLU to test the model with
model_dtype dtype of model when served
few_shots number of examples
batch_size number of GPUs
"""

def __init__(
self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
self,
model_path,
tasks: list[str],
model_dtype="bfloat16",
few_shots: int = 2,
batch_size: int = 5,
) -> None:
super().__init__()
self.model_path = model_path
self.tasks = tasks
alinaryan marked this conversation as resolved.
Show resolved Hide resolved
self.model_dtype = model_dtype
self.few_shots = few_shots
self.batch_size = batch_size

alinaryan marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -30,18 +45,42 @@ def run(self) -> tuple:
overall_score MMLU score for the overall model evaluation
individual_scores Individual MMLU score for each task
"""
individual_scores: dict[str, float] = {}
overall_score: float = 0.0
# TODO: make this a parameter for class?
os.environ["TOKENIZERS_PARALLELISM"] = "true"

individual_scores: dict = {}
agg_score: float = 0.0
model_args = f"pretrained= {self.model_path}, dtype= {self.model_dtype}"

mmlu_output = simple_evaluate(
model="hf",
model_args=model_args,
tasks=self.tasks,
num_fewshot=self.few_shots,
batch_size=self.batch_size,
)

results = mmlu_output["results"]

for task in self.tasks:
mmlu_res = results[task]
agg_score += float(mmlu_res["acc,none"])
individual_scores[task] = {}
individual_scores[task]["score"] = float(mmlu_res["acc,none"])
individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])

overall_score = float(agg_score / len(self.tasks))
return overall_score, individual_scores


class PR_MMLU_Evaluator(Evaluator):
class MMLUBranchEvaluator(Evaluator):
"""
Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU)
Child class of an Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch)

Attributes:
alinaryan marked this conversation as resolved.
Show resolved Hide resolved
sdg_path path where all the PR MMLU tasks are stored
task group name that is shared by all the PR MMLU tasks
model_path absolute path to or name of a huggingface model
sdg_path path where all the MMLUBranch tasks are stored
task group name that is shared by all the MMLUBranch tasks
few_shots number of examples
batch_size number of GPUs
"""
Expand All @@ -62,11 +101,11 @@ def __init__(

def run(self) -> tuple:
"""
Runs PR MMLU evaluation
Runs MMLUBranch evaluation

Returns:
overall_score PR MMLU score for the overall model evaluation
individual_scores Individual PR MMLU scores for each task
overall_score MMLUBranch score for the overall model evaluation
individual_scores Individual MMLUBranch scores for each task
qa_pairs Question and answer pairs from the evaluation
"""
individual_scores: dict[str, float] = {}
Expand Down