From ab664b8bdfda84e609e6c5d37223b620fc7bb1d2 Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Thu, 12 Dec 2024 04:18:54 +0000 Subject: [PATCH 1/3] feat: allow MMLU to pass `system_prompt` to lm_eval Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- .pylintrc | 3 ++- .spellcheck-en-custom.txt | 2 ++ CHANGELOG.md | 4 ++++ src/instructlab/eval/mmlu.py | 30 +++++++++++++++++++++++------- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/.pylintrc b/.pylintrc index 01a605d..14b3eb1 100644 --- a/.pylintrc +++ b/.pylintrc @@ -448,7 +448,8 @@ disable=raw-checker-failed, pointless-statement, wrong-import-order, line-too-long, - dangerous-default-value + dangerous-default-value, + too-many-instance-attributes # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt index 5967218..3251d44 100644 --- a/.spellcheck-en-custom.txt +++ b/.spellcheck-en-custom.txt @@ -26,3 +26,5 @@ TODO tox venv vllm +barebones +LM diff --git a/CHANGELOG.md b/CHANGELOG.md index a897297..755516c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.2 + +* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt. + ## 0.4 * Added ability to specify a custom http client to MT-Bench diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index f893b66..8637ad4 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -102,6 +102,7 @@ class AbstractMMLUEvaluator(Evaluator): few_shots number of examples batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. device PyTorch device (e.g. "cpu" or "cuda:0") for running models + system_prompt system prompt to be used when applying the chat template """ def __init__( @@ -113,8 +114,10 @@ def __init__( few_shots: int = 5, batch_size: Optional[Union[int, str]] = "auto", device: str = ("cuda" if torch.cuda.is_available() else "cpu"), + system_prompt: Optional[str] = None, ) -> None: self.model_path = model_path + self.system_prompt = system_prompt self.tasks_dir = tasks_dir self.tasks = tasks self.model_dtype = model_dtype @@ -168,6 +171,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict: if not os.access(self.tasks_dir, os.R_OK): raise InvalidTasksDirError(self.tasks_dir) tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir) + should_apply_chat_template = self.system_prompt is not None mmlu_output = self._simple_evaluate_with_error_handling( model=model, model_args=model_args, @@ -176,6 +180,8 @@ def _run_mmlu(self, server_url: str | None = None) -> dict: batch_size=self.batch_size, device=self.device, task_manager=tm, + system_instruction=self.system_prompt, + apply_chat_template=should_apply_chat_template, ) results = mmlu_output["results"] return results @@ -213,12 +219,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator): Evaluator for Massive Multitask Language Understanding (MMLU) Attributes: - model_path absolute path to or name of a huggingface model - tasks list of tasks for MMLU to test the model with - model_dtype dtype of model when served - few_shots number of examples - batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. - device PyTorch device (e.g. "cpu" or "cuda:0") for running models + model_path absolute path to or name of a huggingface model + tasks list of tasks for MMLU to test the model with + model_dtype dtype of model when served + few_shots number of examples + batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. + device PyTorch device (e.g. "cpu" or "cuda:0") for running models + system_prompt system prompt to be used when applying the chat template """ name = "mmlu" @@ -231,9 +238,17 @@ def __init__( few_shots: int = 5, batch_size: Optional[Union[int, str]] = "auto", device: str = ("cuda" if torch.cuda.is_available() else "cpu"), + system_prompt: Optional[str] = None, ) -> None: super().__init__( - model_path, None, tasks, model_dtype, few_shots, batch_size, device + model_path, + None, + tasks, + model_dtype, + few_shots, + batch_size, + device, + system_prompt=system_prompt, ) @@ -243,6 +258,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator): Attributes: model_path absolute path to or name of a huggingface model + system_prompt system prompt to be used when applying the chat template tasks_dir path where the .jsonl and _task.yaml files for the branches being evaluated are stored tasks group name that is shared by all the MMLUBranch tasks model_dtype dtype of model when served From fd78adf2eea4168cf46679b176788a2965b9e676 Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 13 Dec 2024 04:57:23 +0000 Subject: [PATCH 2/3] chore: update tests to include system prompt in MMLU evals Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- scripts/test_mmlu.py | 8 +++++++- tests/test_mmlu.py | 11 +++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/scripts/test_mmlu.py b/scripts/test_mmlu.py index 2db46c0..fd10f27 100755 --- a/scripts/test_mmlu.py +++ b/scripts/test_mmlu.py @@ -1,13 +1,19 @@ # First Party from instructlab.eval.mmlu import MMLUEvaluator +SYSTEM_PROMPT = """I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.""" + def test_minimal_mmlu(): print("===> Executing 'test_minimal_mmlu'...") try: model_path = "instructlab/granite-7b-lab" tasks = ["mmlu_anatomy", "mmlu_astronomy"] - mmlu = MMLUEvaluator(model_path=model_path, tasks=tasks) + mmlu = MMLUEvaluator( + model_path=model_path, + tasks=tasks, + system_prompt=SYSTEM_PROMPT, + ) overall_score, individual_scores = mmlu.run() print(overall_score) print(individual_scores) diff --git a/tests/test_mmlu.py b/tests/test_mmlu.py index bdf4f90..2cc0c79 100644 --- a/tests/test_mmlu.py +++ b/tests/test_mmlu.py @@ -48,7 +48,10 @@ def test_mmlu_branch(eval_mock): tasks_dir = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg" tasks = ["mmlu_pr"] mmlu = MMLUBranchEvaluator( - model_path=MODEL_EXAMPLE, tasks_dir=tasks_dir, tasks=tasks + model_path=MODEL_EXAMPLE, + tasks_dir=tasks_dir, + tasks=tasks, + system_prompt="You are an intelligent AI language model.", ) overall_score, individual_scores = mmlu.run() @@ -62,7 +65,11 @@ def test_mmlu_branch(eval_mock): ) def test_mmlu(eval_mock): tasks = ["mmlu_anatomy", "mmlu_astronomy", "mmlu_algebra"] - mmlu = MMLUEvaluator(model_path=MODEL_EXAMPLE, tasks=tasks) + mmlu = MMLUEvaluator( + model_path=MODEL_EXAMPLE, + tasks=tasks, + system_prompt="You are an intelligent AI language model.", + ) overall_score, individual_scores = mmlu.run() eval_mock.assert_called() From ad12276878dfa691599e37aa5e18d6c1bd7f4afd Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:45:13 -0500 Subject: [PATCH 3/3] feat: introduce `results` attribute on MMLU evaluator In order to test the validity of our MMLU results or get information on prior runs, we need to be able to access the full set of results from the lm_eval.evaluator.simple_evaluate API. This commit provides that ability by adding a results attribute on the MMLUEvaluator class and storing the results there. Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- CHANGELOG.md | 1 + scripts/test_mmlu.py | 53 ++++++++++++++++++++++++++++++- src/instructlab/eval/mmlu.py | 60 +++++++++++++++++++++++++----------- 3 files changed, 95 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 755516c..1ba0513 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.4.2 * Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt. +* Adds an `extra_args` parameter to the `.run` method of all MMLU-based evaluators. This way, consumers are able to directly pass any additional arguments they want through to the `lm_eval.evaluators.simple_evaluate` function. ## 0.4 diff --git a/scripts/test_mmlu.py b/scripts/test_mmlu.py index fd10f27..a6035df 100755 --- a/scripts/test_mmlu.py +++ b/scripts/test_mmlu.py @@ -1,9 +1,41 @@ +# Standard +from typing import Dict, List, Tuple, TypedDict + # First Party from instructlab.eval.mmlu import MMLUEvaluator SYSTEM_PROMPT = """I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.""" +class MMLUSample(TypedDict): + """ + Example of a single sample returned from lm_eval when running MMLU. + This is not a comprehensive type, just the subset of fields we care about for this test. + """ + + # Arguments is the list of (prompt, answer) pairs passed to MMLU as few-shot samples. + # They will not be present with few_shot=0 + arguments: List[Tuple[str, str]] + + +def all_samples_contain_system_prompt( + samples: Dict[str, List[MMLUSample]], prompt: str +) -> bool: + """ + Given a mapping of evaluation --> list of results, validates that all few-shot examples + included the system prompt + """ + for topic, samples_set in samples.items(): + for sample in samples_set: + for mmlu_prompt, _ in sample["arguments"]: + if prompt not in mmlu_prompt: + # we are looking for the exact system prompt, so no need to convert to normalize to lowercase + print(f"found a sample in the '{topic}' MMLU topic set") + return False + + return True + + def test_minimal_mmlu(): print("===> Executing 'test_minimal_mmlu'...") try: @@ -14,9 +46,28 @@ def test_minimal_mmlu(): tasks=tasks, system_prompt=SYSTEM_PROMPT, ) - overall_score, individual_scores = mmlu.run() + overall_score, individual_scores = mmlu.run( + extra_args={"log_samples": True, "write_out": True} + ) + samples = mmlu.results["samples"] + print(overall_score) print(individual_scores) + + # we need n-shots > 1 to be able to validate the inclusion of the system prompt + eligible_samples = { + topic: samples[topic] + for topic, shot in mmlu.results["n-shot"].items() + if shot > 1 + } + if eligible_samples: + if not all_samples_contain_system_prompt(eligible_samples, SYSTEM_PROMPT): + return False + else: + print( + "MMLU was run in zero-shot mode, cannot confirm that system prompt was included, skipping check..." + ) + except Exception as exc: print(f"'test_minimal_mmlu' failed: {exc}") return False diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index 8637ad4..1aaf462 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -7,12 +7,12 @@ """ # Standard -from typing import Optional, Union +from typing import Any, Dict, Optional, Union import os # Third Party -from lm_eval.evaluator import simple_evaluate # type: ignore -from lm_eval.tasks import TaskManager # type: ignore +from lm_eval.evaluator import simple_evaluate +from lm_eval.tasks import TaskManager import torch # First Party @@ -103,6 +103,7 @@ class AbstractMMLUEvaluator(Evaluator): batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. device PyTorch device (e.g. "cpu" or "cuda:0") for running models system_prompt system prompt to be used when applying the chat template + results full output from the `lm_eval.evaluator.simple_evaluate` function after MMLU has run. """ def __init__( @@ -124,18 +125,33 @@ def __init__( self.few_shots = few_shots self.batch_size = batch_size self.device = device + self._results = None - def run(self, server_url: str | None = None) -> tuple: + @property + def results(self) -> Dict[str, Any] | None: + """ + Returns the results of the last MMLU evaluation, if one has taken place. + + Returns: + Dict[str, Any] | None: The output from `lm_eval.evaluator.simple_evaluate` + """ + return self._results + + def run( + self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None + ) -> tuple: """ Runs evaluation Attributes server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated + extra_args Dictionary containing any extra arguments to be passed into the lm_eval `lm_eval.evaluator.simple_evaluate` function. Returns: overall_score Average score for the task group individual_scores Individual scores for each task in the task group """ + extra_args = {} if not extra_args else extra_args logger.debug(locals()) # TODO: make this a parameter for class? @@ -156,7 +172,10 @@ def run(self, server_url: str | None = None) -> tuple: return overall_score, individual_scores - def _run_mmlu(self, server_url: str | None = None) -> dict: + def _run_mmlu( + self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None + ) -> dict: + extra_args = {} if not extra_args else extra_args if server_url is not None: # Requires lm_eval >= 0.4.4 model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface" @@ -172,19 +191,24 @@ def _run_mmlu(self, server_url: str | None = None) -> dict: raise InvalidTasksDirError(self.tasks_dir) tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir) should_apply_chat_template = self.system_prompt is not None - mmlu_output = self._simple_evaluate_with_error_handling( - model=model, - model_args=model_args, - tasks=self.tasks, - num_fewshot=self.few_shots, - batch_size=self.batch_size, - device=self.device, - task_manager=tm, - system_instruction=self.system_prompt, - apply_chat_template=should_apply_chat_template, - ) - results = mmlu_output["results"] - return results + + # configure the args here so users can override them as necessary + simple_evaluate_kwargs = { + "model": model, + "model_args": model_args, + "tasks": self.tasks, + "num_fewshot": self.few_shots, + "batch_size": self.batch_size, + "device": self.device, + "task_manager": tm, + "system_instruction": self.system_prompt, + "apply_chat_template": should_apply_chat_template, + } + simple_evaluate_kwargs.update(extra_args) + + results = self._simple_evaluate_with_error_handling(**simple_evaluate_kwargs) + self._results = results + return results["results"] # This method converts general errors from simple_evaluate # into a more user-understandable error