feat: introduce results attribute on MMLU evaluator

In order to test the validity of our MMLU results or get information on prior runs, we need to be able to access the full set of results from the lm_eval.evaluator.simple_evaluate API. This commit provides that ability by adding a results attribute on the MMLUEvaluator class and storing the results there. Signed-off-by: Oleg S <[email protected]>
instructlab · Dec 13, 2024 · 7b4fe25 · 7b4fe25
1 parent fd78adf
commit 7b4fe25
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 16 deletions.
diff --git a/scripts/test_mmlu.py b/scripts/test_mmlu.py
@@ -14,7 +14,13 @@ def test_minimal_mmlu():
             tasks=tasks,
             system_prompt=SYSTEM_PROMPT,
         )
-        overall_score, individual_scores = mmlu.run()
+        overall_score, individual_scores = mmlu.run(
+            extra_args={
+                "log_samples": True,
+            }
+        )
+        samples = mmlu.results["samples"]
+        print(samples)
         print(overall_score)
         print(individual_scores)
     except Exception as exc:

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -7,7 +7,7 @@
 """
 
 # Standard
-from typing import Optional, Union
+from typing import Optional, Union, Dict, Any
 import os
 
 # Third Party
@@ -103,6 +103,7 @@ class AbstractMMLUEvaluator(Evaluator):
         batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
         device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
         system_prompt   system prompt to be used when applying the chat template
+        results         full output from the `lm_eval.evaluator.simple_evaluate` function after MMLU has run.
     """
 
     def __init__(
@@ -124,18 +125,33 @@ def __init__(
         self.few_shots = few_shots
         self.batch_size = batch_size
         self.device = device
+        self._results = None
 
-    def run(self, server_url: str | None = None) -> tuple:
+    @property
+    def results(self) -> Dict[str, Any] | None:
+        """
+        Returns the results of the last MMLU evaluation, if one has taken place.
+
+        Returns:
+            Dict[str, Any] | None: The output from `lm_eval.evaluator.simple_evaluate`
+        """
+        return self._results
+
+    def run(
+        self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None
+    ) -> tuple:
         """
         Runs evaluation
 
         Attributes
             server_url          Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
+            extra_args          Dictionary containing any extra arguments to be passed into the lm_eval `lm_eval.evaluator.simple_evaluate` function.
 
         Returns:
             overall_score       Average score for the task group
             individual_scores   Individual scores for each task in the task group
         """
+        extra_args = dict() if not extra_args else extra_args
         logger.debug(locals())
 
         # TODO: make this a parameter for class?
@@ -156,7 +172,10 @@ def run(self, server_url: str | None = None) -> tuple:
 
         return overall_score, individual_scores
 
-    def _run_mmlu(self, server_url: str | None = None) -> dict:
+    def _run_mmlu(
+        self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None
+    ) -> dict:
+        extra_args = dict() if not extra_args else extra_args
         if server_url is not None:
             # Requires lm_eval >= 0.4.4
             model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface"
@@ -172,19 +191,25 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
                 raise InvalidTasksDirError(self.tasks_dir)
             tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
         should_apply_chat_template = self.system_prompt is not None
-        mmlu_output = self._simple_evaluate_with_error_handling(
-            model=model,
-            model_args=model_args,
-            tasks=self.tasks,
-            num_fewshot=self.few_shots,
-            batch_size=self.batch_size,
-            device=self.device,
-            task_manager=tm,
-            system_instruction=self.system_prompt,
-            apply_chat_template=should_apply_chat_template,
+
+        # configure the args here so users can override them as necessary
+        simple_evaluate_kwargs = {
+            "model": model,
+            "model_args": model_args,
+            "tasks": self.tasks,
+            "num_fewshot": self.few_shots,
+            "batch_size": self.batch_size,
+            "device": self.device,
+            "task_manager": tm,
+            "system_instruction": self.system_prompt,
+            "apply_chat_template": should_apply_chat_template,
+        }
+        simple_evaluate_kwargs.update(extra_args)
+
+        self._results = self._simple_evaluate_with_error_handling(
+            **simple_evaluate_kwargs
         )
-        results = mmlu_output["results"]
-        return results
+        return self.results["results"]
 
     # This method converts general errors from simple_evaluate
     # into a more user-understandable error