diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f56e2c6..8d605493 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## v0.7.0 + +### Features + +* Ability to configure skills recipe. This is managed through the `skills_recipe_path` option, which specifies the file path to a YAML file. This feature is helpful for adjusting the sample size. + ## v0.6.1 ### Fixes diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index e6ca8675..2239aacd 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -604,6 +604,7 @@ def __init__( sys_prompt, num_procs, auxiliary_inst=None, + skills_recipe_path=None, ): # HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset # far exceeds the size of our knowledge samples. This will be removed in the future @@ -616,19 +617,32 @@ def __init__( self.num_procs = num_procs self.auxiliary_inst = auxiliary_inst - self.knowledge_recipe = self._load_default_recipe("knowledge.yaml") - self.skills_recipe = self._load_default_recipe("skills.yaml") + self.knowledge_recipe = self._load_recipe("knowledge.yaml") + self.skills_recipe = self._load_recipe("skills.yaml", skills_recipe_path) self.output_file_knowledge_recipe = f"knowledge_recipe_{date_suffix}.yaml" self.output_file_skills_recipe = f"skills_recipe_{date_suffix}.yaml" self.output_file_mixed_knowledge = f"knowledge_train_msgs_{date_suffix}.jsonl" self.output_file_mixed_skills = f"skills_train_msgs_{date_suffix}.jsonl" - def _load_default_recipe(self, yaml_basename): + def _load_recipe(self, yaml_basename, recipe_path=None): """ - Load a default system recipe from e.g. /usr/share/instructlab/sdg/default_data_recipes - if it exists, otherwise return an empty recipe. + Load a recipe from the specified file path or fallback to the default recipe. If the default + recipe is not present in any of the data directories, an empty recipe is returned. + + Args: + yaml_basename (str): The base name of the default recipe YAML file. + recipe_path (str, optional): Full path to a custom recipe file. Defaults to None. + + Returns: + Recipe: Loaded recipe object. + + Raises: + GenerateException: If the provided path is a directory or does not exist. """ + if recipe_path: + _validate_yaml_file_path(recipe_path) + return Recipe(recipe_path=recipe_path, sys_prompt=self.sys_prompt) for d in self.data_dirs: default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename) if os.path.exists(default_recipe_path): @@ -750,3 +764,24 @@ def generate(self): self.output_file_skills_recipe, self.output_file_mixed_skills, ) + + +def _validate_yaml_file_path(file_path): + try: + # Existing checks + with open(file_path, "r", encoding="utf-8") as f: + yaml.safe_load(f) # Ensure valid YAML format + except IsADirectoryError as e: + raise GenerateException( + f"Recipe path is a directory, not a file: {file_path}" + ) from e + except PermissionError as e: + raise GenerateException( + f"Permission denied for recipe file: {file_path}" + ) from e + except FileNotFoundError as e: + raise GenerateException(f"Recipe file not found: {file_path}") from e + except yaml.YAMLError as e: + raise GenerateException( + f"Invalid YAML format in recipe file: {file_path}" + ) from e diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index aac007b7..7ea8fc3c 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -271,6 +271,7 @@ def _mixer_init( date_suffix, knowledge_auxiliary_inst, system_prompt, + skills_recipe_path=None, ): data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) @@ -282,6 +283,7 @@ def _mixer_init( system_prompt, ctx.dataset_num_procs, knowledge_auxiliary_inst, + skills_recipe_path, ) @@ -308,6 +310,7 @@ def generate_data( batch_size: Optional[int] = None, checkpoint_dir: Optional[str] = None, max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, + skills_recipe_path: Optional[str] = None, ) -> None: """Generate data for training and testing a model. @@ -322,6 +325,7 @@ def generate_data( or an absolute path to a directory containing the pipeline YAML files. We expect three files to be present in this directory: "knowledge.yaml", "freeform_skills.yaml", and "grounded_skills.yaml". + skills_recipe_path: Path to a YAML file containing the skills recipe. """ generate_start = time.time() @@ -390,6 +394,7 @@ def generate_data( date_suffix, knowledge_pipe.auxiliary_inst, system_prompt, + skills_recipe_path, ) if console_output: diff --git a/tests/test_datamixing.py b/tests/test_datamixing.py index 2b2c4f34..272888f0 100644 --- a/tests/test_datamixing.py +++ b/tests/test_datamixing.py @@ -8,12 +8,16 @@ from importlib import resources from unittest.mock import patch import os +import pathlib # Third Party from datasets import Dataset +import pytest +import yaml # First Party from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples +from instructlab.sdg.utils import GenerateException # We mock out the actual things that use num_procs anyway, but just # for a consistent value in the tests... @@ -55,6 +59,79 @@ def test_datamixer_can_load_default_recipes(): assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl" +@pytest.mark.parametrize( + "skills_recipe_case", ["directory", "non_existent", "invalid_yaml", "valid"] +) +def test_datamixer_can_load_specifix_recipe_file( + tmp_path: pathlib.Path, skills_recipe_case +): + """ + Test that DataMixer can load a given recipe file by pointing + it at a simple set of test recipe file. + This test checks when the skills_recipe_path is a directory, + does not exist, or is a valid file. + """ + date_suffix = "2024-07-25T15_52_10" + prompt = "You are a useful AI assistant." + + if skills_recipe_case == "valid": + skills_recipe_path = tmp_path / "skills_recipe_path.yaml" + skills_recipe_path.write_text(""" + datasets: + - path: "test/skills.jsonl" + sampling_size: 1.0 + """) + elif skills_recipe_case == "directory": + skills_recipe_path = tmp_path / "skills_recipe_dir" + skills_recipe_path.mkdir() + elif skills_recipe_case == "invalid_yaml": + skills_recipe_path = tmp_path / "invalid_yaml_recipe.yaml" + skills_recipe_path.write_text("[invalid_yaml") + else: # non_existent + skills_recipe_path = tmp_path / "non_existent_recipe.yaml" + + if skills_recipe_case == "valid": + mixer = DataMixer( + [TEST_DATA_DIR], + TEST_DATA_DIR, + date_suffix, + prompt, + TEST_NUM_PROCS, + skills_recipe_path=str(skills_recipe_path), + ) + assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl" + elif skills_recipe_case == "directory": + with pytest.raises(GenerateException, match="Recipe path is a directory"): + DataMixer( + [TEST_DATA_DIR], + TEST_DATA_DIR, + date_suffix, + prompt, + TEST_NUM_PROCS, + skills_recipe_path=str(skills_recipe_path), + ) + elif skills_recipe_case == "invalid_yaml": + with pytest.raises(GenerateException, match="Invalid YAML format"): + DataMixer( + [TEST_DATA_DIR], + TEST_DATA_DIR, + date_suffix, + prompt, + TEST_NUM_PROCS, + skills_recipe_path=str(skills_recipe_path), + ) + else: # non_existent + with pytest.raises(GenerateException, match="Recipe file not found"): + DataMixer( + [TEST_DATA_DIR], + TEST_DATA_DIR, + date_suffix, + prompt, + TEST_NUM_PROCS, + skills_recipe_path=str(skills_recipe_path), + ) + + def test_recipe_init_with_empty_params_adds_dataset(): """ Test that an empty-initialized recipe can add datasets