Skip to content

Commit

Permalink
feat: allow configurable skills recipe
Browse files Browse the repository at this point in the history
Introduced the ability to specify a custom skills recipe file via the
`skills_recipe_path` option. This enhancement enables users to
fine-tune data mixing processes by providing a tailored YAML recipe.
Practical use cases include adjusting sampling sizes or customizing
dataset definitions.

Co-authored-by: Michael Clifford <[email protected]>
Signed-off-by: Sébastien Han <[email protected]>
  • Loading branch information
leseb and MichaelClifford committed Dec 9, 2024
1 parent f7c35d7 commit 55e85c0
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 5 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## v0.7.0

### Features

* Ability to configure skills recipe. This is managed through the `skills_recipe_path` option, which specifies the file path to a YAML file. This feature is helpful for adjusting the sample size.

## v0.6.1

### Fixes
Expand Down
45 changes: 40 additions & 5 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ def __init__(
sys_prompt,
num_procs,
auxiliary_inst=None,
skills_recipe_path=None,
):
# HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset
# far exceeds the size of our knowledge samples. This will be removed in the future
Expand All @@ -616,19 +617,32 @@ def __init__(
self.num_procs = num_procs
self.auxiliary_inst = auxiliary_inst

self.knowledge_recipe = self._load_default_recipe("knowledge.yaml")
self.skills_recipe = self._load_default_recipe("skills.yaml")
self.knowledge_recipe = self._load_recipe("knowledge.yaml")
self.skills_recipe = self._load_recipe("skills.yaml", skills_recipe_path)

self.output_file_knowledge_recipe = f"knowledge_recipe_{date_suffix}.yaml"
self.output_file_skills_recipe = f"skills_recipe_{date_suffix}.yaml"
self.output_file_mixed_knowledge = f"knowledge_train_msgs_{date_suffix}.jsonl"
self.output_file_mixed_skills = f"skills_train_msgs_{date_suffix}.jsonl"

def _load_default_recipe(self, yaml_basename):
def _load_recipe(self, yaml_basename, recipe_path=None):
"""
Load a default system recipe from e.g. /usr/share/instructlab/sdg/default_data_recipes
if it exists, otherwise return an empty recipe.
Load a recipe from the specified file path or fallback to the default recipe. If the default
recipe is not present in any of the data directories, an empty recipe is returned.
Args:
yaml_basename (str): The base name of the default recipe YAML file.
recipe_path (str, optional): Full path to a custom recipe file. Defaults to None.
Returns:
Recipe: Loaded recipe object.
Raises:
GenerateException: If the provided path is a directory or does not exist.
"""
if recipe_path:
_validate_yaml_file_path(recipe_path)
return Recipe(recipe_path=recipe_path, sys_prompt=self.sys_prompt)
for d in self.data_dirs:
default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
if os.path.exists(default_recipe_path):
Expand Down Expand Up @@ -750,3 +764,24 @@ def generate(self):
self.output_file_skills_recipe,
self.output_file_mixed_skills,
)


def _validate_yaml_file_path(file_path):
try:
# Existing checks
with open(file_path, "r", encoding="utf-8") as f:
yaml.safe_load(f) # Ensure valid YAML format
except IsADirectoryError as e:
raise GenerateException(
f"Recipe path is a directory, not a file: {file_path}"
) from e
except PermissionError as e:
raise GenerateException(
f"Permission denied for recipe file: {file_path}"
) from e
except FileNotFoundError as e:
raise GenerateException(f"Recipe file not found: {file_path}") from e
except yaml.YAMLError as e:
raise GenerateException(
f"Invalid YAML format in recipe file: {file_path}"
) from e
5 changes: 5 additions & 0 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def _mixer_init(
date_suffix,
knowledge_auxiliary_inst,
system_prompt,
skills_recipe_path=None,
):
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
Expand All @@ -282,6 +283,7 @@ def _mixer_init(
system_prompt,
ctx.dataset_num_procs,
knowledge_auxiliary_inst,
skills_recipe_path,
)


Expand All @@ -308,6 +310,7 @@ def generate_data(
batch_size: Optional[int] = None,
checkpoint_dir: Optional[str] = None,
max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
skills_recipe_path: Optional[str] = None,
) -> None:
"""Generate data for training and testing a model.
Expand All @@ -322,6 +325,7 @@ def generate_data(
or an absolute path to a directory containing the pipeline YAML files.
We expect three files to be present in this directory: "knowledge.yaml",
"freeform_skills.yaml", and "grounded_skills.yaml".
skills_recipe_path: Path to a YAML file containing the skills recipe.
"""
generate_start = time.time()

Expand Down Expand Up @@ -390,6 +394,7 @@ def generate_data(
date_suffix,
knowledge_pipe.auxiliary_inst,
system_prompt,
skills_recipe_path,
)

if console_output:
Expand Down
77 changes: 77 additions & 0 deletions tests/test_datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@
from importlib import resources
from unittest.mock import patch
import os
import pathlib

# Third Party
from datasets import Dataset
import pytest
import yaml

# First Party
from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples
from instructlab.sdg.utils import GenerateException

# We mock out the actual things that use num_procs anyway, but just
# for a consistent value in the tests...
Expand Down Expand Up @@ -55,6 +59,79 @@ def test_datamixer_can_load_default_recipes():
assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"


@pytest.mark.parametrize(
"skills_recipe_case", ["directory", "non_existent", "invalid_yaml", "valid"]
)
def test_datamixer_can_load_specifix_recipe_file(
tmp_path: pathlib.Path, skills_recipe_case
):
"""
Test that DataMixer can load a given recipe file by pointing
it at a simple set of test recipe file.
This test checks when the skills_recipe_path is a directory,
does not exist, or is a valid file.
"""
date_suffix = "2024-07-25T15_52_10"
prompt = "You are a useful AI assistant."

if skills_recipe_case == "valid":
skills_recipe_path = tmp_path / "skills_recipe_path.yaml"
skills_recipe_path.write_text("""
datasets:
- path: "test/skills.jsonl"
sampling_size: 1.0
""")
elif skills_recipe_case == "directory":
skills_recipe_path = tmp_path / "skills_recipe_dir"
skills_recipe_path.mkdir()
elif skills_recipe_case == "invalid_yaml":
skills_recipe_path = tmp_path / "invalid_yaml_recipe.yaml"
skills_recipe_path.write_text("[invalid_yaml")
else: # non_existent
skills_recipe_path = tmp_path / "non_existent_recipe.yaml"

if skills_recipe_case == "valid":
mixer = DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)
assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"
elif skills_recipe_case == "directory":
with pytest.raises(GenerateException, match="Recipe path is a directory"):
DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)
elif skills_recipe_case == "invalid_yaml":
with pytest.raises(GenerateException, match="Invalid YAML format"):
DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)
else: # non_existent
with pytest.raises(GenerateException, match="Recipe file not found"):
DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)


def test_recipe_init_with_empty_params_adds_dataset():
"""
Test that an empty-initialized recipe can add datasets
Expand Down

0 comments on commit 55e85c0

Please sign in to comment.