From 823c2791135d3062411893b1c506b49f93ed5fac Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Wed, 27 Nov 2024 15:54:34 -0500 Subject: [PATCH] Ensure knowledge docs are cloned into unique dirs Previously, we were attempting to clone multiple knowledge documents into the same destination directory, leading to failures generating data for any run that contained 2+ knowledge leaf nodes. Now, we clone docs into a guaranteed unique (via `tempfile.mkdtemp`) subdirectory per knowledge leaf node. Just using a subdirectory per leaf node could still have led to collisions if the user ran data generation twice within one minute, which is why this goes the extra step of using `mkdtemp` for guaranteed uniqueness. Fixes #404 Signed-off-by: Ben Browning --- src/instructlab/sdg/utils/taxonomy.py | 9 +++++- tests/test_generate_data.py | 44 +++++++++++++++++---------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 00743d93..24592fe1 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -2,6 +2,7 @@ # Standard from pathlib import Path +from tempfile import mkdtemp from typing import Dict, List, Tuple, Union import glob import logging @@ -257,14 +258,20 @@ def _read_taxonomy_file( try: # get seed instruction data tax_path = "->".join(taxonomy.path.parent.parts) + leaf_node_path = tax_path.replace("->", "_") contents = taxonomy.contents task_description = contents.get("task_description", None) domain = contents.get("domain") documents = contents.get("document") document_contents, doc_filepaths = None, None if documents: + os.makedirs(document_output_dir, exist_ok=True) + unique_output_dir = mkdtemp( + prefix=f"{leaf_node_path}_", dir=document_output_dir + ) document_contents, doc_filepaths = _get_documents( - source=documents, document_output_dir=document_output_dir + source=documents, + document_output_dir=unique_output_dir, ) logger.debug("Content from git repo fetched") diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index 0d04a80f..d3f6ced5 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -119,10 +119,10 @@ def validate_phase_leaf_node_dataset(dataset_file_name): assert ds.features["messages"][0]["role"].dtype == "string" -def validate_recipe(recipe_file_name): +def validate_recipe(recipe_file_name, num_datasets): with open(recipe_file_name, encoding="utf-8") as fp: yaml_contents = yaml.safe_load(fp) - assert len(yaml_contents["datasets"]) == 1 + assert len(yaml_contents["datasets"]) == num_datasets assert yaml_contents["datasets"][0]["path"].endswith(".jsonl") assert "sampling_size" in yaml_contents["datasets"][0] assert yaml_contents["metadata"]["sys_prompt"] == TEST_SYS_PROMPT @@ -344,7 +344,7 @@ def test_generate(self): if name.endswith("compositional_skills_new.jsonl"): validate_skill_leaf_node_dataset(matches[0]) elif name.startswith("skills_recipe_"): - validate_recipe(matches[0]) + validate_recipe(matches[0], 1) elif name.startswith("skills_train_msgs_"): validate_mixed_dataset(matches[0]) @@ -374,13 +374,19 @@ def setUp(self): TEST_DATA_DIR, "test_valid_knowledge_skill.yaml" ) tracked_knowledge_file = os.path.join("knowledge ", "tracked", "qna.yaml") - untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml") + # Explicitly add 2 files here to ensure multiple knowledge leaf nodes + # don't conflict in anything like document_output_dir for knowledge docs + untracked_knowledge_file1 = os.path.join("knowledge", "new1", "qna.yaml") + untracked_knowledge_file2 = os.path.join("knowledge", "new2", "qna.yaml") test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file) self.test_taxonomy.add_tracked( tracked_knowledge_file, test_valid_knowledge_skill ) self.test_taxonomy.create_untracked( - untracked_knowledge_file, test_valid_knowledge_skill + untracked_knowledge_file1, test_valid_knowledge_skill + ) + self.test_taxonomy.create_untracked( + untracked_knowledge_file2, test_valid_knowledge_skill ) self.expected_test_samples = generate_test_samples(test_valid_knowledge_skill) self.expected_train_samples = generate_train_samples(test_valid_knowledge_skill) @@ -412,40 +418,46 @@ def test_generate(self): elif name.startswith("messages_"): validate_messages_dataset(matches[0], self.expected_train_samples) - node_p07_file = os.path.join("node_datasets_*", "knowledge_new_p07.jsonl") - node_p10_file = os.path.join("node_datasets_*", "knowledge_new_p10.jsonl") + node1_p07_file = os.path.join("node_datasets_*", "knowledge_new1_p07.jsonl") + node1_p10_file = os.path.join("node_datasets_*", "knowledge_new1_p10.jsonl") + node2_p07_file = os.path.join("node_datasets_*", "knowledge_new2_p07.jsonl") + node2_p10_file = os.path.join("node_datasets_*", "knowledge_new2_p10.jsonl") for name in [ "skills_recipe_*.yaml", "skills_train_*.jsonl", "knowledge_recipe_*.yaml", "knowledge_train_msgs_*.jsonl", - node_p07_file, - node_p10_file, + node1_p07_file, + node1_p10_file, + node2_p07_file, + node2_p10_file, ]: matches = glob.glob(os.path.join(self.tmp_path, name)) assert len(matches) == 1 - if name.endswith("knowledge_new_p07.jsonl") or name.endswith( - "knowledge_new_p10.jsonl" + if name.endswith("knowledge_new1_p07.jsonl") or name.endswith( + "knowledge_new1_p10.jsonl" ): validate_phase_leaf_node_dataset(matches[0]) elif name.startswith("skills_recipe_") or name.startswith( "knowledge_recipe_" ): - validate_recipe(matches[0]) + validate_recipe(matches[0], 2) elif name.startswith("skills_train_msgs_") or name.startswith( "knowledge_train_msgs_" ): validate_mixed_dataset(matches[0]) for name in [ - "knowledge_new_task.yaml", - "mmlubench_knowledge_new.jsonl", + "knowledge_new1_task.yaml", + "mmlubench_knowledge_new1.jsonl", + "knowledge_new2_task.yaml", + "mmlubench_knowledge_new2.jsonl", ]: matches = glob.glob(os.path.join(self.tmp_path, "node_datasets_*", name)) assert len(matches) == 1 - if name == "knowledge_new_task.yaml": + if name == "knowledge_new1_task.yaml": validate_lm_eval_task(matches[0]) - elif name == "mmlubench_knowledge_new.jsonl": + elif name == "mmlubench_knowledge_new1.jsonl": validate_mmlubench_dataset(matches[0]) def teardown(self) -> None: