Skip to content

Commit

Permalink
Merge pull request #416 from bbrowning/unique-document-dir
Browse files Browse the repository at this point in the history
Ensure knowledge docs are cloned into unique dirs
  • Loading branch information
bbrowning authored Nov 27, 2024
2 parents eb8119c + 823c279 commit eef8bae
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
9 changes: 8 additions & 1 deletion src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# Standard
from pathlib import Path
from tempfile import mkdtemp
from typing import Dict, List, Tuple, Union
import glob
import logging
Expand Down Expand Up @@ -257,14 +258,20 @@ def _read_taxonomy_file(
try:
# get seed instruction data
tax_path = "->".join(taxonomy.path.parent.parts)
leaf_node_path = tax_path.replace("->", "_")
contents = taxonomy.contents
task_description = contents.get("task_description", None)
domain = contents.get("domain")
documents = contents.get("document")
document_contents, doc_filepaths = None, None
if documents:
os.makedirs(document_output_dir, exist_ok=True)
unique_output_dir = mkdtemp(
prefix=f"{leaf_node_path}_", dir=document_output_dir
)
document_contents, doc_filepaths = _get_documents(
source=documents, document_output_dir=document_output_dir
source=documents,
document_output_dir=unique_output_dir,
)
logger.debug("Content from git repo fetched")

Expand Down
44 changes: 28 additions & 16 deletions tests/test_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ def validate_phase_leaf_node_dataset(dataset_file_name):
assert ds.features["messages"][0]["role"].dtype == "string"


def validate_recipe(recipe_file_name):
def validate_recipe(recipe_file_name, num_datasets):
with open(recipe_file_name, encoding="utf-8") as fp:
yaml_contents = yaml.safe_load(fp)
assert len(yaml_contents["datasets"]) == 1
assert len(yaml_contents["datasets"]) == num_datasets
assert yaml_contents["datasets"][0]["path"].endswith(".jsonl")
assert "sampling_size" in yaml_contents["datasets"][0]
assert yaml_contents["metadata"]["sys_prompt"] == TEST_SYS_PROMPT
Expand Down Expand Up @@ -344,7 +344,7 @@ def test_generate(self):
if name.endswith("compositional_skills_new.jsonl"):
validate_skill_leaf_node_dataset(matches[0])
elif name.startswith("skills_recipe_"):
validate_recipe(matches[0])
validate_recipe(matches[0], 1)
elif name.startswith("skills_train_msgs_"):
validate_mixed_dataset(matches[0])

Expand Down Expand Up @@ -374,13 +374,19 @@ def setUp(self):
TEST_DATA_DIR, "test_valid_knowledge_skill.yaml"
)
tracked_knowledge_file = os.path.join("knowledge ", "tracked", "qna.yaml")
untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml")
# Explicitly add 2 files here to ensure multiple knowledge leaf nodes
# don't conflict in anything like document_output_dir for knowledge docs
untracked_knowledge_file1 = os.path.join("knowledge", "new1", "qna.yaml")
untracked_knowledge_file2 = os.path.join("knowledge", "new2", "qna.yaml")
test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file)
self.test_taxonomy.add_tracked(
tracked_knowledge_file, test_valid_knowledge_skill
)
self.test_taxonomy.create_untracked(
untracked_knowledge_file, test_valid_knowledge_skill
untracked_knowledge_file1, test_valid_knowledge_skill
)
self.test_taxonomy.create_untracked(
untracked_knowledge_file2, test_valid_knowledge_skill
)
self.expected_test_samples = generate_test_samples(test_valid_knowledge_skill)
self.expected_train_samples = generate_train_samples(test_valid_knowledge_skill)
Expand Down Expand Up @@ -412,40 +418,46 @@ def test_generate(self):
elif name.startswith("messages_"):
validate_messages_dataset(matches[0], self.expected_train_samples)

node_p07_file = os.path.join("node_datasets_*", "knowledge_new_p07.jsonl")
node_p10_file = os.path.join("node_datasets_*", "knowledge_new_p10.jsonl")
node1_p07_file = os.path.join("node_datasets_*", "knowledge_new1_p07.jsonl")
node1_p10_file = os.path.join("node_datasets_*", "knowledge_new1_p10.jsonl")
node2_p07_file = os.path.join("node_datasets_*", "knowledge_new2_p07.jsonl")
node2_p10_file = os.path.join("node_datasets_*", "knowledge_new2_p10.jsonl")
for name in [
"skills_recipe_*.yaml",
"skills_train_*.jsonl",
"knowledge_recipe_*.yaml",
"knowledge_train_msgs_*.jsonl",
node_p07_file,
node_p10_file,
node1_p07_file,
node1_p10_file,
node2_p07_file,
node2_p10_file,
]:
matches = glob.glob(os.path.join(self.tmp_path, name))
assert len(matches) == 1
if name.endswith("knowledge_new_p07.jsonl") or name.endswith(
"knowledge_new_p10.jsonl"
if name.endswith("knowledge_new1_p07.jsonl") or name.endswith(
"knowledge_new1_p10.jsonl"
):
validate_phase_leaf_node_dataset(matches[0])
elif name.startswith("skills_recipe_") or name.startswith(
"knowledge_recipe_"
):
validate_recipe(matches[0])
validate_recipe(matches[0], 2)
elif name.startswith("skills_train_msgs_") or name.startswith(
"knowledge_train_msgs_"
):
validate_mixed_dataset(matches[0])

for name in [
"knowledge_new_task.yaml",
"mmlubench_knowledge_new.jsonl",
"knowledge_new1_task.yaml",
"mmlubench_knowledge_new1.jsonl",
"knowledge_new2_task.yaml",
"mmlubench_knowledge_new2.jsonl",
]:
matches = glob.glob(os.path.join(self.tmp_path, "node_datasets_*", name))
assert len(matches) == 1
if name == "knowledge_new_task.yaml":
if name == "knowledge_new1_task.yaml":
validate_lm_eval_task(matches[0])
elif name == "mmlubench_knowledge_new.jsonl":
elif name == "mmlubench_knowledge_new1.jsonl":
validate_mmlubench_dataset(matches[0])

def teardown(self) -> None:
Expand Down

0 comments on commit eef8bae

Please sign in to comment.