From 8336e42431777f9b2ed62f57c2d0edf68d429b55 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Tue, 7 Jan 2025 12:21:43 -0500 Subject: [PATCH] Reduce scope of the PR - removing CLI, less churn Signed-off-by: Ben Browning --- src/instructlab/sdg/__init__.py | 5 +- .../sdg/cli/preprocess_taxonomy.py | 82 ------- src/instructlab/sdg/generate_data.py | 189 +++++++++++++++- src/instructlab/sdg/taxonomy.py | 201 ------------------ src/instructlab/sdg/utils/logging.py | 22 -- src/instructlab/sdg/utils/taxonomy.py | 4 - tests/test_generate_data.py | 21 +- tests/test_taxonomy.py | 19 +- 8 files changed, 211 insertions(+), 332 deletions(-) delete mode 100644 src/instructlab/sdg/cli/preprocess_taxonomy.py delete mode 100644 src/instructlab/sdg/taxonomy.py delete mode 100644 src/instructlab/sdg/utils/logging.py diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py index ccef90d8..5cc9d95f 100644 --- a/src/instructlab/sdg/__init__.py +++ b/src/instructlab/sdg/__init__.py @@ -29,7 +29,7 @@ "FULL_PIPELINES_PACKAGE", "SIMPLE_PIPELINES_PACKAGE", "generate_data", - "preprocess_taxonomy", + "mix_datasets", ) # Local @@ -51,7 +51,7 @@ SelectorBlock, SetToMajorityValueBlock, ) -from .generate_data import generate_data +from .generate_data import generate_data, mix_datasets from .pipeline import ( FULL_PIPELINES_PACKAGE, SIMPLE_PIPELINES_PACKAGE, @@ -62,6 +62,5 @@ PipelineContext, ) from .registry import BlockRegistry, PromptRegistry -from .taxonomy import preprocess_taxonomy from .utils import GenerateException from .utils.taxonomy import TaxonomyReadingException diff --git a/src/instructlab/sdg/cli/preprocess_taxonomy.py b/src/instructlab/sdg/cli/preprocess_taxonomy.py deleted file mode 100644 index b7d22532..00000000 --- a/src/instructlab/sdg/cli/preprocess_taxonomy.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Standard -import os - -# First Party -from instructlab.sdg.taxonomy import ( - DEFAULT_CHUNK_WORD_COUNT, - DEFAULT_SERVER_CTX_SIZE, - DEFAULT_TAXONOMY_BASE, - preprocess_taxonomy, -) -from instructlab.sdg.utils.logging import setup_logger - -if __name__ == "__main__": - # Standard - import argparse - - parser = argparse.ArgumentParser( - description="Turn a taxonomy into json samples suitable for use as input to data generate pipelines" - ) - - # Required args - parser.add_argument( - "--output-dir", - type=str, - required=True, - help="Directory to write the processed dataset samples into", - ) - parser.add_argument( - "--taxonomy-path", - type=str, - required=True, - help="Path to your InstructLab taxonomy", - ) - - # Optional args - parser.add_argument( - "--chunk-word-count", - type=int, - default=DEFAULT_CHUNK_WORD_COUNT, - help="Number of words per document chunk", - ) - parser.add_argument( - "--log-level", - type=str, - default=os.getenv("LOG_LEVEL", "INFO"), - help="Logging level", - ) - parser.add_argument( - "--server-ctx-size", - type=int, - default=DEFAULT_SERVER_CTX_SIZE, - help="The maximum number of tokens the inference server can handle.", - ) - parser.add_argument( - "--taxonomy-base", - type=str, - default=DEFAULT_TAXONOMY_BASE, - help="Taxonomy based used to determine what has changed - defaults to 'empty' which means consider all the taxonomy files as changed and process all of them", - ) - parser.add_argument( - "--yaml-rules", - type=str, - default=None, - help="Path to custom rules file for YAML linting", - ) - - args = parser.parse_args() - setup_logger(args.log_level) - preprocess_taxonomy( - args.taxonomy_path, - args.output_dir, - chunk_word_count=args.chunk_word_count, - server_ctx_size=args.server_ctx_size, - taxonomy_base=args.taxonomy_base, - yaml_rules=args.yaml_rules, - ) - -""" -python -m instructlab.sdg.cli.preprocess_taxonomy --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output -""" diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 3f338ecc..9091653f 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -16,6 +16,7 @@ from datasets import Dataset from xdg_base_dirs import xdg_data_dirs, xdg_data_home import openai +import yaml # First Party from instructlab.sdg.blocks.llmblock import DEFAULT_MAX_NUM_TOKENS @@ -32,15 +33,25 @@ Pipeline, PipelineContext, ) -from instructlab.sdg.taxonomy import preprocess_taxonomy from instructlab.sdg.utils import GenerateException from instructlab.sdg.utils.json import jldump, jlload -from instructlab.sdg.utils.taxonomy import _unescape +from instructlab.sdg.utils.taxonomy import ( + leaf_node_to_samples, + read_taxonomy_leaf_nodes, +) logger = logging.getLogger(__name__) _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant." +DEFAULT_CHUNK_WORD_COUNT = 1000 +DEFAULT_TAXONOMY_BASE = "empty" +DEFAULT_SERVER_CTX_SIZE = 4096 + + +def _unescape(s): + return bytes(s, "utf-8").decode("utf-8").strip() + def _convert_to_messages(sample): """ @@ -112,6 +123,56 @@ def _gen_train_data( jldump(messages_data, output_file_messages) +def _knowledge_seed_example_to_test_data(seed_example, system_prompt): + res = [] + for i in range(3): + idx = i + 1 + user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"] + test_sample = { + "user": _unescape(user), + "assistant": _unescape(seed_example[f"icl_response_{idx}"]), + } + if system_prompt: + test_sample["system"] = system_prompt + res.append(test_sample) + return res + + +def _gen_test_data( + seed_examples, + output_file_test, + system_prompt, +): + """ + Generate test data in the format needed by the legacy Linux training + in instructlab/instructlab. + """ + test_data = [] + for seed_example in seed_examples: + if "icl_query_1" in seed_example: + test_data.extend( + _knowledge_seed_example_to_test_data(seed_example, system_prompt) + ) + continue + + # skill seed example + + user = seed_example["seed_question"] # question + + if seed_example["leaf_node_type"] == "grounded_skill": + user += "\n" + seed_example["seed_context"] # context + + test_sample = { + "user": _unescape(user), + "assistant": _unescape(seed_example["seed_response"]), # answer + } + if system_prompt: + test_sample["system"] = system_prompt + test_data.append(test_sample) + + jldump(test_data, output_file_test) + + def _check_pipeline_dir(pipeline): for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]: if not os.path.exists(os.path.join(pipeline, file)): @@ -120,6 +181,31 @@ def _check_pipeline_dir(pipeline): ) +def _locate_docling_models(): + # Search for the models in User and Site data directories + data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] + data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) + + docling_model_path = None + sdg_models_path = docling_model_path + for d in data_dirs: + if os.path.exists(os.path.join(d, "models")): + sdg_models_path = os.path.join(d, "models") + break + + if sdg_models_path is not None: + try: + with open( + os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" + ) as file: + config = yaml.safe_load(file) + docling_model_path = config["models"][0]["path"] + except (FileNotFoundError, NotADirectoryError, PermissionError) as e: + logger.warning(f"unable to read docling models path from config.yaml {e}") + + return docling_model_path + + def _context_init( client: openai.OpenAI, model_family: str, @@ -217,6 +303,105 @@ def _extract_leaf_node_path_and_type(sample): return leaf_node_path, leaf_node_type +def preprocess_taxonomy( + taxonomy_dir, + output_dir, + chunk_word_count=DEFAULT_CHUNK_WORD_COUNT, # TODO: Remove chunk_word_count param + server_ctx_size=DEFAULT_SERVER_CTX_SIZE, # TODO: Remove server_ctx_size param + taxonomy_base=DEFAULT_TAXONOMY_BASE, + teacher_model_path: Optional[str] = None, + yaml_rules: Optional[str] = None, + test_output_file: Optional[str] = None, + system_prompt: Optional[str] = None, +): + """ + Preprocess a taxonomy into input samples suitable for use with + data generation pipelines. This does the following steps: + + - Determine changed leaf nodes in the taxonomy + - Retrieve knowledge documents for changed taxonomy leaf nodes + - Convert any non-markdown knowledge documents to markdown + - Write the Docling json and markdown outputs from this conversion to + disk for other processes to consume if needed. + - Chunk the converted knowledge documents to the desired chunk sizes. + - Turn the qna.yaml and knowledge documents into samples in the format + expected by the `simple` and `full` data generation pipelines shipped + in SDG. + - Write these samples to disk, with one file per taxonomy leaf node. + + Args: + taxonomy_dir: The path to the taxonomy + output_dir: Where to write the samples create for use with data generation + test_output_file: Path to write the test samples jsonl file + chunk_word_count: The target number of words per document chunk + server_ctx_size: The maximum number of tokens the inference server used + during data generation can handle + taxonomy_base: Determines how we calculate what has changed. This should + be a git reference or the special value of 'empty' which + means assume the entire taxonomy has changed. + teacher_model_path: Path to the teacher model on disk, which we'll use to + load its tokenizer for use with document chunking. + yaml_rules: Path to a custom YAML rules file for YAML linting. + system_prompt: System prompt to use when generating test samples + + Returns: + List[str]: The list of output sample files written to disk. + + """ + logging.info("Converting taxonomy to samples") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + output_files = [] + + if not (taxonomy_dir and os.path.exists(taxonomy_dir)): + raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.") + + document_output_dir = output_dir.joinpath("documents") + docling_model_path = _locate_docling_models() + + leaf_nodes = read_taxonomy_leaf_nodes( + taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir + ) + if not leaf_nodes: + raise GenerateException("Error: No new leaf nodes found in the taxonomy.") + + # TODO: This is all a temporary hack here, as we either need to + # remove, deprecate, or otherwise determine the right way to + # support test samples + all_samples = [] + for leaf_node in leaf_nodes.values(): + leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") + samples = leaf_node_to_samples( + leaf_node, + taxonomy_dir, + server_ctx_size, + chunk_word_count, + document_output_dir, + teacher_model_path, + docling_model_path=docling_model_path, + ) + + if not samples: + raise GenerateException("Error: No samples found in leaf node.") + + logger.debug("Samples: %s", samples) + + output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl") + all_samples.extend(samples) + jldump(samples, output_file) + output_files.append(str(output_file)) + + if test_output_file: + _gen_test_data( + all_samples, + test_output_file, + system_prompt, + ) + logger.debug(f"Generating test data to: {test_output_file}") + logger.info("Taxonomy converted to samples and written to %s", output_dir) + return output_files + + def generate_taxonomy( client: openai.OpenAI, input_dir: str, diff --git a/src/instructlab/sdg/taxonomy.py b/src/instructlab/sdg/taxonomy.py deleted file mode 100644 index 4b0c90b2..00000000 --- a/src/instructlab/sdg/taxonomy.py +++ /dev/null @@ -1,201 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# pylint: disable=duplicate-code - -# Standard -from pathlib import Path -from typing import Optional -import logging -import os - -# Third Party -from xdg_base_dirs import xdg_data_dirs, xdg_data_home -import yaml - -# First Party -from instructlab.sdg.utils import GenerateException -from instructlab.sdg.utils.json import jldump -from instructlab.sdg.utils.taxonomy import ( - _unescape, - leaf_node_to_samples, - read_taxonomy_leaf_nodes, -) - -logger = logging.getLogger(__name__) - -DEFAULT_CHUNK_WORD_COUNT = 1000 -DEFAULT_TAXONOMY_BASE = "empty" -DEFAULT_SERVER_CTX_SIZE = 4096 - - -def _locate_docling_models(): - # Search for the models in User and Site data directories - data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] - data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) - - docling_model_path = None - sdg_models_path = docling_model_path - for d in data_dirs: - if os.path.exists(os.path.join(d, "models")): - sdg_models_path = os.path.join(d, "models") - break - - if sdg_models_path is not None: - try: - with open( - os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" - ) as file: - config = yaml.safe_load(file) - docling_model_path = config["models"][0]["path"] - except (FileNotFoundError, NotADirectoryError, PermissionError) as e: - logger.warning(f"unable to read docling models path from config.yaml {e}") - - return docling_model_path - - -def _knowledge_seed_example_to_test_data(seed_example, system_prompt): - res = [] - for i in range(3): - idx = i + 1 - user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"] - test_sample = { - "user": _unescape(user), - "assistant": _unescape(seed_example[f"icl_response_{idx}"]), - } - if system_prompt: - test_sample["system"] = system_prompt - res.append(test_sample) - return res - - -def _gen_test_data( - seed_examples, - output_file_test, - system_prompt, -): - """ - Generate test data in the format needed by the legacy Linux training - in instructlab/instructlab. - """ - test_data = [] - for seed_example in seed_examples: - if "icl_query_1" in seed_example: - test_data.extend( - _knowledge_seed_example_to_test_data(seed_example, system_prompt) - ) - continue - - # skill seed example - - user = seed_example["seed_question"] # question - - if seed_example["leaf_node_type"] == "grounded_skill": - user += "\n" + seed_example["seed_context"] # context - - test_sample = { - "user": _unescape(user), - "assistant": _unescape(seed_example["seed_response"]), # answer - } - if system_prompt: - test_sample["system"] = system_prompt - test_data.append(test_sample) - - jldump(test_data, output_file_test) - - -def preprocess_taxonomy( - taxonomy_dir, - output_dir, - chunk_word_count=DEFAULT_CHUNK_WORD_COUNT, # TODO: Remove chunk_word_count param - server_ctx_size=DEFAULT_SERVER_CTX_SIZE, # TODO: Remove server_ctx_size param - taxonomy_base=DEFAULT_TAXONOMY_BASE, - teacher_model_path: Optional[str] = None, - yaml_rules: Optional[str] = None, - test_output_file: Optional[str] = None, - system_prompt: Optional[str] = None, -): - """ - Preprocess a taxonomy into input samples suitable for use with - data generation pipelines. This does the following steps: - - - Determine changed leaf nodes in the taxonomy - - Retrieve knowledge documents for changed taxonomy leaf nodes - - Convert any non-markdown knowledge documents to markdown - - Write the Docling json and markdown outputs from this conversion to - disk for other processes to consume if needed. - - Chunk the converted knowledge documents to the desired chunk sizes. - - Turn the qna.yaml and knowledge documents into samples in the format - expected by the `simple` and `full` data generation pipelines shipped - in SDG. - - Write these samples to disk, with one file per taxonomy leaf node. - - Args: - taxonomy_dir: The path to the taxonomy - output_dir: Where to write the samples create for use with data generation - test_output_file: Path to write the test samples jsonl file - chunk_word_count: The target number of words per document chunk - server_ctx_size: The maximum number of tokens the inference server used - during data generation can handle - taxonomy_base: Determines how we calculate what has changed. This should - be a git reference or the special value of 'empty' which - means assume the entire taxonomy has changed. - teacher_model_path: Path to the teacher model on disk, which we'll use to - load its tokenizer for use with document chunking. - yaml_rules: Path to a custom YAML rules file for YAML linting. - system_prompt: System prompt to use when generating test samples - - Returns: - List[str]: The list of output sample files written to disk. - - """ - logging.info("Converting taxonomy to samples") - output_dir = Path(output_dir) - output_dir.mkdir(exist_ok=True) - output_files = [] - - if not (taxonomy_dir and os.path.exists(taxonomy_dir)): - raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.") - - document_output_dir = output_dir.joinpath("documents") - docling_model_path = _locate_docling_models() - - leaf_nodes = read_taxonomy_leaf_nodes( - taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir - ) - if not leaf_nodes: - raise GenerateException("Error: No new leaf nodes found in the taxonomy.") - - # TODO: This is all a temporary hack here, as we either need to - # remove, deprecate, or otherwise determine the right way to - # support test samples - all_samples = [] - for leaf_node in leaf_nodes.values(): - leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") - samples = leaf_node_to_samples( - leaf_node, - taxonomy_dir, - server_ctx_size, - chunk_word_count, - document_output_dir, - teacher_model_path, - docling_model_path=docling_model_path, - ) - - if not samples: - raise GenerateException("Error: No samples found in leaf node.") - - logger.debug("Samples: %s", samples) - - output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl") - all_samples.extend(samples) - jldump(samples, output_file) - output_files.append(str(output_file)) - - if test_output_file: - _gen_test_data( - all_samples, - test_output_file, - system_prompt, - ) - logger.debug(f"Generating test data to: {test_output_file}") - logger.info("Taxonomy converted to samples and written to %s", output_dir) - return output_files diff --git a/src/instructlab/sdg/utils/logging.py b/src/instructlab/sdg/utils/logging.py deleted file mode 100644 index c6236f49..00000000 --- a/src/instructlab/sdg/utils/logging.py +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Standard -import logging - -# Third Party -from rich.logging import RichHandler - - -def setup_logger(level="DEBUG"): - """ - Setup a logger - ONLY to be used when running CLI commands in - SDG directly. DO NOT call this from regular library code, and only - call it from __main__ entrypoints in the instructlab.sdg.cli - package - """ - logging.basicConfig( - level=level, - format="%(message)s", - datefmt="[%X]", - handlers=[RichHandler()], - ) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 409eb198..c8c1faf6 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -491,7 +491,3 @@ def leaf_node_to_samples( samples = _skill_leaf_node_to_samples(leaf_node) samples = _enrich_metadata(samples, leaf_node) return Dataset.from_list(samples) - - -def _unescape(s): - return bytes(s, "utf-8").decode("utf-8").strip() diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index a38a76e5..cd2075ea 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -21,7 +21,12 @@ # First Party from instructlab.sdg import LLMBlock, PipelineContext -from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data +from instructlab.sdg.generate_data import ( + _context_init, + _locate_docling_models, + _sdg_init, + generate_data, +) TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant." @@ -562,3 +567,17 @@ def test_context_init_batch_size_optional(): batch_num_workers=32, ) assert ctx.batch_size == 20 + + +def test_locate_docling_models_config_found(testdata_path): + with patch.dict(os.environ): + os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir")) + docling_model_path = _locate_docling_models() + assert docling_model_path == "/mock/docling-models" + + +def test_locate_docling_models_config_not_found(testdata_path): + with patch.dict(os.environ): + os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir")) + docling_model_path = _locate_docling_models() + assert docling_model_path is None diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index bedacaaf..0828e187 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -2,15 +2,14 @@ # Standard from typing import Any, Dict, Union -from unittest.mock import patch import os +import pathlib # Third Party import pytest import yaml # First Party -from instructlab.sdg.taxonomy import _locate_docling_models from instructlab.sdg.utils import taxonomy TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?" @@ -23,7 +22,7 @@ def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]: return yaml.safe_load(skills_file) -class TestUtilsTaxonomy: +class TestTaxonomy: """Test taxonomy in instructlab.sdg.utils.taxonomy.""" @pytest.fixture(autouse=True) @@ -86,17 +85,3 @@ def test_read_taxonomy_leaf_nodes( ): seed_example_exists = True assert seed_example_exists is True - - -def test_locate_docling_models_config_found(testdata_path): - with patch.dict(os.environ): - os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir")) - docling_model_path = _locate_docling_models() - assert docling_model_path == "/mock/docling-models" - - -def test_locate_docling_models_config_not_found(testdata_path): - with patch.dict(os.environ): - os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir")) - docling_model_path = _locate_docling_models() - assert docling_model_path is None