From 8336e42431777f9b2ed62f57c2d0edf68d429b55 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Tue, 7 Jan 2025 12:21:43 -0500
Subject: [PATCH] Reduce scope of the PR - removing CLI, less churn

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 src/instructlab/sdg/__init__.py               |   5 +-
 .../sdg/cli/preprocess_taxonomy.py            |  82 -------
 src/instructlab/sdg/generate_data.py          | 189 +++++++++++++++-
 src/instructlab/sdg/taxonomy.py               | 201 ------------------
 src/instructlab/sdg/utils/logging.py          |  22 --
 src/instructlab/sdg/utils/taxonomy.py         |   4 -
 tests/test_generate_data.py                   |  21 +-
 tests/test_taxonomy.py                        |  19 +-
 8 files changed, 211 insertions(+), 332 deletions(-)
 delete mode 100644 src/instructlab/sdg/cli/preprocess_taxonomy.py
 delete mode 100644 src/instructlab/sdg/taxonomy.py
 delete mode 100644 src/instructlab/sdg/utils/logging.py

diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py
index ccef90d8..5cc9d95f 100644
--- a/src/instructlab/sdg/__init__.py
+++ b/src/instructlab/sdg/__init__.py
@@ -29,7 +29,7 @@
     "FULL_PIPELINES_PACKAGE",
     "SIMPLE_PIPELINES_PACKAGE",
     "generate_data",
-    "preprocess_taxonomy",
+    "mix_datasets",
 )
 
 # Local
@@ -51,7 +51,7 @@
     SelectorBlock,
     SetToMajorityValueBlock,
 )
-from .generate_data import generate_data
+from .generate_data import generate_data, mix_datasets
 from .pipeline import (
     FULL_PIPELINES_PACKAGE,
     SIMPLE_PIPELINES_PACKAGE,
@@ -62,6 +62,5 @@
     PipelineContext,
 )
 from .registry import BlockRegistry, PromptRegistry
-from .taxonomy import preprocess_taxonomy
 from .utils import GenerateException
 from .utils.taxonomy import TaxonomyReadingException
diff --git a/src/instructlab/sdg/cli/preprocess_taxonomy.py b/src/instructlab/sdg/cli/preprocess_taxonomy.py
deleted file mode 100644
index b7d22532..00000000
--- a/src/instructlab/sdg/cli/preprocess_taxonomy.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Standard
-import os
-
-# First Party
-from instructlab.sdg.taxonomy import (
-    DEFAULT_CHUNK_WORD_COUNT,
-    DEFAULT_SERVER_CTX_SIZE,
-    DEFAULT_TAXONOMY_BASE,
-    preprocess_taxonomy,
-)
-from instructlab.sdg.utils.logging import setup_logger
-
-if __name__ == "__main__":
-    # Standard
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Turn a taxonomy into json samples suitable for use as input to data generate pipelines"
-    )
-
-    # Required args
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        required=True,
-        help="Directory to write the processed dataset samples into",
-    )
-    parser.add_argument(
-        "--taxonomy-path",
-        type=str,
-        required=True,
-        help="Path to your InstructLab taxonomy",
-    )
-
-    # Optional args
-    parser.add_argument(
-        "--chunk-word-count",
-        type=int,
-        default=DEFAULT_CHUNK_WORD_COUNT,
-        help="Number of words per document chunk",
-    )
-    parser.add_argument(
-        "--log-level",
-        type=str,
-        default=os.getenv("LOG_LEVEL", "INFO"),
-        help="Logging level",
-    )
-    parser.add_argument(
-        "--server-ctx-size",
-        type=int,
-        default=DEFAULT_SERVER_CTX_SIZE,
-        help="The maximum number of tokens the inference server can handle.",
-    )
-    parser.add_argument(
-        "--taxonomy-base",
-        type=str,
-        default=DEFAULT_TAXONOMY_BASE,
-        help="Taxonomy based used to determine what has changed - defaults to 'empty' which means consider all the taxonomy files as changed and process all of them",
-    )
-    parser.add_argument(
-        "--yaml-rules",
-        type=str,
-        default=None,
-        help="Path to custom rules file for YAML linting",
-    )
-
-    args = parser.parse_args()
-    setup_logger(args.log_level)
-    preprocess_taxonomy(
-        args.taxonomy_path,
-        args.output_dir,
-        chunk_word_count=args.chunk_word_count,
-        server_ctx_size=args.server_ctx_size,
-        taxonomy_base=args.taxonomy_base,
-        yaml_rules=args.yaml_rules,
-    )
-
-"""
-python -m instructlab.sdg.cli.preprocess_taxonomy --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
-"""
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 3f338ecc..9091653f 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -16,6 +16,7 @@
 from datasets import Dataset
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import openai
+import yaml
 
 # First Party
 from instructlab.sdg.blocks.llmblock import DEFAULT_MAX_NUM_TOKENS
@@ -32,15 +33,25 @@
     Pipeline,
     PipelineContext,
 )
-from instructlab.sdg.taxonomy import preprocess_taxonomy
 from instructlab.sdg.utils import GenerateException
 from instructlab.sdg.utils.json import jldump, jlload
-from instructlab.sdg.utils.taxonomy import _unescape
+from instructlab.sdg.utils.taxonomy import (
+    leaf_node_to_samples,
+    read_taxonomy_leaf_nodes,
+)
 
 logger = logging.getLogger(__name__)
 
 _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant."
 
+DEFAULT_CHUNK_WORD_COUNT = 1000
+DEFAULT_TAXONOMY_BASE = "empty"
+DEFAULT_SERVER_CTX_SIZE = 4096
+
+
+def _unescape(s):
+    return bytes(s, "utf-8").decode("utf-8").strip()
+
 
 def _convert_to_messages(sample):
     """
@@ -112,6 +123,56 @@ def _gen_train_data(
     jldump(messages_data, output_file_messages)
 
 
+def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
+    res = []
+    for i in range(3):
+        idx = i + 1
+        user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"]
+        test_sample = {
+            "user": _unescape(user),
+            "assistant": _unescape(seed_example[f"icl_response_{idx}"]),
+        }
+        if system_prompt:
+            test_sample["system"] = system_prompt
+        res.append(test_sample)
+    return res
+
+
+def _gen_test_data(
+    seed_examples,
+    output_file_test,
+    system_prompt,
+):
+    """
+    Generate test data in the format needed by the legacy Linux training
+    in instructlab/instructlab.
+    """
+    test_data = []
+    for seed_example in seed_examples:
+        if "icl_query_1" in seed_example:
+            test_data.extend(
+                _knowledge_seed_example_to_test_data(seed_example, system_prompt)
+            )
+            continue
+
+        # skill seed example
+
+        user = seed_example["seed_question"]  # question
+
+        if seed_example["leaf_node_type"] == "grounded_skill":
+            user += "\n" + seed_example["seed_context"]  # context
+
+        test_sample = {
+            "user": _unescape(user),
+            "assistant": _unescape(seed_example["seed_response"]),  # answer
+        }
+        if system_prompt:
+            test_sample["system"] = system_prompt
+        test_data.append(test_sample)
+
+    jldump(test_data, output_file_test)
+
+
 def _check_pipeline_dir(pipeline):
     for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]:
         if not os.path.exists(os.path.join(pipeline, file)):
@@ -120,6 +181,31 @@ def _check_pipeline_dir(pipeline):
             )
 
 
+def _locate_docling_models():
+    # Search for the models in User and Site data directories
+    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
+    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
+
+    docling_model_path = None
+    sdg_models_path = docling_model_path
+    for d in data_dirs:
+        if os.path.exists(os.path.join(d, "models")):
+            sdg_models_path = os.path.join(d, "models")
+            break
+
+    if sdg_models_path is not None:
+        try:
+            with open(
+                os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
+            ) as file:
+                config = yaml.safe_load(file)
+                docling_model_path = config["models"][0]["path"]
+        except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
+            logger.warning(f"unable to read docling models path from config.yaml {e}")
+
+    return docling_model_path
+
+
 def _context_init(
     client: openai.OpenAI,
     model_family: str,
@@ -217,6 +303,105 @@ def _extract_leaf_node_path_and_type(sample):
     return leaf_node_path, leaf_node_type
 
 
+def preprocess_taxonomy(
+    taxonomy_dir,
+    output_dir,
+    chunk_word_count=DEFAULT_CHUNK_WORD_COUNT,  # TODO: Remove chunk_word_count param
+    server_ctx_size=DEFAULT_SERVER_CTX_SIZE,  # TODO: Remove server_ctx_size param
+    taxonomy_base=DEFAULT_TAXONOMY_BASE,
+    teacher_model_path: Optional[str] = None,
+    yaml_rules: Optional[str] = None,
+    test_output_file: Optional[str] = None,
+    system_prompt: Optional[str] = None,
+):
+    """
+    Preprocess a taxonomy into input samples suitable for use with
+    data generation pipelines. This does the following steps:
+
+    - Determine changed leaf nodes in the taxonomy
+    - Retrieve knowledge documents for changed taxonomy leaf nodes
+    - Convert any non-markdown knowledge documents to markdown
+    - Write the Docling json and markdown outputs from this conversion to
+      disk for other processes to consume if needed.
+    - Chunk the converted knowledge documents to the desired chunk sizes.
+    - Turn the qna.yaml and knowledge documents into samples in the format
+      expected by the `simple` and `full` data generation pipelines shipped
+      in SDG.
+    - Write these samples to disk, with one file per taxonomy leaf node.
+
+    Args:
+        taxonomy_dir: The path to the taxonomy
+        output_dir: Where to write the samples create for use with data generation
+        test_output_file: Path to write the test samples jsonl file
+        chunk_word_count: The target number of words per document chunk
+        server_ctx_size: The maximum number of tokens the inference server used
+                         during data generation can handle
+        taxonomy_base: Determines how we calculate what has changed. This should
+                       be a git reference or the special value of 'empty' which
+                       means assume the entire taxonomy has changed.
+        teacher_model_path: Path to the teacher model on disk, which we'll use to
+                            load its tokenizer for use with document chunking.
+        yaml_rules: Path to a custom YAML rules file for YAML linting.
+        system_prompt: System prompt to use when generating test samples
+
+    Returns:
+        List[str]: The list of output sample files written to disk.
+
+    """
+    logging.info("Converting taxonomy to samples")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+    output_files = []
+
+    if not (taxonomy_dir and os.path.exists(taxonomy_dir)):
+        raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.")
+
+    document_output_dir = output_dir.joinpath("documents")
+    docling_model_path = _locate_docling_models()
+
+    leaf_nodes = read_taxonomy_leaf_nodes(
+        taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir
+    )
+    if not leaf_nodes:
+        raise GenerateException("Error: No new leaf nodes found in the taxonomy.")
+
+    # TODO: This is all a temporary hack here, as we either need to
+    # remove, deprecate, or otherwise determine the right way to
+    # support test samples
+    all_samples = []
+    for leaf_node in leaf_nodes.values():
+        leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
+        samples = leaf_node_to_samples(
+            leaf_node,
+            taxonomy_dir,
+            server_ctx_size,
+            chunk_word_count,
+            document_output_dir,
+            teacher_model_path,
+            docling_model_path=docling_model_path,
+        )
+
+        if not samples:
+            raise GenerateException("Error: No samples found in leaf node.")
+
+        logger.debug("Samples: %s", samples)
+
+        output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl")
+        all_samples.extend(samples)
+        jldump(samples, output_file)
+        output_files.append(str(output_file))
+
+    if test_output_file:
+        _gen_test_data(
+            all_samples,
+            test_output_file,
+            system_prompt,
+        )
+        logger.debug(f"Generating test data to: {test_output_file}")
+    logger.info("Taxonomy converted to samples and written to %s", output_dir)
+    return output_files
+
+
 def generate_taxonomy(
     client: openai.OpenAI,
     input_dir: str,
diff --git a/src/instructlab/sdg/taxonomy.py b/src/instructlab/sdg/taxonomy.py
deleted file mode 100644
index 4b0c90b2..00000000
--- a/src/instructlab/sdg/taxonomy.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# pylint: disable=duplicate-code
-
-# Standard
-from pathlib import Path
-from typing import Optional
-import logging
-import os
-
-# Third Party
-from xdg_base_dirs import xdg_data_dirs, xdg_data_home
-import yaml
-
-# First Party
-from instructlab.sdg.utils import GenerateException
-from instructlab.sdg.utils.json import jldump
-from instructlab.sdg.utils.taxonomy import (
-    _unescape,
-    leaf_node_to_samples,
-    read_taxonomy_leaf_nodes,
-)
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_CHUNK_WORD_COUNT = 1000
-DEFAULT_TAXONOMY_BASE = "empty"
-DEFAULT_SERVER_CTX_SIZE = 4096
-
-
-def _locate_docling_models():
-    # Search for the models in User and Site data directories
-    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
-    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
-
-    docling_model_path = None
-    sdg_models_path = docling_model_path
-    for d in data_dirs:
-        if os.path.exists(os.path.join(d, "models")):
-            sdg_models_path = os.path.join(d, "models")
-            break
-
-    if sdg_models_path is not None:
-        try:
-            with open(
-                os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
-            ) as file:
-                config = yaml.safe_load(file)
-                docling_model_path = config["models"][0]["path"]
-        except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
-            logger.warning(f"unable to read docling models path from config.yaml {e}")
-
-    return docling_model_path
-
-
-def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
-    res = []
-    for i in range(3):
-        idx = i + 1
-        user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"]
-        test_sample = {
-            "user": _unescape(user),
-            "assistant": _unescape(seed_example[f"icl_response_{idx}"]),
-        }
-        if system_prompt:
-            test_sample["system"] = system_prompt
-        res.append(test_sample)
-    return res
-
-
-def _gen_test_data(
-    seed_examples,
-    output_file_test,
-    system_prompt,
-):
-    """
-    Generate test data in the format needed by the legacy Linux training
-    in instructlab/instructlab.
-    """
-    test_data = []
-    for seed_example in seed_examples:
-        if "icl_query_1" in seed_example:
-            test_data.extend(
-                _knowledge_seed_example_to_test_data(seed_example, system_prompt)
-            )
-            continue
-
-        # skill seed example
-
-        user = seed_example["seed_question"]  # question
-
-        if seed_example["leaf_node_type"] == "grounded_skill":
-            user += "\n" + seed_example["seed_context"]  # context
-
-        test_sample = {
-            "user": _unescape(user),
-            "assistant": _unescape(seed_example["seed_response"]),  # answer
-        }
-        if system_prompt:
-            test_sample["system"] = system_prompt
-        test_data.append(test_sample)
-
-    jldump(test_data, output_file_test)
-
-
-def preprocess_taxonomy(
-    taxonomy_dir,
-    output_dir,
-    chunk_word_count=DEFAULT_CHUNK_WORD_COUNT,  # TODO: Remove chunk_word_count param
-    server_ctx_size=DEFAULT_SERVER_CTX_SIZE,  # TODO: Remove server_ctx_size param
-    taxonomy_base=DEFAULT_TAXONOMY_BASE,
-    teacher_model_path: Optional[str] = None,
-    yaml_rules: Optional[str] = None,
-    test_output_file: Optional[str] = None,
-    system_prompt: Optional[str] = None,
-):
-    """
-    Preprocess a taxonomy into input samples suitable for use with
-    data generation pipelines. This does the following steps:
-
-    - Determine changed leaf nodes in the taxonomy
-    - Retrieve knowledge documents for changed taxonomy leaf nodes
-    - Convert any non-markdown knowledge documents to markdown
-    - Write the Docling json and markdown outputs from this conversion to
-      disk for other processes to consume if needed.
-    - Chunk the converted knowledge documents to the desired chunk sizes.
-    - Turn the qna.yaml and knowledge documents into samples in the format
-      expected by the `simple` and `full` data generation pipelines shipped
-      in SDG.
-    - Write these samples to disk, with one file per taxonomy leaf node.
-
-    Args:
-        taxonomy_dir: The path to the taxonomy
-        output_dir: Where to write the samples create for use with data generation
-        test_output_file: Path to write the test samples jsonl file
-        chunk_word_count: The target number of words per document chunk
-        server_ctx_size: The maximum number of tokens the inference server used
-                         during data generation can handle
-        taxonomy_base: Determines how we calculate what has changed. This should
-                       be a git reference or the special value of 'empty' which
-                       means assume the entire taxonomy has changed.
-        teacher_model_path: Path to the teacher model on disk, which we'll use to
-                            load its tokenizer for use with document chunking.
-        yaml_rules: Path to a custom YAML rules file for YAML linting.
-        system_prompt: System prompt to use when generating test samples
-
-    Returns:
-        List[str]: The list of output sample files written to disk.
-
-    """
-    logging.info("Converting taxonomy to samples")
-    output_dir = Path(output_dir)
-    output_dir.mkdir(exist_ok=True)
-    output_files = []
-
-    if not (taxonomy_dir and os.path.exists(taxonomy_dir)):
-        raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.")
-
-    document_output_dir = output_dir.joinpath("documents")
-    docling_model_path = _locate_docling_models()
-
-    leaf_nodes = read_taxonomy_leaf_nodes(
-        taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir
-    )
-    if not leaf_nodes:
-        raise GenerateException("Error: No new leaf nodes found in the taxonomy.")
-
-    # TODO: This is all a temporary hack here, as we either need to
-    # remove, deprecate, or otherwise determine the right way to
-    # support test samples
-    all_samples = []
-    for leaf_node in leaf_nodes.values():
-        leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
-        samples = leaf_node_to_samples(
-            leaf_node,
-            taxonomy_dir,
-            server_ctx_size,
-            chunk_word_count,
-            document_output_dir,
-            teacher_model_path,
-            docling_model_path=docling_model_path,
-        )
-
-        if not samples:
-            raise GenerateException("Error: No samples found in leaf node.")
-
-        logger.debug("Samples: %s", samples)
-
-        output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl")
-        all_samples.extend(samples)
-        jldump(samples, output_file)
-        output_files.append(str(output_file))
-
-    if test_output_file:
-        _gen_test_data(
-            all_samples,
-            test_output_file,
-            system_prompt,
-        )
-        logger.debug(f"Generating test data to: {test_output_file}")
-    logger.info("Taxonomy converted to samples and written to %s", output_dir)
-    return output_files
diff --git a/src/instructlab/sdg/utils/logging.py b/src/instructlab/sdg/utils/logging.py
deleted file mode 100644
index c6236f49..00000000
--- a/src/instructlab/sdg/utils/logging.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Standard
-import logging
-
-# Third Party
-from rich.logging import RichHandler
-
-
-def setup_logger(level="DEBUG"):
-    """
-    Setup a logger - ONLY to be used when running CLI commands in
-    SDG directly. DO NOT call this from regular library code, and only
-    call it from __main__ entrypoints in the instructlab.sdg.cli
-    package
-    """
-    logging.basicConfig(
-        level=level,
-        format="%(message)s",
-        datefmt="[%X]",
-        handlers=[RichHandler()],
-    )
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 409eb198..c8c1faf6 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -491,7 +491,3 @@ def leaf_node_to_samples(
         samples = _skill_leaf_node_to_samples(leaf_node)
     samples = _enrich_metadata(samples, leaf_node)
     return Dataset.from_list(samples)
-
-
-def _unescape(s):
-    return bytes(s, "utf-8").decode("utf-8").strip()
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
index a38a76e5..cd2075ea 100644
--- a/tests/test_generate_data.py
+++ b/tests/test_generate_data.py
@@ -21,7 +21,12 @@
 
 # First Party
 from instructlab.sdg import LLMBlock, PipelineContext
-from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data
+from instructlab.sdg.generate_data import (
+    _context_init,
+    _locate_docling_models,
+    _sdg_init,
+    generate_data,
+)
 
 TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."
 
@@ -562,3 +567,17 @@ def test_context_init_batch_size_optional():
         batch_num_workers=32,
     )
     assert ctx.batch_size == 20
+
+
+def test_locate_docling_models_config_found(testdata_path):
+    with patch.dict(os.environ):
+        os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir"))
+        docling_model_path = _locate_docling_models()
+        assert docling_model_path == "/mock/docling-models"
+
+
+def test_locate_docling_models_config_not_found(testdata_path):
+    with patch.dict(os.environ):
+        os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir"))
+        docling_model_path = _locate_docling_models()
+        assert docling_model_path is None
diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py
index bedacaaf..0828e187 100644
--- a/tests/test_taxonomy.py
+++ b/tests/test_taxonomy.py
@@ -2,15 +2,14 @@
 
 # Standard
 from typing import Any, Dict, Union
-from unittest.mock import patch
 import os
+import pathlib
 
 # Third Party
 import pytest
 import yaml
 
 # First Party
-from instructlab.sdg.taxonomy import _locate_docling_models
 from instructlab.sdg.utils import taxonomy
 
 TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?"
@@ -23,7 +22,7 @@ def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
         return yaml.safe_load(skills_file)
 
 
-class TestUtilsTaxonomy:
+class TestTaxonomy:
     """Test taxonomy in instructlab.sdg.utils.taxonomy."""
 
     @pytest.fixture(autouse=True)
@@ -86,17 +85,3 @@ def test_read_taxonomy_leaf_nodes(
             ):
                 seed_example_exists = True
             assert seed_example_exists is True
-
-
-def test_locate_docling_models_config_found(testdata_path):
-    with patch.dict(os.environ):
-        os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir"))
-        docling_model_path = _locate_docling_models()
-        assert docling_model_path == "/mock/docling-models"
-
-
-def test_locate_docling_models_config_not_found(testdata_path):
-    with patch.dict(os.environ):
-        os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir"))
-        docling_model_path = _locate_docling_models()
-        assert docling_model_path is None