diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 533db868..badf3834 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -395,8 +395,7 @@ def generate_data(
         is_knowledge = False
         leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
         samples = leaf_node_to_samples(
-            leaf_node,
-            taxonomy,
+            leaf_node,  # pylint: disable=duplicate-code
             server_ctx_size,
             chunk_word_count,
             document_output_dir,
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index d8de36de..52c15ba1 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -1,9 +1,7 @@
 # Standard
-from abc import ABC, abstractmethod
 from collections import defaultdict
-from enum import Enum
 from pathlib import Path
-from typing import DefaultDict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional
 import json
 import logging
 import re
@@ -26,6 +24,7 @@
 
 logger = logging.getLogger(__name__)
 _DEFAULT_CHUNK_OVERLAP = 100
+SUPPORTED_FILETYPES = [".pdf", ".md"]
 
 
 def _num_tokens_from_words(num_words) -> int:
@@ -68,186 +67,55 @@ def resolve_ocr_options() -> OcrOptions:
         return None
 
 
-class FileTypes(Enum):
-    MD = ".md"
-    PDF = ".pdf"
+def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
+    """Split document paths into a dict of lists based on their file extension."""
+    document_dict = defaultdict(list)
+    for path in document_paths:
+        filetype = path.suffix
+        if filetype not in SUPPORTED_FILETYPES:
+            raise ValueError(f"Provided unsupported filetype {filetype}")
 
+        document_dict[filetype].append(path)
 
-class ChunkerBase(ABC):
-    @abstractmethod
-    def chunk_documents(self):
-        pass
-
-
-class DocumentChunker:
-    """A factory chunker class that instantiates the applicable chunker
-
-    Currently, only Markdown and PDF are supported. For Markdown, returns
-    TextSplitChunker, and for PDF, returns ContextAwareChunker"""
-
-    def __new__(
-        cls,
-        leaf_node,
-        taxonomy_path,
-        output_dir: Path,
-        server_ctx_size=4096,
-        chunk_word_count=1024,
-        tokenizer_model_name: Optional[str] = None,
-        docling_model_path: Optional[str] = None,
-    ):
-        """Insantiate the appropriate chunker for the provided document
-
-        Args:
-            leaf_node: a leaf node dict containing "documents",
-                "filepaths", and "taxonomy_path" keys
-            output_dir (Path): directory where artifacts should be stored
-            server_ctx_size (int): Context window size of server
-            chunk_word_count (int): Maximum number of words to chunk a document
-            tokenizer_model_name (Optional[str]): name of huggingface model to get
-                tokenizer from
-        Returns:
-            TextSplitChunker | ContextAwareChunker: Object of the appropriate
-                chunker class for the provided filetype
-        """
-        documents = leaf_node[0]["documents"]
-
-        if not isinstance(taxonomy_path, Path):
-            taxonomy_path = Path(taxonomy_path)
+    return dict(document_dict)
 
-        if isinstance(documents, str):
-            documents = [documents]
-            logger.info(
-                "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
-            )
-        elif not isinstance(documents, list):
-            raise TypeError(
-                "Expected: documents to be a list, but got {}".format(type(documents))
-            )
-
-        filepaths = leaf_node[0]["filepaths"]
-
-        doc_dict = cls._split_docs_by_filetype(documents, filepaths)
-        if len(doc_dict.keys()) > 1:
-            raise ValueError("Received multiple document types")
-        if len(doc_dict.keys()) < 1:
-            raise ValueError("Received no document types")
-
-        if FileTypes.MD in doc_dict:
-            doc_contents = [d for d, _ in doc_dict[FileTypes.MD]]
-            return TextSplitChunker(
-                doc_contents,
-                server_ctx_size,
-                chunk_word_count,
-                output_dir,
-            )
-
-        if FileTypes.PDF in doc_dict:
-            doc_paths = [p for _, p in doc_dict[FileTypes.PDF]]
-            return ContextAwareChunker(
-                doc_paths,
-                filepaths,
-                output_dir,
-                chunk_word_count,
-                tokenizer_model_name,
-                docling_model_path=docling_model_path,
-            )
 
-    @staticmethod
-    def _split_docs_by_filetype(
-        documents: List[str], filepaths: List[Path]
-    ) -> DefaultDict[FileTypes, List[Tuple[str, Path]]]:
-        """Separate documents into lists based on their filetype.
-
-        Currently, only Markdown and PDF are supported.
-        Args:
-            documents (List[str]): A list of the document contents as strings
-            filepaths (List[Path]): Corresponding document filepaths
-        Returns:
-            DefaultDict: Dictionary with either ".md" or ".pdf" as a key.
-                Markdown items contain document contents, PDF items contain
-                paths to documents.
-        """
-        doc_dict = defaultdict(list)
-        for doc, path in zip(documents, filepaths):
-            if path.suffix == ".md":
-                # append doc contents
-                doc_dict[FileTypes.MD].append((doc, path))
-            elif path.suffix == ".pdf":
-                # append doc paths
-                doc_dict[FileTypes.PDF].append((doc, path))
-            else:
-                raise ValueError(
-                    f"Received document of type .{path.suffix}, which is not a supported filetype"
-                )
-        return doc_dict
-
-
-class TextSplitChunker(ChunkerBase):
+class DocumentChunker:  # pylint: disable=too-many-instance-attributes
     def __init__(
         self,
-        document_contents: List | str,
-        server_ctx_size: int,
-        chunk_word_count: int,
+        document_paths: List[Path],
         output_dir: Path,
+        tokenizer_model_name: str | Path,
+        docling_model_path: Optional[Path] = None,
+        server_ctx_size: int = 4096,
+        chunk_word_count: int = 1024,
     ):
-        self.document_contents = document_contents
-        self.server_ctx_size = server_ctx_size
-        self.chunk_word_count = chunk_word_count
-        self.output_dir = output_dir
+        if not document_paths:
+            raise ValueError("Provided empty list of documents")
 
-    def chunk_documents(self) -> List:
-        """Naively chunk markdown documents based on the word count provided by the user.
-        Returns:
-            List[str]: List of chunked documents.
-        """
-        num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
-        if num_tokens_per_doc > int(self.server_ctx_size - 1024):
-            raise ValueError(
-                "Error: {}".format(
-                    str(
-                        f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})"
-                    )
-                )
-            )
-        if self.document_contents == []:
-            return []
+        document_dict = split_docs_by_filetype(document_paths)
 
-        chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
-        return chunk_markdowns(self.document_contents, chunk_size)
+        if len(document_dict) > 1:
+            raise ValueError("Provided multiple document types")
 
+        # We know there is only 1 key, value pair, so we take the first
+        self.document_filetype, self.document_paths = next(iter(document_dict.items()))
+        self.docling_model_path = docling_model_path
+        self.converter = self._init_docling_converter()
 
-class ContextAwareChunker(ChunkerBase):  # pylint: disable=too-many-instance-attributes
-    def __init__(
-        self,
-        document_paths,
-        filepaths,
-        output_dir: Path,
-        chunk_word_count: int,
-        tokenizer_model_name: Optional[str],
-        docling_model_path=None,
-    ):
-        self.document_paths = document_paths
-        self.filepaths = filepaths
         self.output_dir = self._path_validator(output_dir)
+        self.server_ctx_size = server_ctx_size
         self.chunk_word_count = chunk_word_count
         self.tokenizer = self.create_tokenizer(tokenizer_model_name)
-        self.docling_model_path = docling_model_path
-
-    def chunk_documents(self) -> List:
-        """Semantically chunk PDF documents.
 
-        Returns:
-            List: a list of chunks from the documents
-        """
+    def _init_docling_converter(self):
+        """Initialize docling converter with filetype-specific configurations"""
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
         from docling.document_converter import DocumentConverter, PdfFormatOption
         from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 
-        if self.document_paths == []:
-            return []
-
         if self.docling_model_path is None:
             logger.info("Docling models not found on disk, downloading models...")
             self.docling_model_path = StandardPdfPipeline.download_models_hf()
@@ -258,17 +126,29 @@ def chunk_documents(self) -> List:
             artifacts_path=self.docling_model_path,
             do_ocr=False,
         )
+
         ocr_options = resolve_ocr_options()
         if ocr_options is not None:
             pipeline_options.do_ocr = True
             pipeline_options.ocr_options = ocr_options
 
-        converter = DocumentConverter(
+        return DocumentConverter(
             format_options={
                 InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
             }
         )
-        parsed_documents = converter.convert_all(self.filepaths)
+
+    def chunk_documents(self) -> List:
+        """Split a list of documents into chunks
+
+        Returns:
+            List: a list of chunks from the documents
+        """
+
+        if self.document_paths == []:
+            return []
+
+        parsed_documents = self.converter.convert_all(self.document_paths)
 
         docling_artifacts_path = self.export_documents(parsed_documents)
 
@@ -348,7 +228,7 @@ def fuse_texts(
         return fused_texts
 
     @staticmethod
-    def create_tokenizer(model_name: Optional[str]):
+    def create_tokenizer(model_path: str | Path):
         """
         Create a tokenizer instance from a pre-trained model or a local directory.
 
@@ -363,10 +243,8 @@ def create_tokenizer(model_name: Optional[str]):
         # Third Party
         from transformers import AutoTokenizer
 
-        if model_name is None:
-            raise TypeError("No model path provided")
-
-        model_path = Path(model_name)
+        if not isinstance(model_path, Path):
+            model_path = Path(model_path)
         error_info_message = (
             "Please run `ilab model download {download_args}` and try again"
         )
@@ -486,7 +364,7 @@ def get_table_page_number(self, json_book, idx):
                 prev_page_num = book_element["prov"][0]["page"]
                 break
         for book_element in json_book["main-text"][idx:]:
-            if "prov" in book_element:
+            if "prov" in book_element and book_element["prov"]:
                 next_page_num = book_element["prov"][0]["page"]
                 break
         if prev_page_num is not None and next_page_num is not None:
@@ -545,8 +423,14 @@ def build_chunks_from_docling_json(
                     current_book_page_number = self.get_table_page_number(
                         json_book, idx
                     )
+                    book_text = self.get_table(json_book, book_element["$ref"])
+                elif book_element["prov"]:
+                    current_book_page_number = book_element["prov"][0][
+                        "page"
+                    ]  # TODO export to function to handle empty ["prov"]
+                    book_text = book_element["text"]
                 else:
-                    current_book_page_number = book_element["prov"][0]["page"]
+                    current_book_page_number = None
                     book_text = book_element["text"]
 
                 if book_element["type"] == "subtitle-level-1":
@@ -599,8 +483,6 @@ def build_chunks_from_docling_json(
 
                 if book_element["type"] == "paragraph":
                     book_text = self.add_heading_formatting(book_text)
-                elif book_element["type"] == "table":
-                    book_text = self.get_table(json_book, book_element["$ref"])
                 if "## References" in book_text or "## Acknowledgements" in book_text:
                     # For research papers we ignore everything after this sections
                     break
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 24592fe1..2a23072f 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -110,6 +110,18 @@ def _get_taxonomy(repo="taxonomy"):
     return taxonomy_file_paths
 
 
+def _string_contains_html(s: str) -> bool:
+    """Detect HTML tags in a string.
+
+    We use this to catch markdown files that may contain html elements since
+    docling does not support this."""
+    # Define a regex to detect HTML tags
+    html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>")
+
+    # Check for HTML tags in the content
+    return bool(html_tag_pattern.search(s))
+
+
 def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
@@ -161,6 +173,12 @@ def _get_documents(
                             # Process Markdown files
                             with open(file_path, "r", encoding="utf-8") as file:
                                 content = file.read()
+                                if _string_contains_html(content):
+                                    logging.warning(
+                                        f"Provided markdown file {file_path} contains HTML contents, which is currently unsupported as a part of markdown"
+                                        "NOTE: Continuing this might affect your data generation quality."
+                                        "To get best results please format your markdown documents without the use of HTML or use a different document filetype."
+                                    )
                                 file_contents.append(content)
                                 filepaths.append(Path(file_path))
                                 logger.info(
@@ -418,20 +436,19 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset:
 
 def _knowledge_leaf_node_to_samples(
     leaf_node,
-    taxonomy_path,
     server_ctx_size,
     chunk_word_count,
     document_output_dir,
     model_name,
     docling_model_path=None,
 ):
+    document_paths = leaf_node[0]["filepaths"]
     chunker = DocumentChunker(
-        leaf_node=leaf_node,
-        taxonomy_path=taxonomy_path,
+        document_paths=document_paths,
         output_dir=document_output_dir,
+        tokenizer_model_name=model_name,
         server_ctx_size=server_ctx_size,
         chunk_word_count=chunk_word_count,
-        tokenizer_model_name=model_name,
         docling_model_path=docling_model_path,
     )
     chunks = chunker.chunk_documents()
@@ -457,7 +474,6 @@ def _skill_leaf_node_to_samples(leaf_node):
 
 def leaf_node_to_samples(
     leaf_node,
-    taxonomy_path,
     server_ctx_size,
     chunk_word_count,
     document_output_dir,
@@ -468,8 +484,7 @@ def leaf_node_to_samples(
         return []
     if leaf_node[0].get("documents"):
         return _knowledge_leaf_node_to_samples(
-            leaf_node,
-            taxonomy_path,
+            leaf_node,  # pylint: disable=duplicate-code
             server_ctx_size,
             chunk_word_count,
             document_output_dir,
diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index 217cb889..f06db504 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -8,58 +8,53 @@
 # First Party
 from instructlab.sdg.utils.chunkers import DocumentChunker
 
+# Constants for Test Directory and Test Documents
 TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata")
+TEST_DOCUMENTS = {
+    "pdf": "sample_documents/phoenix.pdf",
+    "md": "sample_documents/phoenix.md",
+}
+
+
+@pytest.fixture(scope="module")
+def test_paths():
+    """Fixture to return paths to test documents."""
+    return {
+        doc_type: Path(os.path.join(TEST_DATA_DIR, path))
+        for doc_type, path in TEST_DOCUMENTS.items()
+    }
 
 
 @pytest.fixture
 def tokenizer_model_name():
+    """Fixture to return the path to the tokenizer model."""
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
-def test_chunk_pdf(tmp_path, tokenizer_model_name):
-    pdf_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.pdf"))
-    leaf_node = [
-        {
-            "documents": ["Lorem ipsum"],
-            "filepaths": [pdf_path],
-            "taxonomy_path": "knowledge",
-        }
-    ]
+@pytest.mark.parametrize(
+    "doc_type, expected_chunks, contains_text",
+    [
+        ("pdf", 9, "Phoenix is a minor constellation"),
+        ("md", 7, None),  # Assuming there's no specific text to check in Markdown
+    ],
+)
+def test_chunk_documents(
+    tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text
+):
+    """
+    Generalized test function for chunking documents.
+    """
+    document_path = test_paths[doc_type]
     chunker = DocumentChunker(
-        leaf_node=leaf_node,
-        taxonomy_path=tmp_path,
+        document_paths=[document_path],
         output_dir=tmp_path,
-        server_ctx_size=4096,
-        chunk_word_count=500,
         tokenizer_model_name=tokenizer_model_name,
-    )
-    chunks = chunker.chunk_documents()
-    assert len(chunks) > 9
-    assert "Phoenix is a minor constellation" in chunks[0]
-    for chunk in chunks:
-        # inexact sanity-checking of chunk max length
-        assert len(chunk) < 2500
-
-
-def test_chunk_md(tmp_path, tokenizer_model_name):
-    markdown_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.md"))
-    leaf_node = [
-        {
-            "documents": [markdown_path.read_text(encoding="utf-8")],
-            "filepaths": [markdown_path],
-            "taxonomy_path": "knowledge",
-        }
-    ]
-    chunker = DocumentChunker(
-        leaf_node=leaf_node,
-        taxonomy_path=tmp_path,
-        output_dir=tmp_path,
         server_ctx_size=4096,
         chunk_word_count=500,
-        tokenizer_model_name=tokenizer_model_name,
     )
     chunks = chunker.chunk_documents()
-    assert len(chunks) > 7
+    assert len(chunks) > expected_chunks
+    if contains_text:
+        assert contains_text in chunks[0]
     for chunk in chunks:
-        # inexact sanity-checking of chunk max length
         assert len(chunk) < 2500
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
index 700758ae..42db1ac3 100644
--- a/tests/test_chunkers.py
+++ b/tests/test_chunkers.py
@@ -11,13 +11,7 @@
 import pytest
 
 # First Party
-from instructlab.sdg.utils.chunkers import (
-    ContextAwareChunker,
-    DocumentChunker,
-    FileTypes,
-    TextSplitChunker,
-    resolve_ocr_options,
-)
+from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options
 
 # Local
 from .testdata import testdata
@@ -35,65 +29,27 @@ def tokenizer_model_name():
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
-@pytest.mark.parametrize(
-    "filepaths, chunker_type",
-    [
-        ([Path("document.md")], TextSplitChunker),
-        ([Path("document.pdf")], ContextAwareChunker),
-    ],
-)
-def test_chunker_factory(filepaths, chunker_type, documents_dir, tokenizer_model_name):
-    """Test that the DocumentChunker factory class returns the proper Chunker type"""
-    leaf_node = [
-        {
-            "documents": ["Lorem ipsum"],
-            "taxonomy_path": "",
-            "filepaths": filepaths,
-        }
-    ]
-    with tempfile.TemporaryDirectory() as temp_dir:
-        chunker = DocumentChunker(
-            leaf_node=leaf_node,
-            taxonomy_path=documents_dir,
-            output_dir=temp_dir,
-            tokenizer_model_name=tokenizer_model_name,
-        )
-        assert isinstance(chunker, chunker_type)
-
-
-def test_chunker_factory_unsupported_filetype(documents_dir, tokenizer_model_name):
+def test_init_document_chunker_unsupported_filetype(
+    documents_dir, tokenizer_model_name
+):
     """Test that the DocumentChunker factory class fails when provided an unsupported document"""
-    leaf_node = [
-        {
-            "documents": ["Lorem ipsum"],
-            "taxonomy_path": "",
-            "filepaths": [Path("document.jpg")],
-        }
-    ]
+    document_paths = [documents_dir / "document.jpg"]
     with pytest.raises(ValueError):
         with tempfile.TemporaryDirectory() as temp_dir:
             _ = DocumentChunker(
-                leaf_node=leaf_node,
-                taxonomy_path=documents_dir,
+                document_paths=document_paths,
                 output_dir=temp_dir,
                 tokenizer_model_name=tokenizer_model_name,
             )
 
 
-def test_chunker_factory_empty_filetype(documents_dir):
+def test_chunker_factory_empty_document_paths(tokenizer_model_name):
     """Test that the DocumentChunker factory class fails when provided no document"""
-    leaf_node = [
-        {
-            "documents": [],
-            "taxonomy_path": "",
-            "filepaths": [],
-        }
-    ]
+    document_paths = []
     with pytest.raises(ValueError):
         with tempfile.TemporaryDirectory() as temp_dir:
             _ = DocumentChunker(
-                leaf_node=leaf_node,
-                taxonomy_path=documents_dir,
+                document_paths=document_paths,
                 output_dir=temp_dir,
                 tokenizer_model_name=tokenizer_model_name,
             )
@@ -149,7 +105,7 @@ def test_resolve_ocr_options_none_found_logs_error(
 
 
 def test_create_tokenizer(tokenizer_model_name):
-    ContextAwareChunker.create_tokenizer(tokenizer_model_name)
+    DocumentChunker.create_tokenizer(tokenizer_model_name)
 
 
 @pytest.mark.parametrize(
@@ -163,4 +119,4 @@ def test_create_tokenizer(tokenizer_model_name):
 def test_invalid_tokenizer(model_name):
     model_path = os.path.join(TEST_DATA_DIR, model_name)
     with pytest.raises(ValueError):
-        ContextAwareChunker.create_tokenizer(model_path)
+        DocumentChunker.create_tokenizer(model_path)
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
index c5415636..c2968029 100644
--- a/tests/test_generate_data.py
+++ b/tests/test_generate_data.py
@@ -312,8 +312,10 @@ def test_generate(self):
             generate_data(
                 client=MagicMock(),
                 logger=mocked_logger,
-                model_family="merlinite",
-                model_name="models/merlinite-7b-lab-Q4_K_M.gguf",
+                model_family="granite",
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -395,8 +397,10 @@ def test_generate(self):
             generate_data(
                 client=MagicMock(),
                 logger=mocked_logger,
-                model_family="merlinite",
-                model_name="models/merlinite-7b-lab-Q4_K_M.gguf",
+                model_family="granite",
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -499,8 +503,10 @@ def test_generate(self):
             generate_data(
                 client=MagicMock(),
                 logger=mocked_logger,
-                model_family="merlinite",
-                model_name="models/merlinite-7b-lab-Q4_K_M.gguf",
+                model_family="granite",
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py
index 0828e187..0b697bd7 100644
--- a/tests/test_taxonomy.py
+++ b/tests/test_taxonomy.py
@@ -11,6 +11,7 @@
 
 # First Party
 from instructlab.sdg.utils import taxonomy
+from instructlab.sdg.utils.taxonomy import _string_contains_html
 
 TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?"
 
@@ -85,3 +86,14 @@ def test_read_taxonomy_leaf_nodes(
             ):
                 seed_example_exists = True
             assert seed_example_exists is True
+
+    @pytest.mark.parametrize(
+        "s, contains_html",
+        [
+            ("hello, world!", False),
+            ("hello, <div>world!</div>", True),
+        ],
+    )
+    def test_string_contains_html(self, s, contains_html):
+        print(taxonomy.__dict__)
+        assert _string_contains_html(s) == contains_html
diff --git a/tests/testdata/test_valid_knowledge_skill.yaml b/tests/testdata/test_valid_knowledge_skill.yaml
index 705acb41..be4c3ef9 100644
--- a/tests/testdata/test_valid_knowledge_skill.yaml
+++ b/tests/testdata/test_valid_knowledge_skill.yaml
@@ -1,176 +1,198 @@
-created_by: lukeinglis
-domain: anatomy_tonsil
 version: 3
+domain: astronomy
+created_by: juliadenham
 seed_examples:
   - context: |
-      ## Structure
-      Humans are born with four types of tonsils: the pharyngeal tonsil, two
-      tubal tonsils, two palatine tonsils, and the lingual tonsils.[1]
-
-      <table>
-      <thead>
-      <tr class="header">
-      <th><p>Type</p></th>
-      <th><p><a href="Epithelium" title="wikilink">Epithelium</a></p></th>
-      <th><p><a href=":wikt:capsule" title="wikilink">Capsule</a></p></th>
-      <th><p><a href="Tonsillar_crypts" title="wikilink">Crypts</a></p></th>
-      <th><p>Location</p></th>
-      </tr>
-      </thead>
-      <tbody>
-      <tr class="odd">
-      <td><p><a href="Adenoid" title="wikilink">Pharyngeal tonsil</a> (also
-      termed "adenoid")</p></td>
-      <td><p><a href="pseudostratified_epithelium" title="wikilink">Ciliated
-      pseudostratified columnar</a> (<a href="respiratory_epithelium"
-      title="wikilink">respiratory epithelium</a>)</p></td>
-      <td><p>Incompletely encapsulated</p></td>
-      <td><p>Small folds—sometimes described as crypts<a href="#fn1"
-      class="footnote-ref" id="fnref1"
-      role="doc-noteref"><sup>1</sup></a></p></td>
-      <td><p>Roof of <a href="pharynx" title="wikilink">pharynx</a></p></td>
-      </tr>
-      <tr class="even">
-      <td><p><a href="Tubal_tonsils" title="wikilink">Tubal tonsils</a></p></td>
-      <td><p>Ciliated pseudostratified columnar (respiratory epithelium)</p></td>
-      <td><p>Not encapsulated</p></td>
-      <td><p>No crypts</p></td>
-      <td><p>Roof of pharynx</p></td>
-      </tr>
-      <tr class="odd">
-      <td><p><a href="Palatine_tonsils" title="wikilink">Palatine tonsils</a></p></td>
-      <td><p>Stratified squamous epithelium</p></td>
-      <td><p>Fully encapsulated</p></td>
-      <td><p>Multiple deep crypts</p></td>
-      <td><p>Each side of the throat at the back of the mouth</p></td>
-      </tr>
-
+      **Phoenix** is a minor [constellation](constellation "wikilink") in the
+      [southern sky](southern_sky "wikilink"). Named after the mythical
+      [phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a
+      celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603
+      *[Uranometria](Uranometria "wikilink")*. The French explorer and
+      astronomer [Nicolas Louis de
+      Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter
+      stars and gave their [Bayer designations](Bayer_designation "wikilink")
+      in 1756. The constellation stretches from roughly −39 degrees to −57 degrees
+      [declination](declination "wikilink"), and from 23.5h to 2.5h of [right
+      ascension](right_ascension "wikilink"). The constellations Phoenix,
+      [Grus](Grus_(constellation) "wikilink"),
+      [Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"),
+      are known as the Southern Birds.
     questions_and_answers:
-      - question: What is the location of the tubal tonsils?
-        answer: The location of the tubal tonsils is the roof of the pharynx.
       - question: |
-          Compare the epithelial types, encapsulation, and presence of
-          crypts in the pharyngeal, tubal, and palatine tonsils according to the
-          table provided.
+          What is the Phoenix constellation?
         answer: |
-          The pharyngeal tonsil features ciliated pseudostratified columnar
-          epithelium and is incompletely encapsulated with small folds sometimes
-          described as crypts. The tubal tonsils also have ciliated
-          pseudostratified columnar epithelium but are not encapsulated and do
-          not possess crypts. In contrast, the palatine tonsils are covered with
-          stratified squamous epithelium, are fully encapsulated, and contain
-          multiple deep crypts. These structural differences are indicative of
-          their varied anatomical locations and potentially their distinct
-          functions within the immune system.
-      - question: What type of epithelium is found in the pharyngeal tonsil?
+          Phoenix is a minor constellation in the southern sky.
+      - question: |
+          Who charted the Phoenix constellation?
+        answer: |
+          The Phoenix constellation was charted by french explorer and
+          astronomer Nicolas Louis de Lacaille.
+      - question: |
+          How far does the Phoenix constellation stretch?
         answer: |
-          The type of epithelium found in the pharyngeal tonsil is ciliated
-          pseudostratified columnar (respiratory epithelium).
-
-
+          The phoenix constellation stretches from roughly −39° to −57°
+          declination, and from 23.5h to 2.5h of right ascension.
   - context: |
-      The **tonsils** are a set of [lymphoid](Lymphatic_system "wikilink")
-      organs facing into the aerodigestive tract, which is known as
-      [Waldeyer's tonsillar ring](Waldeyer's_tonsillar_ring "wikilink") and
-      consists of the [adenoid tonsil](adenoid "wikilink") (or pharyngeal
-      tonsil), two [tubal tonsils](tubal_tonsil "wikilink"), two [palatine
-      tonsils](palatine_tonsil "wikilink"), and the [lingual
-      tonsils](lingual_tonsil "wikilink"). These organs play an important role
-      in the immune system.
-
+      Phoenix was the largest of the 12 constellations established by [Petrus
+      Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter
+      Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de
+      Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm
+      diameter celestial globe published in 1597 (or 1598) in Amsterdam by
+      Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first
+      depiction of this constellation in a celestial atlas was in [Johann
+      Bayer](Johann_Bayer "wikilink")'s
+      *[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included
+      it in his southern star catalog the same year under the Dutch name *Den
+      voghel Fenicx*, "The Bird Phoenix", symbolising the
+      [phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One
+      name of the brightest star [Alpha
+      Phoenicis](Alpha_Phoenicis "wikilink")—Ankaa—is derived from the Arabic:
+      العنقاء, romanized: al-‘anqā’, lit. 'the phoenix', and
+      was coined sometime after 1800 in relation to the constellation.
     questions_and_answers:
-      - question: What is the immune system's first line of defense?
+      - question: |
+          What is the brightest star in the Phoenix constellation
+          called?
         answer: |
-          The tonsils are the immune system's first line of defense against
-          ingested or inhaled foreign pathogens.
-      - question: What is Waldeyer's tonsillar ring?
+          Alpha Phoenicis or Ankaa is the brightest star in the Phoenix
+          Constellation.
+      - question: Where did the Phoenix constellation first appear?
         answer: |
-          Waldeyer's tonsillar ring is a set of lymphoid organs facing into the
-          aerodigestive tract, consisting of the adenoid tonsil, two tubal
-          tonsils, two palatine tonsils, and the lingual tonsils.
-      - question: How many tubal tonsils are part of Waldeyer's tonsillar ring?
-        answer: There are two tubal tonsils as part of Waldeyer's tonsillar ring.
-
+          The Phoenix constellation first appeared on a 35-cm diameter
+          celestial globe published in 1597 (or 1598) in Amsterdam by
+          Plancius with Jodocus Hondius.
+      - question: |
+          What does "The Bird Phoenix" symbolize?
+        answer: |
+          "The Bird Phoenix" symbolizes the phoenix of classical mythology.
   - context: |
-      The palatine tonsils tend to reach their largest size in [puberty](puberty
-      "wikilink"), and they gradually undergo [atrophy](atrophy "wikilink")
-      thereafter. However, they are largest relative to the diameter of the
-      throat in young children. In adults, each palatine tonsil normally
-      measures up to 2.5 cm in length, 2.0 cm in width and 1.2 cm in
-      thickness.[2]
-
+      Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink")
+      and Sculptor to the north, Grus to the west, Tucana to the south,
+      touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and
+      [Eridanus](Eridanus_(constellation) "wikilink") to the east and
+      southeast. The bright star [Achernar](Achernar "wikilink") is
+      nearby. The three-letter abbreviation for the constellation, as
+      adopted by the [International Astronomical
+      Union](International_Astronomical_Union "wikilink") in 1922, is
+      "Phe". The official constellation boundaries, as set by Belgian
+      astronomer [Eugène Delporte](Eugène_Joseph_Delporte "wikilink") in 1930,
+      are defined by a polygon of 10 segments. In the [equatorial coordinate
+      system](equatorial_coordinate_system "wikilink"), the [right
+      ascension](right_ascension "wikilink") coordinates of these borders lie
+      between 23<sup>h</sup> 26.5<sup>m</sup> and 02<sup>h</sup> 25.0<sup>m</sup>,
+      while the [declination](declination "wikilink")
+      coordinates are between −39.31° and −57.84°. This means it remains
+      below the horizon to anyone living north of the [40th
+      parallel](40th_parallel_north "wikilink") in the [Northern
+      Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky
+      for anyone living north of the [equator](equator "wikilink"). It is most
+      visible from locations such as Australia and South Africa during late
+      [Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most
+      of the constellation lies within, and can be located by, forming a
+      triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink")
+      and [Beta Ceti](Beta_Ceti "wikilink")—Ankaa lies roughly in the centre
+      of this.
     questions_and_answers:
-      - question: When do the palatine tonsils tend to reach their largest size?
-        answer: The palatine tonsils tend to reach their largest size in puberty.
-      - question: What are the typical dimensions of an adult palatine tonsil?
+      - question: What are the characteristics of the Phoenix constellation?
+        answer: |
+          Phoenix is a small constellation bordered by Fornax and Sculptor to
+          the north, Grus to the west, Tucana to the south, touching on the
+          corner of Hydrus to the south, and Eridanus to the east and southeast.
+          The bright star Achernar is nearby.
+      - question: |
+          When is the phoenix constellation most visible?
         answer: |
-          In adults, each palatine tonsil normally measures up to 2.5 cm in
-          length, 2.0 cm in width, and 1.2 cm in thickness.
-      - question: How do the palatine tonsils change in size with age?
+          Phoenix is most visible from locations such as Australia and
+          South Africa during late Southern Hemisphere spring.
+      - question: |
+          What are the Phoenix Constellation boundaries?
         answer: |
-          The palatine tonsils tend to gradually undergo atrophy after puberty,
-          becoming smaller in size compared to their dimensions in young
-          children.
-
+          The official constellation boundaries for Phoenix, as set by Belgian
+          astronomer Eugène Delporte in 1930, are defined by a polygon of 10
+          segments.
   - context: |
-      The tonsils are immunocompetent organs that serve as the immune system's
-      first line of defense against ingested or inhaled foreign pathogens, and
-      as such frequently engorge with blood to assist in immune responses to
-      common illnesses such as the common cold. The tonsils have on their
-      surface specialized antigen capture cells called [microfold
-      cells](microfold_cell "wikilink") (M cells) that allow for the uptake of
-      antigens produced by pathogens. These M cells then alert the B cells and T
-      cells in the tonsil that a pathogen is present and an immune response is
-      stimulated.[3] B cells are activated and proliferate in areas called
-      germinal centers in the tonsil. These germinal centers are places where B
-      memory cells are created and [secretory antibody (IgA)](Immunoglobulin_A
-      "wikilink") is produced.
-
+      Ten stars have been found to have planets to date, and four planetary
+      systems have been discovered with the [SuperWASP](SuperWASP "wikilink")
+      project. [HD 142](HD_142 "wikilink") is a yellow giant that has an
+      apparent magnitude of 5.7, and has a planet ([HD 142b](HD_142_b
+      "wikilink")) 1.36 times the mass of Jupiter which orbits every 328 days.
+      [HD 2039](HD_2039 "wikilink") is a yellow subgiant with an apparent
+      magnitude of 9.0 around 330 light years away which has a planet ([HD 2039
+      b](HD_2039_b "wikilink")) six times the mass of Jupiter. [WASP-18](WASP-18
+      "wikilink") is a star of magnitude 9.29 which was discovered to have a hot
+      Jupiter-like planet ([WASP-18b](WASP-18b "wikilink")) taking less than a
+      day to orbit the star. The planet is suspected to be causing WASP-18 to
+      appear older than it really is. [WASP-4](WASP-4 "wikilink") and
+      [WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000
+      light years distant and of 13th magnitude, each with a single planet
+      larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange
+      dwarf of spectral type K4V and visual magnitude 11.3, which has a
+      planetary companion of similar size and mass to Saturn. The planet
+      completes an orbit every 3.9 days.
     questions_and_answers:
-      - question: |
-          What are the specialized antigen capture cells on the surface of the
-          tonsils called?
+      - question: In the Phoenix constellation, how many stars have planets?
         answer: |
-          The specialized antigen capture cells on the surface of the tonsils
-          are called microfold cells (M cells).
-      - question: What is the role of microfold cells in the tonsils?
+          In the Phoenix constellation, ten stars have been found to have
+          planets to date, and four planetary systems have been discovered
+          with the SuperWASP project.
+      - question: What is HD 142?
+        answer: |
+          HD 142 is a yellow giant that has an apparent magnitude of 5.7, and
+          has a planet (HD 142 b) 1.36 times the mass of Jupiter which
+          orbits every 328 days.
+      - question: |
+          Are WASP-4 and WASP-5 solar-type yellow stars?
         answer: |
-          Microfold cells (M cells) allow for the uptake of antigens produced by
-          pathogens. They alert the B cells and T cells in the tonsil that a
-          pathogen is present, stimulating an immune response.
-      - question: Where do B cells proliferate in the tonsils?
-        answer: B cells proliferate in areas called germinal centers in the tonsils.
-
+          Yes, WASP-4 and WASP-5 are solar-type yellow stars around 1000 light
+          years distant and of 13th magnitude, each with a single planet
+          larger than Jupiter.
   - context: |
-      A [tonsillolith](tonsillolith "wikilink") (also known as a "tonsil stone")
-      is material that accumulates on the palatine tonsil. This can reach the
-      size of a [peppercorn](peppercorn "wikilink") and is white or cream in
-      color. The main substance is mostly [calcium](calcium "wikilink"), but it
-      has a strong unpleasant odor because of [hydrogen
-      sulfide](hydrogen_sulfide "wikilink") and [methyl
-      mercaptan](methyl_mercaptan "wikilink") and other chemicals.[6]
-
+      The constellation does not lie on the
+      [galactic plane](galactic_plane "wikilink") of the Milky Way, and there
+      are no prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf
+      [irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude 11.0
+      and lying some 12.7 million light years distant. Only 24000 light years in
+      diameter, it is an outlying member of the [Sculptor Group](Sculptor_Group
+      "wikilink"). NGC 625 is thought to have been involved in a collision and
+      is experiencing a burst of [active star formation](Active_galactic_nucleus
+      "wikilink"). [NGC 37](NGC_37 "wikilink") is a
+      [lenticular galaxy](lenticular_galaxy "wikilink") of apparent magnitude
+      14.66. It is approximately 42 [kiloparsecs](kiloparsecs "wikilink")
+      (137,000 [light-years](light-years "wikilink")) in diameter and about
+      12.9 billion years old. [Robert's Quartet](Robert's_Quartet "wikilink")
+      (composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three
+      spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink")
+      and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located
+      around 160 million light-years away which are in the process of colliding
+      and merging. They are within a circle of radius of 1.6 arcmin,
+      corresponding to about 75,000 light-years. Located in the galaxy ESO
+      243-49 is [HLX-1](HLX-1 "wikilink"), an
+      [intermediate-mass black hole](intermediate-mass_black_hole
+      "wikilink")—the first one of its kind identified. It is thought to be a
+      remnant of a dwarf galaxy that was absorbed in a
+      [collision](Interacting_galaxy "wikilink") with ESO 243-49. Before its
+      discovery, this class of black hole was only hypothesized.
     questions_and_answers:
-      - question: What is a tonsillolith?
+      - question: |
+          Is the Phoenix Constellation part of the Milky Way?
+        answer: |
+          The Phoenix constellation does not lie on the galactic plane of
+          the Milky Way, and there are no prominent star clusters.
+      - question: |
+          How many light years away is NGC 625?
         answer: |
-          A tonsillolith (tonsil stone) is material that accumulates on the
-          palatine tonsil, reaching the size of a peppercorn and having a white
-          or cream color. It contains calcium and has a strong unpleasant odor
-          due to hydrogen sulfide, methyl mercaptan, and other chemicals.
-      - question: What is the main substance found in a tonsillolith?
-        answer: The main substance found in a tonsillolith is mostly calcium.
-      - question: Why do tonsilloliths have a strong unpleasant odor?
+          NGC 625 is 24000 light years in diameter and is an outlying
+          member of the Sculptor Group.
+      - question: |
+          What is Robert's Quartet composed of?
         answer: |
-          Tonsilloliths have a strong unpleasant odor due to hydrogen sulfide,
-          methyl mercaptan, and other chemicals.
-
+          Robert's Quartet is composed of the irregular galaxy NGC 87,
+          and three spiral galaxies NGC 88, NGC 89 and NGC 92.
 document_outline: |
-  Overview of Human tonsils, describing their types, locations, structure,
-  function, and clinical significance, with a specific focus on their role in
-  the immune system and related health issues.
-
+  Information about the Phoenix Constellation including the
+  history, characteristics, and features of the stars in the constellation.
 document:
-  repo: https://github.com/luke-inglis/il-anatomy-knowledge
-  commit: cc7c6ca
+  repo: https://github.com/RedHatOfficial/rhelai-taxonomy-data
+  commit: c87a82eb15567f28c0a8d30025e0cd77a2150646
   patterns:
-    - anatomy1.md
+    - phoenix.md