diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 533db868..badf3834 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -395,8 +395,7 @@ def generate_data( is_knowledge = False leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") samples = leaf_node_to_samples( - leaf_node, - taxonomy, + leaf_node, # pylint: disable=duplicate-code server_ctx_size, chunk_word_count, document_output_dir, diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index d8de36de..52c15ba1 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -1,9 +1,7 @@ # Standard -from abc import ABC, abstractmethod from collections import defaultdict -from enum import Enum from pathlib import Path -from typing import DefaultDict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional import json import logging import re @@ -26,6 +24,7 @@ logger = logging.getLogger(__name__) _DEFAULT_CHUNK_OVERLAP = 100 +SUPPORTED_FILETYPES = [".pdf", ".md"] def _num_tokens_from_words(num_words) -> int: @@ -68,186 +67,55 @@ def resolve_ocr_options() -> OcrOptions: return None -class FileTypes(Enum): - MD = ".md" - PDF = ".pdf" +def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: + """Split document paths into a dict of lists based on their file extension.""" + document_dict = defaultdict(list) + for path in document_paths: + filetype = path.suffix + if filetype not in SUPPORTED_FILETYPES: + raise ValueError(f"Provided unsupported filetype {filetype}") + document_dict[filetype].append(path) -class ChunkerBase(ABC): - @abstractmethod - def chunk_documents(self): - pass - - -class DocumentChunker: - """A factory chunker class that instantiates the applicable chunker - - Currently, only Markdown and PDF are supported. For Markdown, returns - TextSplitChunker, and for PDF, returns ContextAwareChunker""" - - def __new__( - cls, - leaf_node, - taxonomy_path, - output_dir: Path, - server_ctx_size=4096, - chunk_word_count=1024, - tokenizer_model_name: Optional[str] = None, - docling_model_path: Optional[str] = None, - ): - """Insantiate the appropriate chunker for the provided document - - Args: - leaf_node: a leaf node dict containing "documents", - "filepaths", and "taxonomy_path" keys - output_dir (Path): directory where artifacts should be stored - server_ctx_size (int): Context window size of server - chunk_word_count (int): Maximum number of words to chunk a document - tokenizer_model_name (Optional[str]): name of huggingface model to get - tokenizer from - Returns: - TextSplitChunker | ContextAwareChunker: Object of the appropriate - chunker class for the provided filetype - """ - documents = leaf_node[0]["documents"] - - if not isinstance(taxonomy_path, Path): - taxonomy_path = Path(taxonomy_path) + return dict(document_dict) - if isinstance(documents, str): - documents = [documents] - logger.info( - "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input." - ) - elif not isinstance(documents, list): - raise TypeError( - "Expected: documents to be a list, but got {}".format(type(documents)) - ) - - filepaths = leaf_node[0]["filepaths"] - - doc_dict = cls._split_docs_by_filetype(documents, filepaths) - if len(doc_dict.keys()) > 1: - raise ValueError("Received multiple document types") - if len(doc_dict.keys()) < 1: - raise ValueError("Received no document types") - - if FileTypes.MD in doc_dict: - doc_contents = [d for d, _ in doc_dict[FileTypes.MD]] - return TextSplitChunker( - doc_contents, - server_ctx_size, - chunk_word_count, - output_dir, - ) - - if FileTypes.PDF in doc_dict: - doc_paths = [p for _, p in doc_dict[FileTypes.PDF]] - return ContextAwareChunker( - doc_paths, - filepaths, - output_dir, - chunk_word_count, - tokenizer_model_name, - docling_model_path=docling_model_path, - ) - @staticmethod - def _split_docs_by_filetype( - documents: List[str], filepaths: List[Path] - ) -> DefaultDict[FileTypes, List[Tuple[str, Path]]]: - """Separate documents into lists based on their filetype. - - Currently, only Markdown and PDF are supported. - Args: - documents (List[str]): A list of the document contents as strings - filepaths (List[Path]): Corresponding document filepaths - Returns: - DefaultDict: Dictionary with either ".md" or ".pdf" as a key. - Markdown items contain document contents, PDF items contain - paths to documents. - """ - doc_dict = defaultdict(list) - for doc, path in zip(documents, filepaths): - if path.suffix == ".md": - # append doc contents - doc_dict[FileTypes.MD].append((doc, path)) - elif path.suffix == ".pdf": - # append doc paths - doc_dict[FileTypes.PDF].append((doc, path)) - else: - raise ValueError( - f"Received document of type .{path.suffix}, which is not a supported filetype" - ) - return doc_dict - - -class TextSplitChunker(ChunkerBase): +class DocumentChunker: # pylint: disable=too-many-instance-attributes def __init__( self, - document_contents: List | str, - server_ctx_size: int, - chunk_word_count: int, + document_paths: List[Path], output_dir: Path, + tokenizer_model_name: str | Path, + docling_model_path: Optional[Path] = None, + server_ctx_size: int = 4096, + chunk_word_count: int = 1024, ): - self.document_contents = document_contents - self.server_ctx_size = server_ctx_size - self.chunk_word_count = chunk_word_count - self.output_dir = output_dir + if not document_paths: + raise ValueError("Provided empty list of documents") - def chunk_documents(self) -> List: - """Naively chunk markdown documents based on the word count provided by the user. - Returns: - List[str]: List of chunked documents. - """ - num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) - if num_tokens_per_doc > int(self.server_ctx_size - 1024): - raise ValueError( - "Error: {}".format( - str( - f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})" - ) - ) - ) - if self.document_contents == []: - return [] + document_dict = split_docs_by_filetype(document_paths) - chunk_size = _num_chars_from_tokens(num_tokens_per_doc) - return chunk_markdowns(self.document_contents, chunk_size) + if len(document_dict) > 1: + raise ValueError("Provided multiple document types") + # We know there is only 1 key, value pair, so we take the first + self.document_filetype, self.document_paths = next(iter(document_dict.items())) + self.docling_model_path = docling_model_path + self.converter = self._init_docling_converter() -class ContextAwareChunker(ChunkerBase): # pylint: disable=too-many-instance-attributes - def __init__( - self, - document_paths, - filepaths, - output_dir: Path, - chunk_word_count: int, - tokenizer_model_name: Optional[str], - docling_model_path=None, - ): - self.document_paths = document_paths - self.filepaths = filepaths self.output_dir = self._path_validator(output_dir) + self.server_ctx_size = server_ctx_size self.chunk_word_count = chunk_word_count self.tokenizer = self.create_tokenizer(tokenizer_model_name) - self.docling_model_path = docling_model_path - - def chunk_documents(self) -> List: - """Semantically chunk PDF documents. - Returns: - List: a list of chunks from the documents - """ + def _init_docling_converter(self): + """Initialize docling converter with filetype-specific configurations""" # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel # Third Party from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline - if self.document_paths == []: - return [] - if self.docling_model_path is None: logger.info("Docling models not found on disk, downloading models...") self.docling_model_path = StandardPdfPipeline.download_models_hf() @@ -258,17 +126,29 @@ def chunk_documents(self) -> List: artifacts_path=self.docling_model_path, do_ocr=False, ) + ocr_options = resolve_ocr_options() if ocr_options is not None: pipeline_options.do_ocr = True pipeline_options.ocr_options = ocr_options - converter = DocumentConverter( + return DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) } ) - parsed_documents = converter.convert_all(self.filepaths) + + def chunk_documents(self) -> List: + """Split a list of documents into chunks + + Returns: + List: a list of chunks from the documents + """ + + if self.document_paths == []: + return [] + + parsed_documents = self.converter.convert_all(self.document_paths) docling_artifacts_path = self.export_documents(parsed_documents) @@ -348,7 +228,7 @@ def fuse_texts( return fused_texts @staticmethod - def create_tokenizer(model_name: Optional[str]): + def create_tokenizer(model_path: str | Path): """ Create a tokenizer instance from a pre-trained model or a local directory. @@ -363,10 +243,8 @@ def create_tokenizer(model_name: Optional[str]): # Third Party from transformers import AutoTokenizer - if model_name is None: - raise TypeError("No model path provided") - - model_path = Path(model_name) + if not isinstance(model_path, Path): + model_path = Path(model_path) error_info_message = ( "Please run `ilab model download {download_args}` and try again" ) @@ -486,7 +364,7 @@ def get_table_page_number(self, json_book, idx): prev_page_num = book_element["prov"][0]["page"] break for book_element in json_book["main-text"][idx:]: - if "prov" in book_element: + if "prov" in book_element and book_element["prov"]: next_page_num = book_element["prov"][0]["page"] break if prev_page_num is not None and next_page_num is not None: @@ -545,8 +423,14 @@ def build_chunks_from_docling_json( current_book_page_number = self.get_table_page_number( json_book, idx ) + book_text = self.get_table(json_book, book_element["$ref"]) + elif book_element["prov"]: + current_book_page_number = book_element["prov"][0][ + "page" + ] # TODO export to function to handle empty ["prov"] + book_text = book_element["text"] else: - current_book_page_number = book_element["prov"][0]["page"] + current_book_page_number = None book_text = book_element["text"] if book_element["type"] == "subtitle-level-1": @@ -599,8 +483,6 @@ def build_chunks_from_docling_json( if book_element["type"] == "paragraph": book_text = self.add_heading_formatting(book_text) - elif book_element["type"] == "table": - book_text = self.get_table(json_book, book_element["$ref"]) if "## References" in book_text or "## Acknowledgements" in book_text: # For research papers we ignore everything after this sections break diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 24592fe1..2a23072f 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -110,6 +110,18 @@ def _get_taxonomy(repo="taxonomy"): return taxonomy_file_paths +def _string_contains_html(s: str) -> bool: + """Detect HTML tags in a string. + + We use this to catch markdown files that may contain html elements since + docling does not support this.""" + # Define a regex to detect HTML tags + html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>") + + # Check for HTML tags in the content + return bool(html_tag_pattern.search(s)) + + def _get_documents( source: Dict[str, Union[str, List[str]]], skip_checkout: bool = False, @@ -161,6 +173,12 @@ def _get_documents( # Process Markdown files with open(file_path, "r", encoding="utf-8") as file: content = file.read() + if _string_contains_html(content): + logging.warning( + f"Provided markdown file {file_path} contains HTML contents, which is currently unsupported as a part of markdown" + "NOTE: Continuing this might affect your data generation quality." + "To get best results please format your markdown documents without the use of HTML or use a different document filetype." + ) file_contents.append(content) filepaths.append(Path(file_path)) logger.info( @@ -418,20 +436,19 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset: def _knowledge_leaf_node_to_samples( leaf_node, - taxonomy_path, server_ctx_size, chunk_word_count, document_output_dir, model_name, docling_model_path=None, ): + document_paths = leaf_node[0]["filepaths"] chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=taxonomy_path, + document_paths=document_paths, output_dir=document_output_dir, + tokenizer_model_name=model_name, server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, - tokenizer_model_name=model_name, docling_model_path=docling_model_path, ) chunks = chunker.chunk_documents() @@ -457,7 +474,6 @@ def _skill_leaf_node_to_samples(leaf_node): def leaf_node_to_samples( leaf_node, - taxonomy_path, server_ctx_size, chunk_word_count, document_output_dir, @@ -468,8 +484,7 @@ def leaf_node_to_samples( return [] if leaf_node[0].get("documents"): return _knowledge_leaf_node_to_samples( - leaf_node, - taxonomy_path, + leaf_node, # pylint: disable=duplicate-code server_ctx_size, chunk_word_count, document_output_dir, diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index 217cb889..f06db504 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -8,58 +8,53 @@ # First Party from instructlab.sdg.utils.chunkers import DocumentChunker +# Constants for Test Directory and Test Documents TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata") +TEST_DOCUMENTS = { + "pdf": "sample_documents/phoenix.pdf", + "md": "sample_documents/phoenix.md", +} + + +@pytest.fixture(scope="module") +def test_paths(): + """Fixture to return paths to test documents.""" + return { + doc_type: Path(os.path.join(TEST_DATA_DIR, path)) + for doc_type, path in TEST_DOCUMENTS.items() + } @pytest.fixture def tokenizer_model_name(): + """Fixture to return the path to the tokenizer model.""" return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -def test_chunk_pdf(tmp_path, tokenizer_model_name): - pdf_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.pdf")) - leaf_node = [ - { - "documents": ["Lorem ipsum"], - "filepaths": [pdf_path], - "taxonomy_path": "knowledge", - } - ] +@pytest.mark.parametrize( + "doc_type, expected_chunks, contains_text", + [ + ("pdf", 9, "Phoenix is a minor constellation"), + ("md", 7, None), # Assuming there's no specific text to check in Markdown + ], +) +def test_chunk_documents( + tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text +): + """ + Generalized test function for chunking documents. + """ + document_path = test_paths[doc_type] chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=tmp_path, + document_paths=[document_path], output_dir=tmp_path, - server_ctx_size=4096, - chunk_word_count=500, tokenizer_model_name=tokenizer_model_name, - ) - chunks = chunker.chunk_documents() - assert len(chunks) > 9 - assert "Phoenix is a minor constellation" in chunks[0] - for chunk in chunks: - # inexact sanity-checking of chunk max length - assert len(chunk) < 2500 - - -def test_chunk_md(tmp_path, tokenizer_model_name): - markdown_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.md")) - leaf_node = [ - { - "documents": [markdown_path.read_text(encoding="utf-8")], - "filepaths": [markdown_path], - "taxonomy_path": "knowledge", - } - ] - chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=tmp_path, - output_dir=tmp_path, server_ctx_size=4096, chunk_word_count=500, - tokenizer_model_name=tokenizer_model_name, ) chunks = chunker.chunk_documents() - assert len(chunks) > 7 + assert len(chunks) > expected_chunks + if contains_text: + assert contains_text in chunks[0] for chunk in chunks: - # inexact sanity-checking of chunk max length assert len(chunk) < 2500 diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 700758ae..42db1ac3 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -11,13 +11,7 @@ import pytest # First Party -from instructlab.sdg.utils.chunkers import ( - ContextAwareChunker, - DocumentChunker, - FileTypes, - TextSplitChunker, - resolve_ocr_options, -) +from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options # Local from .testdata import testdata @@ -35,65 +29,27 @@ def tokenizer_model_name(): return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -@pytest.mark.parametrize( - "filepaths, chunker_type", - [ - ([Path("document.md")], TextSplitChunker), - ([Path("document.pdf")], ContextAwareChunker), - ], -) -def test_chunker_factory(filepaths, chunker_type, documents_dir, tokenizer_model_name): - """Test that the DocumentChunker factory class returns the proper Chunker type""" - leaf_node = [ - { - "documents": ["Lorem ipsum"], - "taxonomy_path": "", - "filepaths": filepaths, - } - ] - with tempfile.TemporaryDirectory() as temp_dir: - chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=documents_dir, - output_dir=temp_dir, - tokenizer_model_name=tokenizer_model_name, - ) - assert isinstance(chunker, chunker_type) - - -def test_chunker_factory_unsupported_filetype(documents_dir, tokenizer_model_name): +def test_init_document_chunker_unsupported_filetype( + documents_dir, tokenizer_model_name +): """Test that the DocumentChunker factory class fails when provided an unsupported document""" - leaf_node = [ - { - "documents": ["Lorem ipsum"], - "taxonomy_path": "", - "filepaths": [Path("document.jpg")], - } - ] + document_paths = [documents_dir / "document.jpg"] with pytest.raises(ValueError): with tempfile.TemporaryDirectory() as temp_dir: _ = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=documents_dir, + document_paths=document_paths, output_dir=temp_dir, tokenizer_model_name=tokenizer_model_name, ) -def test_chunker_factory_empty_filetype(documents_dir): +def test_chunker_factory_empty_document_paths(tokenizer_model_name): """Test that the DocumentChunker factory class fails when provided no document""" - leaf_node = [ - { - "documents": [], - "taxonomy_path": "", - "filepaths": [], - } - ] + document_paths = [] with pytest.raises(ValueError): with tempfile.TemporaryDirectory() as temp_dir: _ = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=documents_dir, + document_paths=document_paths, output_dir=temp_dir, tokenizer_model_name=tokenizer_model_name, ) @@ -149,7 +105,7 @@ def test_resolve_ocr_options_none_found_logs_error( def test_create_tokenizer(tokenizer_model_name): - ContextAwareChunker.create_tokenizer(tokenizer_model_name) + DocumentChunker.create_tokenizer(tokenizer_model_name) @pytest.mark.parametrize( @@ -163,4 +119,4 @@ def test_create_tokenizer(tokenizer_model_name): def test_invalid_tokenizer(model_name): model_path = os.path.join(TEST_DATA_DIR, model_name) with pytest.raises(ValueError): - ContextAwareChunker.create_tokenizer(model_path) + DocumentChunker.create_tokenizer(model_path) diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index c5415636..c2968029 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -312,8 +312,10 @@ def test_generate(self): generate_data( client=MagicMock(), logger=mocked_logger, - model_family="merlinite", - model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + model_family="granite", + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -395,8 +397,10 @@ def test_generate(self): generate_data( client=MagicMock(), logger=mocked_logger, - model_family="merlinite", - model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + model_family="granite", + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -499,8 +503,10 @@ def test_generate(self): generate_data( client=MagicMock(), logger=mocked_logger, - model_family="merlinite", - model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + model_family="granite", + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index 0828e187..0b697bd7 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -11,6 +11,7 @@ # First Party from instructlab.sdg.utils import taxonomy +from instructlab.sdg.utils.taxonomy import _string_contains_html TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?" @@ -85,3 +86,14 @@ def test_read_taxonomy_leaf_nodes( ): seed_example_exists = True assert seed_example_exists is True + + @pytest.mark.parametrize( + "s, contains_html", + [ + ("hello, world!", False), + ("hello,
Type |
- - | - | - | Location |
-
---|---|---|---|---|
Pharyngeal tonsil (also - termed "adenoid") |
- Ciliated - pseudostratified columnar (respiratory epithelium) |
- Incompletely encapsulated |
- Small folds—sometimes described as crypts1 |
- Roof of pharynx |
-
- | Ciliated pseudostratified columnar (respiratory epithelium) |
- Not encapsulated |
- No crypts |
- Roof of pharynx |
-
- | Stratified squamous epithelium |
- Fully encapsulated |
- Multiple deep crypts |
- Each side of the throat at the back of the mouth |
-