From c562f8c4b0b0d6239e6f785168cdda43d094d340 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Sat, 23 Nov 2024 14:32:58 -0500 Subject: [PATCH 01/11] Replace DocumentChunker factory with docling-based chunker by default Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/utils/chunkers.py | 384 ++++++++++++++------------ src/instructlab/sdg/utils/taxonomy.py | 6 +- 2 files changed, 214 insertions(+), 176 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index d8de36de..b292239c 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -3,7 +3,7 @@ from collections import defaultdict from enum import Enum from pathlib import Path -from typing import DefaultDict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple import json import logging import re @@ -26,6 +26,7 @@ logger = logging.getLogger(__name__) _DEFAULT_CHUNK_OVERLAP = 100 +SUPPORTED_FILETYPES = [".pdf", ".md"] def _num_tokens_from_words(num_words) -> int: @@ -68,185 +69,211 @@ def resolve_ocr_options() -> OcrOptions: return None -class FileTypes(Enum): - MD = ".md" - PDF = ".pdf" - - -class ChunkerBase(ABC): - @abstractmethod - def chunk_documents(self): - pass - - -class DocumentChunker: - """A factory chunker class that instantiates the applicable chunker - - Currently, only Markdown and PDF are supported. For Markdown, returns - TextSplitChunker, and for PDF, returns ContextAwareChunker""" - - def __new__( - cls, - leaf_node, - taxonomy_path, - output_dir: Path, - server_ctx_size=4096, - chunk_word_count=1024, - tokenizer_model_name: Optional[str] = None, - docling_model_path: Optional[str] = None, - ): - """Insantiate the appropriate chunker for the provided document - - Args: - leaf_node: a leaf node dict containing "documents", - "filepaths", and "taxonomy_path" keys - output_dir (Path): directory where artifacts should be stored - server_ctx_size (int): Context window size of server - chunk_word_count (int): Maximum number of words to chunk a document - tokenizer_model_name (Optional[str]): name of huggingface model to get - tokenizer from - Returns: - TextSplitChunker | ContextAwareChunker: Object of the appropriate - chunker class for the provided filetype - """ - documents = leaf_node[0]["documents"] - - if not isinstance(taxonomy_path, Path): - taxonomy_path = Path(taxonomy_path) - - if isinstance(documents, str): - documents = [documents] - logger.info( - "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input." - ) - elif not isinstance(documents, list): - raise TypeError( - "Expected: documents to be a list, but got {}".format(type(documents)) - ) - - filepaths = leaf_node[0]["filepaths"] - - doc_dict = cls._split_docs_by_filetype(documents, filepaths) - if len(doc_dict.keys()) > 1: - raise ValueError("Received multiple document types") - if len(doc_dict.keys()) < 1: - raise ValueError("Received no document types") - - if FileTypes.MD in doc_dict: - doc_contents = [d for d, _ in doc_dict[FileTypes.MD]] - return TextSplitChunker( - doc_contents, - server_ctx_size, - chunk_word_count, - output_dir, - ) - - if FileTypes.PDF in doc_dict: - doc_paths = [p for _, p in doc_dict[FileTypes.PDF]] - return ContextAwareChunker( - doc_paths, - filepaths, - output_dir, - chunk_word_count, - tokenizer_model_name, - docling_model_path=docling_model_path, - ) - - @staticmethod - def _split_docs_by_filetype( - documents: List[str], filepaths: List[Path] - ) -> DefaultDict[FileTypes, List[Tuple[str, Path]]]: - """Separate documents into lists based on their filetype. - - Currently, only Markdown and PDF are supported. - Args: - documents (List[str]): A list of the document contents as strings - filepaths (List[Path]): Corresponding document filepaths - Returns: - DefaultDict: Dictionary with either ".md" or ".pdf" as a key. - Markdown items contain document contents, PDF items contain - paths to documents. - """ - doc_dict = defaultdict(list) - for doc, path in zip(documents, filepaths): - if path.suffix == ".md": - # append doc contents - doc_dict[FileTypes.MD].append((doc, path)) - elif path.suffix == ".pdf": - # append doc paths - doc_dict[FileTypes.PDF].append((doc, path)) - else: - raise ValueError( - f"Received document of type .{path.suffix}, which is not a supported filetype" - ) - return doc_dict - +def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: + """Split document paths into a dict of lists based on their file extension. + """ + document_dict = defaultdict(list) + for path in document_paths: + filetype = path.suffix + if filetype not in SUPPORTED_FILETYPES: + raise ValueError(f"Provided unsupported filetype {filetype}") + + document_dict[filetype].append(path) + + return dict(document_dict) + + + +# class DocumentChunker: +# """A factory chunker class that instantiates the applicable chunker + +# Currently, only Markdown and PDF are supported. For Markdown, returns +# TextSplitChunker, and for PDF, returns ContextAwareChunker""" + +# def __new__( +# cls, +# doc_filepaths: List[Path], +# output_dir: Path, +# server_ctx_size=4096, +# chunk_word_count=1024, +# tokenizer_model_name: Optional[str] = None, +# docling_model_path: Optional[str] = None, +# ): +# """Insantiate the appropriate chunker for the provided document + +# Args: +# leaf_node: a leaf node dict containing "documents", +# "filepaths", and "taxonomy_path" keys +# output_dir (Path): directory where artifacts should be stored +# server_ctx_size (int): Context window size of server +# chunk_word_count (int): Maximum number of words to chunk a document +# tokenizer_model_name (Optional[str]): name of huggingface model to get +# tokenizer from +# Returns: +# TextSplitChunker | ContextAwareChunker: Object of the appropriate +# chunker class for the provided filetype +# """ +# documents = leaf_node[0]["documents"] + +# if not isinstance(taxonomy_path, Path): +# taxonomy_path = Path(taxonomy_path) + +# if isinstance(documents, str): +# documents = [documents] +# logger.info( +# "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input." +# ) +# elif not isinstance(documents, list): +# raise TypeError( +# "Expected: documents to be a list, but got {}".format(type(documents)) +# ) + +# filepaths = leaf_node[0]["filepaths"] + +# doc_dict = cls._split_docs_by_filetype(documents, filepaths) +# if len(doc_dict.keys()) > 1: +# raise ValueError("Received multiple document types") +# if len(doc_dict.keys()) < 1: +# raise ValueError("Received no document types") + +# if SupportedFileTypes.MD in doc_dict: +# doc_contents = [d for d, _ in doc_dict[SupportedFileTypes.MD]] +# # return TextSplitChunker( +# # doc_contents, +# # server_ctx_size, +# # chunk_word_count, +# # output_dir, +# # ) + +# # TODO CHUNK AS MARKDOWN +# pass + +# if SupportedFileTypes.PDF in doc_dict: +# doc_paths = [p for _, p in doc_dict[SupportedFileTypes.PDF]] +# # return ContextAwareChunker( +# # doc_paths, +# # filepaths, +# # output_dir, +# # chunk_word_count, +# # tokenizer_model_name, +# # docling_model_path=docling_model_path, +# # ) + +# # TODO CHUNK AS PDF +# pass + +# @staticmethod +# def _split_docs_by_filetype( +# documents: List[str], filepaths: List[Path] +# ) -> DefaultDict[SupportedFileTypes, List[Tuple[str, Path]]]: +# """Separate documents into lists based on their filetype. + +# Currently, only Markdown and PDF are supported. +# Args: +# documents (List[str]): A list of the document contents as strings +# filepaths (List[Path]): Corresponding document filepaths +# Returns: +# DefaultDict: Dictionary with either ".md" or ".pdf" as a key. +# Markdown items contain document contents, PDF items contain +# paths to documents. +# """ +# doc_dict = defaultdict(list) +# for doc, path in zip(documents, filepaths): +# if path.suffix == ".md": +# # append doc contents +# doc_dict[SupportedFileTypes.MD].append((doc, path)) +# elif path.suffix == ".pdf": +# # append doc paths +# doc_dict[SupportedFileTypes.PDF].append((doc, path)) +# else: +# raise ValueError( +# f"Received document of type .{path.suffix}, which is not a supported filetype" +# ) +# return doc_dict + + +# class TextSplitChunker(ChunkerBase): +# def __init__( +# self, +# document_contents: List | str, +# server_ctx_size: int, +# chunk_word_count: int, +# output_dir: Path, +# ): +# self.document_contents = document_contents +# self.server_ctx_size = server_ctx_size +# self.chunk_word_count = chunk_word_count +# self.output_dir = output_dir + +# def chunk_documents(self) -> List: +# """Naively chunk markdown documents based on the word count provided by the user. +# Returns: +# List[str]: List of chunked documents. +# """ +# num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) +# if num_tokens_per_doc > int(self.server_ctx_size - 1024): +# raise ValueError( +# "Error: {}".format( +# str( +# f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})" +# ) +# ) +# ) +# if self.document_contents == []: +# return [] + +# chunk_size = _num_chars_from_tokens(num_tokens_per_doc) +# return chunk_markdowns(self.document_contents, chunk_size) + + +class DocumentChunker(): # pylint: disable=too-many-instance-attributes + + # def __new__( + # cls, + # leaf_node, + # taxonomy_path, + # output_dir: Path, + # server_ctx_size=4096, + # chunk_word_count=1024, + # tokenizer_model_name: Optional[str] = None, + # docling_model_path: Optional[str] = None, + # ): -class TextSplitChunker(ChunkerBase): def __init__( self, - document_contents: List | str, - server_ctx_size: int, - chunk_word_count: int, + document_paths: List[Path], output_dir: Path, + tokenizer_model_name: str, + docling_model_path: Optional[Path] = None, + server_ctx_size: int = 4096, + chunk_word_count: int = 1024, ): - self.document_contents = document_contents - self.server_ctx_size = server_ctx_size - self.chunk_word_count = chunk_word_count - self.output_dir = output_dir + if len(document_paths) == 0: + raise ValueError("Provided empty list of documents") - def chunk_documents(self) -> List: - """Naively chunk markdown documents based on the word count provided by the user. - Returns: - List[str]: List of chunked documents. - """ - num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) - if num_tokens_per_doc > int(self.server_ctx_size - 1024): - raise ValueError( - "Error: {}".format( - str( - f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})" - ) - ) - ) - if self.document_contents == []: - return [] + document_dict = split_docs_by_filetype(document_paths) - chunk_size = _num_chars_from_tokens(num_tokens_per_doc) - return chunk_markdowns(self.document_contents, chunk_size) + if len(document_dict) > 1: + raise ValueError("Provided multiple document types") + # We know there is only 1 key, value pair, so we take the first + self.document_filetype, self.document_paths = next(iter(document_dict.items())) + self.docling_model_path = docling_model_path + self.converter = self._init_docling_converter() -class ContextAwareChunker(ChunkerBase): # pylint: disable=too-many-instance-attributes - def __init__( - self, - document_paths, - filepaths, - output_dir: Path, - chunk_word_count: int, - tokenizer_model_name: Optional[str], - docling_model_path=None, - ): - self.document_paths = document_paths - self.filepaths = filepaths self.output_dir = self._path_validator(output_dir) + self.server_ctx_size = server_ctx_size self.chunk_word_count = chunk_word_count self.tokenizer = self.create_tokenizer(tokenizer_model_name) - self.docling_model_path = docling_model_path - - def chunk_documents(self) -> List: - """Semantically chunk PDF documents. - Returns: - List: a list of chunks from the documents + def _init_docling_converter(self): + """Initialize docling converter with filetype-specific configurations """ # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel # Third Party - from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline - - if self.document_paths == []: - return [] + from docling.document_converter import DocumentConverter, PdfFormatOption if self.docling_model_path is None: logger.info("Docling models not found on disk, downloading models...") @@ -258,17 +285,29 @@ def chunk_documents(self) -> List: artifacts_path=self.docling_model_path, do_ocr=False, ) + ocr_options = resolve_ocr_options() if ocr_options is not None: pipeline_options.do_ocr = True pipeline_options.ocr_options = ocr_options - converter = DocumentConverter( + return DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) } ) - parsed_documents = converter.convert_all(self.filepaths) + + def chunk_documents(self) -> List: + """Split a list of documents into chunks + + Returns: + List: a list of chunks from the documents + """ + + if self.document_paths == []: + return [] + + parsed_documents = self.converter.convert_all(self.document_paths) docling_artifacts_path = self.export_documents(parsed_documents) @@ -348,7 +387,7 @@ def fuse_texts( return fused_texts @staticmethod - def create_tokenizer(model_name: Optional[str]): + def create_tokenizer(model_name: str): """ Create a tokenizer instance from a pre-trained model or a local directory. @@ -363,10 +402,7 @@ def create_tokenizer(model_name: Optional[str]): # Third Party from transformers import AutoTokenizer - if model_name is None: - raise TypeError("No model path provided") - - model_path = Path(model_name) + model_path = Path(model_name) # TODO expect a path from the DocumentChunker constructor error_info_message = ( "Please run `ilab model download {download_args}` and try again" ) @@ -486,7 +522,7 @@ def get_table_page_number(self, json_book, idx): prev_page_num = book_element["prov"][0]["page"] break for book_element in json_book["main-text"][idx:]: - if "prov" in book_element: + if "prov" in book_element and book_element["prov"]: next_page_num = book_element["prov"][0]["page"] break if prev_page_num is not None and next_page_num is not None: @@ -545,8 +581,12 @@ def build_chunks_from_docling_json( current_book_page_number = self.get_table_page_number( json_book, idx ) + book_text = self.get_table(json_book, book_element["$ref"]) + elif book_element["prov"]: + current_book_page_number = book_element["prov"][0]["page"] # TODO export to function to handle empty ["prov"] + book_text = book_element["text"] else: - current_book_page_number = book_element["prov"][0]["page"] + current_book_page_number = None book_text = book_element["text"] if book_element["type"] == "subtitle-level-1": @@ -599,8 +639,6 @@ def build_chunks_from_docling_json( if book_element["type"] == "paragraph": book_text = self.add_heading_formatting(book_text) - elif book_element["type"] == "table": - book_text = self.get_table(json_book, book_element["$ref"]) if "## References" in book_text or "## Acknowledgements" in book_text: # For research papers we ignore everything after this sections break diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 24592fe1..db66f1ae 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -425,13 +425,13 @@ def _knowledge_leaf_node_to_samples( model_name, docling_model_path=None, ): + document_paths = leaf_node[0]["filepaths"] chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=taxonomy_path, + document_paths=document_paths, output_dir=document_output_dir, + tokenizer_model_name=model_name, server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, - tokenizer_model_name=model_name, docling_model_path=docling_model_path, ) chunks = chunker.chunk_documents() From feff382e8e8ca6462118b001ce93831934cddd68 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Thu, 5 Dec 2024 17:05:57 -0500 Subject: [PATCH 02/11] Modify tests to use new DocumentChunker interface Signed-off-by: Khaled Sulayman --- tests/functional/test_chunkers.py | 10 +++--- tests/test_chunkers.py | 59 +++++-------------------------- tests/test_generate_data.py | 12 +++---- 3 files changed, 18 insertions(+), 63 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index 217cb889..d364ac48 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -26,12 +26,11 @@ def test_chunk_pdf(tmp_path, tokenizer_model_name): } ] chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=tmp_path, + document_paths=[pdf_path], output_dir=tmp_path, + tokenizer_model_name=tokenizer_model_name, server_ctx_size=4096, chunk_word_count=500, - tokenizer_model_name=tokenizer_model_name, ) chunks = chunker.chunk_documents() assert len(chunks) > 9 @@ -51,12 +50,11 @@ def test_chunk_md(tmp_path, tokenizer_model_name): } ] chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=tmp_path, + document_paths=[markdown_path], output_dir=tmp_path, + tokenizer_model_name=tokenizer_model_name, server_ctx_size=4096, chunk_word_count=500, - tokenizer_model_name=tokenizer_model_name, ) chunks = chunker.chunk_documents() assert len(chunks) > 7 diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 700758ae..ff1079d5 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -12,10 +12,7 @@ # First Party from instructlab.sdg.utils.chunkers import ( - ContextAwareChunker, DocumentChunker, - FileTypes, - TextSplitChunker, resolve_ocr_options, ) @@ -35,65 +32,25 @@ def tokenizer_model_name(): return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -@pytest.mark.parametrize( - "filepaths, chunker_type", - [ - ([Path("document.md")], TextSplitChunker), - ([Path("document.pdf")], ContextAwareChunker), - ], -) -def test_chunker_factory(filepaths, chunker_type, documents_dir, tokenizer_model_name): - """Test that the DocumentChunker factory class returns the proper Chunker type""" - leaf_node = [ - { - "documents": ["Lorem ipsum"], - "taxonomy_path": "", - "filepaths": filepaths, - } - ] - with tempfile.TemporaryDirectory() as temp_dir: - chunker = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=documents_dir, - output_dir=temp_dir, - tokenizer_model_name=tokenizer_model_name, - ) - assert isinstance(chunker, chunker_type) - - -def test_chunker_factory_unsupported_filetype(documents_dir, tokenizer_model_name): +def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name): """Test that the DocumentChunker factory class fails when provided an unsupported document""" - leaf_node = [ - { - "documents": ["Lorem ipsum"], - "taxonomy_path": "", - "filepaths": [Path("document.jpg")], - } - ] + document_paths = [documents_dir / "document.jpg"] with pytest.raises(ValueError): with tempfile.TemporaryDirectory() as temp_dir: _ = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=documents_dir, + document_paths=document_paths, output_dir=temp_dir, tokenizer_model_name=tokenizer_model_name, ) -def test_chunker_factory_empty_filetype(documents_dir): +def test_chunker_factory_empty_document_paths(tokenizer_model_name): """Test that the DocumentChunker factory class fails when provided no document""" - leaf_node = [ - { - "documents": [], - "taxonomy_path": "", - "filepaths": [], - } - ] + document_paths = [] with pytest.raises(ValueError): with tempfile.TemporaryDirectory() as temp_dir: _ = DocumentChunker( - leaf_node=leaf_node, - taxonomy_path=documents_dir, + document_paths=document_paths, output_dir=temp_dir, tokenizer_model_name=tokenizer_model_name, ) @@ -149,7 +106,7 @@ def test_resolve_ocr_options_none_found_logs_error( def test_create_tokenizer(tokenizer_model_name): - ContextAwareChunker.create_tokenizer(tokenizer_model_name) + DocumentChunker.create_tokenizer(tokenizer_model_name) @pytest.mark.parametrize( @@ -163,4 +120,4 @@ def test_create_tokenizer(tokenizer_model_name): def test_invalid_tokenizer(model_name): model_path = os.path.join(TEST_DATA_DIR, model_name) with pytest.raises(ValueError): - ContextAwareChunker.create_tokenizer(model_path) + DocumentChunker.create_tokenizer(model_path) diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index c5415636..650df21a 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -312,8 +312,8 @@ def test_generate(self): generate_data( client=MagicMock(), logger=mocked_logger, - model_family="merlinite", - model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + model_family="granite", + model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -395,8 +395,8 @@ def test_generate(self): generate_data( client=MagicMock(), logger=mocked_logger, - model_family="merlinite", - model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + model_family="granite", + model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -499,8 +499,8 @@ def test_generate(self): generate_data( client=MagicMock(), logger=mocked_logger, - model_family="merlinite", - model_name="models/merlinite-7b-lab-Q4_K_M.gguf", + model_family="granite", + model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, From 759a04d1bea2959086d60fd5955eb955d2b3dd65 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Tue, 17 Dec 2024 16:48:47 +0400 Subject: [PATCH 03/11] Check for html in markdown files and error out Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/utils/taxonomy.py | 18 ++++++++++++++++++ tests/test_taxonomy.py | 12 ++++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index db66f1ae..f965e1d9 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -110,6 +110,18 @@ def _get_taxonomy(repo="taxonomy"): return taxonomy_file_paths +def _string_contains_html(s: str) -> bool: + """Detect HTML tags in a string. + + We use this to catch markdown files that may contain html elements since + docling does not support this.""" + # Define a regex to detect HTML tags + html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>") + + # Check for HTML tags in the content + return bool(html_tag_pattern.search(s)) + + def _get_documents( source: Dict[str, Union[str, List[str]]], skip_checkout: bool = False, @@ -161,6 +173,12 @@ def _get_documents( # Process Markdown files with open(file_path, "r", encoding="utf-8") as file: content = file.read() + if _string_contains_html(content): + raise ValueError(f"Provided markdown file {file_path} contains" + " HTML, which is currently unsupported. Please" + " format your markdown documents without the" + " use of HTML or use a different document" + " filetype.") file_contents.append(content) filepaths.append(Path(file_path)) logger.info( diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index 0828e187..32114fcc 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -11,6 +11,7 @@ # First Party from instructlab.sdg.utils import taxonomy +from instructlab.sdg.utils.taxonomy import _string_contains_html TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?" @@ -85,3 +86,14 @@ def test_read_taxonomy_leaf_nodes( ): seed_example_exists = True assert seed_example_exists is True + + @pytest.mark.parametrize( + "s, contains_html", + [ + ("hello, world!", False), + ("hello,
world!
", True), + ] + ) + def test_string_contains_html(self, s, contains_html): + print(taxonomy.__dict__) + assert _string_contains_html(s) == contains_html From 8af10d4c4e5b054c94405955b0255eaeeef1eca2 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Wed, 18 Dec 2024 16:42:46 +0400 Subject: [PATCH 04/11] replace test_valid_knowledge_skill.yaml with example with no html Signed-off-by: Khaled Sulayman --- .../testdata/test_valid_knowledge_skill.yaml | 322 ++++++++++-------- 1 file changed, 172 insertions(+), 150 deletions(-) diff --git a/tests/testdata/test_valid_knowledge_skill.yaml b/tests/testdata/test_valid_knowledge_skill.yaml index 705acb41..be4c3ef9 100644 --- a/tests/testdata/test_valid_knowledge_skill.yaml +++ b/tests/testdata/test_valid_knowledge_skill.yaml @@ -1,176 +1,198 @@ -created_by: lukeinglis -domain: anatomy_tonsil version: 3 +domain: astronomy +created_by: juliadenham seed_examples: - context: | - ## Structure - Humans are born with four types of tonsils: the pharyngeal tonsil, two - tubal tonsils, two palatine tonsils, and the lingual tonsils.[1] - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + **Phoenix** is a minor [constellation](constellation "wikilink") in the + [southern sky](southern_sky "wikilink"). Named after the mythical + [phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a + celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603 + *[Uranometria](Uranometria "wikilink")*. The French explorer and + astronomer [Nicolas Louis de + Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter + stars and gave their [Bayer designations](Bayer_designation "wikilink") + in 1756. The constellation stretches from roughly −39 degrees to −57 degrees + [declination](declination "wikilink"), and from 23.5h to 2.5h of [right + ascension](right_ascension "wikilink"). The constellations Phoenix, + [Grus](Grus_(constellation) "wikilink"), + [Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"), + are known as the Southern Birds. questions_and_answers: - - question: What is the location of the tubal tonsils? - answer: The location of the tubal tonsils is the roof of the pharynx. - question: | - Compare the epithelial types, encapsulation, and presence of - crypts in the pharyngeal, tubal, and palatine tonsils according to the - table provided. + What is the Phoenix constellation? answer: | - The pharyngeal tonsil features ciliated pseudostratified columnar - epithelium and is incompletely encapsulated with small folds sometimes - described as crypts. The tubal tonsils also have ciliated - pseudostratified columnar epithelium but are not encapsulated and do - not possess crypts. In contrast, the palatine tonsils are covered with - stratified squamous epithelium, are fully encapsulated, and contain - multiple deep crypts. These structural differences are indicative of - their varied anatomical locations and potentially their distinct - functions within the immune system. - - question: What type of epithelium is found in the pharyngeal tonsil? + Phoenix is a minor constellation in the southern sky. + - question: | + Who charted the Phoenix constellation? + answer: | + The Phoenix constellation was charted by french explorer and + astronomer Nicolas Louis de Lacaille. + - question: | + How far does the Phoenix constellation stretch? answer: | - The type of epithelium found in the pharyngeal tonsil is ciliated - pseudostratified columnar (respiratory epithelium). - - + The phoenix constellation stretches from roughly −39° to −57° + declination, and from 23.5h to 2.5h of right ascension. - context: | - The **tonsils** are a set of [lymphoid](Lymphatic_system "wikilink") - organs facing into the aerodigestive tract, which is known as - [Waldeyer's tonsillar ring](Waldeyer's_tonsillar_ring "wikilink") and - consists of the [adenoid tonsil](adenoid "wikilink") (or pharyngeal - tonsil), two [tubal tonsils](tubal_tonsil "wikilink"), two [palatine - tonsils](palatine_tonsil "wikilink"), and the [lingual - tonsils](lingual_tonsil "wikilink"). These organs play an important role - in the immune system. - + Phoenix was the largest of the 12 constellations established by [Petrus + Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter + Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de + Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm + diameter celestial globe published in 1597 (or 1598) in Amsterdam by + Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first + depiction of this constellation in a celestial atlas was in [Johann + Bayer](Johann_Bayer "wikilink")'s + *[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included + it in his southern star catalog the same year under the Dutch name *Den + voghel Fenicx*, "The Bird Phoenix", symbolising the + [phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One + name of the brightest star [Alpha + Phoenicis](Alpha_Phoenicis "wikilink")—Ankaa—is derived from the Arabic: + العنقاء, romanized: al-‘anqā’, lit. 'the phoenix', and + was coined sometime after 1800 in relation to the constellation. questions_and_answers: - - question: What is the immune system's first line of defense? + - question: | + What is the brightest star in the Phoenix constellation + called? answer: | - The tonsils are the immune system's first line of defense against - ingested or inhaled foreign pathogens. - - question: What is Waldeyer's tonsillar ring? + Alpha Phoenicis or Ankaa is the brightest star in the Phoenix + Constellation. + - question: Where did the Phoenix constellation first appear? answer: | - Waldeyer's tonsillar ring is a set of lymphoid organs facing into the - aerodigestive tract, consisting of the adenoid tonsil, two tubal - tonsils, two palatine tonsils, and the lingual tonsils. - - question: How many tubal tonsils are part of Waldeyer's tonsillar ring? - answer: There are two tubal tonsils as part of Waldeyer's tonsillar ring. - + The Phoenix constellation first appeared on a 35-cm diameter + celestial globe published in 1597 (or 1598) in Amsterdam by + Plancius with Jodocus Hondius. + - question: | + What does "The Bird Phoenix" symbolize? + answer: | + "The Bird Phoenix" symbolizes the phoenix of classical mythology. - context: | - The palatine tonsils tend to reach their largest size in [puberty](puberty - "wikilink"), and they gradually undergo [atrophy](atrophy "wikilink") - thereafter. However, they are largest relative to the diameter of the - throat in young children. In adults, each palatine tonsil normally - measures up to 2.5 cm in length, 2.0 cm in width and 1.2 cm in - thickness.[2] - + Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink") + and Sculptor to the north, Grus to the west, Tucana to the south, + touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and + [Eridanus](Eridanus_(constellation) "wikilink") to the east and + southeast. The bright star [Achernar](Achernar "wikilink") is + nearby. The three-letter abbreviation for the constellation, as + adopted by the [International Astronomical + Union](International_Astronomical_Union "wikilink") in 1922, is + "Phe". The official constellation boundaries, as set by Belgian + astronomer [Eugène Delporte](Eugène_Joseph_Delporte "wikilink") in 1930, + are defined by a polygon of 10 segments. In the [equatorial coordinate + system](equatorial_coordinate_system "wikilink"), the [right + ascension](right_ascension "wikilink") coordinates of these borders lie + between 23h 26.5m and 02h 25.0m, + while the [declination](declination "wikilink") + coordinates are between −39.31° and −57.84°. This means it remains + below the horizon to anyone living north of the [40th + parallel](40th_parallel_north "wikilink") in the [Northern + Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky + for anyone living north of the [equator](equator "wikilink"). It is most + visible from locations such as Australia and South Africa during late + [Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most + of the constellation lies within, and can be located by, forming a + triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink") + and [Beta Ceti](Beta_Ceti "wikilink")—Ankaa lies roughly in the centre + of this. questions_and_answers: - - question: When do the palatine tonsils tend to reach their largest size? - answer: The palatine tonsils tend to reach their largest size in puberty. - - question: What are the typical dimensions of an adult palatine tonsil? + - question: What are the characteristics of the Phoenix constellation? + answer: | + Phoenix is a small constellation bordered by Fornax and Sculptor to + the north, Grus to the west, Tucana to the south, touching on the + corner of Hydrus to the south, and Eridanus to the east and southeast. + The bright star Achernar is nearby. + - question: | + When is the phoenix constellation most visible? answer: | - In adults, each palatine tonsil normally measures up to 2.5 cm in - length, 2.0 cm in width, and 1.2 cm in thickness. - - question: How do the palatine tonsils change in size with age? + Phoenix is most visible from locations such as Australia and + South Africa during late Southern Hemisphere spring. + - question: | + What are the Phoenix Constellation boundaries? answer: | - The palatine tonsils tend to gradually undergo atrophy after puberty, - becoming smaller in size compared to their dimensions in young - children. - + The official constellation boundaries for Phoenix, as set by Belgian + astronomer Eugène Delporte in 1930, are defined by a polygon of 10 + segments. - context: | - The tonsils are immunocompetent organs that serve as the immune system's - first line of defense against ingested or inhaled foreign pathogens, and - as such frequently engorge with blood to assist in immune responses to - common illnesses such as the common cold. The tonsils have on their - surface specialized antigen capture cells called [microfold - cells](microfold_cell "wikilink") (M cells) that allow for the uptake of - antigens produced by pathogens. These M cells then alert the B cells and T - cells in the tonsil that a pathogen is present and an immune response is - stimulated.[3] B cells are activated and proliferate in areas called - germinal centers in the tonsil. These germinal centers are places where B - memory cells are created and [secretory antibody (IgA)](Immunoglobulin_A - "wikilink") is produced. - + Ten stars have been found to have planets to date, and four planetary + systems have been discovered with the [SuperWASP](SuperWASP "wikilink") + project. [HD 142](HD_142 "wikilink") is a yellow giant that has an + apparent magnitude of 5.7, and has a planet ([HD 142b](HD_142_b + "wikilink")) 1.36 times the mass of Jupiter which orbits every 328 days. + [HD 2039](HD_2039 "wikilink") is a yellow subgiant with an apparent + magnitude of 9.0 around 330 light years away which has a planet ([HD 2039 + b](HD_2039_b "wikilink")) six times the mass of Jupiter. [WASP-18](WASP-18 + "wikilink") is a star of magnitude 9.29 which was discovered to have a hot + Jupiter-like planet ([WASP-18b](WASP-18b "wikilink")) taking less than a + day to orbit the star. The planet is suspected to be causing WASP-18 to + appear older than it really is. [WASP-4](WASP-4 "wikilink") and + [WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000 + light years distant and of 13th magnitude, each with a single planet + larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange + dwarf of spectral type K4V and visual magnitude 11.3, which has a + planetary companion of similar size and mass to Saturn. The planet + completes an orbit every 3.9 days. questions_and_answers: - - question: | - What are the specialized antigen capture cells on the surface of the - tonsils called? + - question: In the Phoenix constellation, how many stars have planets? answer: | - The specialized antigen capture cells on the surface of the tonsils - are called microfold cells (M cells). - - question: What is the role of microfold cells in the tonsils? + In the Phoenix constellation, ten stars have been found to have + planets to date, and four planetary systems have been discovered + with the SuperWASP project. + - question: What is HD 142? + answer: | + HD 142 is a yellow giant that has an apparent magnitude of 5.7, and + has a planet (HD 142 b) 1.36 times the mass of Jupiter which + orbits every 328 days. + - question: | + Are WASP-4 and WASP-5 solar-type yellow stars? answer: | - Microfold cells (M cells) allow for the uptake of antigens produced by - pathogens. They alert the B cells and T cells in the tonsil that a - pathogen is present, stimulating an immune response. - - question: Where do B cells proliferate in the tonsils? - answer: B cells proliferate in areas called germinal centers in the tonsils. - + Yes, WASP-4 and WASP-5 are solar-type yellow stars around 1000 light + years distant and of 13th magnitude, each with a single planet + larger than Jupiter. - context: | - A [tonsillolith](tonsillolith "wikilink") (also known as a "tonsil stone") - is material that accumulates on the palatine tonsil. This can reach the - size of a [peppercorn](peppercorn "wikilink") and is white or cream in - color. The main substance is mostly [calcium](calcium "wikilink"), but it - has a strong unpleasant odor because of [hydrogen - sulfide](hydrogen_sulfide "wikilink") and [methyl - mercaptan](methyl_mercaptan "wikilink") and other chemicals.[6] - + The constellation does not lie on the + [galactic plane](galactic_plane "wikilink") of the Milky Way, and there + are no prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf + [irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude 11.0 + and lying some 12.7 million light years distant. Only 24000 light years in + diameter, it is an outlying member of the [Sculptor Group](Sculptor_Group + "wikilink"). NGC 625 is thought to have been involved in a collision and + is experiencing a burst of [active star formation](Active_galactic_nucleus + "wikilink"). [NGC 37](NGC_37 "wikilink") is a + [lenticular galaxy](lenticular_galaxy "wikilink") of apparent magnitude + 14.66. It is approximately 42 [kiloparsecs](kiloparsecs "wikilink") + (137,000 [light-years](light-years "wikilink")) in diameter and about + 12.9 billion years old. [Robert's Quartet](Robert's_Quartet "wikilink") + (composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three + spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink") + and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located + around 160 million light-years away which are in the process of colliding + and merging. They are within a circle of radius of 1.6 arcmin, + corresponding to about 75,000 light-years. Located in the galaxy ESO + 243-49 is [HLX-1](HLX-1 "wikilink"), an + [intermediate-mass black hole](intermediate-mass_black_hole + "wikilink")—the first one of its kind identified. It is thought to be a + remnant of a dwarf galaxy that was absorbed in a + [collision](Interacting_galaxy "wikilink") with ESO 243-49. Before its + discovery, this class of black hole was only hypothesized. questions_and_answers: - - question: What is a tonsillolith? + - question: | + Is the Phoenix Constellation part of the Milky Way? + answer: | + The Phoenix constellation does not lie on the galactic plane of + the Milky Way, and there are no prominent star clusters. + - question: | + How many light years away is NGC 625? answer: | - A tonsillolith (tonsil stone) is material that accumulates on the - palatine tonsil, reaching the size of a peppercorn and having a white - or cream color. It contains calcium and has a strong unpleasant odor - due to hydrogen sulfide, methyl mercaptan, and other chemicals. - - question: What is the main substance found in a tonsillolith? - answer: The main substance found in a tonsillolith is mostly calcium. - - question: Why do tonsilloliths have a strong unpleasant odor? + NGC 625 is 24000 light years in diameter and is an outlying + member of the Sculptor Group. + - question: | + What is Robert's Quartet composed of? answer: | - Tonsilloliths have a strong unpleasant odor due to hydrogen sulfide, - methyl mercaptan, and other chemicals. - + Robert's Quartet is composed of the irregular galaxy NGC 87, + and three spiral galaxies NGC 88, NGC 89 and NGC 92. document_outline: | - Overview of Human tonsils, describing their types, locations, structure, - function, and clinical significance, with a specific focus on their role in - the immune system and related health issues. - + Information about the Phoenix Constellation including the + history, characteristics, and features of the stars in the constellation. document: - repo: https://github.com/luke-inglis/il-anatomy-knowledge - commit: cc7c6ca + repo: https://github.com/RedHatOfficial/rhelai-taxonomy-data + commit: c87a82eb15567f28c0a8d30025e0cd77a2150646 patterns: - - anatomy1.md + - phoenix.md From de8180b60955dfa26aef7d93e4478f68a1d7d414 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Mon, 6 Jan 2025 13:28:22 -0500 Subject: [PATCH 05/11] Update for ruff and pylint issues Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 26 ++++++++++++-------------- src/instructlab/sdg/utils/taxonomy.py | 18 ++++++++++-------- tests/test_chunkers.py | 9 ++++----- tests/test_generate_data.py | 12 +++++++++--- tests/test_taxonomy.py | 4 ++-- 5 files changed, 37 insertions(+), 32 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index b292239c..257c9cea 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -1,9 +1,7 @@ # Standard -from abc import ABC, abstractmethod from collections import defaultdict -from enum import Enum from pathlib import Path -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional import json import logging import re @@ -70,8 +68,7 @@ def resolve_ocr_options() -> OcrOptions: def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: - """Split document paths into a dict of lists based on their file extension. - """ + """Split document paths into a dict of lists based on their file extension.""" document_dict = defaultdict(list) for path in document_paths: filetype = path.suffix @@ -83,7 +80,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: return dict(document_dict) - # class DocumentChunker: # """A factory chunker class that instantiates the applicable chunker @@ -226,8 +222,7 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: # return chunk_markdowns(self.document_contents, chunk_size) -class DocumentChunker(): # pylint: disable=too-many-instance-attributes - +class DocumentChunker: # pylint: disable=too-many-instance-attributes # def __new__( # cls, # leaf_node, @@ -267,13 +262,12 @@ def __init__( self.tokenizer = self.create_tokenizer(tokenizer_model_name) def _init_docling_converter(self): - """Initialize docling converter with filetype-specific configurations - """ + """Initialize docling converter with filetype-specific configurations""" # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel # Third Party - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline if self.docling_model_path is None: logger.info("Docling models not found on disk, downloading models...") @@ -285,7 +279,7 @@ def _init_docling_converter(self): artifacts_path=self.docling_model_path, do_ocr=False, ) - + ocr_options = resolve_ocr_options() if ocr_options is not None: pipeline_options.do_ocr = True @@ -402,7 +396,9 @@ def create_tokenizer(model_name: str): # Third Party from transformers import AutoTokenizer - model_path = Path(model_name) # TODO expect a path from the DocumentChunker constructor + model_path = Path( + model_name + ) # TODO expect a path from the DocumentChunker constructor error_info_message = ( "Please run `ilab model download {download_args}` and try again" ) @@ -583,7 +579,9 @@ def build_chunks_from_docling_json( ) book_text = self.get_table(json_book, book_element["$ref"]) elif book_element["prov"]: - current_book_page_number = book_element["prov"][0]["page"] # TODO export to function to handle empty ["prov"] + current_book_page_number = book_element["prov"][0][ + "page" + ] # TODO export to function to handle empty ["prov"] book_text = book_element["text"] else: current_book_page_number = None diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index f965e1d9..8db131d3 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -112,12 +112,12 @@ def _get_taxonomy(repo="taxonomy"): def _string_contains_html(s: str) -> bool: """Detect HTML tags in a string. - + We use this to catch markdown files that may contain html elements since docling does not support this.""" # Define a regex to detect HTML tags html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>") - + # Check for HTML tags in the content return bool(html_tag_pattern.search(s)) @@ -174,11 +174,13 @@ def _get_documents( with open(file_path, "r", encoding="utf-8") as file: content = file.read() if _string_contains_html(content): - raise ValueError(f"Provided markdown file {file_path} contains" - " HTML, which is currently unsupported. Please" - " format your markdown documents without the" - " use of HTML or use a different document" - " filetype.") + raise ValueError( + f"Provided markdown file {file_path} contains" + " HTML, which is currently unsupported. Please" + " format your markdown documents without the" + " use of HTML or use a different document" + " filetype." + ) file_contents.append(content) filepaths.append(Path(file_path)) logger.info( @@ -436,7 +438,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset: def _knowledge_leaf_node_to_samples( leaf_node, - taxonomy_path, + taxonomy_path, # pylint: disable=unused-argument server_ctx_size, chunk_word_count, document_output_dir, diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index ff1079d5..42db1ac3 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -11,10 +11,7 @@ import pytest # First Party -from instructlab.sdg.utils.chunkers import ( - DocumentChunker, - resolve_ocr_options, -) +from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options # Local from .testdata import testdata @@ -32,7 +29,9 @@ def tokenizer_model_name(): return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name): +def test_init_document_chunker_unsupported_filetype( + documents_dir, tokenizer_model_name +): """Test that the DocumentChunker factory class fails when provided an unsupported document""" document_paths = [documents_dir / "document.jpg"] with pytest.raises(ValueError): diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index 650df21a..c2968029 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -313,7 +313,9 @@ def test_generate(self): client=MagicMock(), logger=mocked_logger, model_family="granite", - model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -396,7 +398,9 @@ def test_generate(self): client=MagicMock(), logger=mocked_logger, model_family="granite", - model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -500,7 +504,9 @@ def test_generate(self): client=MagicMock(), logger=mocked_logger, model_family="granite", - model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index 32114fcc..0b697bd7 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -86,13 +86,13 @@ def test_read_taxonomy_leaf_nodes( ): seed_example_exists = True assert seed_example_exists is True - + @pytest.mark.parametrize( "s, contains_html", [ ("hello, world!", False), ("hello,
world!
", True), - ] + ], ) def test_string_contains_html(self, s, contains_html): print(taxonomy.__dict__) From c34cc9c1b3d2fda3ecd3d2b8dfbb2f48006e47a5 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 7 Jan 2025 15:12:54 -0500 Subject: [PATCH 06/11] Update chunkers.py with minor cleanup Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 144 +------------------------- 1 file changed, 1 insertion(+), 143 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 257c9cea..5fe1f5d5 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -80,148 +80,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: return dict(document_dict) -# class DocumentChunker: -# """A factory chunker class that instantiates the applicable chunker - -# Currently, only Markdown and PDF are supported. For Markdown, returns -# TextSplitChunker, and for PDF, returns ContextAwareChunker""" - -# def __new__( -# cls, -# doc_filepaths: List[Path], -# output_dir: Path, -# server_ctx_size=4096, -# chunk_word_count=1024, -# tokenizer_model_name: Optional[str] = None, -# docling_model_path: Optional[str] = None, -# ): -# """Insantiate the appropriate chunker for the provided document - -# Args: -# leaf_node: a leaf node dict containing "documents", -# "filepaths", and "taxonomy_path" keys -# output_dir (Path): directory where artifacts should be stored -# server_ctx_size (int): Context window size of server -# chunk_word_count (int): Maximum number of words to chunk a document -# tokenizer_model_name (Optional[str]): name of huggingface model to get -# tokenizer from -# Returns: -# TextSplitChunker | ContextAwareChunker: Object of the appropriate -# chunker class for the provided filetype -# """ -# documents = leaf_node[0]["documents"] - -# if not isinstance(taxonomy_path, Path): -# taxonomy_path = Path(taxonomy_path) - -# if isinstance(documents, str): -# documents = [documents] -# logger.info( -# "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input." -# ) -# elif not isinstance(documents, list): -# raise TypeError( -# "Expected: documents to be a list, but got {}".format(type(documents)) -# ) - -# filepaths = leaf_node[0]["filepaths"] - -# doc_dict = cls._split_docs_by_filetype(documents, filepaths) -# if len(doc_dict.keys()) > 1: -# raise ValueError("Received multiple document types") -# if len(doc_dict.keys()) < 1: -# raise ValueError("Received no document types") - -# if SupportedFileTypes.MD in doc_dict: -# doc_contents = [d for d, _ in doc_dict[SupportedFileTypes.MD]] -# # return TextSplitChunker( -# # doc_contents, -# # server_ctx_size, -# # chunk_word_count, -# # output_dir, -# # ) - -# # TODO CHUNK AS MARKDOWN -# pass - -# if SupportedFileTypes.PDF in doc_dict: -# doc_paths = [p for _, p in doc_dict[SupportedFileTypes.PDF]] -# # return ContextAwareChunker( -# # doc_paths, -# # filepaths, -# # output_dir, -# # chunk_word_count, -# # tokenizer_model_name, -# # docling_model_path=docling_model_path, -# # ) - -# # TODO CHUNK AS PDF -# pass - -# @staticmethod -# def _split_docs_by_filetype( -# documents: List[str], filepaths: List[Path] -# ) -> DefaultDict[SupportedFileTypes, List[Tuple[str, Path]]]: -# """Separate documents into lists based on their filetype. - -# Currently, only Markdown and PDF are supported. -# Args: -# documents (List[str]): A list of the document contents as strings -# filepaths (List[Path]): Corresponding document filepaths -# Returns: -# DefaultDict: Dictionary with either ".md" or ".pdf" as a key. -# Markdown items contain document contents, PDF items contain -# paths to documents. -# """ -# doc_dict = defaultdict(list) -# for doc, path in zip(documents, filepaths): -# if path.suffix == ".md": -# # append doc contents -# doc_dict[SupportedFileTypes.MD].append((doc, path)) -# elif path.suffix == ".pdf": -# # append doc paths -# doc_dict[SupportedFileTypes.PDF].append((doc, path)) -# else: -# raise ValueError( -# f"Received document of type .{path.suffix}, which is not a supported filetype" -# ) -# return doc_dict - - -# class TextSplitChunker(ChunkerBase): -# def __init__( -# self, -# document_contents: List | str, -# server_ctx_size: int, -# chunk_word_count: int, -# output_dir: Path, -# ): -# self.document_contents = document_contents -# self.server_ctx_size = server_ctx_size -# self.chunk_word_count = chunk_word_count -# self.output_dir = output_dir - -# def chunk_documents(self) -> List: -# """Naively chunk markdown documents based on the word count provided by the user. -# Returns: -# List[str]: List of chunked documents. -# """ -# num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) -# if num_tokens_per_doc > int(self.server_ctx_size - 1024): -# raise ValueError( -# "Error: {}".format( -# str( -# f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})" -# ) -# ) -# ) -# if self.document_contents == []: -# return [] - -# chunk_size = _num_chars_from_tokens(num_tokens_per_doc) -# return chunk_markdowns(self.document_contents, chunk_size) - - class DocumentChunker: # pylint: disable=too-many-instance-attributes # def __new__( # cls, @@ -243,7 +101,7 @@ def __init__( server_ctx_size: int = 4096, chunk_word_count: int = 1024, ): - if len(document_paths) == 0: + if not document_paths: raise ValueError("Provided empty list of documents") document_dict = split_docs_by_filetype(document_paths) From cbfd47aad56304169db588fc936f042dcb5608d5 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 7 Jan 2025 16:39:58 -0500 Subject: [PATCH 07/11] Update test_chunkers to be more modular Signed-off-by: Aakanksha Duggal --- tests/functional/test_chunkers.py | 69 +++++++++++++++---------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index d364ac48..f06db504 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -8,56 +8,53 @@ # First Party from instructlab.sdg.utils.chunkers import DocumentChunker +# Constants for Test Directory and Test Documents TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata") +TEST_DOCUMENTS = { + "pdf": "sample_documents/phoenix.pdf", + "md": "sample_documents/phoenix.md", +} + + +@pytest.fixture(scope="module") +def test_paths(): + """Fixture to return paths to test documents.""" + return { + doc_type: Path(os.path.join(TEST_DATA_DIR, path)) + for doc_type, path in TEST_DOCUMENTS.items() + } @pytest.fixture def tokenizer_model_name(): + """Fixture to return the path to the tokenizer model.""" return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -def test_chunk_pdf(tmp_path, tokenizer_model_name): - pdf_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.pdf")) - leaf_node = [ - { - "documents": ["Lorem ipsum"], - "filepaths": [pdf_path], - "taxonomy_path": "knowledge", - } - ] - chunker = DocumentChunker( - document_paths=[pdf_path], - output_dir=tmp_path, - tokenizer_model_name=tokenizer_model_name, - server_ctx_size=4096, - chunk_word_count=500, - ) - chunks = chunker.chunk_documents() - assert len(chunks) > 9 - assert "Phoenix is a minor constellation" in chunks[0] - for chunk in chunks: - # inexact sanity-checking of chunk max length - assert len(chunk) < 2500 - - -def test_chunk_md(tmp_path, tokenizer_model_name): - markdown_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.md")) - leaf_node = [ - { - "documents": [markdown_path.read_text(encoding="utf-8")], - "filepaths": [markdown_path], - "taxonomy_path": "knowledge", - } - ] +@pytest.mark.parametrize( + "doc_type, expected_chunks, contains_text", + [ + ("pdf", 9, "Phoenix is a minor constellation"), + ("md", 7, None), # Assuming there's no specific text to check in Markdown + ], +) +def test_chunk_documents( + tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text +): + """ + Generalized test function for chunking documents. + """ + document_path = test_paths[doc_type] chunker = DocumentChunker( - document_paths=[markdown_path], + document_paths=[document_path], output_dir=tmp_path, tokenizer_model_name=tokenizer_model_name, server_ctx_size=4096, chunk_word_count=500, ) chunks = chunker.chunk_documents() - assert len(chunks) > 7 + assert len(chunks) > expected_chunks + if contains_text: + assert contains_text in chunks[0] for chunk in chunks: - # inexact sanity-checking of chunk max length assert len(chunk) < 2500 From 9581c2d84b2d96d50d07d59b7d65c1d90824a27a Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 8 Jan 2025 10:38:11 -0500 Subject: [PATCH 08/11] Update HTML error to warning to avoid exiting Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/taxonomy.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 8db131d3..00a4b1ba 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -174,12 +174,10 @@ def _get_documents( with open(file_path, "r", encoding="utf-8") as file: content = file.read() if _string_contains_html(content): - raise ValueError( - f"Provided markdown file {file_path} contains" - " HTML, which is currently unsupported. Please" - " format your markdown documents without the" - " use of HTML or use a different document" - " filetype." + logging.warning( + f"Provided markdown file {file_path} contains HTML contents, which is currently unsupported as a part of markdown" + "NOTE: Continuing this might affect your data generation quality." + "To get best results please format your markdown documents without the use of HTML or use a different document filetype." ) file_contents.append(content) filepaths.append(Path(file_path)) From ed86e827c86f9cff4880db67f49cd1192c8317cd Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Mon, 13 Jan 2025 21:46:57 +0400 Subject: [PATCH 09/11] delete unused arg taxonomy_path in leaf_node_to_samples Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/generate_data.py | 3 +-- src/instructlab/sdg/utils/taxonomy.py | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 533db868..badf3834 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -395,8 +395,7 @@ def generate_data( is_knowledge = False leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") samples = leaf_node_to_samples( - leaf_node, - taxonomy, + leaf_node, # pylint: disable=duplicate-code server_ctx_size, chunk_word_count, document_output_dir, diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 00a4b1ba..2a23072f 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -436,7 +436,6 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset: def _knowledge_leaf_node_to_samples( leaf_node, - taxonomy_path, # pylint: disable=unused-argument server_ctx_size, chunk_word_count, document_output_dir, @@ -475,7 +474,6 @@ def _skill_leaf_node_to_samples(leaf_node): def leaf_node_to_samples( leaf_node, - taxonomy_path, server_ctx_size, chunk_word_count, document_output_dir, @@ -486,8 +484,7 @@ def leaf_node_to_samples( return [] if leaf_node[0].get("documents"): return _knowledge_leaf_node_to_samples( - leaf_node, - taxonomy_path, + leaf_node, # pylint: disable=duplicate-code server_ctx_size, chunk_word_count, document_output_dir, From a692d70c2479032f7a2e61019d2ed1c4ca80b7d8 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Tue, 14 Jan 2025 18:20:56 +0400 Subject: [PATCH 10/11] remove commented code in chunkers.py Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/utils/chunkers.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 5fe1f5d5..1ac24d9d 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -81,17 +81,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: class DocumentChunker: # pylint: disable=too-many-instance-attributes - # def __new__( - # cls, - # leaf_node, - # taxonomy_path, - # output_dir: Path, - # server_ctx_size=4096, - # chunk_word_count=1024, - # tokenizer_model_name: Optional[str] = None, - # docling_model_path: Optional[str] = None, - # ): - def __init__( self, document_paths: List[Path], From e3a3e1eadeea23d13fc7cba9b102e00aa5c61a39 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Wed, 15 Jan 2025 14:32:38 +0400 Subject: [PATCH 11/11] optionally accept pathlib.Path for tokenizer_model_name Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/utils/chunkers.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 1ac24d9d..52c15ba1 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -85,7 +85,7 @@ def __init__( self, document_paths: List[Path], output_dir: Path, - tokenizer_model_name: str, + tokenizer_model_name: str | Path, docling_model_path: Optional[Path] = None, server_ctx_size: int = 4096, chunk_word_count: int = 1024, @@ -228,7 +228,7 @@ def fuse_texts( return fused_texts @staticmethod - def create_tokenizer(model_name: str): + def create_tokenizer(model_path: str | Path): """ Create a tokenizer instance from a pre-trained model or a local directory. @@ -243,9 +243,8 @@ def create_tokenizer(model_name: str): # Third Party from transformers import AutoTokenizer - model_path = Path( - model_name - ) # TODO expect a path from the DocumentChunker constructor + if not isinstance(model_path, Path): + model_path = Path(model_path) error_info_message = ( "Please run `ilab model download {download_args}` and try again" )

Type

Epithelium

Capsule

Crypts

Location

Pharyngeal tonsil (also - termed "adenoid")

Ciliated - pseudostratified columnar (respiratory epithelium)

Incompletely encapsulated

Small folds—sometimes described as crypts1

Roof of pharynx

Tubal tonsils

Ciliated pseudostratified columnar (respiratory epithelium)

Not encapsulated

No crypts

Roof of pharynx

Palatine tonsils

Stratified squamous epithelium

Fully encapsulated

Multiple deep crypts

Each side of the throat at the back of the mouth