Skip to content

Commit

Permalink
Check for tokenizer in downloaded models directory
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Nov 11, 2024
1 parent fc709af commit 2aa3aee
Showing 1 changed file with 17 additions and 10 deletions.
27 changes: 17 additions & 10 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
PdfFormatOption,
)
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from instructlab.model.backends.backends import is_model_gguf, is_model_safetensors

Check failure on line 22 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'instructlab.model.backends.backends' (import-error)

Check failure on line 22 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0611: No name 'model' in module 'instructlab' (no-name-in-module)
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from tabulate import tabulate
from transformers import AutoTokenizer
Expand Down Expand Up @@ -186,18 +187,12 @@ def __init__(
filepaths,
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
tokenizer_model_name: str,
):
self.document_paths = document_paths
self.filepaths = filepaths
self.output_dir = self._path_validator(output_dir)
self.chunk_word_count = chunk_word_count
self.tokenizer_model_name = (
tokenizer_model_name
if tokenizer_model_name is not None
else "mistralai/Mixtral-8x7B-Instruct-v0.1"
)

self.tokenizer = self.create_tokenizer(tokenizer_model_name)

def chunk_documents(self) -> List:
Expand Down Expand Up @@ -305,12 +300,24 @@ def create_tokenizer(self, model_name: str):
Returns:
AutoTokenizer: The tokenizer instance.
"""
# Third Party
import ipdb

ipdb.set_trace()

Check warning on line 306 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

C0415: Import outside toplevel (ipdb) (import-outside-toplevel)

Check failure on line 306 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'ipdb' (import-error)
model_path = Path(model_name)
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
logger.info(f"Successfully loaded tokenizer from: {model_name}")
if is_model_safetensors(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path)
elif is_model_gguf(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path.parent, gguf_file=model_path.name)
logger.info(f"Successfully loaded tokenizer from: {model_path}")
return tokenizer
except Exception as e:
logger.error(f"Failed to load tokenizer from {model_name}: {str(e)}")
logger.error(

Check failure on line 316 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

E0606: Possibly using variable 'tokenizer' before assignment (possibly-used-before-assignment)
f"Failed to load tokenizer as model was not found at {model_path}."

Check warning on line 317 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

W0612: Unused variable 'e' (unused-variable)
"Please run `ilab model download {model_name} and try again\n"
"{str(e)}"
)
raise

def get_token_count(self, text, tokenizer):
Expand Down

0 comments on commit 2aa3aee

Please sign in to comment.