Skip to content

Commit

Permalink
feat: add white space normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
Goldziher committed Feb 2, 2025
1 parent e4cc451 commit 98c77c1
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 15 deletions.
26 changes: 13 additions & 13 deletions kreuzberg/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pytesseract import TesseractError, image_to_string

from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
from kreuzberg._string import normalize_spaces
from kreuzberg._sync import run_sync
from kreuzberg.exceptions import ParsingError

Expand All @@ -40,7 +41,7 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
images = [page.render(scale=2.0).to_pil() for page in pdf]

text = "\n".join(image_to_string(img) for img in images)
return text.strip()
return normalize_spaces(text)
except (PdfiumError, TesseractError) as e:
# TODO: add test case
raise ParsingError(
Expand All @@ -63,7 +64,7 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
try:
document = PdfDocument(file_path)
text = "\n".join(page.get_textpage().get_text_range() for page in document)
return text.strip()
return normalize_spaces(text)
except PdfiumError as e:
# TODO: add test case
raise ParsingError(
Expand All @@ -82,9 +83,9 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
The extracted text.
"""
if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
return content
return normalize_spaces(content)

return await run_sync(_extract_pdf_with_tesseract, file_path)
return normalize_spaces(await run_sync(_extract_pdf_with_tesseract, file_path))


async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
Expand All @@ -104,7 +105,9 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
encoding = encoding or detect(file_data)["encoding"] or "utf-8"
try:
return cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
return normalize_spaces(
cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
)
except RuntimeError as e:
# TODO: add test case
raise ParsingError(
Expand All @@ -128,7 +131,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
"""
ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
try:
return cast(str, await run_sync(convert_file, file_path, to="md", format=ext))
return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
except RuntimeError as e:
raise ParsingError(
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
Expand All @@ -149,14 +152,14 @@ async def _extract_image_with_tesseract(file_path: Path | str) -> str:
The extracted content.
"""
try:
return cast(str, image_to_string(str(file_path)).strip())
return normalize_spaces(cast(str, image_to_string(str(file_path))))
except TesseractError as e:
raise ParsingError(
"Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
) from e


async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str: # noqa: C901
async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
"""Extract text from a PPTX file.
Notes:
Expand Down Expand Up @@ -211,10 +214,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str: # noq
md_content += "\n" + html_table + "\n"

elif shape.has_text_frame:
if shape == title:
md_content += "# " + shape.text.lstrip() + "\n"
else:
md_content += shape.text + "\n"
md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"

md_content = md_content.strip()
if slide.has_notes_slide:
Expand All @@ -226,4 +226,4 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str: # noq

md_content = md_content.strip()

return md_content.strip()
return normalize_spaces(md_content)
12 changes: 12 additions & 0 deletions kreuzberg/_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,15 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:

# TODO: add test case
return byte_data.decode("latin-1", errors="replace")


def normalize_spaces(text: str) -> str:
"""Normalize the spaces in a string.
Args:
text: The text to sanitize.
Returns:
The sanitized text.
"""
return " ".join(text.strip().split())
10 changes: 8 additions & 2 deletions tests/extraction_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ async def test_extract_bytes_pptx(pptx_document: Path) -> None:
result = await extract_bytes(content, POWER_POINT_MIME_TYPE)
assert result.mime_type == MARKDOWN_MIME_TYPE
assert isinstance(result.content, str)
assert result.content.strip()
assert (
"At Contoso, we empower organizations to foster collaborative thinking to further drive workplace innovation. By closing the loop and leveraging agile frameworks, we help business grow organically and foster a consumer first mindset."
in result.content
)


async def test_extract_bytes_markdown(markdown_document: Path) -> None:
Expand Down Expand Up @@ -126,7 +129,10 @@ async def test_extract_file_pptx(pptx_document: Path) -> None:
result = await extract_file(pptx_document, POWER_POINT_MIME_TYPE)
assert result.mime_type == MARKDOWN_MIME_TYPE
assert isinstance(result.content, str)
assert result.content.strip()
assert (
"At Contoso, we empower organizations to foster collaborative thinking to further drive workplace innovation. By closing the loop and leveraging agile frameworks, we help business grow organically and foster a consumer first mindset."
in result.content
)


async def test_extract_file_invalid_mime() -> None:
Expand Down

0 comments on commit 98c77c1

Please sign in to comment.