Skip to content

Commit

Permalink
chore: simplified pre-processing and removed binarization
Browse files Browse the repository at this point in the history
  • Loading branch information
Goldziher committed Feb 16, 2025
1 parent 43b9034 commit 68362e9
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 216 deletions.
43 changes: 0 additions & 43 deletions kreuzberg/_ocr_pre_processing.py

This file was deleted.

37 changes: 35 additions & 2 deletions kreuzberg/_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@

from anyio import CapacityLimiter, create_task_group, to_process
from anyio import Path as AsyncPath
from PIL.Image import Image
from PIL import ImageOps
from PIL.Image import Image, Resampling
from PIL.Image import open as open_image

from kreuzberg._constants import DEFAULT_MAX_PROCESSES
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
from kreuzberg._ocr_pre_processing import preprocess_image
from kreuzberg._string import normalize_spaces
from kreuzberg._sync import run_sync
from kreuzberg._tmp import create_temp_file
Expand All @@ -28,6 +28,12 @@

MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5


DEFAULT_DPI: Final[int] = 72
TARGET_DPI: Final[int] = 300
BINARIZATION_THRESHOLD: Final[int] = 0
BINARIZATION_MAX_VALUE: Final[int] = 255

version_ref = {"checked": False}

T = TypeVar("T", bound=Union[Image, PathLike[str], str])
Expand Down Expand Up @@ -60,6 +66,33 @@ class PSMMode(Enum):
"""Treat the image as a single character."""


def resize_for_ocr(image: Image) -> Image:
"""Resize the image to ensure sufficient DPI for OCR.
Args:
image: Input Pillow image.
Returns:
The resized image.
"""
width, height = image.size
scale_factor = TARGET_DPI / DEFAULT_DPI
new_size = (int(width * scale_factor), int(height * scale_factor))
return image.resize(new_size, Resampling.LANCZOS)


def preprocess_image(image: Image) -> Image:
"""Preprocess the input image for OCR.
Args:
image: Input Pillow image.
Returns:
The preprocessed version of the input image.
"""
return resize_for_ocr(ImageOps.grayscale(image))


async def validate_tesseract_version() -> None:
"""Validate that Tesseract is installed and is version 5 or above.
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ dependencies = [
"charset-normalizer>=3.4.1",
"exceptiongroup>=1.2.2; python_version<'3.11'",
"html-to-markdown>=1.2.0",
"numpy>=2.0.2",
"opencv-python-headless>=4.11.0.86",
"pypdfium2>=4.30.1",
"python-calamine>=0.3.1",
"python-pptx>=1.0.2",
Expand Down
Loading

0 comments on commit 68362e9

Please sign in to comment.