chore: simplified pre-processing and removed binarization

Goldziher · Feb 16, 2025 · 68362e9 · 68362e9
1 parent 43b9034
commit 68362e9
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 216 deletions.
diff --git a/kreuzberg/_ocr_pre_processing.py b/kreuzberg/_ocr_pre_processing.py
diff --git a/kreuzberg/_tesseract.py b/kreuzberg/_tesseract.py
@@ -11,12 +11,12 @@
 
 from anyio import CapacityLimiter, create_task_group, to_process
 from anyio import Path as AsyncPath
-from PIL.Image import Image
+from PIL import ImageOps
+from PIL.Image import Image, Resampling
 from PIL.Image import open as open_image
 
 from kreuzberg._constants import DEFAULT_MAX_PROCESSES
 from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
-from kreuzberg._ocr_pre_processing import preprocess_image
 from kreuzberg._string import normalize_spaces
 from kreuzberg._sync import run_sync
 from kreuzberg._tmp import create_temp_file
@@ -28,6 +28,12 @@
 
 MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
 
+
+DEFAULT_DPI: Final[int] = 72
+TARGET_DPI: Final[int] = 300
+BINARIZATION_THRESHOLD: Final[int] = 0
+BINARIZATION_MAX_VALUE: Final[int] = 255
+
 version_ref = {"checked": False}
 
 T = TypeVar("T", bound=Union[Image, PathLike[str], str])
@@ -60,6 +66,33 @@ class PSMMode(Enum):
     """Treat the image as a single character."""
 
 
+def resize_for_ocr(image: Image) -> Image:
+    """Resize the image to ensure sufficient DPI for OCR.
+
+    Args:
+        image: Input Pillow image.
+
+    Returns:
+        The resized image.
+    """
+    width, height = image.size
+    scale_factor = TARGET_DPI / DEFAULT_DPI
+    new_size = (int(width * scale_factor), int(height * scale_factor))
+    return image.resize(new_size, Resampling.LANCZOS)
+
+
+def preprocess_image(image: Image) -> Image:
+    """Preprocess the input image for OCR.
+
+    Args:
+        image: Input Pillow image.
+
+    Returns:
+        The preprocessed version of the input image.
+    """
+    return resize_for_ocr(ImageOps.grayscale(image))
+
+
 async def validate_tesseract_version() -> None:
     """Validate that Tesseract is installed and is version 5 or above.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -40,8 +40,6 @@ dependencies = [
   "charset-normalizer>=3.4.1",
   "exceptiongroup>=1.2.2; python_version<'3.11'",
   "html-to-markdown>=1.2.0",
-  "numpy>=2.0.2",
-  "opencv-python-headless>=4.11.0.86",
   "pypdfium2>=4.30.1",
   "python-calamine>=0.3.1",
   "python-pptx>=1.0.2",