chore: updated test output

Goldziher · Feb 18, 2025 · 5e347bf · 5e347bf
1 parent 68362e9
commit 5e347bf
Show file tree

Hide file tree

Showing 17 changed files with 191 additions and 134 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -59,7 +59,7 @@ jobs:
         #os: [ ubuntu-latest, macOS-latest, windows-latest ]
         #python: [" 3.9", "3.10", "3.11", "3.12", "3.13" ]
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 15
+    timeout-minutes: 20
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -122,4 +122,4 @@ jobs:
           pandoc --version
 
       - name: Run Tests
-        run: uv run pytest tests -vvv -n auto --dist=loadfile --timeout 30
+        run: uv run pytest -n auto
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
         additional_dependencies:
           - tomli
   - repo: https://github.com/jsh9/pydoclint
-    rev: 0.6.0
+    rev: 0.6.2
     hooks:
       - id: pydoclint
         args:

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,18 @@
+FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim AS base
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    g++ \
+    libpq-dev \
+    pandoc \
+    tesseract-ocr \
+    tesseract-ocr-deu \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+FROM base AS install
+WORKDIR /app/
+COPY pyproject.toml uv.lock ./
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    uv sync --verbose --frozen
+ENV PATH="/app/.venv/bin:$PATH"
diff --git a/README.md b/README.md
@@ -26,8 +26,8 @@ pip install kreuzberg
 
 Kreuzberg requires two system level dependencies:
 
-- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
-- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
+- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
+- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
 
 You can install these with:
 
@@ -40,7 +40,7 @@ sudo apt-get install pandoc tesseract-ocr
 #### MacOS
 
 ```shell
-# MacOS
+#
 brew install tesseract pandoc
 ```
 
@@ -54,6 +54,7 @@ Notes:
 
 - in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
 - please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
+- th
 
 ## Architecture
 

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -0,0 +1,11 @@
+services:
+  kreuzberg:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      target: install
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./kreuzberg:/src/kreuzberg:cached
+      - ./tests:/src/tests:cached
diff --git a/kreuzberg/_constants.py b/kreuzberg/_constants.py
@@ -3,4 +3,4 @@
 from multiprocessing import cpu_count
 from typing import Final
 
-DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
+DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
diff --git a/kreuzberg/_html.py b/kreuzberg/_html.py
@@ -8,7 +8,6 @@
 from kreuzberg import ExtractionResult
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
 from kreuzberg._string import normalize_spaces, safe_decode
-from kreuzberg._sync import run_sync
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
         if isinstance(file_path_or_contents, bytes)
         else await AsyncPath(file_path_or_contents).read_text()
     )
-    result = await run_sync(html_to_markdown.convert_to_markdown, content)
+    result = html_to_markdown.convert_to_markdown(content)
     return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
diff --git a/kreuzberg/_pandoc.py b/kreuzberg/_pandoc.py
@@ -6,13 +6,12 @@
 from json import JSONDecodeError, loads
 from typing import TYPE_CHECKING, Any, Final, Literal, cast
 
-from anyio import CapacityLimiter, create_task_group, to_process
+from anyio import CapacityLimiter, create_task_group, run_process, to_process
 from anyio import Path as AsyncPath
 
 from kreuzberg._constants import DEFAULT_MAX_PROCESSES
 from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
 from kreuzberg._string import normalize_spaces
-from kreuzberg._sync import run_sync
 from kreuzberg._tmp import create_temp_file
 from kreuzberg._types import ExtractionResult, Metadata
 from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
@@ -251,10 +250,10 @@ async def _validate_pandoc_version() -> None:
             return
 
         command = ["pandoc", "--version"]
-        result = await run_sync(subprocess.run, command, capture_output=True)
+        result = await run_process(command)
         version = result.stdout.decode().split("\n")[0].split()[1]
-        if not version.startswith("3."):
-            raise MissingDependencyError("Pandoc version 3 or above is required.")
+        if not version.startswith("2."):
+            raise MissingDependencyError("Pandoc version 2 or above is required.")
 
         version_ref["checked"] = True
 

diff --git a/kreuzberg/_pdf.py b/kreuzberg/_pdf.py
@@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
     document: pypdfium2.PdfDocument | None = None
     try:
         document = await run_sync(pypdfium2.PdfDocument, str(input_file))
-        return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
+        return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
     except pypdfium2.PdfiumError as e:
         raise ParsingError(
             "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}

diff --git a/kreuzberg/_sync.py b/kreuzberg/_sync.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
 import sys
+from collections.abc import Awaitable
 from functools import partial
 from typing import TYPE_CHECKING, TypeVar, cast
 
+from anyio import create_task_group
 from anyio.to_thread import run_sync as any_io_run_sync
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -31,3 +33,43 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
     """
     handler = partial(sync_fn, **kwargs)
     return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True))  # pyright: ignore [reportCallIssue]
+
+
+async def run_taskgroup(*coroutines: Callable[[], Awaitable[T]]) -> list[T]:
+    """Run a list of coroutines concurrently.
+
+    Args:
+        coroutines: The list of coroutines to run.
+
+    Returns:
+        The results of the coroutines.
+    """
+    results = cast(list[T], [None] * len(coroutines))
+
+    async def run_task(index: int, task: Callable[[], Awaitable[T]]) -> None:
+        results[index] = await task()
+
+    async with create_task_group() as tg:
+        for i, coro in enumerate(coroutines):
+            tg.start_soon(run_task, i, coro)
+
+    return results
+
+
+async def run_taskgroup_batched(*coroutines: Callable[[], Awaitable[T]], batch_size: int) -> list[T]:
+    """Run a list of coroutines concurrently in batches.
+
+    Args:
+        coroutines: The list of coroutines to run.
+        batch_size: The size of each batch.
+
+    Returns:
+        The results of the coroutines.
+    """
+    results: list[T] = []
+
+    for i in range(0, len(coroutines), batch_size):
+        batch = coroutines[i : i + batch_size]
+        results.extend(await run_taskgroup(*batch))
+
+    return results