Skip to content

Commit

Permalink
chore: updated test output
Browse files Browse the repository at this point in the history
  • Loading branch information
Goldziher committed Feb 18, 2025
1 parent 68362e9 commit 5e347bf
Show file tree
Hide file tree
Showing 17 changed files with 191 additions and 134 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
#os: [ ubuntu-latest, macOS-latest, windows-latest ]
#python: [" 3.9", "3.10", "3.11", "3.12", "3.13" ]
runs-on: ${{ matrix.os }}
timeout-minutes: 15
timeout-minutes: 20
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down Expand Up @@ -122,4 +122,4 @@ jobs:
pandoc --version
- name: Run Tests
run: uv run pytest tests -vvv -n auto --dist=loadfile --timeout 30
run: uv run pytest -n auto
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ repos:
additional_dependencies:
- tomli
- repo: https://github.com/jsh9/pydoclint
rev: 0.6.0
rev: 0.6.2
hooks:
- id: pydoclint
args:
Expand Down
18 changes: 18 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim AS base
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
g++ \
libpq-dev \
pandoc \
tesseract-ocr \
tesseract-ocr-deu \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

FROM base AS install
WORKDIR /app/
COPY pyproject.toml uv.lock ./
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --verbose --frozen
ENV PATH="/app/.venv/bin:$PATH"
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ pip install kreuzberg

Kreuzberg requires two system level dependencies:

- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.

You can install these with:

Expand All @@ -40,7 +40,7 @@ sudo apt-get install pandoc tesseract-ocr
#### MacOS

```shell
# MacOS
#
brew install tesseract pandoc
```

Expand All @@ -54,6 +54,7 @@ Notes:

- in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
- please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
- th

## Architecture

Expand Down
11 changes: 11 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
services:
kreuzberg:
build:
context: .
dockerfile: Dockerfile
target: install
ports:
- "8000:8000"
volumes:
- ./kreuzberg:/src/kreuzberg:cached
- ./tests:/src/tests:cached
2 changes: 1 addition & 1 deletion kreuzberg/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from multiprocessing import cpu_count
from typing import Final

DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
3 changes: 1 addition & 2 deletions kreuzberg/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from kreuzberg import ExtractionResult
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
from kreuzberg._string import normalize_spaces, safe_decode
from kreuzberg._sync import run_sync

if TYPE_CHECKING:
from pathlib import Path
Expand All @@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
if isinstance(file_path_or_contents, bytes)
else await AsyncPath(file_path_or_contents).read_text()
)
result = await run_sync(html_to_markdown.convert_to_markdown, content)
result = html_to_markdown.convert_to_markdown(content)
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
9 changes: 4 additions & 5 deletions kreuzberg/_pandoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
from json import JSONDecodeError, loads
from typing import TYPE_CHECKING, Any, Final, Literal, cast

from anyio import CapacityLimiter, create_task_group, to_process
from anyio import CapacityLimiter, create_task_group, run_process, to_process
from anyio import Path as AsyncPath

from kreuzberg._constants import DEFAULT_MAX_PROCESSES
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
from kreuzberg._string import normalize_spaces
from kreuzberg._sync import run_sync
from kreuzberg._tmp import create_temp_file
from kreuzberg._types import ExtractionResult, Metadata
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
Expand Down Expand Up @@ -251,10 +250,10 @@ async def _validate_pandoc_version() -> None:
return

command = ["pandoc", "--version"]
result = await run_sync(subprocess.run, command, capture_output=True)
result = await run_process(command)
version = result.stdout.decode().split("\n")[0].split()[1]
if not version.startswith("3."):
raise MissingDependencyError("Pandoc version 3 or above is required.")
if not version.startswith("2."):
raise MissingDependencyError("Pandoc version 2 or above is required.")

version_ref["checked"] = True

Expand Down
2 changes: 1 addition & 1 deletion kreuzberg/_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
document: pypdfium2.PdfDocument | None = None
try:
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
except pypdfium2.PdfiumError as e:
raise ParsingError(
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
Expand Down
42 changes: 42 additions & 0 deletions kreuzberg/_sync.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from __future__ import annotations

import sys
from collections.abc import Awaitable
from functools import partial
from typing import TYPE_CHECKING, TypeVar, cast

from anyio import create_task_group
from anyio.to_thread import run_sync as any_io_run_sync

if TYPE_CHECKING: # pragma: no cover
Expand Down Expand Up @@ -31,3 +33,43 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
"""
handler = partial(sync_fn, **kwargs)
return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]


async def run_taskgroup(*coroutines: Callable[[], Awaitable[T]]) -> list[T]:
"""Run a list of coroutines concurrently.
Args:
coroutines: The list of coroutines to run.
Returns:
The results of the coroutines.
"""
results = cast(list[T], [None] * len(coroutines))

async def run_task(index: int, task: Callable[[], Awaitable[T]]) -> None:
results[index] = await task()

async with create_task_group() as tg:
for i, coro in enumerate(coroutines):
tg.start_soon(run_task, i, coro)

return results


async def run_taskgroup_batched(*coroutines: Callable[[], Awaitable[T]], batch_size: int) -> list[T]:
"""Run a list of coroutines concurrently in batches.
Args:
coroutines: The list of coroutines to run.
batch_size: The size of each batch.
Returns:
The results of the coroutines.
"""
results: list[T] = []

for i in range(0, len(coroutines), batch_size):
batch = coroutines[i : i + batch_size]
results.extend(await run_taskgroup(*batch))

return results
Loading

0 comments on commit 5e347bf

Please sign in to comment.