Skip to content

Commit

Permalink
chore: updated windows support
Browse files Browse the repository at this point in the history
  • Loading branch information
Goldziher committed Feb 9, 2025
1 parent a09a20a commit 1cf8079
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 21 deletions.
32 changes: 20 additions & 12 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,11 @@ jobs:

- name: Execute Pre-Commit
run: uv run pre-commit run --show-diff-on-failure --color=always --all-files

test:
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
os: [ ubuntu-latest, windows-latest, macos-latest ]
runs-on: ${{ matrix.os }}
timeout-minutes: 5
steps:
- name: Checkout
uses: actions/checkout@v4
Expand All @@ -54,23 +52,33 @@ jobs:
with:
enable-cache: true

- name: Set up Python
- name: Install Python
uses: actions/setup-python@v5
with:
python-version-file: "pyproject.toml"

- name: Install System Dependencies
- name: Install Dependencies
run: uv sync --all-extras --dev

- name: Install System Dependencies (Windows)
if: runner.os == 'Windows'
run: |
choco install -y tesseract pandoc
echo "C:\Program Files\Tesseract-OCR" >> $env:PATH
echo "C:\Program Files\Pandoc" >> $env:PATH
- name: Install System Dependencies (Linux)
if: runner.os == 'Linux'
run: sudo apt-get install -y tesseract-ocr pandoc
- name: Install System Dependencies
if: runner.os == 'Windows'
run: choco install -y tesseract pandoc
- name: Install System Dependencies

- name: Install System Dependencies (macOS)
if: runner.os == 'macOS'
run: brew install tesseract pandoc

- name: Install Dependencies
run: uv sync --all-extras --dev
- name: Verify Installed Dependencies (Windows)
run: |
tesseract --version
pandoc --version
- name: Test
- name: Run Tests
run: uv run pytest tests
11 changes: 7 additions & 4 deletions kreuzberg/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
A list of Pillow Images.
"""
try:
pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
resolved_path = str(await AsyncPath(file_path).resolve())
pdf = await run_sync(pypdfium2.PdfDocument, resolved_path)
return [page.render(scale=2.0).to_pil() for page in pdf]
except pypdfium2.PdfiumError as e:
raise ParsingError(
Expand Down Expand Up @@ -73,7 +74,8 @@ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
The extracted text.
"""
try:
document = await run_sync(pypdfium2.PdfDocument, file_path)
resolved_path = str(await AsyncPath(file_path).resolve())
document = await run_sync(pypdfium2.PdfDocument, resolved_path)
text = "\n".join(page.get_textpage().get_text_bounded() for page in document)
return normalize_spaces(text)
except pypdfium2.PdfiumError as e:
Expand Down Expand Up @@ -122,7 +124,8 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
Returns:
The extracted text.
"""
result = await process_file(file_path, mime_type=mime_type)
resolved_path = str(await AsyncPath(file_path).resolve())
result = await process_file(resolved_path, mime_type=mime_type)
return normalize_spaces(result.content)


Expand Down Expand Up @@ -215,7 +218,7 @@ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
xlsx_file.flush()
xlsx_path = xlsx_file.name
else:
xlsx_path = str(file_path_or_contents)
xlsx_path = str(await AsyncPath(file_path_or_contents).resolve())

await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
result = await process_file(csv_file.name, mime_type="text/csv")
Expand Down
8 changes: 5 additions & 3 deletions kreuzberg/_pandoc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import json
import platform
import subprocess
from asyncio import gather
from dataclasses import dataclass
Expand Down Expand Up @@ -305,7 +306,8 @@ async def _validate_pandoc_version() -> None:
if version_ref["checked"]:
return

result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
command = ["pandoc.exe" if platform.system() == "Windows" else "pandoc", "--version"]
result = await run_sync(subprocess.run, command, capture_output=True)
version = result.stdout.decode().split("\n")[0].split()[1]
if not version.startswith("3."):
raise MissingDependencyError("Pandoc version 3 or above is required.")
Expand All @@ -322,7 +324,7 @@ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type
with NamedTemporaryFile(suffix=".json") as metadata_file:
try:
command = [
"pandoc",
"pandoc.exe" if platform.system() == "Windows" else "pandoc",
str(input_file),
f"--from={pandoc_type}",
"--to=json",
Expand Down Expand Up @@ -357,7 +359,7 @@ async def _handle_extract_file(

with NamedTemporaryFile(suffix=".md") as output_file:
command = [
"pandoc",
"pandoc.exe" if platform.system() == "Windows" else "pandoc",
str(input_file),
f"--from={pandoc_type}",
"--to=markdown",
Expand Down
6 changes: 4 additions & 2 deletions kreuzberg/_tesseract.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import platform
import re
import subprocess
from asyncio import gather
Expand Down Expand Up @@ -186,7 +187,8 @@ async def validate_tesseract_version() -> None:
if version_ref["checked"]:
return

result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
command = ["tesseract.exe" if platform.system() == "Windows" else "tesseract", "--version"]
result = await run_sync(subprocess.run, command, capture_output=True)
version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
if not version_match or int(version_match.group(1)) < 5:
raise MissingDependencyError("Tesseract version 5 or above is required.")
Expand Down Expand Up @@ -218,7 +220,7 @@ async def process_file(
output_file_name = output_file.name.replace(".txt", "")
try:
command = [
"tesseract",
"tesseract.exe" if platform.system() == "Windows" else "tesseract",
str(input_file),
output_file_name,
"-l",
Expand Down

0 comments on commit 1cf8079

Please sign in to comment.