Skip to content

Commit

Permalink
feat: v1.7
Browse files Browse the repository at this point in the history
  • Loading branch information
Goldziher committed Feb 14, 2025
1 parent cb553d5 commit a52f736
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 217 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
- id: check-case-conflict
- id: detect-private-key
- repo: https://github.com/rbubley/mirrors-prettier
rev: "v3.4.2"
rev: "v3.5.1"
hooks:
- id: prettier
exclude: ^tests|^.idea|^migrations|^.git
Expand All @@ -30,13 +30,13 @@ repos:
hooks:
- id: pyproject-fmt
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.3
rev: v0.9.6
hooks:
- id: ruff
args: [--fix]
- id: ruff-format
- repo: https://github.com/codespell-project/codespell
rev: v2.4.0
rev: v2.4.1
hooks:
- id: codespell
exclude: ^tests|^scripts|^kreuzberg/_tesseract
Expand Down
2 changes: 1 addition & 1 deletion kreuzberg/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = Fal
if isinstance(file_path_or_contents, bytes):
with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
try:
pdf_file.write(file_path_or_contents)
file_path = Path(pdf_file.name)
await AsyncPath(file_path).write_bytes(file_path_or_contents)

if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
return normalize_spaces(content)
Expand Down
6 changes: 3 additions & 3 deletions kreuzberg/_pandoc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

import json
import subprocess
from asyncio import gather
from dataclasses import dataclass
from json import JSONDecodeError, loads
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast

Expand Down Expand Up @@ -344,10 +344,10 @@ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
)

json_data = json.loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
return _extract_metadata(json_data)

except (RuntimeError, OSError, json.JSONDecodeError) as e:
except (RuntimeError, OSError, JSONDecodeError) as e:
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e

finally:
Expand Down
2 changes: 1 addition & 1 deletion kreuzberg/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
try:
temp_file.write(content)
await AsyncPath(temp_file.name).write_bytes(content)
return ExtractionResult(
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
)
Expand Down
36 changes: 18 additions & 18 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "kreuzberg"
version = "1.6.0"
version = "1.7.0"
description = "A text extraction library supporting PDFs, images, office documents and more"
readme = "README.md"
keywords = [
Expand Down Expand Up @@ -36,28 +36,28 @@ classifiers = [
]

dependencies = [
"anyio>=4.8.0",
"charset-normalizer>=3.4.1",
"html-to-markdown>=1.2.0",
"pypdfium2>=4.30.1",
"python-pptx>=1.0.2",
"typing-extensions>=4.12.2; python_version<'3.10'",
"xlsx2csv>=0.8.4",
"anyio>=4.8.0",
"charset-normalizer>=3.4.1",
"html-to-markdown>=1.2.0",
"pypdfium2>=4.30.1",
"python-pptx>=1.0.2",
"typing-extensions>=4.12.2",
"xlsx2csv>=0.8.4",
]
urls.homepage = "https://github.com/Goldziher/kreuzberg"

[dependency-groups]
dev = [
"covdefaults>=2.3.0",
"mypy>=1.15.0",
"pre-commit>=4.1.0",
"pytest>=8.3.4",
"pytest-asyncio>=0.25.3",
"pytest-cov>=6.0.0",
"pytest-mock>=3.14.0",
"pytest-timeout>=2.3.1",
"python-dotenv>=1.0.1",
"ruff>=0.9.5",
"covdefaults>=2.3.0",
"mypy>=1.15.0",
"pre-commit>=4.1.0",
"pytest>=8.3.4",
"pytest-asyncio>=0.25.3",
"pytest-cov>=6.0.0",
"pytest-mock>=3.14.0",
"pytest-timeout>=2.3.1",
"python-dotenv>=1.0.1",
"ruff>=0.9.6",
]

[tool.setuptools.packages.find]
Expand Down
Loading

0 comments on commit a52f736

Please sign in to comment.