Skip to content

Commit

Permalink
fix windows PermissionError caused by NamedTemporaryFile
Browse files Browse the repository at this point in the history
  • Loading branch information
Cycloctane committed Feb 13, 2025
1 parent a2ce9f7 commit 2ded553
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 55 deletions.
45 changes: 29 additions & 16 deletions kreuzberg/_extractors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import os
import re
from contextlib import suppress
from html import escape
Expand Down Expand Up @@ -95,14 +96,18 @@ async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = Fal
The extracted text.
"""
if isinstance(file_path_or_contents, bytes):
with NamedTemporaryFile(suffix=".pdf") as pdf_file:
pdf_file.write(file_path_or_contents)
file_path = Path(pdf_file.name)
with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
try:
pdf_file.write(file_path_or_contents)
file_path = Path(pdf_file.name)

if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
return normalize_spaces(content)
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
return normalize_spaces(content)

return await extract_pdf_with_tesseract(file_path)
return await extract_pdf_with_tesseract(file_path)
finally:
pdf_file.close()
os.unlink(pdf_file.name)

if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
return normalize_spaces(content)
Expand Down Expand Up @@ -221,8 +226,11 @@ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
Raises:
ParsingError: If the XLSX file could not be parsed.
"""
try:
with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
with (
NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
):
try:
if isinstance(file_path_or_contents, bytes):
xlsx_file.write(file_path_or_contents)
xlsx_file.flush()
Expand All @@ -233,14 +241,19 @@ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
result = await process_file(csv_file.name, mime_type="text/csv")
return normalize_spaces(result.content)
except Exception as e:
raise ParsingError(
"Could not extract text from XLSX file",
context={
"error": str(e),
"file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
},
) from e
except Exception as e:
raise ParsingError(
"Could not extract text from XLSX file",
context={
"error": str(e),
"file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
},
) from e
finally:
xlsx_file.close()
csv_file.close()
os.unlink(xlsx_file.name)
os.unlink(csv_file.name)


async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
Expand Down
73 changes: 44 additions & 29 deletions kreuzberg/_pandoc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import json
import os
import subprocess
from asyncio import gather
from dataclasses import dataclass
Expand Down Expand Up @@ -320,7 +321,7 @@ async def _validate_pandoc_version() -> None:
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)

with NamedTemporaryFile(suffix=".json") as metadata_file:
with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
try:
command = [
"pandoc",
Expand Down Expand Up @@ -350,42 +351,51 @@ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type
except (RuntimeError, OSError, json.JSONDecodeError) as e:
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e

finally:
metadata_file.close()
os.unlink(metadata_file.name)


async def _handle_extract_file(
input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
) -> str:
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)

with NamedTemporaryFile(suffix=".md") as output_file:
command = [
"pandoc",
str(input_file),
f"--from={pandoc_type}",
"--to=markdown",
"--standalone",
"--wrap=preserve",
"--quiet",
"--output",
output_file.name,
]

if extra_args:
command.extend(extra_args)
with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
try:
command = [
"pandoc",
str(input_file),
f"--from={pandoc_type}",
"--to=markdown",
"--standalone",
"--wrap=preserve",
"--quiet",
"--output",
output_file.name,
]

result = await run_sync(
subprocess.run,
command,
capture_output=True,
)
if extra_args:
command.extend(extra_args)

if result.returncode != 0:
raise ParsingError(
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
result = await run_sync(
subprocess.run,
command,
capture_output=True,
)

text = await AsyncPath(output_file.name).read_text()
if result.returncode != 0:
raise ParsingError(
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
)

text = await AsyncPath(output_file.name).read_text()

return normalize_spaces(text)
return normalize_spaces(text)

finally:
output_file.close()
os.unlink(output_file.name)


async def process_file(
Expand Down Expand Up @@ -428,6 +438,11 @@ async def process_content(content: bytes, *, mime_type: str, extra_args: list[st
"""
extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"

with NamedTemporaryFile(suffix=f".{extension}") as input_file:
await AsyncPath(input_file.name).write_bytes(content)
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
try:
await AsyncPath(input_file.name).write_bytes(content)
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)

finally:
input_file.close()
os.unlink(input_file.name)
20 changes: 15 additions & 5 deletions kreuzberg/_tesseract.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import os
import re
import subprocess
from asyncio import gather
Expand Down Expand Up @@ -214,10 +215,10 @@ async def process_file(
Returns:
str: Extracted text from the image.
"""
with NamedTemporaryFile(suffix=".txt") as output_file:
with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
# this is needed because tesseract adds .txt to the output file
output_file_name = output_file.name.replace(".txt", "")
try:
output_file_name = output_file.name.replace(".txt", "")
command = [
"tesseract",
str(input_file),
Expand Down Expand Up @@ -245,6 +246,10 @@ async def process_file(
except (RuntimeError, OSError) as e:
raise OCRError("Failed to OCR using tesseract") from e

finally:
output_file.close()
os.unlink(output_file.name)


async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
"""Process a single Pillow Image using Tesseract OCR.
Expand All @@ -258,9 +263,14 @@ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMM
Returns:
str: Extracted text from the image.
"""
with NamedTemporaryFile(suffix=".png") as image_file:
await run_sync(image.save, image_file.name, format="PNG")
return await process_file(image_file.name, language=language, psm=psm, **kwargs)
with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
try:
await run_sync(image.save, image_file.name, format="PNG")
return await process_file(image_file.name, language=language, psm=psm, **kwargs)

finally:
image_file.close()
os.unlink(image_file.name)


async def process_image_with_tesseract(
Expand Down
15 changes: 10 additions & 5 deletions kreuzberg/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from __future__ import annotations

import os
from mimetypes import guess_type
from pathlib import Path
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -77,11 +78,15 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)

if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
temp_file.write(content)
return ExtractionResult(
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
)
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
try:
temp_file.write(content)
return ExtractionResult(
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
)
finally:
temp_file.close()
os.unlink(temp_file.name)

if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
Expand Down

0 comments on commit 2ded553

Please sign in to comment.