fix windows PermissionError caused by NamedTemporaryFile

Goldziher · Feb 13, 2025 · 2ded553 · 2ded553
1 parent a2ce9f7
commit 2ded553
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 55 deletions.
diff --git a/kreuzberg/_extractors.py b/kreuzberg/_extractors.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import re
 from contextlib import suppress
 from html import escape
@@ -95,14 +96,18 @@ async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = Fal
         The extracted text.
     """
     if isinstance(file_path_or_contents, bytes):
-        with NamedTemporaryFile(suffix=".pdf") as pdf_file:
-            pdf_file.write(file_path_or_contents)
-            file_path = Path(pdf_file.name)
+        with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
+            try:
+                pdf_file.write(file_path_or_contents)
+                file_path = Path(pdf_file.name)
 
-            if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
-                return normalize_spaces(content)
+                if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
+                    return normalize_spaces(content)
 
-            return await extract_pdf_with_tesseract(file_path)
+                return await extract_pdf_with_tesseract(file_path)
+            finally:
+                pdf_file.close()
+                os.unlink(pdf_file.name)
 
     if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
         return normalize_spaces(content)
@@ -221,8 +226,11 @@ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
     Raises:
         ParsingError: If the XLSX file could not be parsed.
     """
-    try:
-        with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
+    with (
+        NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
+        NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
+    ):
+        try:
             if isinstance(file_path_or_contents, bytes):
                 xlsx_file.write(file_path_or_contents)
                 xlsx_file.flush()
@@ -233,14 +241,19 @@ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
             await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
             result = await process_file(csv_file.name, mime_type="text/csv")
             return normalize_spaces(result.content)
-    except Exception as e:
-        raise ParsingError(
-            "Could not extract text from XLSX file",
-            context={
-                "error": str(e),
-                "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
-            },
-        ) from e
+        except Exception as e:
+            raise ParsingError(
+                "Could not extract text from XLSX file",
+                context={
+                    "error": str(e),
+                    "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
+                },
+            ) from e
+        finally:
+            xlsx_file.close()
+            csv_file.close()
+            os.unlink(xlsx_file.name)
+            os.unlink(csv_file.name)
 
 
 async def extract_html_string(file_path_or_contents: Path | bytes) -> str:

diff --git a/kreuzberg/_pandoc.py b/kreuzberg/_pandoc.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import os
 import subprocess
 from asyncio import gather
 from dataclasses import dataclass
@@ -320,7 +321,7 @@ async def _validate_pandoc_version() -> None:
 async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
     pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
 
-    with NamedTemporaryFile(suffix=".json") as metadata_file:
+    with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
         try:
             command = [
                 "pandoc",
@@ -350,42 +351,51 @@ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type
         except (RuntimeError, OSError, json.JSONDecodeError) as e:
             raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
 
+        finally:
+            metadata_file.close()
+            os.unlink(metadata_file.name)
+
 
 async def _handle_extract_file(
     input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
 ) -> str:
     pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
 
-    with NamedTemporaryFile(suffix=".md") as output_file:
-        command = [
-            "pandoc",
-            str(input_file),
-            f"--from={pandoc_type}",
-            "--to=markdown",
-            "--standalone",
-            "--wrap=preserve",
-            "--quiet",
-            "--output",
-            output_file.name,
-        ]
-
-        if extra_args:
-            command.extend(extra_args)
+    with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
+        try:
+            command = [
+                "pandoc",
+                str(input_file),
+                f"--from={pandoc_type}",
+                "--to=markdown",
+                "--standalone",
+                "--wrap=preserve",
+                "--quiet",
+                "--output",
+                output_file.name,
+            ]
 
-        result = await run_sync(
-            subprocess.run,
-            command,
-            capture_output=True,
-        )
+            if extra_args:
+                command.extend(extra_args)
 
-        if result.returncode != 0:
-            raise ParsingError(
-                "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
+            result = await run_sync(
+                subprocess.run,
+                command,
+                capture_output=True,
             )
 
-        text = await AsyncPath(output_file.name).read_text()
+            if result.returncode != 0:
+                raise ParsingError(
+                    "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
+                )
+
+            text = await AsyncPath(output_file.name).read_text()
 
-        return normalize_spaces(text)
+            return normalize_spaces(text)
+
+        finally:
+            output_file.close()
+            os.unlink(output_file.name)
 
 
 async def process_file(
@@ -428,6 +438,11 @@ async def process_content(content: bytes, *, mime_type: str, extra_args: list[st
     """
     extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
 
-    with NamedTemporaryFile(suffix=f".{extension}") as input_file:
-        await AsyncPath(input_file.name).write_bytes(content)
-        return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
+    with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
+        try:
+            await AsyncPath(input_file.name).write_bytes(content)
+            return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
+
+        finally:
+            input_file.close()
+            os.unlink(input_file.name)
diff --git a/kreuzberg/_tesseract.py b/kreuzberg/_tesseract.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import re
 import subprocess
 from asyncio import gather
@@ -214,10 +215,10 @@ async def process_file(
     Returns:
         str: Extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".txt") as output_file:
+    with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
         # this is needed because tesseract adds .txt to the output file
-        output_file_name = output_file.name.replace(".txt", "")
         try:
+            output_file_name = output_file.name.replace(".txt", "")
             command = [
                 "tesseract",
                 str(input_file),
@@ -245,6 +246,10 @@ async def process_file(
         except (RuntimeError, OSError) as e:
             raise OCRError("Failed to OCR using tesseract") from e
 
+        finally:
+            output_file.close()
+            os.unlink(output_file.name)
+
 
 async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
     """Process a single Pillow Image using Tesseract OCR.
@@ -258,9 +263,14 @@ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMM
     Returns:
         str: Extracted text from the image.
     """
-    with NamedTemporaryFile(suffix=".png") as image_file:
-        await run_sync(image.save, image_file.name, format="PNG")
-        return await process_file(image_file.name, language=language, psm=psm, **kwargs)
+    with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
+        try:
+            await run_sync(image.save, image_file.name, format="PNG")
+            return await process_file(image_file.name, language=language, psm=psm, **kwargs)
+
+        finally:
+            image_file.close()
+            os.unlink(image_file.name)
 
 
 async def process_image_with_tesseract(

diff --git a/kreuzberg/extraction.py b/kreuzberg/extraction.py
@@ -9,6 +9,7 @@
 
 from __future__ import annotations
 
+import os
 from mimetypes import guess_type
 from pathlib import Path
 from tempfile import NamedTemporaryFile
@@ -77,11 +78,15 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
         return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
 
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
-        with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
-            temp_file.write(content)
-            return ExtractionResult(
-                content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
-            )
+        with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
+            try:
+                temp_file.write(content)
+                return ExtractionResult(
+                    content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
+                )
+            finally:
+                temp_file.close()
+                os.unlink(temp_file.name)
 
     if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
         mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES