Add Extractor Function (#3)

developerproductivity · Jan 8, 2025 · a910a52 · a910a52
1 parent dde38d2
commit a910a52
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 12 deletions.
diff --git a/src/pdf_extract.py b/src/pdf_extract.py
@@ -0,0 +1,45 @@
+#
+# This module contains support functions which extract text and image objects
+# from a PDF file for inclusion in other media.
+#
+from typing import Any, Union
+
+from PIL.PngImagePlugin import ImageFile
+from pypdf import PdfReader
+
+PDF_ITEMS = dict[str, Union[list[str], dict[str, ImageFile]]]
+
+
+def get_pdf_objects(
+    config: dict[str, dict[str, Any]]
+) -> dict[str, dict[str, PDF_ITEMS]]:
+    """Extract all images and text from the configured PDF files.
+
+    Assumes that the configuration reflects the following YAML:
+        teams:
+          Practices Team:
+            team_dashboards:
+              Developer Practices Dashboard:
+                Filename: DPROD_Report.pdf
+              ...
+          ...
+        config:
+          Download_path: "downloaded_pdfs/"
+    """
+    results = {}
+    for team, dashboards in config["teams"].items():
+        team_results = {}
+        for dashboard, options in dashboards["team_dashboards"].items():
+            dashboard_results = {"images": {}, "text": []}
+            pdf = PdfReader(config["config"]["Download_path"] + options["Filename"])
+            for page in pdf.pages:
+                i = {image.name: image.image for image in page.images}
+                t = page.extract_text(
+                    extraction_mode="layout",
+                    layout_mode_space_vertically=False,
+                )
+                dashboard_results["images"].update(i)
+                dashboard_results["text"].append(t)
+            team_results[dashboard] = dashboard_results
+        results[team] = team_results
+    return results
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -1,2 +1,4 @@
+pillow~=11.0.0
 playwright==1.49.0
+pypdf~=5.1.0
 pyyaml==6.0.2
diff --git a/tests/fixtures/test.pdf b/tests/fixtures/test.pdf
diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py
@@ -0,0 +1,30 @@
+import unittest
+
+from src.pdf_extract import get_pdf_objects
+
+
+class MyTestCase(unittest.TestCase):
+    def test_get_pdf_objects(self):
+        config = {
+            "teams": {
+                "Mock Team": {
+                    "team_dashboards": {"Mock Team Dashboard": {"Filename": "test.pdf"}}
+                }
+            },
+            "config": {"Download_path": "tests/fixtures/"},
+        }
+        result = get_pdf_objects(config)
+        self.assertEqual(
+            3,
+            len(result["Mock Team"]["Mock Team Dashboard"]["text"]),
+            "Unexpected number of text sections found.",
+        )
+        self.assertEqual(
+            5,
+            len(result["Mock Team"]["Mock Team Dashboard"]["images"]),
+            "Unexpected number of images sections found.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_placeholder.py b/tests/test_placeholder.py