Skip to content

Commit

Permalink
Add Extractor Function (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
webbnh authored Jan 8, 2025
1 parent dde38d2 commit a910a52
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 12 deletions.
45 changes: 45 additions & 0 deletions src/pdf_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#
# This module contains support functions which extract text and image objects
# from a PDF file for inclusion in other media.
#
from typing import Any, Union

from PIL.PngImagePlugin import ImageFile
from pypdf import PdfReader

PDF_ITEMS = dict[str, Union[list[str], dict[str, ImageFile]]]


def get_pdf_objects(
config: dict[str, dict[str, Any]]
) -> dict[str, dict[str, PDF_ITEMS]]:
"""Extract all images and text from the configured PDF files.
Assumes that the configuration reflects the following YAML:
teams:
Practices Team:
team_dashboards:
Developer Practices Dashboard:
Filename: DPROD_Report.pdf
...
...
config:
Download_path: "downloaded_pdfs/"
"""
results = {}
for team, dashboards in config["teams"].items():
team_results = {}
for dashboard, options in dashboards["team_dashboards"].items():
dashboard_results = {"images": {}, "text": []}
pdf = PdfReader(config["config"]["Download_path"] + options["Filename"])
for page in pdf.pages:
i = {image.name: image.image for image in page.images}
t = page.extract_text(
extraction_mode="layout",
layout_mode_space_vertically=False,
)
dashboard_results["images"].update(i)
dashboard_results["text"].append(t)
team_results[dashboard] = dashboard_results
results[team] = team_results
return results
2 changes: 2 additions & 0 deletions src/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
pillow~=11.0.0
playwright==1.49.0
pypdf~=5.1.0
pyyaml==6.0.2
Binary file added tests/fixtures/test.pdf
Binary file not shown.
30 changes: 30 additions & 0 deletions tests/test_pdf_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import unittest

from src.pdf_extract import get_pdf_objects


class MyTestCase(unittest.TestCase):
def test_get_pdf_objects(self):
config = {
"teams": {
"Mock Team": {
"team_dashboards": {"Mock Team Dashboard": {"Filename": "test.pdf"}}
}
},
"config": {"Download_path": "tests/fixtures/"},
}
result = get_pdf_objects(config)
self.assertEqual(
3,
len(result["Mock Team"]["Mock Team Dashboard"]["text"]),
"Unexpected number of text sections found.",
)
self.assertEqual(
5,
len(result["Mock Team"]["Mock Team Dashboard"]["images"]),
"Unexpected number of images sections found.",
)


if __name__ == "__main__":
unittest.main()
12 changes: 0 additions & 12 deletions tests/test_placeholder.py

This file was deleted.

0 comments on commit a910a52

Please sign in to comment.