-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
77 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# | ||
# This module contains support functions which extract text and image objects | ||
# from a PDF file for inclusion in other media. | ||
# | ||
from typing import Any, Union | ||
|
||
from PIL.PngImagePlugin import ImageFile | ||
from pypdf import PdfReader | ||
|
||
PDF_ITEMS = dict[str, Union[list[str], dict[str, ImageFile]]] | ||
|
||
|
||
def get_pdf_objects( | ||
config: dict[str, dict[str, Any]] | ||
) -> dict[str, dict[str, PDF_ITEMS]]: | ||
"""Extract all images and text from the configured PDF files. | ||
Assumes that the configuration reflects the following YAML: | ||
teams: | ||
Practices Team: | ||
team_dashboards: | ||
Developer Practices Dashboard: | ||
Filename: DPROD_Report.pdf | ||
... | ||
... | ||
config: | ||
Download_path: "downloaded_pdfs/" | ||
""" | ||
results = {} | ||
for team, dashboards in config["teams"].items(): | ||
team_results = {} | ||
for dashboard, options in dashboards["team_dashboards"].items(): | ||
dashboard_results = {"images": {}, "text": []} | ||
pdf = PdfReader(config["config"]["Download_path"] + options["Filename"]) | ||
for page in pdf.pages: | ||
i = {image.name: image.image for image in page.images} | ||
t = page.extract_text( | ||
extraction_mode="layout", | ||
layout_mode_space_vertically=False, | ||
) | ||
dashboard_results["images"].update(i) | ||
dashboard_results["text"].append(t) | ||
team_results[dashboard] = dashboard_results | ||
results[team] = team_results | ||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,4 @@ | ||
pillow~=11.0.0 | ||
playwright==1.49.0 | ||
pypdf~=5.1.0 | ||
pyyaml==6.0.2 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import unittest | ||
|
||
from src.pdf_extract import get_pdf_objects | ||
|
||
|
||
class MyTestCase(unittest.TestCase): | ||
def test_get_pdf_objects(self): | ||
config = { | ||
"teams": { | ||
"Mock Team": { | ||
"team_dashboards": {"Mock Team Dashboard": {"Filename": "test.pdf"}} | ||
} | ||
}, | ||
"config": {"Download_path": "tests/fixtures/"}, | ||
} | ||
result = get_pdf_objects(config) | ||
self.assertEqual( | ||
3, | ||
len(result["Mock Team"]["Mock Team Dashboard"]["text"]), | ||
"Unexpected number of text sections found.", | ||
) | ||
self.assertEqual( | ||
5, | ||
len(result["Mock Team"]["Mock Team Dashboard"]["images"]), | ||
"Unexpected number of images sections found.", | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
This file was deleted.
Oops, something went wrong.