Skip to content

Commit

Permalink
Merge branch 'hotfix/3.14.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Nov 11, 2024
2 parents 0c07309 + 45b9248 commit 97ba180
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 4 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ CHANGELOG
3.14
----

- Bug fix: Gale page indexing with local OCR now catches and logs a warning on JSON decode error

3.14
----

- Revise EEBO-TCP content import logic to render tagged notes at the bottom of the page instead of displaying inline
- Optimize local OCR page indexing for Gale content by loading OCR from a single JSON file per volume
- Revise **index_pages** manage command arguments for indexing all records from a single source to make it easier to use (lowercase, support prefixes when unambiguous)
Expand Down
2 changes: 1 addition & 1 deletion ppa/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version_info__ = (3, 14, 0, None)
__version_info__ = (3, 14, 1, None)


# Dot-connect all but the last. Last is dash-connected if not None.
Expand Down
6 changes: 5 additions & 1 deletion ppa/archive/gale.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,8 @@ def get_item_pages(self, item_id, gale_record=None):
local_ocr_text = get_local_ocr(item_id)
except FileNotFoundError:
logger.warning(f"Local OCR not found for {item_id}")
except json.decoder.JSONDecodeError:
logger.warning(f"JSON decode error on local OCR file for {item_id}")

# iterate through the pages in the response
for page in gale_record["pageResponse"]["pages"]:
Expand All @@ -247,7 +249,9 @@ def get_item_pages(self, item_id, gale_record=None):
tags = ["local_ocr"]
# If page is not present in the data, use Gale OCR as fallback
else:
logger.warning(f"No local OCR for {item_id} {page_number}")
# don't warn for every page when no OCR text is found
if local_ocr_text:
logger.warning(f"No local OCR for {item_id} {page_number}")
# try getting the ocr from the gale api result
# (may be empty, since some pages have no text)
ocr_text = page.get("ocrText")
Expand Down
20 changes: 18 additions & 2 deletions ppa/archive/tests/test_gale.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
{
"pageNumber": "0003",
"image": {"id": "0765400456789", "url": "http://example.com/img/3"},
"ocrText": "ignored text",
"ocrText": "fallback gale text",
},
]
api_response = {
Expand Down Expand Up @@ -319,8 +319,24 @@ def test_get_item_pages(self, mock_get_item, mock_get_local_ocr, mockrequests):
assert [p["content"] for p in page_data] == [
None,
"more test content",
"ignored text",
"fallback gale text",
]
# NOTE: would be nice to test logging, but can't get
# pytest caplog or unittest logging assertions to work
# since the logging is already captured and displayed by the
# test runner

# confirm json decode error is handled appropriately
mock_get_local_ocr.side_effect = json.decoder.JSONDecodeError(
"invalid json", "file.json", 1
)
page_data = list(gale_api.get_item_pages(item_id))
assert [p["content"] for p in page_data] == [
None,
"more test content",
"fallback gale text",
]
# would be nice to test logging here also

# skip api call if record is provided
mock_get_item.reset_mock()
Expand Down

0 comments on commit 97ba180

Please sign in to comment.