-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/process adjudication data (#130)
* Adds script & unit tests for processing Prodigy adjudication data * Adds mypy type checking support for dev
- Loading branch information
Showing
3 changed files
with
345 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
200 changes: 200 additions & 0 deletions
200
src/corppa/poetry_detection/annotation/process_adjudication_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
""" | ||
This script processes the adjudication data produced by Prodigy for our | ||
poetry detection task into two outputs: | ||
1. A JSONL file that compiles the annotation data into page-level records. | ||
So, each record contains some page-level metdata and the compiled list | ||
of poetry excerpts (if any) determined in the adjudication process. | ||
2. A CSV file containing excerpt-level data per line. | ||
Note that the first file explicitly include information on the pages where | ||
no poetry was identified, while the second will only implicitly through | ||
absence and requires external knowledge of what pages were covered in | ||
the annotation rounds. So, the former is particularly useful for the evaluation | ||
process while the latter is better suited for building a final excerpt dataset. | ||
Example command line usage: | ||
``` | ||
python process_adjudication_data.py prodigy_data.jsonl adj_pages.jsonl adj_excerpts.csv | ||
``` | ||
""" | ||
|
||
import argparse | ||
import csv | ||
import pathlib | ||
import sys | ||
from collections.abc import Generator | ||
from typing import Any | ||
|
||
import orjsonl | ||
from tqdm import tqdm | ||
from xopen import xopen | ||
|
||
|
||
def get_excerpts(page_annotation: dict[str, Any]) -> list[dict[str, int | str]]: | ||
""" | ||
Extract excerpts from page-level annotation. Excerpts have the following | ||
fields: | ||
* start: character-level starting index | ||
* end: character-level end index (Pythonic, exclusive) | ||
* text: text of page excerpt | ||
Note: Currently ignoring span labels, since there's only one for the | ||
poetry detection task. | ||
""" | ||
excerpts = [] | ||
# Blank pages may not have a text field, so in these cases set to empty string | ||
page_text = page_annotation.get("text", "") | ||
if "spans" not in page_annotation: | ||
raise ValueError("Page annotation missing 'spans' field") | ||
for span in page_annotation["spans"]: | ||
excerpt = { | ||
"start": span["start"], | ||
"end": span["end"], | ||
"text": page_text[span["start"] : span["end"]], | ||
} | ||
excerpts.append(excerpt) | ||
return excerpts | ||
|
||
|
||
def process_page_annotation(page_annotation) -> dict[str, Any]: | ||
""" | ||
Extracts desired content from page-level annotation. The returned data has | ||
the following fields" | ||
* page_id: Page's PPA page identifier | ||
* work_id: PPA work identifier | ||
* work_title: Title of PPA work | ||
* work_author: Author of PPA work | ||
* work_year: Publication of PPA work | ||
* n_excerpts: Number of poetry excerpts contained in page | ||
* excerpts: List of poetry excerpts identified within page | ||
""" | ||
page_data = {} | ||
page_data["page_id"] = page_annotation["id"] | ||
page_data["work_id"] = page_annotation["work_id"] | ||
page_data["work_title"] = page_annotation["meta"]["title"] | ||
page_data["work_author"] = page_annotation["meta"]["author"] | ||
page_data["work_year"] = page_annotation["meta"]["year"] | ||
page_data["excerpts"] = get_excerpts(page_annotation) | ||
page_data["n_excerpts"] = len(page_data["excerpts"]) | ||
return page_data | ||
|
||
|
||
def get_excerpt_entries(page_data: dict[str, Any]) -> Generator[dict[str, Any]]: | ||
""" | ||
Generate excerpt entries data from the processed page produced by | ||
`process_page_annotation`. | ||
""" | ||
for excerpt in page_data["excerpts"]: | ||
entry = { | ||
"page_id": page_data["page_id"], | ||
"work_id": page_data["work_id"], | ||
"work_title": page_data["work_title"], | ||
"work_author": page_data["work_author"], | ||
"work_year": page_data["work_year"], | ||
"start": excerpt["start"], | ||
"end": excerpt["end"], | ||
"text": excerpt["text"], | ||
} | ||
yield entry | ||
|
||
|
||
def process_adjudication_data( | ||
input_jsonl: pathlib.Path, | ||
output_pages: pathlib.Path, | ||
output_excerpts: pathlib.Path, | ||
disable_progress: bool = False, | ||
) -> None: | ||
""" | ||
Process adjudication annotation data and write output files containing page-level | ||
and excerpt-level information that are JSONL and CSV files respectively. | ||
""" | ||
n_lines = sum(1 for line in xopen(input_jsonl, mode="rb")) | ||
progress_annos = tqdm( | ||
orjsonl.stream(input_jsonl), | ||
total=n_lines, | ||
disable=disable_progress, | ||
) | ||
csv_fieldnames = [ | ||
"page_id", | ||
"work_id", | ||
"work_title", | ||
"work_author", | ||
"work_year", | ||
"start", | ||
"end", | ||
"text", | ||
] | ||
with open(output_excerpts, mode="w", newline="") as csvfile: | ||
csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames) | ||
csv_writer.writeheader() | ||
for page_anno in progress_annos: | ||
# Get & save page data | ||
page_data = process_page_annotation(page_anno) | ||
orjsonl.append(output_pages, page_data) | ||
|
||
for row in get_excerpt_entries(page_data): | ||
csv_writer.writerow(row) | ||
|
||
|
||
def main(): | ||
""" | ||
Extracts page- and excerpt-level data from a Prodigy data file (JSONL) | ||
and writes the page-level excerpt data to a JSONL (`output_pages`) and the | ||
excerpt-level data to a CSV (`output_excerpts`). | ||
""" | ||
parser = argparse.ArgumentParser( | ||
description="Extracts & saves page- and excerpt-level data from Prodigy data file", | ||
) | ||
parser.add_argument( | ||
"input", | ||
help="Path to Prodigy annotation data export (JSONL file)", | ||
type=pathlib.Path, | ||
) | ||
parser.add_argument( | ||
"output_pages", | ||
help="Filename where extracted page-level data (JSONL file) should be written", | ||
type=pathlib.Path, | ||
) | ||
parser.add_argument( | ||
"output_excerpts", | ||
help="Filename where extracted excerpt-level data (CSV file) should be written", | ||
type=pathlib.Path, | ||
) | ||
parser.add_argument( | ||
"--progress", | ||
help="Show progress", | ||
action=argparse.BooleanOptionalAction, | ||
default=True, | ||
) | ||
|
||
args = parser.parse_args() | ||
disable_progress = not args.progress | ||
|
||
# Check that input file exists | ||
if not args.input.is_file(): | ||
print( | ||
f"Error: input file {args.input.is_file()} does not exist", file=sys.stderr | ||
) | ||
sys.exit(1) | ||
|
||
# Check that output files does not exist | ||
for output_file in [args.output_pages, args.output_excerpts]: | ||
if output_file.exists(): | ||
print( | ||
f"Error: output file {output_file} already exists, not overwriting", | ||
file=sys.stderr, | ||
) | ||
sys.exit(1) | ||
|
||
process_adjudication_data( | ||
args.input, | ||
args.output_pages, | ||
args.output_excerpts, | ||
disable_progress=disable_progress, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
140 changes: 140 additions & 0 deletions
140
test/test_poetry_detection/test_annotation/test_process_adjudication_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import sys | ||
from inspect import isgenerator | ||
from unittest.mock import MagicMock, call, patch | ||
|
||
import pytest | ||
|
||
from corppa.poetry_detection.annotation.process_adjudication_data import ( | ||
get_excerpt_entries, | ||
get_excerpts, | ||
process_adjudication_data, | ||
process_page_annotation, | ||
) | ||
|
||
|
||
def test_get_excerpts(): | ||
page_annotation = {"text": "some page text"} | ||
|
||
# Missing spans field | ||
with pytest.raises(ValueError, match="Page annotation missing 'spans' field"): | ||
get_excerpts(page_annotation) | ||
|
||
# Empty spans field | ||
page_annotation["spans"] = [] | ||
assert get_excerpts(page_annotation) == [] | ||
|
||
# Regular case (i.e. non-empty spans field) | ||
page_annotation["spans"].append({"start": 0, "end": 4}) | ||
page_annotation["spans"].append({"start": 10, "end": 14}) | ||
results = get_excerpts(page_annotation) | ||
assert results[0] == {"start": 0, "end": 4, "text": "some"} | ||
assert results[1] == {"start": 10, "end": 14, "text": "text"} | ||
|
||
# Missing text field | ||
blank_page = {"spans": []} | ||
assert get_excerpts(blank_page) == [] | ||
|
||
|
||
@patch("corppa.poetry_detection.annotation.process_adjudication_data.get_excerpts") | ||
def test_process_page_annotation(mock_get_excerpts): | ||
mock_get_excerpts.return_value = ["some", "poetry", "excerpts"] | ||
page_annotation = { | ||
"id": "some-page-id", | ||
"work_id": "some-work-id", | ||
"meta": {"title": "some-title", "author": "some-author", "year": "some-year"}, | ||
"spans": "some-spans", | ||
} | ||
result = process_page_annotation(page_annotation) | ||
assert result == { | ||
"page_id": "some-page-id", | ||
"work_id": "some-work-id", | ||
"work_title": "some-title", | ||
"work_author": "some-author", | ||
"work_year": "some-year", | ||
"excerpts": ["some", "poetry", "excerpts"], | ||
"n_excerpts": 3, | ||
} | ||
mock_get_excerpts.assert_called_once_with(page_annotation) | ||
|
||
|
||
def test_get_excerpt_entries(): | ||
page_meta = { | ||
"page_id": "some-page-id", | ||
"work_id": "some-work-id", | ||
"work_title": "some-title", | ||
"work_author": "some-author", | ||
"work_year": "some-year", | ||
} | ||
excerpts = [ | ||
{"start": 0, "end": 3, "text": "a"}, | ||
{"start": 5, "end": 6, "text": "b"}, | ||
] | ||
page_data = page_meta | {"excerpts": excerpts} | ||
expected_results = [page_meta | excerpt for excerpt in excerpts] | ||
|
||
result = get_excerpt_entries(page_data) | ||
assert isgenerator(result) | ||
assert list(result) == expected_results | ||
|
||
|
||
@patch( | ||
"corppa.poetry_detection.annotation.process_adjudication_data.get_excerpt_entries" | ||
) | ||
@patch( | ||
"corppa.poetry_detection.annotation.process_adjudication_data.process_page_annotation" | ||
) | ||
@patch("corppa.poetry_detection.annotation.process_adjudication_data.orjsonl") | ||
@patch("corppa.poetry_detection.annotation.process_adjudication_data.tqdm") | ||
def test_process_adjudication_data( | ||
mock_tqdm, | ||
mock_orjsonl, | ||
mock_process_page_annotation, | ||
mock_get_excerpt_entries, | ||
tmpdir, | ||
): | ||
input_jsonl = tmpdir / "input.jsonl" | ||
input_jsonl.write_text("some\ntext\n", encoding="utf-8") | ||
out_excerpts = tmpdir / "output.csv" | ||
|
||
# Default | ||
csv_fields = [ | ||
"page_id", | ||
"work_id", | ||
"work_title", | ||
"work_author", | ||
"work_year", | ||
"start", | ||
"end", | ||
"text", | ||
] | ||
mock_orjsonl.stream.return_value = "jsonl stream" | ||
mock_tqdm.return_value = ["a", "b"] | ||
mock_process_page_annotation.side_effect = lambda x: f"page {x}" | ||
mock_get_excerpt_entries.return_value = [{k: "test" for k in csv_fields}] | ||
|
||
process_adjudication_data(input_jsonl, "out.jsonl", out_excerpts) | ||
mock_orjsonl.stream.assert_called_once_with(input_jsonl) | ||
mock_tqdm.assert_called_once_with("jsonl stream", total=2, disable=False) | ||
assert mock_process_page_annotation.call_count == 2 | ||
mock_process_page_annotation.assert_has_calls([call("a"), call("b")]) | ||
assert mock_orjsonl.append.call_count == 2 | ||
mock_orjsonl.append.assert_has_calls( | ||
[call("out.jsonl", "page a"), call("out.jsonl", "page b")] | ||
) | ||
assert mock_get_excerpt_entries.call_count == 2 | ||
mock_get_excerpt_entries.assert_has_calls([call("page a"), call("page b")]) | ||
csv_text = ",".join(csv_fields) + "\n" | ||
csv_text += ",".join(["test"] * 8) + "\n" | ||
csv_text += ",".join(["test"] * 8) + "\n" | ||
assert out_excerpts.read_text(encoding="utf-8") == csv_text | ||
|
||
# Disable progress | ||
mock_orjsonl.reset_mock() | ||
mock_orjsonl.stream.return_value = "jsonl stream" | ||
mock_tqdm.reset_mock() | ||
mock_tqdm.return_value = ["a", "b"] | ||
process_adjudication_data( | ||
input_jsonl, "out.jsonl", out_excerpts, disable_progress=True | ||
) | ||
mock_orjsonl.stream.assert_called_once_with(input_jsonl) | ||
mock_tqdm.assert_called_once_with("jsonl stream", total=2, disable=True) |