Skip to content

Commit

Permalink
Feature/process adjudication data (#130)
Browse files Browse the repository at this point in the history
* Adds script & unit tests for processing Prodigy adjudication data

* Adds mypy type checking support for dev
  • Loading branch information
laurejt authored Dec 19, 2024
1 parent cb03ab8 commit a386f5f
Show file tree
Hide file tree
Showing 3 changed files with 345 additions and 1 deletion.
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,16 @@ dependencies = [
]

[project.optional-dependencies]
type-checking = [
"mypy",
"types-tqdm",
]
test = [
"pytest",
"pytest-cov"
]
ocr = ["google-cloud-vision"]
dev = ["pre-commit", "ruff", "corppa[test]", "corppa[ocr]"]
dev = ["pre-commit", "ruff", "corppa[type-checking]", "corppa[test]", "corppa[ocr]"]

[project.scripts]
corppa-filter = "corppa.utils.filter:main"
Expand Down
200 changes: 200 additions & 0 deletions src/corppa/poetry_detection/annotation/process_adjudication_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
"""
This script processes the adjudication data produced by Prodigy for our
poetry detection task into two outputs:
1. A JSONL file that compiles the annotation data into page-level records.
So, each record contains some page-level metdata and the compiled list
of poetry excerpts (if any) determined in the adjudication process.
2. A CSV file containing excerpt-level data per line.
Note that the first file explicitly include information on the pages where
no poetry was identified, while the second will only implicitly through
absence and requires external knowledge of what pages were covered in
the annotation rounds. So, the former is particularly useful for the evaluation
process while the latter is better suited for building a final excerpt dataset.
Example command line usage:
```
python process_adjudication_data.py prodigy_data.jsonl adj_pages.jsonl adj_excerpts.csv
```
"""

import argparse
import csv
import pathlib
import sys
from collections.abc import Generator
from typing import Any

import orjsonl
from tqdm import tqdm
from xopen import xopen


def get_excerpts(page_annotation: dict[str, Any]) -> list[dict[str, int | str]]:
"""
Extract excerpts from page-level annotation. Excerpts have the following
fields:
* start: character-level starting index
* end: character-level end index (Pythonic, exclusive)
* text: text of page excerpt
Note: Currently ignoring span labels, since there's only one for the
poetry detection task.
"""
excerpts = []
# Blank pages may not have a text field, so in these cases set to empty string
page_text = page_annotation.get("text", "")
if "spans" not in page_annotation:
raise ValueError("Page annotation missing 'spans' field")
for span in page_annotation["spans"]:
excerpt = {
"start": span["start"],
"end": span["end"],
"text": page_text[span["start"] : span["end"]],
}
excerpts.append(excerpt)
return excerpts


def process_page_annotation(page_annotation) -> dict[str, Any]:
"""
Extracts desired content from page-level annotation. The returned data has
the following fields"
* page_id: Page's PPA page identifier
* work_id: PPA work identifier
* work_title: Title of PPA work
* work_author: Author of PPA work
* work_year: Publication of PPA work
* n_excerpts: Number of poetry excerpts contained in page
* excerpts: List of poetry excerpts identified within page
"""
page_data = {}
page_data["page_id"] = page_annotation["id"]
page_data["work_id"] = page_annotation["work_id"]
page_data["work_title"] = page_annotation["meta"]["title"]
page_data["work_author"] = page_annotation["meta"]["author"]
page_data["work_year"] = page_annotation["meta"]["year"]
page_data["excerpts"] = get_excerpts(page_annotation)
page_data["n_excerpts"] = len(page_data["excerpts"])
return page_data


def get_excerpt_entries(page_data: dict[str, Any]) -> Generator[dict[str, Any]]:
"""
Generate excerpt entries data from the processed page produced by
`process_page_annotation`.
"""
for excerpt in page_data["excerpts"]:
entry = {
"page_id": page_data["page_id"],
"work_id": page_data["work_id"],
"work_title": page_data["work_title"],
"work_author": page_data["work_author"],
"work_year": page_data["work_year"],
"start": excerpt["start"],
"end": excerpt["end"],
"text": excerpt["text"],
}
yield entry


def process_adjudication_data(
input_jsonl: pathlib.Path,
output_pages: pathlib.Path,
output_excerpts: pathlib.Path,
disable_progress: bool = False,
) -> None:
"""
Process adjudication annotation data and write output files containing page-level
and excerpt-level information that are JSONL and CSV files respectively.
"""
n_lines = sum(1 for line in xopen(input_jsonl, mode="rb"))
progress_annos = tqdm(
orjsonl.stream(input_jsonl),
total=n_lines,
disable=disable_progress,
)
csv_fieldnames = [
"page_id",
"work_id",
"work_title",
"work_author",
"work_year",
"start",
"end",
"text",
]
with open(output_excerpts, mode="w", newline="") as csvfile:
csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
csv_writer.writeheader()
for page_anno in progress_annos:
# Get & save page data
page_data = process_page_annotation(page_anno)
orjsonl.append(output_pages, page_data)

for row in get_excerpt_entries(page_data):
csv_writer.writerow(row)


def main():
"""
Extracts page- and excerpt-level data from a Prodigy data file (JSONL)
and writes the page-level excerpt data to a JSONL (`output_pages`) and the
excerpt-level data to a CSV (`output_excerpts`).
"""
parser = argparse.ArgumentParser(
description="Extracts & saves page- and excerpt-level data from Prodigy data file",
)
parser.add_argument(
"input",
help="Path to Prodigy annotation data export (JSONL file)",
type=pathlib.Path,
)
parser.add_argument(
"output_pages",
help="Filename where extracted page-level data (JSONL file) should be written",
type=pathlib.Path,
)
parser.add_argument(
"output_excerpts",
help="Filename where extracted excerpt-level data (CSV file) should be written",
type=pathlib.Path,
)
parser.add_argument(
"--progress",
help="Show progress",
action=argparse.BooleanOptionalAction,
default=True,
)

args = parser.parse_args()
disable_progress = not args.progress

# Check that input file exists
if not args.input.is_file():
print(
f"Error: input file {args.input.is_file()} does not exist", file=sys.stderr
)
sys.exit(1)

# Check that output files does not exist
for output_file in [args.output_pages, args.output_excerpts]:
if output_file.exists():
print(
f"Error: output file {output_file} already exists, not overwriting",
file=sys.stderr,
)
sys.exit(1)

process_adjudication_data(
args.input,
args.output_pages,
args.output_excerpts,
disable_progress=disable_progress,
)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import sys
from inspect import isgenerator
from unittest.mock import MagicMock, call, patch

import pytest

from corppa.poetry_detection.annotation.process_adjudication_data import (
get_excerpt_entries,
get_excerpts,
process_adjudication_data,
process_page_annotation,
)


def test_get_excerpts():
page_annotation = {"text": "some page text"}

# Missing spans field
with pytest.raises(ValueError, match="Page annotation missing 'spans' field"):
get_excerpts(page_annotation)

# Empty spans field
page_annotation["spans"] = []
assert get_excerpts(page_annotation) == []

# Regular case (i.e. non-empty spans field)
page_annotation["spans"].append({"start": 0, "end": 4})
page_annotation["spans"].append({"start": 10, "end": 14})
results = get_excerpts(page_annotation)
assert results[0] == {"start": 0, "end": 4, "text": "some"}
assert results[1] == {"start": 10, "end": 14, "text": "text"}

# Missing text field
blank_page = {"spans": []}
assert get_excerpts(blank_page) == []


@patch("corppa.poetry_detection.annotation.process_adjudication_data.get_excerpts")
def test_process_page_annotation(mock_get_excerpts):
mock_get_excerpts.return_value = ["some", "poetry", "excerpts"]
page_annotation = {
"id": "some-page-id",
"work_id": "some-work-id",
"meta": {"title": "some-title", "author": "some-author", "year": "some-year"},
"spans": "some-spans",
}
result = process_page_annotation(page_annotation)
assert result == {
"page_id": "some-page-id",
"work_id": "some-work-id",
"work_title": "some-title",
"work_author": "some-author",
"work_year": "some-year",
"excerpts": ["some", "poetry", "excerpts"],
"n_excerpts": 3,
}
mock_get_excerpts.assert_called_once_with(page_annotation)


def test_get_excerpt_entries():
page_meta = {
"page_id": "some-page-id",
"work_id": "some-work-id",
"work_title": "some-title",
"work_author": "some-author",
"work_year": "some-year",
}
excerpts = [
{"start": 0, "end": 3, "text": "a"},
{"start": 5, "end": 6, "text": "b"},
]
page_data = page_meta | {"excerpts": excerpts}
expected_results = [page_meta | excerpt for excerpt in excerpts]

result = get_excerpt_entries(page_data)
assert isgenerator(result)
assert list(result) == expected_results


@patch(
"corppa.poetry_detection.annotation.process_adjudication_data.get_excerpt_entries"
)
@patch(
"corppa.poetry_detection.annotation.process_adjudication_data.process_page_annotation"
)
@patch("corppa.poetry_detection.annotation.process_adjudication_data.orjsonl")
@patch("corppa.poetry_detection.annotation.process_adjudication_data.tqdm")
def test_process_adjudication_data(
mock_tqdm,
mock_orjsonl,
mock_process_page_annotation,
mock_get_excerpt_entries,
tmpdir,
):
input_jsonl = tmpdir / "input.jsonl"
input_jsonl.write_text("some\ntext\n", encoding="utf-8")
out_excerpts = tmpdir / "output.csv"

# Default
csv_fields = [
"page_id",
"work_id",
"work_title",
"work_author",
"work_year",
"start",
"end",
"text",
]
mock_orjsonl.stream.return_value = "jsonl stream"
mock_tqdm.return_value = ["a", "b"]
mock_process_page_annotation.side_effect = lambda x: f"page {x}"
mock_get_excerpt_entries.return_value = [{k: "test" for k in csv_fields}]

process_adjudication_data(input_jsonl, "out.jsonl", out_excerpts)
mock_orjsonl.stream.assert_called_once_with(input_jsonl)
mock_tqdm.assert_called_once_with("jsonl stream", total=2, disable=False)
assert mock_process_page_annotation.call_count == 2
mock_process_page_annotation.assert_has_calls([call("a"), call("b")])
assert mock_orjsonl.append.call_count == 2
mock_orjsonl.append.assert_has_calls(
[call("out.jsonl", "page a"), call("out.jsonl", "page b")]
)
assert mock_get_excerpt_entries.call_count == 2
mock_get_excerpt_entries.assert_has_calls([call("page a"), call("page b")])
csv_text = ",".join(csv_fields) + "\n"
csv_text += ",".join(["test"] * 8) + "\n"
csv_text += ",".join(["test"] * 8) + "\n"
assert out_excerpts.read_text(encoding="utf-8") == csv_text

# Disable progress
mock_orjsonl.reset_mock()
mock_orjsonl.stream.return_value = "jsonl stream"
mock_tqdm.reset_mock()
mock_tqdm.return_value = ["a", "b"]
process_adjudication_data(
input_jsonl, "out.jsonl", out_excerpts, disable_progress=True
)
mock_orjsonl.stream.assert_called_once_with(input_jsonl)
mock_tqdm.assert_called_once_with("jsonl stream", total=2, disable=True)

0 comments on commit a386f5f

Please sign in to comment.