Feature/process adjudication data (#130)

* Adds script & unit tests for processing Prodigy adjudication data * Adds mypy type checking support for dev
Princeton-CDH · Dec 19, 2024 · a386f5f · a386f5f
1 parent cb03ab8
commit a386f5f
Show file tree

Hide file tree

Showing 3 changed files with 345 additions and 1 deletion.
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,12 +29,16 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+type-checking = [
+  "mypy",
+  "types-tqdm",
+]
 test = [
   "pytest",
   "pytest-cov"
 ]
 ocr = ["google-cloud-vision"]
-dev = ["pre-commit", "ruff", "corppa[test]", "corppa[ocr]"]
+dev = ["pre-commit", "ruff", "corppa[type-checking]", "corppa[test]", "corppa[ocr]"]
 
 [project.scripts]
 corppa-filter = "corppa.utils.filter:main"

diff --git a/src/corppa/poetry_detection/annotation/process_adjudication_data.py b/src/corppa/poetry_detection/annotation/process_adjudication_data.py
@@ -0,0 +1,200 @@
+"""
+This script processes the adjudication data produced by Prodigy for our
+poetry detection task into two outputs:
+
+    1. A JSONL file that compiles the annotation data into page-level records.
+       So, each record contains some page-level metdata and the compiled list
+       of poetry excerpts (if any) determined in the adjudication process.
+
+    2. A CSV file containing excerpt-level data per line.
+
+Note that the first file explicitly include information on the pages where
+no poetry was identified, while the second will only implicitly through
+absence and requires external knowledge of what pages were covered in
+the annotation rounds. So, the former is particularly useful for the evaluation
+process while the latter is better suited for building a final excerpt dataset.
+
+Example command line usage:
+```
+python process_adjudication_data.py prodigy_data.jsonl adj_pages.jsonl adj_excerpts.csv
+```
+"""
+
+import argparse
+import csv
+import pathlib
+import sys
+from collections.abc import Generator
+from typing import Any
+
+import orjsonl
+from tqdm import tqdm
+from xopen import xopen
+
+
+def get_excerpts(page_annotation: dict[str, Any]) -> list[dict[str, int | str]]:
+    """
+    Extract excerpts from page-level annotation. Excerpts have the following
+    fields:
+        * start: character-level starting index
+        * end: character-level end index (Pythonic, exclusive)
+        * text: text of page excerpt
+
+    Note: Currently ignoring span labels, since there's only one for the
+          poetry detection task.
+    """
+    excerpts = []
+    # Blank pages may not have a text field, so in these cases set to empty string
+    page_text = page_annotation.get("text", "")
+    if "spans" not in page_annotation:
+        raise ValueError("Page annotation missing 'spans' field")
+    for span in page_annotation["spans"]:
+        excerpt = {
+            "start": span["start"],
+            "end": span["end"],
+            "text": page_text[span["start"] : span["end"]],
+        }
+        excerpts.append(excerpt)
+    return excerpts
+
+
+def process_page_annotation(page_annotation) -> dict[str, Any]:
+    """
+    Extracts desired content from page-level annotation. The returned data has
+    the following fields"
+        * page_id: Page's PPA page identifier
+        * work_id: PPA work identifier
+        * work_title: Title of PPA work
+        * work_author: Author of PPA work
+        * work_year: Publication of PPA work
+        * n_excerpts: Number of poetry excerpts contained in page
+        * excerpts: List of poetry excerpts identified within page
+    """
+    page_data = {}
+    page_data["page_id"] = page_annotation["id"]
+    page_data["work_id"] = page_annotation["work_id"]
+    page_data["work_title"] = page_annotation["meta"]["title"]
+    page_data["work_author"] = page_annotation["meta"]["author"]
+    page_data["work_year"] = page_annotation["meta"]["year"]
+    page_data["excerpts"] = get_excerpts(page_annotation)
+    page_data["n_excerpts"] = len(page_data["excerpts"])
+    return page_data
+
+
+def get_excerpt_entries(page_data: dict[str, Any]) -> Generator[dict[str, Any]]:
+    """
+    Generate excerpt entries data from the processed page produced by
+    `process_page_annotation`.
+    """
+    for excerpt in page_data["excerpts"]:
+        entry = {
+            "page_id": page_data["page_id"],
+            "work_id": page_data["work_id"],
+            "work_title": page_data["work_title"],
+            "work_author": page_data["work_author"],
+            "work_year": page_data["work_year"],
+            "start": excerpt["start"],
+            "end": excerpt["end"],
+            "text": excerpt["text"],
+        }
+        yield entry
+
+
+def process_adjudication_data(
+    input_jsonl: pathlib.Path,
+    output_pages: pathlib.Path,
+    output_excerpts: pathlib.Path,
+    disable_progress: bool = False,
+) -> None:
+    """
+    Process adjudication annotation data and write output files containing page-level
+    and excerpt-level information that are JSONL and CSV files respectively.
+    """
+    n_lines = sum(1 for line in xopen(input_jsonl, mode="rb"))
+    progress_annos = tqdm(
+        orjsonl.stream(input_jsonl),
+        total=n_lines,
+        disable=disable_progress,
+    )
+    csv_fieldnames = [
+        "page_id",
+        "work_id",
+        "work_title",
+        "work_author",
+        "work_year",
+        "start",
+        "end",
+        "text",
+    ]
+    with open(output_excerpts, mode="w", newline="") as csvfile:
+        csv_writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
+        csv_writer.writeheader()
+        for page_anno in progress_annos:
+            # Get & save page data
+            page_data = process_page_annotation(page_anno)
+            orjsonl.append(output_pages, page_data)
+
+            for row in get_excerpt_entries(page_data):
+                csv_writer.writerow(row)
+
+
+def main():
+    """
+    Extracts page- and excerpt-level data from a Prodigy data file (JSONL)
+    and writes the page-level excerpt data to a JSONL (`output_pages`) and the
+    excerpt-level data to a CSV (`output_excerpts`).
+    """
+    parser = argparse.ArgumentParser(
+        description="Extracts & saves page- and excerpt-level data from Prodigy data file",
+    )
+    parser.add_argument(
+        "input",
+        help="Path to Prodigy annotation data export (JSONL file)",
+        type=pathlib.Path,
+    )
+    parser.add_argument(
+        "output_pages",
+        help="Filename where extracted page-level data (JSONL file) should be written",
+        type=pathlib.Path,
+    )
+    parser.add_argument(
+        "output_excerpts",
+        help="Filename where extracted excerpt-level data (CSV file) should be written",
+        type=pathlib.Path,
+    )
+    parser.add_argument(
+        "--progress",
+        help="Show progress",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+    )
+
+    args = parser.parse_args()
+    disable_progress = not args.progress
+
+    # Check that input file exists
+    if not args.input.is_file():
+        print(
+            f"Error: input file {args.input.is_file()} does not exist", file=sys.stderr
+        )
+        sys.exit(1)
+
+    # Check that output files does not exist
+    for output_file in [args.output_pages, args.output_excerpts]:
+        if output_file.exists():
+            print(
+                f"Error: output file {output_file} already exists, not overwriting",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+    process_adjudication_data(
+        args.input,
+        args.output_pages,
+        args.output_excerpts,
+        disable_progress=disable_progress,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_poetry_detection/test_annotation/test_process_adjudication_data.py b/test/test_poetry_detection/test_annotation/test_process_adjudication_data.py
@@ -0,0 +1,140 @@
+import sys
+from inspect import isgenerator
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from corppa.poetry_detection.annotation.process_adjudication_data import (
+    get_excerpt_entries,
+    get_excerpts,
+    process_adjudication_data,
+    process_page_annotation,
+)
+
+
+def test_get_excerpts():
+    page_annotation = {"text": "some page text"}
+
+    # Missing spans field
+    with pytest.raises(ValueError, match="Page annotation missing 'spans' field"):
+        get_excerpts(page_annotation)
+
+    # Empty spans field
+    page_annotation["spans"] = []
+    assert get_excerpts(page_annotation) == []
+
+    # Regular case (i.e. non-empty spans field)
+    page_annotation["spans"].append({"start": 0, "end": 4})
+    page_annotation["spans"].append({"start": 10, "end": 14})
+    results = get_excerpts(page_annotation)
+    assert results[0] == {"start": 0, "end": 4, "text": "some"}
+    assert results[1] == {"start": 10, "end": 14, "text": "text"}
+
+    # Missing text field
+    blank_page = {"spans": []}
+    assert get_excerpts(blank_page) == []
+
+
+@patch("corppa.poetry_detection.annotation.process_adjudication_data.get_excerpts")
+def test_process_page_annotation(mock_get_excerpts):
+    mock_get_excerpts.return_value = ["some", "poetry", "excerpts"]
+    page_annotation = {
+        "id": "some-page-id",
+        "work_id": "some-work-id",
+        "meta": {"title": "some-title", "author": "some-author", "year": "some-year"},
+        "spans": "some-spans",
+    }
+    result = process_page_annotation(page_annotation)
+    assert result == {
+        "page_id": "some-page-id",
+        "work_id": "some-work-id",
+        "work_title": "some-title",
+        "work_author": "some-author",
+        "work_year": "some-year",
+        "excerpts": ["some", "poetry", "excerpts"],
+        "n_excerpts": 3,
+    }
+    mock_get_excerpts.assert_called_once_with(page_annotation)
+
+
+def test_get_excerpt_entries():
+    page_meta = {
+        "page_id": "some-page-id",
+        "work_id": "some-work-id",
+        "work_title": "some-title",
+        "work_author": "some-author",
+        "work_year": "some-year",
+    }
+    excerpts = [
+        {"start": 0, "end": 3, "text": "a"},
+        {"start": 5, "end": 6, "text": "b"},
+    ]
+    page_data = page_meta | {"excerpts": excerpts}
+    expected_results = [page_meta | excerpt for excerpt in excerpts]
+
+    result = get_excerpt_entries(page_data)
+    assert isgenerator(result)
+    assert list(result) == expected_results
+
+
+@patch(
+    "corppa.poetry_detection.annotation.process_adjudication_data.get_excerpt_entries"
+)
+@patch(
+    "corppa.poetry_detection.annotation.process_adjudication_data.process_page_annotation"
+)
+@patch("corppa.poetry_detection.annotation.process_adjudication_data.orjsonl")
+@patch("corppa.poetry_detection.annotation.process_adjudication_data.tqdm")
+def test_process_adjudication_data(
+    mock_tqdm,
+    mock_orjsonl,
+    mock_process_page_annotation,
+    mock_get_excerpt_entries,
+    tmpdir,
+):
+    input_jsonl = tmpdir / "input.jsonl"
+    input_jsonl.write_text("some\ntext\n", encoding="utf-8")
+    out_excerpts = tmpdir / "output.csv"
+
+    # Default
+    csv_fields = [
+        "page_id",
+        "work_id",
+        "work_title",
+        "work_author",
+        "work_year",
+        "start",
+        "end",
+        "text",
+    ]
+    mock_orjsonl.stream.return_value = "jsonl stream"
+    mock_tqdm.return_value = ["a", "b"]
+    mock_process_page_annotation.side_effect = lambda x: f"page {x}"
+    mock_get_excerpt_entries.return_value = [{k: "test" for k in csv_fields}]
+
+    process_adjudication_data(input_jsonl, "out.jsonl", out_excerpts)
+    mock_orjsonl.stream.assert_called_once_with(input_jsonl)
+    mock_tqdm.assert_called_once_with("jsonl stream", total=2, disable=False)
+    assert mock_process_page_annotation.call_count == 2
+    mock_process_page_annotation.assert_has_calls([call("a"), call("b")])
+    assert mock_orjsonl.append.call_count == 2
+    mock_orjsonl.append.assert_has_calls(
+        [call("out.jsonl", "page a"), call("out.jsonl", "page b")]
+    )
+    assert mock_get_excerpt_entries.call_count == 2
+    mock_get_excerpt_entries.assert_has_calls([call("page a"), call("page b")])
+    csv_text = ",".join(csv_fields) + "\n"
+    csv_text += ",".join(["test"] * 8) + "\n"
+    csv_text += ",".join(["test"] * 8) + "\n"
+    assert out_excerpts.read_text(encoding="utf-8") == csv_text
+
+    # Disable progress
+    mock_orjsonl.reset_mock()
+    mock_orjsonl.stream.return_value = "jsonl stream"
+    mock_tqdm.reset_mock()
+    mock_tqdm.return_value = ["a", "b"]
+    process_adjudication_data(
+        input_jsonl, "out.jsonl", out_excerpts, disable_progress=True
+    )
+    mock_orjsonl.stream.assert_called_once_with(input_jsonl)
+    mock_tqdm.assert_called_once_with("jsonl stream", total=2, disable=True)