From e47c530e529ad0a3a3d960aaf89b6f8de723af7a Mon Sep 17 00:00:00 2001 From: Dan LaManna Date: Tue, 14 Nov 2023 07:36:15 -0500 Subject: [PATCH 1/2] Refactor metadata consistency checks --- isic/ingest/utils/metadata.py | 38 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/isic/ingest/utils/metadata.py b/isic/ingest/utils/metadata.py index 71dd317a..62d74a54 100644 --- a/isic/ingest/utils/metadata.py +++ b/isic/ingest/utils/metadata.py @@ -1,4 +1,5 @@ from collections import defaultdict +from typing import Callable from django.forms.models import ModelForm from isic_metadata.metadata import MetadataRow @@ -56,42 +57,41 @@ def validate_csv_format_and_filenames(df, cohort): return problems -def validate_internal_consistency(df): +def _validate_df_consistency(df, row_preprocessor: Callable[[dict], dict] | None = None) -> dict: # keyed by column, message column_problems: dict[tuple[str, str], list[int]] = defaultdict(list) for i, (_, row) in enumerate(df.iterrows(), start=2): try: - MetadataRow.model_validate(row.to_dict()) + row_dict = row.to_dict() + if row_preprocessor is not None: + row_dict = row_preprocessor(row_dict) + + MetadataRow.model_validate(row_dict) except PydanticValidationError as e: for error in e.errors(): column = error["loc"][0] - column_problems[(column, error["msg"])].append(i) + column_problems[(str(column), error["msg"])].append(i) + + # defaultdict doesn't work with django templates, see https://stackoverflow.com/a/12842716 + column_problems.default_factory = None + + return column_problems - # TODO: defaultdict doesn't work in django templates? - return dict(column_problems) + +def validate_internal_consistency(df): + return _validate_df_consistency(df) def validate_archive_consistency(df, cohort): - # keyed by column, message - column_problems: dict[tuple[str, str], list[int]] = defaultdict(list) accessions = Accession.objects.filter( cohort=cohort, original_blob_name__in=df["filename"] ).values_list("original_blob_name", "metadata") # TODO: easier way to do this? accessions_dict = {x[0]: x[1] for x in accessions} - for i, (_, row) in enumerate(df.iterrows(), start=2): + def prepopulate_row(row: dict) -> dict: existing = accessions_dict[row["filename"]] - row = existing | {k: v for k, v in row.items() if v is not None} - - try: - MetadataRow.model_validate(row) - except PydanticValidationError as e: - for error in e.errors(): - column = error["loc"][0] - - column_problems[(column, error["msg"])].append(i) + return existing | {k: v for k, v in row.items() if v is not None} - # TODO: defaultdict doesn't work in django templates? - return dict(column_problems) + return _validate_df_consistency(df, row_preprocessor=prepopulate_row) From 12cd8581b8bee965a0781e002d125fa1925a7c5f Mon Sep 17 00:00:00 2001 From: Dan LaManna Date: Tue, 14 Nov 2023 08:05:23 -0500 Subject: [PATCH 2/2] Add metadata batch validation --- .../partials/metadata_validation_output.html | 62 +++++++---- isic/ingest/tests/test_metadata.py | 103 +++++++++++++++-- isic/ingest/utils/metadata.py | 105 +++++++++++++----- isic/ingest/views/__init__.py | 2 +- isic/ingest/views/metadata.py | 67 +++++------ 5 files changed, 248 insertions(+), 91 deletions(-) diff --git a/isic/ingest/templates/ingest/partials/metadata_validation_output.html b/isic/ingest/templates/ingest/partials/metadata_validation_output.html index f52ec5be..d86262bf 100644 --- a/isic/ingest/templates/ingest/partials/metadata_validation_output.html +++ b/isic/ingest/templates/ingest/partials/metadata_validation_output.html @@ -28,36 +28,42 @@
-
Checkpoint 1 - {{ checkpoint.1.title }}
+
Checkpoint 1 - Filename checks
- {% if checkpoint.1.run %} - {% for problem in checkpoint.1.problems %} -
- {{ problem.message }} - {% if problem.context %} -
- {{ problem.context }} -
- {% endif %} -
- {% empty %} - passed. - - {% endfor %} - {% endif %} + {% for problem in csv_check %} +
+ {{ problem.message }} + {% if problem.context %} +
+ {{ problem.context }} +
+ {% endif %} +
+ {% empty %} + passed. + {% endfor %}
-
Checkpoint 2 - {{ checkpoint.2.title }}
+
Checkpoint 2 - Internal Consistency
- {% if checkpoint.2.run %} - {% if checkpoint.2.problems %} + {% if internal_check is not None %} + {% if internal_check.0 or internal_check.1 %}
    - {% for key, values in checkpoint.2.problems.items %} + {% for key, values in internal_check.0.items %}
  • {{ key.0 }} - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }} {% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
  • {% endfor %} + + {% for batch_problem in internal_check.1 %} +
  • + {{ batch_problem.message }} + - instances: {{ batch_problem.context|slice:"5"|join:", " }} + {% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %} +
  • + {% endfor %} +
{% else %} passed. @@ -67,17 +73,25 @@ {% endif %}
-
Checkpoint 3 - {{ checkpoint.3.title }}
+
Checkpoint 3 - Archive Consistency
- {% if checkpoint.3.run %} - {% if checkpoint.3.problems %} + {% if archive_check is not None %} + {% if archive_check.0 or archive_check.1 %}
    - {% for key, values in checkpoint.3.problems.items %} + {% for key, values in archive_check.0 %}
  • {{ key.0 }} - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }} {% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
  • {% endfor %} + + {% for batch_problem in archive_check.1 %} +
  • + {{ batch_problem.message }} + - instances: {{ batch_problem.context|slice:"5"|join:", " }} + {% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %} +
  • + {% endfor %}
{% else %} passed. diff --git a/isic/ingest/tests/test_metadata.py b/isic/ingest/tests/test_metadata.py index c576e209..08a241a1 100644 --- a/isic/ingest/tests/test_metadata.py +++ b/isic/ingest/tests/test_metadata.py @@ -1,6 +1,6 @@ import csv import io -from typing import BinaryIO +from typing import BinaryIO, cast from django.urls.base import reverse from django.utils import timezone @@ -10,6 +10,7 @@ from isic.ingest.tasks import update_metadata_task from isic.ingest.tests.csv_streams import StreamWriter from isic.ingest.utils.metadata import validate_csv_format_and_filenames +from isic.ingest.views.metadata import ApplyMetadataContext @pytest.fixture @@ -31,6 +32,45 @@ def csv_stream_diagnosis_sex() -> BinaryIO: return file_stream +@pytest.fixture +def csv_stream_diagnosis_sex_lesion_patient() -> BinaryIO: + file_stream = StreamWriter(io.BytesIO()) + writer = csv.DictWriter( + file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"] + ) + writer.writeheader() + writer.writerow( + { + "filename": "filename.jpg", + "diagnosis": "melanoma", + "sex": "female", + "lesion_id": "lesion1", + "patient_id": "patient1", + } + ) + return file_stream + + +@pytest.fixture +def csv_stream_diagnosis_sex_disagreeing_lesion_patient() -> BinaryIO: + # a version that maps the same lesion to a different patient + file_stream = StreamWriter(io.BytesIO()) + writer = csv.DictWriter( + file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"] + ) + writer.writeheader() + writer.writerow( + { + "filename": "filename2.jpg", + "diagnosis": "nevus", + "sex": "male", + "lesion_id": "lesion1", + "patient_id": "patient2", + } + ) + return file_stream + + @pytest.fixture def csv_stream_benign() -> BinaryIO: file_stream = StreamWriter(io.BytesIO()) @@ -52,6 +92,7 @@ def csv_stream_diagnosis_sex_invalid() -> BinaryIO: @pytest.fixture def cohort_with_accession(cohort, accession_factory): cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename.jpg")) + cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename2.jpg")) return cohort @@ -130,13 +171,15 @@ def test_apply_metadata_step2_invalid( reverse("validate-metadata", args=[cohort_with_accession.pk]), {"metadata_file": metadatafile.pk}, ) + r.context = cast(ApplyMetadataContext, r.context) + assert not r.context["form"].errors, r.context["form"].errors assert r.status_code == 200, r.status_code - assert r.context["checkpoint"][1]["problems"] == [] + assert r.context["csv_check"] == [] # Ensure there's an error with the diagnosis field in step 2 - assert r.context["checkpoint"][2]["problems"] - assert list(r.context["checkpoint"][2]["problems"].items())[0][0][0] == "diagnosis" - assert r.context["checkpoint"][3]["problems"] == {} + assert r.context["internal_check"] + assert list(r.context["internal_check"][0].keys())[0][0] == "diagnosis" + assert not r.context["archive_check"] @pytest.mark.django_db @@ -171,12 +214,54 @@ def test_apply_metadata_step3( reverse("validate-metadata", args=[cohort_with_accession.pk]), {"metadata_file": benign_metadatafile.pk}, ) + r.context = cast(ApplyMetadataContext, r.context) assert not r.context["form"].errors, r.context["form"].errors assert r.status_code == 200, r.status_code - assert r.context["checkpoint"][1]["problems"] == [] - assert r.context["checkpoint"][2]["problems"] == {} - assert r.context["checkpoint"][3]["problems"] - assert list(r.context["checkpoint"][3]["problems"].items())[0][0][0] == "diagnosis" + assert r.context["csv_check"] == [] + assert r.context["internal_check"] and not any(r.context["internal_check"]) + assert r.context["archive_check"] + assert list(r.context["archive_check"][0].keys())[0][0] == "diagnosis" + + +@pytest.mark.django_db +def test_apply_metadata_step3_full_cohort( + user, + staff_client, + cohort_with_accession, + csv_stream_diagnosis_sex_lesion_patient, + csv_stream_diagnosis_sex_disagreeing_lesion_patient, + metadata_file_factory, +): + metadatafile = metadata_file_factory( + blob__from_func=lambda: csv_stream_diagnosis_sex_lesion_patient, + cohort=cohort_with_accession, + ) + + r = staff_client.post( + reverse("validate-metadata", args=[cohort_with_accession.pk]), + {"metadata_file": metadatafile.pk}, + ) + assert not r.context["form"].errors, r.context["form"].errors + assert r.status_code == 200, r.status_code + + update_metadata_task(user.pk, metadatafile.pk) + + # test step 3 by trying to upload a disagreeing lesion/patient pair. + disagreeing_metadatafile = metadata_file_factory( + blob__from_func=lambda: csv_stream_diagnosis_sex_disagreeing_lesion_patient, + cohort=cohort_with_accession, + ) + + r = staff_client.post( + reverse("validate-metadata", args=[cohort_with_accession.pk]), + {"metadata_file": disagreeing_metadatafile.pk}, + ) + r.context = cast(ApplyMetadataContext, r.context) + assert not r.context["form"].errors, r.context["form"].errors + assert r.context["csv_check"] == [] + assert r.context["internal_check"] and not any(r.context["internal_check"]) + assert r.context["archive_check"] + assert "belong to multiple patients" in str(r.context["archive_check"][1][0].message) @pytest.mark.django_db diff --git a/isic/ingest/utils/metadata.py b/isic/ingest/utils/metadata.py index 62d74a54..cdb7708d 100644 --- a/isic/ingest/utils/metadata.py +++ b/isic/ingest/utils/metadata.py @@ -1,8 +1,8 @@ from collections import defaultdict -from typing import Callable +from typing import Callable, Iterable from django.forms.models import ModelForm -from isic_metadata.metadata import MetadataRow +from isic_metadata.metadata import MetadataBatch, MetadataRow import pandas as pd from pydantic import ValidationError as PydanticValidationError from pydantic.main import BaseModel @@ -24,7 +24,11 @@ class Problem(BaseModel): type: str | None = "error" -def validate_csv_format_and_filenames(df, cohort): +# A dictionary of (column name, error message) -> list of row indices with that error +ColumnRowErrors = dict[tuple[str, str], list[int]] + + +def validate_csv_format_and_filenames(df, cohort) -> list[Problem]: problems = [] # TODO: duplicate columns @@ -57,41 +61,92 @@ def validate_csv_format_and_filenames(df, cohort): return problems -def _validate_df_consistency(df, row_preprocessor: Callable[[dict], dict] | None = None) -> dict: - # keyed by column, message - column_problems: dict[tuple[str, str], list[int]] = defaultdict(list) +def _validate_df_consistency( + batch: Iterable[dict], row_preprocessor: Callable[[dict], dict] | None = None +) -> tuple[ColumnRowErrors, list[Problem]]: + column_error_rows: ColumnRowErrors = defaultdict(list) + batch_problems: list[Problem] = [] - for i, (_, row) in enumerate(df.iterrows(), start=2): + # Since rows have to be evaluated twice, we need to convert the iterator to a list + batch = list(batch) + + for i, row in enumerate(batch): try: - row_dict = row.to_dict() if row_preprocessor is not None: - row_dict = row_preprocessor(row_dict) + row = row_preprocessor(row) - MetadataRow.model_validate(row_dict) + MetadataRow.model_validate(row) except PydanticValidationError as e: for error in e.errors(): column = error["loc"][0] - column_problems[(str(column), error["msg"])].append(i) + column_error_rows[(str(column), error["msg"])].append(i) + + # validate the metadata as a "batch". this is for all checks that span rows. since this + # currently only applies to patient/lesion checks, we can sparsely populate the MetadataRow + # objects to save on memory. + try: + MetadataBatch( + items=[ + MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id")) + for row in batch + ] + ) + except PydanticValidationError as e: + for error in e.errors(): + examples = error["ctx"]["examples"] if "ctx" in error else [] + batch_problems.append(Problem(message=error["msg"], context=examples)) # defaultdict doesn't work with django templates, see https://stackoverflow.com/a/12842716 - column_problems.default_factory = None + return dict(column_error_rows), batch_problems - return column_problems +def validate_internal_consistency(df) -> tuple[ColumnRowErrors, list[Problem]]: + return _validate_df_consistency(df.to_dict(orient="records")) -def validate_internal_consistency(df): - return _validate_df_consistency(df) +def validate_archive_consistency(df, cohort) -> tuple[ColumnRowErrors, list[Problem]]: + """ + Validate that a CSV is consistent with the existing cohort metadata. -def validate_archive_consistency(df, cohort): - accessions = Accession.objects.filter( - cohort=cohort, original_blob_name__in=df["filename"] - ).values_list("original_blob_name", "metadata") - # TODO: easier way to do this? - accessions_dict = {x[0]: x[1] for x in accessions} + This merges the existing cohort metadata with the proposed df and validates the merged + metadata. This allows for cross column checks e.g. an existing benign accession against + a df with diagnosis=melanoma. It also enables cross row checks, such as verifying that + a lesion doesn't belong to more than one patient. + """ - def prepopulate_row(row: dict) -> dict: - existing = accessions_dict[row["filename"]] - return existing | {k: v for k, v in row.items() if v is not None} + def cohort_df_merged_metadata_rows(): + """ + Yield the merged metadata rows for the cohort and df. + + The merged metadata rows are generated by iterating over the cohort accessions and + yielding the metadata for each accession. It merges if necessary and then yields the + merged result, removing it from the dataframe. + """ + accessions = cohort.accessions.values_list( + "original_blob_name", + "metadata", + "lesion__private_lesion_id", + "patient__private_patient_id", + ) - return _validate_df_consistency(df, row_preprocessor=prepopulate_row) + for original_blob_name, metadata, lesion_id, patient_id in accessions.iterator(): + # lesion/patient need to be treated as top level metadata even though + # they're stored differently in the database. + if lesion_id: + metadata["lesion_id"] = lesion_id + if patient_id: + metadata["patient_id"] = patient_id + + in_df = len(df[df["filename"] == original_blob_name]) > 0 + if in_df: + df_row = df[df["filename"] == original_blob_name].iloc[0] + yield metadata | df_row.to_dict() + df.drop(df_row.name, inplace=True) + else: + yield metadata + + # yield all the rows in the dataframe that don't already exist in the cohort + for df_row in df.itertuples(): + yield df_row._asdict() + + return _validate_df_consistency(cohort_df_merged_metadata_rows()) diff --git a/isic/ingest/views/__init__.py b/isic/ingest/views/__init__.py index 90dfcfe0..ccdaeec3 100644 --- a/isic/ingest/views/__init__.py +++ b/isic/ingest/views/__init__.py @@ -3,7 +3,7 @@ from isic.ingest.models.cohort import Cohort -def make_breadcrumbs(cohort: Cohort | None = None) -> list: +def make_breadcrumbs(cohort: Cohort | None = None) -> list[list[str]]: ret = [[reverse("ingest-review"), "Ingest Review"]] if cohort: diff --git a/isic/ingest/views/metadata.py b/isic/ingest/views/metadata.py index 20fbf436..3f768d0c 100644 --- a/isic/ingest/views/metadata.py +++ b/isic/ingest/views/metadata.py @@ -1,4 +1,5 @@ import os +from typing import TypedDict from django import forms from django.contrib.admin.views.decorators import staff_member_required @@ -13,6 +14,8 @@ from isic.core.permissions import get_visible_objects, needs_object_permission from isic.ingest.models import Cohort, MetadataFile from isic.ingest.utils.metadata import ( + ColumnRowErrors, + Problem, validate_archive_consistency, validate_csv_format_and_filenames, validate_internal_consistency, @@ -66,64 +69,64 @@ def metadata_file_create(request, cohort_pk): return render(request, "ingest/metadata_file_create.html", {"form": form}) +class ApplyMetadataContext(TypedDict): + cohort: Cohort + breadcrumbs: list[list[str]] + metadata_file_id: int + unstructured_columns: list[str] + form: ValidateMetadataForm + successful: bool + + csv_check: list[Problem] | None + internal_check: tuple[ColumnRowErrors, list[Problem]] | None + archive_check: tuple[ColumnRowErrors, list[Problem]] | None + + @staff_member_required def apply_metadata(request, cohort_pk): - cohort = get_object_or_404( + cohort: Cohort = get_object_or_404( Cohort.objects.prefetch_related( Prefetch("metadata_files", queryset=MetadataFile.objects.order_by("-created")) ), pk=cohort_pk, ) + # casting ctx to ApplyMetadataContext causes errors because all of the keys are required + # but not provided in the initial assignment. workarounds cause more headaches than they + # solve, so we only use the TypedDict in the tests. ctx = { "cohort": cohort, "breadcrumbs": make_breadcrumbs(cohort) - + [[reverse("validate-metadata", args=[cohort.id]), "Validate Metadata"]], - } - # TODO: Find a cleaner way to implement this system altogether. - checkpoints = { - 1: { - "title": "Filename checks", - "run": False, - "problems": {}, - }, - 2: { - "title": "Internal consistency", - "run": False, - "problems": {}, - }, - 3: { - "title": "Archive consistency", - "run": False, - "problems": {}, - }, + + [[reverse("validate-metadata", args=[cohort.pk]), "Validate Metadata"]], } + csv_check: list[Problem] | None = None + internal_check: tuple[ColumnRowErrors, list[Problem]] | None = None + archive_check: tuple[ColumnRowErrors, list[Problem]] | None = None + if request.method == "POST": form = ValidateMetadataForm(request.user, cohort, request.POST) if form.is_valid(): metadata_file = MetadataFile.objects.get(id=int(form.cleaned_data["metadata_file"])) - ctx["metadata_file_id"] = metadata_file.id + ctx["metadata_file_id"] = metadata_file.pk df = metadata_file.to_df() ctx["unstructured_columns"] = get_unstructured_columns(df) - checkpoints[1]["problems"] = validate_csv_format_and_filenames(df, cohort) - checkpoints[1]["run"] = True + csv_check = validate_csv_format_and_filenames(df, cohort) - if not checkpoints[1]["problems"]: - checkpoints[2]["problems"] = validate_internal_consistency(df) - checkpoints[2]["run"] = True + if not any(csv_check): + internal_check = validate_internal_consistency(df) - if not checkpoints[2]["problems"]: - checkpoints[3]["problems"] = validate_archive_consistency(df, cohort) - checkpoints[3]["run"] = True + if not any(internal_check): + archive_check = validate_archive_consistency(df, cohort) - if not checkpoints[3]["problems"]: + if not any(archive_check): ctx["successful"] = True - else: form = ValidateMetadataForm(request.user, cohort) - ctx["checkpoint"] = checkpoints ctx["form"] = form + ctx["csv_check"] = csv_check + ctx["internal_check"] = internal_check + ctx["archive_check"] = archive_check return render(request, "ingest/apply_metadata.html", ctx)