From e47c530e529ad0a3a3d960aaf89b6f8de723af7a Mon Sep 17 00:00:00 2001
From: Dan LaManna <dan.lamanna@kitware.com>
Date: Tue, 14 Nov 2023 07:36:15 -0500
Subject: [PATCH 1/2] Refactor metadata consistency checks

---
 isic/ingest/utils/metadata.py | 38 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/isic/ingest/utils/metadata.py b/isic/ingest/utils/metadata.py
index 71dd317a..62d74a54 100644
--- a/isic/ingest/utils/metadata.py
+++ b/isic/ingest/utils/metadata.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from typing import Callable
 
 from django.forms.models import ModelForm
 from isic_metadata.metadata import MetadataRow
@@ -56,42 +57,41 @@ def validate_csv_format_and_filenames(df, cohort):
     return problems
 
 
-def validate_internal_consistency(df):
+def _validate_df_consistency(df, row_preprocessor: Callable[[dict], dict] | None = None) -> dict:
     # keyed by column, message
     column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)
 
     for i, (_, row) in enumerate(df.iterrows(), start=2):
         try:
-            MetadataRow.model_validate(row.to_dict())
+            row_dict = row.to_dict()
+            if row_preprocessor is not None:
+                row_dict = row_preprocessor(row_dict)
+
+            MetadataRow.model_validate(row_dict)
         except PydanticValidationError as e:
             for error in e.errors():
                 column = error["loc"][0]
-                column_problems[(column, error["msg"])].append(i)
+                column_problems[(str(column), error["msg"])].append(i)
+
+    # defaultdict doesn't work with django templates, see https://stackoverflow.com/a/12842716
+    column_problems.default_factory = None
+
+    return column_problems
 
-    # TODO: defaultdict doesn't work in django templates?
-    return dict(column_problems)
+
+def validate_internal_consistency(df):
+    return _validate_df_consistency(df)
 
 
 def validate_archive_consistency(df, cohort):
-    # keyed by column, message
-    column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)
     accessions = Accession.objects.filter(
         cohort=cohort, original_blob_name__in=df["filename"]
     ).values_list("original_blob_name", "metadata")
     # TODO: easier way to do this?
     accessions_dict = {x[0]: x[1] for x in accessions}
 
-    for i, (_, row) in enumerate(df.iterrows(), start=2):
+    def prepopulate_row(row: dict) -> dict:
         existing = accessions_dict[row["filename"]]
-        row = existing | {k: v for k, v in row.items() if v is not None}
-
-        try:
-            MetadataRow.model_validate(row)
-        except PydanticValidationError as e:
-            for error in e.errors():
-                column = error["loc"][0]
-
-                column_problems[(column, error["msg"])].append(i)
+        return existing | {k: v for k, v in row.items() if v is not None}
 
-    # TODO: defaultdict doesn't work in django templates?
-    return dict(column_problems)
+    return _validate_df_consistency(df, row_preprocessor=prepopulate_row)

From 12cd8581b8bee965a0781e002d125fa1925a7c5f Mon Sep 17 00:00:00 2001
From: Dan LaManna <dan.lamanna@kitware.com>
Date: Tue, 14 Nov 2023 08:05:23 -0500
Subject: [PATCH 2/2] Add metadata batch validation

---
 .../partials/metadata_validation_output.html  |  62 +++++++----
 isic/ingest/tests/test_metadata.py            | 103 +++++++++++++++--
 isic/ingest/utils/metadata.py                 | 105 +++++++++++++-----
 isic/ingest/views/__init__.py                 |   2 +-
 isic/ingest/views/metadata.py                 |  67 +++++------
 5 files changed, 248 insertions(+), 91 deletions(-)

diff --git a/isic/ingest/templates/ingest/partials/metadata_validation_output.html b/isic/ingest/templates/ingest/partials/metadata_validation_output.html
index f52ec5be..d86262bf 100644
--- a/isic/ingest/templates/ingest/partials/metadata_validation_output.html
+++ b/isic/ingest/templates/ingest/partials/metadata_validation_output.html
@@ -28,36 +28,42 @@
 <div id="checkpoint-problems">
 
   <div>
-    <div class="heading-4">Checkpoint 1 - {{ checkpoint.1.title }}</div>
+    <div class="heading-4">Checkpoint 1 - Filename checks</div>
   </div>
-  {% if checkpoint.1.run %}
-    {% for problem in checkpoint.1.problems %}
-      <div>
-        {{ problem.message  }}
-        {% if problem.context %}
-          <div>
-            {{ problem.context }}
-          </div>
-        {% endif %}
-      </div>
-    {% empty %}
-      passed.
-
-    {% endfor %}
-  {% endif %}
+  {% for problem in csv_check %}
+    <div>
+      {{ problem.message  }}
+      {% if problem.context %}
+        <div>
+          {{ problem.context }}
+        </div>
+      {% endif %}
+    </div>
+  {% empty %}
+    passed.
+  {% endfor %}
 
   <div>
-    <div class="heading-4">Checkpoint 2 - {{ checkpoint.2.title }}</div>
+    <div class="heading-4">Checkpoint 2 - Internal Consistency</div>
   </div>
-  {% if checkpoint.2.run %}
-    {% if checkpoint.2.problems %}
+  {% if internal_check is not None %}
+    {% if internal_check.0 or internal_check.1 %}
       <ul>
-        {% for key, values in checkpoint.2.problems.items %}
+        {% for key, values in internal_check.0.items %}
           <li>
             <strong>{{ key.0 }}</strong> - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }}
             {% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
           </li>
         {% endfor %}
+
+        {% for batch_problem in internal_check.1 %}
+          <li>
+            <strong>{{ batch_problem.message }}</strong>
+            - instances: {{ batch_problem.context|slice:"5"|join:", " }}
+            {% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %}
+          </li>
+        {% endfor %}
+
       </ul>
     {% else %}
       passed.
@@ -67,17 +73,25 @@
   {% endif %}
 
   <div>
-    <div class="heading-4">Checkpoint 3 - {{ checkpoint.3.title }}</div>
+    <div class="heading-4">Checkpoint 3 - Archive Consistency</div>
   </div>
-  {% if checkpoint.3.run %}
-    {% if checkpoint.3.problems %}
+  {% if archive_check is not None %}
+    {% if archive_check.0 or archive_check.1 %}
       <ul>
-        {% for key, values in checkpoint.3.problems.items %}
+        {% for key, values in archive_check.0 %}
           <li>
             <strong>{{ key.0 }}</strong> - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }}
             {% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
           </li>
         {% endfor %}
+
+        {% for batch_problem in archive_check.1 %}
+          <li>
+            <strong>{{ batch_problem.message }}</strong>
+            - instances: {{ batch_problem.context|slice:"5"|join:", " }}
+            {% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %}
+          </li>
+        {% endfor %}
       </ul>
     {% else %}
       passed.
diff --git a/isic/ingest/tests/test_metadata.py b/isic/ingest/tests/test_metadata.py
index c576e209..08a241a1 100644
--- a/isic/ingest/tests/test_metadata.py
+++ b/isic/ingest/tests/test_metadata.py
@@ -1,6 +1,6 @@
 import csv
 import io
-from typing import BinaryIO
+from typing import BinaryIO, cast
 
 from django.urls.base import reverse
 from django.utils import timezone
@@ -10,6 +10,7 @@
 from isic.ingest.tasks import update_metadata_task
 from isic.ingest.tests.csv_streams import StreamWriter
 from isic.ingest.utils.metadata import validate_csv_format_and_filenames
+from isic.ingest.views.metadata import ApplyMetadataContext
 
 
 @pytest.fixture
@@ -31,6 +32,45 @@ def csv_stream_diagnosis_sex() -> BinaryIO:
     return file_stream
 
 
+@pytest.fixture
+def csv_stream_diagnosis_sex_lesion_patient() -> BinaryIO:
+    file_stream = StreamWriter(io.BytesIO())
+    writer = csv.DictWriter(
+        file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"]
+    )
+    writer.writeheader()
+    writer.writerow(
+        {
+            "filename": "filename.jpg",
+            "diagnosis": "melanoma",
+            "sex": "female",
+            "lesion_id": "lesion1",
+            "patient_id": "patient1",
+        }
+    )
+    return file_stream
+
+
+@pytest.fixture
+def csv_stream_diagnosis_sex_disagreeing_lesion_patient() -> BinaryIO:
+    # a version that maps the same lesion to a different patient
+    file_stream = StreamWriter(io.BytesIO())
+    writer = csv.DictWriter(
+        file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"]
+    )
+    writer.writeheader()
+    writer.writerow(
+        {
+            "filename": "filename2.jpg",
+            "diagnosis": "nevus",
+            "sex": "male",
+            "lesion_id": "lesion1",
+            "patient_id": "patient2",
+        }
+    )
+    return file_stream
+
+
 @pytest.fixture
 def csv_stream_benign() -> BinaryIO:
     file_stream = StreamWriter(io.BytesIO())
@@ -52,6 +92,7 @@ def csv_stream_diagnosis_sex_invalid() -> BinaryIO:
 @pytest.fixture
 def cohort_with_accession(cohort, accession_factory):
     cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename.jpg"))
+    cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename2.jpg"))
     return cohort
 
 
@@ -130,13 +171,15 @@ def test_apply_metadata_step2_invalid(
         reverse("validate-metadata", args=[cohort_with_accession.pk]),
         {"metadata_file": metadatafile.pk},
     )
+    r.context = cast(ApplyMetadataContext, r.context)
+
     assert not r.context["form"].errors, r.context["form"].errors
     assert r.status_code == 200, r.status_code
-    assert r.context["checkpoint"][1]["problems"] == []
+    assert r.context["csv_check"] == []
     # Ensure there's an error with the diagnosis field in step 2
-    assert r.context["checkpoint"][2]["problems"]
-    assert list(r.context["checkpoint"][2]["problems"].items())[0][0][0] == "diagnosis"
-    assert r.context["checkpoint"][3]["problems"] == {}
+    assert r.context["internal_check"]
+    assert list(r.context["internal_check"][0].keys())[0][0] == "diagnosis"
+    assert not r.context["archive_check"]
 
 
 @pytest.mark.django_db
@@ -171,12 +214,54 @@ def test_apply_metadata_step3(
         reverse("validate-metadata", args=[cohort_with_accession.pk]),
         {"metadata_file": benign_metadatafile.pk},
     )
+    r.context = cast(ApplyMetadataContext, r.context)
     assert not r.context["form"].errors, r.context["form"].errors
     assert r.status_code == 200, r.status_code
-    assert r.context["checkpoint"][1]["problems"] == []
-    assert r.context["checkpoint"][2]["problems"] == {}
-    assert r.context["checkpoint"][3]["problems"]
-    assert list(r.context["checkpoint"][3]["problems"].items())[0][0][0] == "diagnosis"
+    assert r.context["csv_check"] == []
+    assert r.context["internal_check"] and not any(r.context["internal_check"])
+    assert r.context["archive_check"]
+    assert list(r.context["archive_check"][0].keys())[0][0] == "diagnosis"
+
+
+@pytest.mark.django_db
+def test_apply_metadata_step3_full_cohort(
+    user,
+    staff_client,
+    cohort_with_accession,
+    csv_stream_diagnosis_sex_lesion_patient,
+    csv_stream_diagnosis_sex_disagreeing_lesion_patient,
+    metadata_file_factory,
+):
+    metadatafile = metadata_file_factory(
+        blob__from_func=lambda: csv_stream_diagnosis_sex_lesion_patient,
+        cohort=cohort_with_accession,
+    )
+
+    r = staff_client.post(
+        reverse("validate-metadata", args=[cohort_with_accession.pk]),
+        {"metadata_file": metadatafile.pk},
+    )
+    assert not r.context["form"].errors, r.context["form"].errors
+    assert r.status_code == 200, r.status_code
+
+    update_metadata_task(user.pk, metadatafile.pk)
+
+    # test step 3 by trying to upload a disagreeing lesion/patient pair.
+    disagreeing_metadatafile = metadata_file_factory(
+        blob__from_func=lambda: csv_stream_diagnosis_sex_disagreeing_lesion_patient,
+        cohort=cohort_with_accession,
+    )
+
+    r = staff_client.post(
+        reverse("validate-metadata", args=[cohort_with_accession.pk]),
+        {"metadata_file": disagreeing_metadatafile.pk},
+    )
+    r.context = cast(ApplyMetadataContext, r.context)
+    assert not r.context["form"].errors, r.context["form"].errors
+    assert r.context["csv_check"] == []
+    assert r.context["internal_check"] and not any(r.context["internal_check"])
+    assert r.context["archive_check"]
+    assert "belong to multiple patients" in str(r.context["archive_check"][1][0].message)
 
 
 @pytest.mark.django_db
diff --git a/isic/ingest/utils/metadata.py b/isic/ingest/utils/metadata.py
index 62d74a54..cdb7708d 100644
--- a/isic/ingest/utils/metadata.py
+++ b/isic/ingest/utils/metadata.py
@@ -1,8 +1,8 @@
 from collections import defaultdict
-from typing import Callable
+from typing import Callable, Iterable
 
 from django.forms.models import ModelForm
-from isic_metadata.metadata import MetadataRow
+from isic_metadata.metadata import MetadataBatch, MetadataRow
 import pandas as pd
 from pydantic import ValidationError as PydanticValidationError
 from pydantic.main import BaseModel
@@ -24,7 +24,11 @@ class Problem(BaseModel):
     type: str | None = "error"
 
 
-def validate_csv_format_and_filenames(df, cohort):
+# A dictionary of (column name, error message) -> list of row indices with that error
+ColumnRowErrors = dict[tuple[str, str], list[int]]
+
+
+def validate_csv_format_and_filenames(df, cohort) -> list[Problem]:
     problems = []
 
     # TODO: duplicate columns
@@ -57,41 +61,92 @@ def validate_csv_format_and_filenames(df, cohort):
     return problems
 
 
-def _validate_df_consistency(df, row_preprocessor: Callable[[dict], dict] | None = None) -> dict:
-    # keyed by column, message
-    column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)
+def _validate_df_consistency(
+    batch: Iterable[dict], row_preprocessor: Callable[[dict], dict] | None = None
+) -> tuple[ColumnRowErrors, list[Problem]]:
+    column_error_rows: ColumnRowErrors = defaultdict(list)
+    batch_problems: list[Problem] = []
 
-    for i, (_, row) in enumerate(df.iterrows(), start=2):
+    # Since rows have to be evaluated twice, we need to convert the iterator to a list
+    batch = list(batch)
+
+    for i, row in enumerate(batch):
         try:
-            row_dict = row.to_dict()
             if row_preprocessor is not None:
-                row_dict = row_preprocessor(row_dict)
+                row = row_preprocessor(row)
 
-            MetadataRow.model_validate(row_dict)
+            MetadataRow.model_validate(row)
         except PydanticValidationError as e:
             for error in e.errors():
                 column = error["loc"][0]
-                column_problems[(str(column), error["msg"])].append(i)
+                column_error_rows[(str(column), error["msg"])].append(i)
+
+    # validate the metadata as a "batch". this is for all checks that span rows. since this
+    # currently only applies to patient/lesion checks, we can sparsely populate the MetadataRow
+    # objects to save on memory.
+    try:
+        MetadataBatch(
+            items=[
+                MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id"))
+                for row in batch
+            ]
+        )
+    except PydanticValidationError as e:
+        for error in e.errors():
+            examples = error["ctx"]["examples"] if "ctx" in error else []
+            batch_problems.append(Problem(message=error["msg"], context=examples))
 
     # defaultdict doesn't work with django templates, see https://stackoverflow.com/a/12842716
-    column_problems.default_factory = None
+    return dict(column_error_rows), batch_problems
 
-    return column_problems
 
+def validate_internal_consistency(df) -> tuple[ColumnRowErrors, list[Problem]]:
+    return _validate_df_consistency(df.to_dict(orient="records"))
 
-def validate_internal_consistency(df):
-    return _validate_df_consistency(df)
 
+def validate_archive_consistency(df, cohort) -> tuple[ColumnRowErrors, list[Problem]]:
+    """
+    Validate that a CSV is consistent with the existing cohort metadata.
 
-def validate_archive_consistency(df, cohort):
-    accessions = Accession.objects.filter(
-        cohort=cohort, original_blob_name__in=df["filename"]
-    ).values_list("original_blob_name", "metadata")
-    # TODO: easier way to do this?
-    accessions_dict = {x[0]: x[1] for x in accessions}
+    This merges the existing cohort metadata with the proposed df and validates the merged
+    metadata. This allows for cross column checks e.g. an existing benign accession against
+    a df with diagnosis=melanoma. It also enables cross row checks, such as verifying that
+    a lesion doesn't belong to more than one patient.
+    """
 
-    def prepopulate_row(row: dict) -> dict:
-        existing = accessions_dict[row["filename"]]
-        return existing | {k: v for k, v in row.items() if v is not None}
+    def cohort_df_merged_metadata_rows():
+        """
+        Yield the merged metadata rows for the cohort and df.
+
+        The merged metadata rows are generated by iterating over the cohort accessions and
+        yielding the metadata for each accession. It merges if necessary and then yields the
+        merged result, removing it from the dataframe.
+        """
+        accessions = cohort.accessions.values_list(
+            "original_blob_name",
+            "metadata",
+            "lesion__private_lesion_id",
+            "patient__private_patient_id",
+        )
 
-    return _validate_df_consistency(df, row_preprocessor=prepopulate_row)
+        for original_blob_name, metadata, lesion_id, patient_id in accessions.iterator():
+            # lesion/patient need to be treated as top level metadata even though
+            # they're stored differently in the database.
+            if lesion_id:
+                metadata["lesion_id"] = lesion_id
+            if patient_id:
+                metadata["patient_id"] = patient_id
+
+            in_df = len(df[df["filename"] == original_blob_name]) > 0
+            if in_df:
+                df_row = df[df["filename"] == original_blob_name].iloc[0]
+                yield metadata | df_row.to_dict()
+                df.drop(df_row.name, inplace=True)
+            else:
+                yield metadata
+
+        # yield all the rows in the dataframe that don't already exist in the cohort
+        for df_row in df.itertuples():
+            yield df_row._asdict()
+
+    return _validate_df_consistency(cohort_df_merged_metadata_rows())
diff --git a/isic/ingest/views/__init__.py b/isic/ingest/views/__init__.py
index 90dfcfe0..ccdaeec3 100644
--- a/isic/ingest/views/__init__.py
+++ b/isic/ingest/views/__init__.py
@@ -3,7 +3,7 @@
 from isic.ingest.models.cohort import Cohort
 
 
-def make_breadcrumbs(cohort: Cohort | None = None) -> list:
+def make_breadcrumbs(cohort: Cohort | None = None) -> list[list[str]]:
     ret = [[reverse("ingest-review"), "Ingest Review"]]
 
     if cohort:
diff --git a/isic/ingest/views/metadata.py b/isic/ingest/views/metadata.py
index 20fbf436..3f768d0c 100644
--- a/isic/ingest/views/metadata.py
+++ b/isic/ingest/views/metadata.py
@@ -1,4 +1,5 @@
 import os
+from typing import TypedDict
 
 from django import forms
 from django.contrib.admin.views.decorators import staff_member_required
@@ -13,6 +14,8 @@
 from isic.core.permissions import get_visible_objects, needs_object_permission
 from isic.ingest.models import Cohort, MetadataFile
 from isic.ingest.utils.metadata import (
+    ColumnRowErrors,
+    Problem,
     validate_archive_consistency,
     validate_csv_format_and_filenames,
     validate_internal_consistency,
@@ -66,64 +69,64 @@ def metadata_file_create(request, cohort_pk):
     return render(request, "ingest/metadata_file_create.html", {"form": form})
 
 
+class ApplyMetadataContext(TypedDict):
+    cohort: Cohort
+    breadcrumbs: list[list[str]]
+    metadata_file_id: int
+    unstructured_columns: list[str]
+    form: ValidateMetadataForm
+    successful: bool
+
+    csv_check: list[Problem] | None
+    internal_check: tuple[ColumnRowErrors, list[Problem]] | None
+    archive_check: tuple[ColumnRowErrors, list[Problem]] | None
+
+
 @staff_member_required
 def apply_metadata(request, cohort_pk):
-    cohort = get_object_or_404(
+    cohort: Cohort = get_object_or_404(
         Cohort.objects.prefetch_related(
             Prefetch("metadata_files", queryset=MetadataFile.objects.order_by("-created"))
         ),
         pk=cohort_pk,
     )
 
+    # casting ctx to ApplyMetadataContext causes errors because all of the keys are required
+    # but not provided in the initial assignment. workarounds cause more headaches than they
+    # solve, so we only use the TypedDict in the tests.
     ctx = {
         "cohort": cohort,
         "breadcrumbs": make_breadcrumbs(cohort)
-        + [[reverse("validate-metadata", args=[cohort.id]), "Validate Metadata"]],
-    }
-    # TODO: Find a cleaner way to implement this system altogether.
-    checkpoints = {
-        1: {
-            "title": "Filename checks",
-            "run": False,
-            "problems": {},
-        },
-        2: {
-            "title": "Internal consistency",
-            "run": False,
-            "problems": {},
-        },
-        3: {
-            "title": "Archive consistency",
-            "run": False,
-            "problems": {},
-        },
+        + [[reverse("validate-metadata", args=[cohort.pk]), "Validate Metadata"]],
     }
 
+    csv_check: list[Problem] | None = None
+    internal_check: tuple[ColumnRowErrors, list[Problem]] | None = None
+    archive_check: tuple[ColumnRowErrors, list[Problem]] | None = None
+
     if request.method == "POST":
         form = ValidateMetadataForm(request.user, cohort, request.POST)
         if form.is_valid():
             metadata_file = MetadataFile.objects.get(id=int(form.cleaned_data["metadata_file"]))
-            ctx["metadata_file_id"] = metadata_file.id
+            ctx["metadata_file_id"] = metadata_file.pk
             df = metadata_file.to_df()
             ctx["unstructured_columns"] = get_unstructured_columns(df)
 
-            checkpoints[1]["problems"] = validate_csv_format_and_filenames(df, cohort)
-            checkpoints[1]["run"] = True
+            csv_check = validate_csv_format_and_filenames(df, cohort)
 
-            if not checkpoints[1]["problems"]:
-                checkpoints[2]["problems"] = validate_internal_consistency(df)
-                checkpoints[2]["run"] = True
+            if not any(csv_check):
+                internal_check = validate_internal_consistency(df)
 
-                if not checkpoints[2]["problems"]:
-                    checkpoints[3]["problems"] = validate_archive_consistency(df, cohort)
-                    checkpoints[3]["run"] = True
+                if not any(internal_check):
+                    archive_check = validate_archive_consistency(df, cohort)
 
-                    if not checkpoints[3]["problems"]:
+                    if not any(archive_check):
                         ctx["successful"] = True
-
     else:
         form = ValidateMetadataForm(request.user, cohort)
 
-    ctx["checkpoint"] = checkpoints
     ctx["form"] = form
+    ctx["csv_check"] = csv_check
+    ctx["internal_check"] = internal_check
+    ctx["archive_check"] = archive_check
     return render(request, "ingest/apply_metadata.html", ctx)