Merge pull request #805 from ImageMarkup/metadata-batch-validation

ImageMarkup · Dec 4, 2023 · 4161f02 · 4161f02
2 parents 5e4c938 + 12cd858
commit 4161f02
Show file tree

Hide file tree

Showing 5 changed files with 256 additions and 99 deletions.
diff --git a/isic/ingest/templates/ingest/partials/metadata_validation_output.html b/isic/ingest/templates/ingest/partials/metadata_validation_output.html
@@ -28,36 +28,42 @@
 <div id="checkpoint-problems">
 
   <div>
-    <div class="heading-4">Checkpoint 1 - {{ checkpoint.1.title }}</div>
+    <div class="heading-4">Checkpoint 1 - Filename checks</div>
   </div>
-  {% if checkpoint.1.run %}
-    {% for problem in checkpoint.1.problems %}
-      <div>
-        {{ problem.message  }}
-        {% if problem.context %}
-          <div>
-            {{ problem.context }}
-          </div>
-        {% endif %}
-      </div>
-    {% empty %}
-      passed.
-
-    {% endfor %}
-  {% endif %}
+  {% for problem in csv_check %}
+    <div>
+      {{ problem.message  }}
+      {% if problem.context %}
+        <div>
+          {{ problem.context }}
+        </div>
+      {% endif %}
+    </div>
+  {% empty %}
+    passed.
+  {% endfor %}
 
   <div>
-    <div class="heading-4">Checkpoint 2 - {{ checkpoint.2.title }}</div>
+    <div class="heading-4">Checkpoint 2 - Internal Consistency</div>
   </div>
-  {% if checkpoint.2.run %}
-    {% if checkpoint.2.problems %}
+  {% if internal_check is not None %}
+    {% if internal_check.0 or internal_check.1 %}
       <ul>
-        {% for key, values in checkpoint.2.problems.items %}
+        {% for key, values in internal_check.0.items %}
           <li>
             <strong>{{ key.0 }}</strong> - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }}
             {% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
           </li>
         {% endfor %}
+
+        {% for batch_problem in internal_check.1 %}
+          <li>
+            <strong>{{ batch_problem.message }}</strong>
+            - instances: {{ batch_problem.context|slice:"5"|join:", " }}
+            {% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %}
+          </li>
+        {% endfor %}
+
       </ul>
     {% else %}
       passed.
@@ -67,17 +73,25 @@
   {% endif %}
 
   <div>
-    <div class="heading-4">Checkpoint 3 - {{ checkpoint.3.title }}</div>
+    <div class="heading-4">Checkpoint 3 - Archive Consistency</div>
   </div>
-  {% if checkpoint.3.run %}
-    {% if checkpoint.3.problems %}
+  {% if archive_check is not None %}
+    {% if archive_check.0 or archive_check.1 %}
       <ul>
-        {% for key, values in checkpoint.3.problems.items %}
+        {% for key, values in archive_check.0 %}
           <li>
             <strong>{{ key.0 }}</strong> - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }}
             {% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
           </li>
         {% endfor %}
+
+        {% for batch_problem in archive_check.1 %}
+          <li>
+            <strong>{{ batch_problem.message }}</strong>
+            - instances: {{ batch_problem.context|slice:"5"|join:", " }}
+            {% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %}
+          </li>
+        {% endfor %}
       </ul>
     {% else %}
       passed.

diff --git a/isic/ingest/tests/test_metadata.py b/isic/ingest/tests/test_metadata.py
@@ -1,6 +1,6 @@
 import csv
 import io
-from typing import BinaryIO
+from typing import BinaryIO, cast
 
 from django.urls.base import reverse
 from django.utils import timezone
@@ -10,6 +10,7 @@
 from isic.ingest.tasks import update_metadata_task
 from isic.ingest.tests.csv_streams import StreamWriter
 from isic.ingest.utils.metadata import validate_csv_format_and_filenames
+from isic.ingest.views.metadata import ApplyMetadataContext
 
 
 @pytest.fixture
@@ -31,6 +32,45 @@ def csv_stream_diagnosis_sex() -> BinaryIO:
     return file_stream
 
 
+@pytest.fixture
+def csv_stream_diagnosis_sex_lesion_patient() -> BinaryIO:
+    file_stream = StreamWriter(io.BytesIO())
+    writer = csv.DictWriter(
+        file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"]
+    )
+    writer.writeheader()
+    writer.writerow(
+        {
+            "filename": "filename.jpg",
+            "diagnosis": "melanoma",
+            "sex": "female",
+            "lesion_id": "lesion1",
+            "patient_id": "patient1",
+        }
+    )
+    return file_stream
+
+
+@pytest.fixture
+def csv_stream_diagnosis_sex_disagreeing_lesion_patient() -> BinaryIO:
+    # a version that maps the same lesion to a different patient
+    file_stream = StreamWriter(io.BytesIO())
+    writer = csv.DictWriter(
+        file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"]
+    )
+    writer.writeheader()
+    writer.writerow(
+        {
+            "filename": "filename2.jpg",
+            "diagnosis": "nevus",
+            "sex": "male",
+            "lesion_id": "lesion1",
+            "patient_id": "patient2",
+        }
+    )
+    return file_stream
+
+
 @pytest.fixture
 def csv_stream_benign() -> BinaryIO:
     file_stream = StreamWriter(io.BytesIO())
@@ -52,6 +92,7 @@ def csv_stream_diagnosis_sex_invalid() -> BinaryIO:
 @pytest.fixture
 def cohort_with_accession(cohort, accession_factory):
     cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename.jpg"))
+    cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename2.jpg"))
     return cohort
 
 
@@ -130,13 +171,15 @@ def test_apply_metadata_step2_invalid(
         reverse("validate-metadata", args=[cohort_with_accession.pk]),
         {"metadata_file": metadatafile.pk},
     )
+    r.context = cast(ApplyMetadataContext, r.context)
+
     assert not r.context["form"].errors, r.context["form"].errors
     assert r.status_code == 200, r.status_code
-    assert r.context["checkpoint"][1]["problems"] == []
+    assert r.context["csv_check"] == []
     # Ensure there's an error with the diagnosis field in step 2
-    assert r.context["checkpoint"][2]["problems"]
-    assert list(r.context["checkpoint"][2]["problems"].items())[0][0][0] == "diagnosis"
-    assert r.context["checkpoint"][3]["problems"] == {}
+    assert r.context["internal_check"]
+    assert list(r.context["internal_check"][0].keys())[0][0] == "diagnosis"
+    assert not r.context["archive_check"]
 
 
 @pytest.mark.django_db
@@ -171,12 +214,54 @@ def test_apply_metadata_step3(
         reverse("validate-metadata", args=[cohort_with_accession.pk]),
         {"metadata_file": benign_metadatafile.pk},
     )
+    r.context = cast(ApplyMetadataContext, r.context)
     assert not r.context["form"].errors, r.context["form"].errors
     assert r.status_code == 200, r.status_code
-    assert r.context["checkpoint"][1]["problems"] == []
-    assert r.context["checkpoint"][2]["problems"] == {}
-    assert r.context["checkpoint"][3]["problems"]
-    assert list(r.context["checkpoint"][3]["problems"].items())[0][0][0] == "diagnosis"
+    assert r.context["csv_check"] == []
+    assert r.context["internal_check"] and not any(r.context["internal_check"])
+    assert r.context["archive_check"]
+    assert list(r.context["archive_check"][0].keys())[0][0] == "diagnosis"
+
+
+@pytest.mark.django_db
+def test_apply_metadata_step3_full_cohort(
+    user,
+    staff_client,
+    cohort_with_accession,
+    csv_stream_diagnosis_sex_lesion_patient,
+    csv_stream_diagnosis_sex_disagreeing_lesion_patient,
+    metadata_file_factory,
+):
+    metadatafile = metadata_file_factory(
+        blob__from_func=lambda: csv_stream_diagnosis_sex_lesion_patient,
+        cohort=cohort_with_accession,
+    )
+
+    r = staff_client.post(
+        reverse("validate-metadata", args=[cohort_with_accession.pk]),
+        {"metadata_file": metadatafile.pk},
+    )
+    assert not r.context["form"].errors, r.context["form"].errors
+    assert r.status_code == 200, r.status_code
+
+    update_metadata_task(user.pk, metadatafile.pk)
+
+    # test step 3 by trying to upload a disagreeing lesion/patient pair.
+    disagreeing_metadatafile = metadata_file_factory(
+        blob__from_func=lambda: csv_stream_diagnosis_sex_disagreeing_lesion_patient,
+        cohort=cohort_with_accession,
+    )
+
+    r = staff_client.post(
+        reverse("validate-metadata", args=[cohort_with_accession.pk]),
+        {"metadata_file": disagreeing_metadatafile.pk},
+    )
+    r.context = cast(ApplyMetadataContext, r.context)
+    assert not r.context["form"].errors, r.context["form"].errors
+    assert r.context["csv_check"] == []
+    assert r.context["internal_check"] and not any(r.context["internal_check"])
+    assert r.context["archive_check"]
+    assert "belong to multiple patients" in str(r.context["archive_check"][1][0].message)
 
 
 @pytest.mark.django_db

diff --git a/isic/ingest/utils/metadata.py b/isic/ingest/utils/metadata.py
@@ -1,7 +1,8 @@
 from collections import defaultdict
+from typing import Callable, Iterable
 
 from django.forms.models import ModelForm
-from isic_metadata.metadata import MetadataRow
+from isic_metadata.metadata import MetadataBatch, MetadataRow
 import pandas as pd
 from pydantic import ValidationError as PydanticValidationError
 from pydantic.main import BaseModel
@@ -23,7 +24,11 @@ class Problem(BaseModel):
     type: str | None = "error"
 
 
-def validate_csv_format_and_filenames(df, cohort):
+# A dictionary of (column name, error message) -> list of row indices with that error
+ColumnRowErrors = dict[tuple[str, str], list[int]]
+
+
+def validate_csv_format_and_filenames(df, cohort) -> list[Problem]:
     problems = []
 
     # TODO: duplicate columns
@@ -56,42 +61,92 @@ def validate_csv_format_and_filenames(df, cohort):
     return problems
 
 
-def validate_internal_consistency(df):
-    # keyed by column, message
-    column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)
-
-    for i, (_, row) in enumerate(df.iterrows(), start=2):
-        try:
-            MetadataRow.model_validate(row.to_dict())
-        except PydanticValidationError as e:
-            for error in e.errors():
-                column = error["loc"][0]
-                column_problems[(column, error["msg"])].append(i)
-
-    # TODO: defaultdict doesn't work in django templates?
-    return dict(column_problems)
-
-
-def validate_archive_consistency(df, cohort):
-    # keyed by column, message
-    column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)
-    accessions = Accession.objects.filter(
-        cohort=cohort, original_blob_name__in=df["filename"]
-    ).values_list("original_blob_name", "metadata")
-    # TODO: easier way to do this?
-    accessions_dict = {x[0]: x[1] for x in accessions}
+def _validate_df_consistency(
+    batch: Iterable[dict], row_preprocessor: Callable[[dict], dict] | None = None
+) -> tuple[ColumnRowErrors, list[Problem]]:
+    column_error_rows: ColumnRowErrors = defaultdict(list)
+    batch_problems: list[Problem] = []
 
-    for i, (_, row) in enumerate(df.iterrows(), start=2):
-        existing = accessions_dict[row["filename"]]
-        row = existing | {k: v for k, v in row.items() if v is not None}
+    # Since rows have to be evaluated twice, we need to convert the iterator to a list
+    batch = list(batch)
 
+    for i, row in enumerate(batch):
         try:
+            if row_preprocessor is not None:
+                row = row_preprocessor(row)
+
             MetadataRow.model_validate(row)
         except PydanticValidationError as e:
             for error in e.errors():
                 column = error["loc"][0]
+                column_error_rows[(str(column), error["msg"])].append(i)
+
+    # validate the metadata as a "batch". this is for all checks that span rows. since this
+    # currently only applies to patient/lesion checks, we can sparsely populate the MetadataRow
+    # objects to save on memory.
+    try:
+        MetadataBatch(
+            items=[
+                MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id"))
+                for row in batch
+            ]
+        )
+    except PydanticValidationError as e:
+        for error in e.errors():
+            examples = error["ctx"]["examples"] if "ctx" in error else []
+            batch_problems.append(Problem(message=error["msg"], context=examples))
+
+    # defaultdict doesn't work with django templates, see https://stackoverflow.com/a/12842716
+    return dict(column_error_rows), batch_problems
+
+
+def validate_internal_consistency(df) -> tuple[ColumnRowErrors, list[Problem]]:
+    return _validate_df_consistency(df.to_dict(orient="records"))
+
+
+def validate_archive_consistency(df, cohort) -> tuple[ColumnRowErrors, list[Problem]]:
+    """
+    Validate that a CSV is consistent with the existing cohort metadata.
+
+    This merges the existing cohort metadata with the proposed df and validates the merged
+    metadata. This allows for cross column checks e.g. an existing benign accession against
+    a df with diagnosis=melanoma. It also enables cross row checks, such as verifying that
+    a lesion doesn't belong to more than one patient.
+    """
+
+    def cohort_df_merged_metadata_rows():
+        """
+        Yield the merged metadata rows for the cohort and df.
+
+        The merged metadata rows are generated by iterating over the cohort accessions and
+        yielding the metadata for each accession. It merges if necessary and then yields the
+        merged result, removing it from the dataframe.
+        """
+        accessions = cohort.accessions.values_list(
+            "original_blob_name",
+            "metadata",
+            "lesion__private_lesion_id",
+            "patient__private_patient_id",
+        )
 
-                column_problems[(column, error["msg"])].append(i)
-
-    # TODO: defaultdict doesn't work in django templates?
-    return dict(column_problems)
+        for original_blob_name, metadata, lesion_id, patient_id in accessions.iterator():
+            # lesion/patient need to be treated as top level metadata even though
+            # they're stored differently in the database.
+            if lesion_id:
+                metadata["lesion_id"] = lesion_id
+            if patient_id:
+                metadata["patient_id"] = patient_id
+
+            in_df = len(df[df["filename"] == original_blob_name]) > 0
+            if in_df:
+                df_row = df[df["filename"] == original_blob_name].iloc[0]
+                yield metadata | df_row.to_dict()
+                df.drop(df_row.name, inplace=True)
+            else:
+                yield metadata
+
+        # yield all the rows in the dataframe that don't already exist in the cohort
+        for df_row in df.itertuples():
+            yield df_row._asdict()
+
+    return _validate_df_consistency(cohort_df_merged_metadata_rows())
diff --git a/isic/ingest/views/__init__.py b/isic/ingest/views/__init__.py
@@ -3,7 +3,7 @@
 from isic.ingest.models.cohort import Cohort
 
 
-def make_breadcrumbs(cohort: Cohort | None = None) -> list:
+def make_breadcrumbs(cohort: Cohort | None = None) -> list[list[str]]:
     ret = [[reverse("ingest-review"), "Ingest Review"]]
 
     if cohort: