Skip to content

Commit

Permalink
Merge pull request #805 from ImageMarkup/metadata-batch-validation
Browse files Browse the repository at this point in the history
  • Loading branch information
danlamanna authored Dec 4, 2023
2 parents 5e4c938 + 12cd858 commit 4161f02
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 99 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,36 +28,42 @@
<div id="checkpoint-problems">

<div>
<div class="heading-4">Checkpoint 1 - {{ checkpoint.1.title }}</div>
<div class="heading-4">Checkpoint 1 - Filename checks</div>
</div>
{% if checkpoint.1.run %}
{% for problem in checkpoint.1.problems %}
<div>
{{ problem.message }}
{% if problem.context %}
<div>
{{ problem.context }}
</div>
{% endif %}
</div>
{% empty %}
passed.

{% endfor %}
{% endif %}
{% for problem in csv_check %}
<div>
{{ problem.message }}
{% if problem.context %}
<div>
{{ problem.context }}
</div>
{% endif %}
</div>
{% empty %}
passed.
{% endfor %}

<div>
<div class="heading-4">Checkpoint 2 - {{ checkpoint.2.title }}</div>
<div class="heading-4">Checkpoint 2 - Internal Consistency</div>
</div>
{% if checkpoint.2.run %}
{% if checkpoint.2.problems %}
{% if internal_check is not None %}
{% if internal_check.0 or internal_check.1 %}
<ul>
{% for key, values in checkpoint.2.problems.items %}
{% for key, values in internal_check.0.items %}
<li>
<strong>{{ key.0 }}</strong> - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }}
{% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
</li>
{% endfor %}

{% for batch_problem in internal_check.1 %}
<li>
<strong>{{ batch_problem.message }}</strong>
- instances: {{ batch_problem.context|slice:"5"|join:", " }}
{% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %}
</li>
{% endfor %}

</ul>
{% else %}
passed.
Expand All @@ -67,17 +73,25 @@
{% endif %}

<div>
<div class="heading-4">Checkpoint 3 - {{ checkpoint.3.title }}</div>
<div class="heading-4">Checkpoint 3 - Archive Consistency</div>
</div>
{% if checkpoint.3.run %}
{% if checkpoint.3.problems %}
{% if archive_check is not None %}
{% if archive_check.0 or archive_check.1 %}
<ul>
{% for key, values in checkpoint.3.problems.items %}
{% for key, values in archive_check.0 %}
<li>
<strong>{{ key.0 }}</strong> - {{ key.1 }} - lines: {{ values|slice:"5"|join:", " }}
{% if values|length > 5 %}(and {{ values|length|add:"-5" }} more){% endif %}
</li>
{% endfor %}

{% for batch_problem in archive_check.1 %}
<li>
<strong>{{ batch_problem.message }}</strong>
- instances: {{ batch_problem.context|slice:"5"|join:", " }}
{% if batch_problem.context|length > 5 %}(and {{batch_problem.context|length|add:"-5" }} more){% endif %}
</li>
{% endfor %}
</ul>
{% else %}
passed.
Expand Down
103 changes: 94 additions & 9 deletions isic/ingest/tests/test_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import csv
import io
from typing import BinaryIO
from typing import BinaryIO, cast

from django.urls.base import reverse
from django.utils import timezone
Expand All @@ -10,6 +10,7 @@
from isic.ingest.tasks import update_metadata_task
from isic.ingest.tests.csv_streams import StreamWriter
from isic.ingest.utils.metadata import validate_csv_format_and_filenames
from isic.ingest.views.metadata import ApplyMetadataContext


@pytest.fixture
Expand All @@ -31,6 +32,45 @@ def csv_stream_diagnosis_sex() -> BinaryIO:
return file_stream


@pytest.fixture
def csv_stream_diagnosis_sex_lesion_patient() -> BinaryIO:
file_stream = StreamWriter(io.BytesIO())
writer = csv.DictWriter(
file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"]
)
writer.writeheader()
writer.writerow(
{
"filename": "filename.jpg",
"diagnosis": "melanoma",
"sex": "female",
"lesion_id": "lesion1",
"patient_id": "patient1",
}
)
return file_stream


@pytest.fixture
def csv_stream_diagnosis_sex_disagreeing_lesion_patient() -> BinaryIO:
# a version that maps the same lesion to a different patient
file_stream = StreamWriter(io.BytesIO())
writer = csv.DictWriter(
file_stream, fieldnames=["filename", "diagnosis", "sex", "lesion_id", "patient_id"]
)
writer.writeheader()
writer.writerow(
{
"filename": "filename2.jpg",
"diagnosis": "nevus",
"sex": "male",
"lesion_id": "lesion1",
"patient_id": "patient2",
}
)
return file_stream


@pytest.fixture
def csv_stream_benign() -> BinaryIO:
file_stream = StreamWriter(io.BytesIO())
Expand All @@ -52,6 +92,7 @@ def csv_stream_diagnosis_sex_invalid() -> BinaryIO:
@pytest.fixture
def cohort_with_accession(cohort, accession_factory):
cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename.jpg"))
cohort.accessions.add(accession_factory(cohort=cohort, original_blob_name="filename2.jpg"))
return cohort


Expand Down Expand Up @@ -130,13 +171,15 @@ def test_apply_metadata_step2_invalid(
reverse("validate-metadata", args=[cohort_with_accession.pk]),
{"metadata_file": metadatafile.pk},
)
r.context = cast(ApplyMetadataContext, r.context)

assert not r.context["form"].errors, r.context["form"].errors
assert r.status_code == 200, r.status_code
assert r.context["checkpoint"][1]["problems"] == []
assert r.context["csv_check"] == []
# Ensure there's an error with the diagnosis field in step 2
assert r.context["checkpoint"][2]["problems"]
assert list(r.context["checkpoint"][2]["problems"].items())[0][0][0] == "diagnosis"
assert r.context["checkpoint"][3]["problems"] == {}
assert r.context["internal_check"]
assert list(r.context["internal_check"][0].keys())[0][0] == "diagnosis"
assert not r.context["archive_check"]


@pytest.mark.django_db
Expand Down Expand Up @@ -171,12 +214,54 @@ def test_apply_metadata_step3(
reverse("validate-metadata", args=[cohort_with_accession.pk]),
{"metadata_file": benign_metadatafile.pk},
)
r.context = cast(ApplyMetadataContext, r.context)
assert not r.context["form"].errors, r.context["form"].errors
assert r.status_code == 200, r.status_code
assert r.context["checkpoint"][1]["problems"] == []
assert r.context["checkpoint"][2]["problems"] == {}
assert r.context["checkpoint"][3]["problems"]
assert list(r.context["checkpoint"][3]["problems"].items())[0][0][0] == "diagnosis"
assert r.context["csv_check"] == []
assert r.context["internal_check"] and not any(r.context["internal_check"])
assert r.context["archive_check"]
assert list(r.context["archive_check"][0].keys())[0][0] == "diagnosis"


@pytest.mark.django_db
def test_apply_metadata_step3_full_cohort(
user,
staff_client,
cohort_with_accession,
csv_stream_diagnosis_sex_lesion_patient,
csv_stream_diagnosis_sex_disagreeing_lesion_patient,
metadata_file_factory,
):
metadatafile = metadata_file_factory(
blob__from_func=lambda: csv_stream_diagnosis_sex_lesion_patient,
cohort=cohort_with_accession,
)

r = staff_client.post(
reverse("validate-metadata", args=[cohort_with_accession.pk]),
{"metadata_file": metadatafile.pk},
)
assert not r.context["form"].errors, r.context["form"].errors
assert r.status_code == 200, r.status_code

update_metadata_task(user.pk, metadatafile.pk)

# test step 3 by trying to upload a disagreeing lesion/patient pair.
disagreeing_metadatafile = metadata_file_factory(
blob__from_func=lambda: csv_stream_diagnosis_sex_disagreeing_lesion_patient,
cohort=cohort_with_accession,
)

r = staff_client.post(
reverse("validate-metadata", args=[cohort_with_accession.pk]),
{"metadata_file": disagreeing_metadatafile.pk},
)
r.context = cast(ApplyMetadataContext, r.context)
assert not r.context["form"].errors, r.context["form"].errors
assert r.context["csv_check"] == []
assert r.context["internal_check"] and not any(r.context["internal_check"])
assert r.context["archive_check"]
assert "belong to multiple patients" in str(r.context["archive_check"][1][0].message)


@pytest.mark.django_db
Expand Down
121 changes: 88 additions & 33 deletions isic/ingest/utils/metadata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections import defaultdict
from typing import Callable, Iterable

from django.forms.models import ModelForm
from isic_metadata.metadata import MetadataRow
from isic_metadata.metadata import MetadataBatch, MetadataRow
import pandas as pd
from pydantic import ValidationError as PydanticValidationError
from pydantic.main import BaseModel
Expand All @@ -23,7 +24,11 @@ class Problem(BaseModel):
type: str | None = "error"


def validate_csv_format_and_filenames(df, cohort):
# A dictionary of (column name, error message) -> list of row indices with that error
ColumnRowErrors = dict[tuple[str, str], list[int]]


def validate_csv_format_and_filenames(df, cohort) -> list[Problem]:
problems = []

# TODO: duplicate columns
Expand Down Expand Up @@ -56,42 +61,92 @@ def validate_csv_format_and_filenames(df, cohort):
return problems


def validate_internal_consistency(df):
# keyed by column, message
column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)

for i, (_, row) in enumerate(df.iterrows(), start=2):
try:
MetadataRow.model_validate(row.to_dict())
except PydanticValidationError as e:
for error in e.errors():
column = error["loc"][0]
column_problems[(column, error["msg"])].append(i)

# TODO: defaultdict doesn't work in django templates?
return dict(column_problems)


def validate_archive_consistency(df, cohort):
# keyed by column, message
column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)
accessions = Accession.objects.filter(
cohort=cohort, original_blob_name__in=df["filename"]
).values_list("original_blob_name", "metadata")
# TODO: easier way to do this?
accessions_dict = {x[0]: x[1] for x in accessions}
def _validate_df_consistency(
batch: Iterable[dict], row_preprocessor: Callable[[dict], dict] | None = None
) -> tuple[ColumnRowErrors, list[Problem]]:
column_error_rows: ColumnRowErrors = defaultdict(list)
batch_problems: list[Problem] = []

for i, (_, row) in enumerate(df.iterrows(), start=2):
existing = accessions_dict[row["filename"]]
row = existing | {k: v for k, v in row.items() if v is not None}
# Since rows have to be evaluated twice, we need to convert the iterator to a list
batch = list(batch)

for i, row in enumerate(batch):
try:
if row_preprocessor is not None:
row = row_preprocessor(row)

MetadataRow.model_validate(row)
except PydanticValidationError as e:
for error in e.errors():
column = error["loc"][0]
column_error_rows[(str(column), error["msg"])].append(i)

# validate the metadata as a "batch". this is for all checks that span rows. since this
# currently only applies to patient/lesion checks, we can sparsely populate the MetadataRow
# objects to save on memory.
try:
MetadataBatch(
items=[
MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id"))
for row in batch
]
)
except PydanticValidationError as e:
for error in e.errors():
examples = error["ctx"]["examples"] if "ctx" in error else []
batch_problems.append(Problem(message=error["msg"], context=examples))

# defaultdict doesn't work with django templates, see https://stackoverflow.com/a/12842716
return dict(column_error_rows), batch_problems


def validate_internal_consistency(df) -> tuple[ColumnRowErrors, list[Problem]]:
return _validate_df_consistency(df.to_dict(orient="records"))


def validate_archive_consistency(df, cohort) -> tuple[ColumnRowErrors, list[Problem]]:
"""
Validate that a CSV is consistent with the existing cohort metadata.
This merges the existing cohort metadata with the proposed df and validates the merged
metadata. This allows for cross column checks e.g. an existing benign accession against
a df with diagnosis=melanoma. It also enables cross row checks, such as verifying that
a lesion doesn't belong to more than one patient.
"""

def cohort_df_merged_metadata_rows():
"""
Yield the merged metadata rows for the cohort and df.
The merged metadata rows are generated by iterating over the cohort accessions and
yielding the metadata for each accession. It merges if necessary and then yields the
merged result, removing it from the dataframe.
"""
accessions = cohort.accessions.values_list(
"original_blob_name",
"metadata",
"lesion__private_lesion_id",
"patient__private_patient_id",
)

column_problems[(column, error["msg"])].append(i)

# TODO: defaultdict doesn't work in django templates?
return dict(column_problems)
for original_blob_name, metadata, lesion_id, patient_id in accessions.iterator():
# lesion/patient need to be treated as top level metadata even though
# they're stored differently in the database.
if lesion_id:
metadata["lesion_id"] = lesion_id
if patient_id:
metadata["patient_id"] = patient_id

in_df = len(df[df["filename"] == original_blob_name]) > 0
if in_df:
df_row = df[df["filename"] == original_blob_name].iloc[0]
yield metadata | df_row.to_dict()
df.drop(df_row.name, inplace=True)
else:
yield metadata

# yield all the rows in the dataframe that don't already exist in the cohort
for df_row in df.itertuples():
yield df_row._asdict()

return _validate_df_consistency(cohort_df_merged_metadata_rows())
2 changes: 1 addition & 1 deletion isic/ingest/views/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from isic.ingest.models.cohort import Cohort


def make_breadcrumbs(cohort: Cohort | None = None) -> list:
def make_breadcrumbs(cohort: Cohort | None = None) -> list[list[str]]:
ret = [[reverse("ingest-review"), "Ingest Review"]]

if cohort:
Expand Down
Loading

0 comments on commit 4161f02

Please sign in to comment.