Strip whitespace and remove empty identifiers (#586)

* Strip whitespace and remove empty identifiers
SAFEHR-data · Jan 7, 2025 · ecd5abf · ecd5abf
1 parent 807f84b
commit ecd5abf
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 12 deletions.
diff --git a/cli/src/pixl_cli/_database.py b/cli/src/pixl_cli/_database.py
@@ -89,8 +89,8 @@ def _filter_existing_images(
 ) -> pd.DataFrame:
     # DataFrame indices must batch when using df.isin (or df.index.isin)
     # So we re-index the DataFrames to match on the columns we want to compare
-    messages_df_reindexed = messages_df.set_index(["accession_number", "mrn", "study_date"])
-    images_df_reindexed = images_df.set_index(["accession_number", "mrn", "study_date"])
+    messages_df_reindexed = messages_df.set_index(["accession_number", "mrn", "study_uid"])
+    images_df_reindexed = images_df.set_index(["accession_number", "mrn", "study_uid"])
     keep_indices = ~messages_df_reindexed.index.isin(images_df_reindexed.index)
     return messages_df[keep_indices]
 
@@ -101,7 +101,7 @@ def _filter_exported_messages(
 ) -> pd.DataFrame:
     merged = messages_df.merge(
         images_df,
-        on=["accession_number", "mrn", "study_date"],
+        on=["accession_number", "mrn", "study_uid"],
         how="left",
         validate="one_to_one",
         suffixes=(None, None),
@@ -131,7 +131,7 @@ def all_images_for_project(project_slug: str) -> pd.DataFrame:
     PixlSession = sessionmaker(engine)
 
     query = (
-        select(Image.accession_number, Image.study_date, Image.mrn, Image.exported_at)
+        select(Image.accession_number, Image.study_uid, Image.mrn, Image.exported_at)
         .join(Extract)
         .where(Extract.slug == project_slug)
     )

diff --git a/cli/src/pixl_cli/_io.py b/cli/src/pixl_cli/_io.py
@@ -64,19 +64,25 @@ def read_patient_info(resources_path: Path) -> pd.DataFrame:
         messages_df = _load_csv(resources_path)
     else:
         messages_df = _load_parquet(resources_path)
+    # Tidy up dataframe in case of whitespace or no way to identify images
+    unique_columns = ["project_name", "mrn", "accession_number", "study_uid"]
+    filtered_df = messages_df.dropna(subset=["accession_number", "study_uid"], how="all")
+    for column in unique_columns:
+        filtered_df[column] = filtered_df[column].str.strip()
+    filtered_df = filtered_df[
+        ~(filtered_df["accession_number"].eq("") & filtered_df["study_uid"].eq(""))
+    ]
 
-    messages_df = messages_df.sort_values(by=["project_name", "study_date"])
-    messages_df = messages_df.drop_duplicates(
-        subset=["project_name", "mrn", "accession_number", "study_date"]
-    )
+    filtered_df = filtered_df.sort_values(by=["project_name", "study_date"])
+    filtered_df = filtered_df.drop_duplicates(subset=unique_columns)
 
-    if len(messages_df) == 0:
+    if len(filtered_df) == 0:
         msg = f"Failed to find any messages in {resources_path}"
         raise ValueError(msg)
 
-    logger.info("Created {} messages from {}", len(messages_df), resources_path)
+    logger.info("Created {} messages from {}", len(filtered_df), resources_path)
 
-    return messages_df
+    return filtered_df
 
 
 def _load_csv(filepath: Path) -> pd.DataFrame:
@@ -168,7 +174,6 @@ class DF_COLUMNS(StrEnum):  # noqa: N801
     "participant_id": "pseudo_patient_id",
 }
 
-
 MAP_PARQUET_TO_MESSAGE_KEYS = {
     "PrimaryMrn": "mrn",
     "AccessionNumber": "accession_number",

diff --git a/cli/tests/test_messages_from_files.py b/cli/tests/test_messages_from_files.py
@@ -55,6 +55,32 @@ def test_messages_from_csv(omop_resources: Path) -> None:
     assert messages == expected_messages
 
 
+def test_whitespace_and_na_processing(omop_resources: Path) -> None:
+    """
+    GIVEN a csv with leading and trailing whitespace, a duplicate entry
+      and ones with no image identifiers (empty and whitespaces).
+    WHEN the messages are generated from the directory
+    THEN one message should be generated, with no leading or trailing whitespace
+    """
+    # Arrange
+    test_csv = omop_resources / "test_whitespace_and_na_processing.csv"
+    messages_df = read_patient_info(test_csv)
+    # Act
+    messages = messages_from_df(messages_df)
+    # Assert
+    assert messages == [
+        Message(
+            procedure_occurrence_id=0,
+            mrn="patient_identifier",
+            accession_number="123456789",
+            study_uid="1.2.3.4.5.6.7.8",
+            project_name="ms-pinpoint-test",
+            extract_generated_timestamp=datetime.datetime.fromisoformat("2023-01-01T00:01:00Z"),
+            study_date=datetime.date.fromisoformat("2022-01-01"),
+        ),
+    ]
+
+
 def test_messages_from_csv_multiple_projects(
     omop_resources: Path, rows_in_session, mock_publisher
 ) -> None:

diff --git a/pytest-pixl/src/pytest_pixl/data/omop-resources/test_whitespace_and_na_processing.csv b/pytest-pixl/src/pytest_pixl/data/omop-resources/test_whitespace_and_na_processing.csv
@@ -0,0 +1,5 @@
+procedure_id,mrn,accession_number,project_name,extract_generated_timestamp,study_date,study_uid,participant_id
+0, patient_identifier , 123456789 , ms-pinpoint-test ,2023-01-01T00:01:00Z,2022-01-01, 1.2.3.4.5.6.7.8 ,
+1, patient_identifier , 123456789 , ms-pinpoint-test ,2023-01-01T00:01:00Z,2022-01-01, 1.2.3.4.5.6.7.8 ,
+2, whitespace_idenfifiers , ,ms-pinpoint-test,2023-01-01T00:01:00Z,2022-01-01, ,
+3, NA_idenfifiers ,,ms-pinpoint-test,2023-01-01T00:01:00Z,2022-01-01,,