Skip to content

Commit

Permalink
Strip whitespace and remove empty identifiers (#586)
Browse files Browse the repository at this point in the history
* Strip whitespace and remove empty identifiers
  • Loading branch information
stefpiatek authored Jan 7, 2025
1 parent 807f84b commit ecd5abf
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 12 deletions.
8 changes: 4 additions & 4 deletions cli/src/pixl_cli/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ def _filter_existing_images(
) -> pd.DataFrame:
# DataFrame indices must batch when using df.isin (or df.index.isin)
# So we re-index the DataFrames to match on the columns we want to compare
messages_df_reindexed = messages_df.set_index(["accession_number", "mrn", "study_date"])
images_df_reindexed = images_df.set_index(["accession_number", "mrn", "study_date"])
messages_df_reindexed = messages_df.set_index(["accession_number", "mrn", "study_uid"])
images_df_reindexed = images_df.set_index(["accession_number", "mrn", "study_uid"])
keep_indices = ~messages_df_reindexed.index.isin(images_df_reindexed.index)
return messages_df[keep_indices]

Expand All @@ -101,7 +101,7 @@ def _filter_exported_messages(
) -> pd.DataFrame:
merged = messages_df.merge(
images_df,
on=["accession_number", "mrn", "study_date"],
on=["accession_number", "mrn", "study_uid"],
how="left",
validate="one_to_one",
suffixes=(None, None),
Expand Down Expand Up @@ -131,7 +131,7 @@ def all_images_for_project(project_slug: str) -> pd.DataFrame:
PixlSession = sessionmaker(engine)

query = (
select(Image.accession_number, Image.study_date, Image.mrn, Image.exported_at)
select(Image.accession_number, Image.study_uid, Image.mrn, Image.exported_at)
.join(Extract)
.where(Extract.slug == project_slug)
)
Expand Down
21 changes: 13 additions & 8 deletions cli/src/pixl_cli/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,25 @@ def read_patient_info(resources_path: Path) -> pd.DataFrame:
messages_df = _load_csv(resources_path)
else:
messages_df = _load_parquet(resources_path)
# Tidy up dataframe in case of whitespace or no way to identify images
unique_columns = ["project_name", "mrn", "accession_number", "study_uid"]
filtered_df = messages_df.dropna(subset=["accession_number", "study_uid"], how="all")
for column in unique_columns:
filtered_df[column] = filtered_df[column].str.strip()
filtered_df = filtered_df[
~(filtered_df["accession_number"].eq("") & filtered_df["study_uid"].eq(""))
]

messages_df = messages_df.sort_values(by=["project_name", "study_date"])
messages_df = messages_df.drop_duplicates(
subset=["project_name", "mrn", "accession_number", "study_date"]
)
filtered_df = filtered_df.sort_values(by=["project_name", "study_date"])
filtered_df = filtered_df.drop_duplicates(subset=unique_columns)

if len(messages_df) == 0:
if len(filtered_df) == 0:
msg = f"Failed to find any messages in {resources_path}"
raise ValueError(msg)

logger.info("Created {} messages from {}", len(messages_df), resources_path)
logger.info("Created {} messages from {}", len(filtered_df), resources_path)

return messages_df
return filtered_df


def _load_csv(filepath: Path) -> pd.DataFrame:
Expand Down Expand Up @@ -168,7 +174,6 @@ class DF_COLUMNS(StrEnum): # noqa: N801
"participant_id": "pseudo_patient_id",
}


MAP_PARQUET_TO_MESSAGE_KEYS = {
"PrimaryMrn": "mrn",
"AccessionNumber": "accession_number",
Expand Down
26 changes: 26 additions & 0 deletions cli/tests/test_messages_from_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,32 @@ def test_messages_from_csv(omop_resources: Path) -> None:
assert messages == expected_messages


def test_whitespace_and_na_processing(omop_resources: Path) -> None:
"""
GIVEN a csv with leading and trailing whitespace, a duplicate entry
and ones with no image identifiers (empty and whitespaces).
WHEN the messages are generated from the directory
THEN one message should be generated, with no leading or trailing whitespace
"""
# Arrange
test_csv = omop_resources / "test_whitespace_and_na_processing.csv"
messages_df = read_patient_info(test_csv)
# Act
messages = messages_from_df(messages_df)
# Assert
assert messages == [
Message(
procedure_occurrence_id=0,
mrn="patient_identifier",
accession_number="123456789",
study_uid="1.2.3.4.5.6.7.8",
project_name="ms-pinpoint-test",
extract_generated_timestamp=datetime.datetime.fromisoformat("2023-01-01T00:01:00Z"),
study_date=datetime.date.fromisoformat("2022-01-01"),
),
]


def test_messages_from_csv_multiple_projects(
omop_resources: Path, rows_in_session, mock_publisher
) -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
procedure_id,mrn,accession_number,project_name,extract_generated_timestamp,study_date,study_uid,participant_id
0, patient_identifier , 123456789 , ms-pinpoint-test ,2023-01-01T00:01:00Z,2022-01-01, 1.2.3.4.5.6.7.8 ,
1, patient_identifier , 123456789 , ms-pinpoint-test ,2023-01-01T00:01:00Z,2022-01-01, 1.2.3.4.5.6.7.8 ,
2, whitespace_idenfifiers , ,ms-pinpoint-test,2023-01-01T00:01:00Z,2022-01-01, ,
3, NA_idenfifiers ,,ms-pinpoint-test,2023-01-01T00:01:00Z,2022-01-01,,

0 comments on commit ecd5abf

Please sign in to comment.