From f43491e29eec0612f1c448e0591edd59358a2345 Mon Sep 17 00:00:00 2001 From: Chao Pang Date: Sat, 26 Oct 2024 22:20:06 -0400 Subject: [PATCH] Revert "Fix omop meds order (#68)" This reverts commit 25e2406fb507b42458e5376b5e2d916a9b0b4d1a. --- .../hf_data_generator/hf_dataset_mapping.py | 76 ++++++++----------- .../meds_to_cehrbert_base.py | 9 --- .../meds_to_cehrbert_micmic4.py | 3 - .../meds_to_cehrbert_omop.py | 3 - .../hf_data_generator/meds_utils.py | 8 +- .../hf_data_generator/patient_block.py | 35 +++++---- 6 files changed, 54 insertions(+), 80 deletions(-) diff --git a/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py b/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py index 2714e73..9d6f492 100644 --- a/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py +++ b/src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py @@ -30,7 +30,6 @@ "Visit/61", "NUCC/315D00000X", ] -ED_VISIT_TYPE_CODES = ["VISIT/ER"] DISCHARGE_FACILITY_TYPES = [ "8536", "8863", @@ -174,7 +173,7 @@ def _update_cehrbert_record( mlm_skip_value: int = 0, unit: str = NA, ) -> None: - cehrbert_record["concept_ids"].append(replace_escape_chars(code)) + cehrbert_record["concept_ids"].append(code) cehrbert_record["visit_concept_orders"].append(visit_concept_order) cehrbert_record["ages"].append(age) cehrbert_record["dates"].append(date) @@ -239,11 +238,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: # We assume the first measurement to be the visit type of the current visit visit_type = visit["visit_type"] - is_er_or_inpatient = ( - visit_type in INPATIENT_VISIT_TYPES - or visit_type in INPATIENT_VISIT_TYPE_CODES - or visit_type in ED_VISIT_TYPE_CODES - ) + is_inpatient = visit_type in INPATIENT_VISIT_TYPES or visit_type in INPATIENT_VISIT_TYPE_CODES # Add artificial time tokens to the patient timeline if timedelta exists if time_delta: @@ -280,12 +275,39 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: visit_segment=visit_segment, visit_concept_id=visit_type, ) - # Keep track of the existing outpatient events, we don't want to add them again - existing_outpatient_events = list() + for e in events: # If the event doesn't have a time stamp, we skip it if not e["time"]: continue + # Add a medical token to the patient timeline + # If this is an inpatient visit, we use the event time stamps to calculate age and date + # because the patient can stay in the hospital for a period of time. + if is_inpatient: + # Calculate age using the event time stamp + age = relativedelta(e["time"], birth_datetime).years + # Calculate the week number since the epoch time + date = (e["time"] - datetime.datetime(year=1970, month=1, day=1)).days // 7 + else: + # For outpatient visits, we use the visit time stamp to calculate age and time because we assume + # the outpatient visits start and end on the same day + pass + + # Calculate the time diff in days w.r.t the previous measurement + meas_time_diff = (e["time"] - date_cursor).days + # Update the date_cursor if the time diff between two neighboring measurements is greater than and + # equal to 1 day + if meas_time_diff > 0: + date_cursor = e["time"] + if self._inpatient_time_token_function: + # This generates an artificial time token depending on the choice of the time token functions + self._update_cehrbert_record( + cehrbert_record, + code=f"i-{self._inpatient_time_token_function(meas_time_diff)}", + visit_concept_order=i + 1, + visit_segment=visit_segment, + visit_concept_id=visit_type, + ) # If numeric_value exists, this is a concept/value tuple, we indicate this using a concept_value_mask numeric_value = e.get("numeric_value", None) @@ -294,7 +316,6 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: concept_value_mask = int(numeric_value is not None) concept_value = numeric_value if concept_value_mask == 1 else -1.0 code = replace_escape_chars(e["code"]) - # If the value mask is 1, this indicates a numeric value associated with the concept if concept_value_mask != 1: # Otherwise we will try to concatenate the answer with the code if the categorical value is provide @@ -303,37 +324,6 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: text_value_replaced = replace_escape_chars(text_value) code = f"{code}//option:{text_value_replaced}" - # Add a medical token to the patient timeline - # If this is an inpatient visit, we use the event time stamps to calculate age and date - # because the patient can stay in the hospital for a period of time. - if is_er_or_inpatient: - # Calculate age using the event time stamp - age = relativedelta(e["time"], birth_datetime).years - # Calculate the week number since the epoch time - date = (e["time"] - datetime.datetime(year=1970, month=1, day=1)).days // 7 - # Calculate the time diff in days w.r.t the previous measurement - meas_time_diff = (e["time"] - date_cursor).days - # Update the date_cursor if the time diff between two neighboring measurements is greater than and - # equal to 1 day - if meas_time_diff > 0: - date_cursor = e["time"] - if self._inpatient_time_token_function: - # This generates an artificial time token depending on the choice of the time token functions - self._update_cehrbert_record( - cehrbert_record, - code=f"i-{self._inpatient_time_token_function(meas_time_diff)}", - visit_concept_order=i + 1, - visit_segment=visit_segment, - visit_concept_id=visit_type, - ) - else: - # For outpatient visits, we use the visit time stamp to calculate age and time because we assume - # the outpatient visits start and end on the same day. - # We check whether the date/code/value combination already exists in the existing events - # If they exist, we do not add them to the patient timeline for outpatient visits. - if (date, code, concept_value) in existing_outpatient_events: - continue - self._update_cehrbert_record( cehrbert_record, code=code, @@ -347,10 +337,8 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]: unit=unit, mlm_skip_value=concept_value_mask, ) - existing_outpatient_events.append((date, code, concept_value)) - # For inpatient or ER visits, we want to discharge_facility to the end of the visit - if is_er_or_inpatient: + if is_inpatient: # If visit_end_datetime is populated for the inpatient visit, we update the date_cursor visit_end_datetime = visit.get("visit_end_datetime", None) if visit_end_datetime: diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py index b0f0e51..e6579b1 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py @@ -69,12 +69,6 @@ def __init__(self, **kwargs): self._discharge_matching_rules = self._create_discharge_matching_rules() self._text_event_numeric_event_map = {r.code: r for r in self._create_text_event_to_numeric_event_rules()} - @abstractmethod - def _create_visit_matching_rules(self) -> List[str]: - raise NotImplementedError( - "Must implement the matching rules for identifying the visits other than ED/admission" - ) - @abstractmethod def _create_ed_admission_matching_rules(self) -> List[str]: """ @@ -128,9 +122,6 @@ def _create_text_event_to_numeric_event_rules(self) -> List[EventConversionRule] """ raise NotImplementedError("Must implement the event mapping rules for converting text events to numeric events") - def get_other_visit_matching_rules(self) -> List[str]: - return self._create_visit_matching_rules() - def get_ed_admission_matching_rules(self) -> List[str]: return self._ed_admission_matching_rules diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py index 5bb39ad..c78f546 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py @@ -13,9 +13,6 @@ def __init__(self, default_visit_id, **kwargs): super().__init__(**kwargs) self.default_visit_id = default_visit_id - def _create_visit_matching_rules(self) -> List[str]: - return [] - def _create_ed_admission_matching_rules(self) -> List[str]: return ["ED_REGISTRATION//", "TRANSFER_TO//ED"] diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py index ffe7f36..1a94b26 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py @@ -8,9 +8,6 @@ class MedsToCehrbertOMOP(MedsToCehrBertConversion): - def _create_visit_matching_rules(self) -> List[str]: - return ["Visit/"] - def _create_ed_admission_matching_rules(self) -> List[str]: return ["Visit/ER"] diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_utils.py b/src/cehrbert/data_generators/hf_data_generator/meds_utils.py index 149a6dc..6f0e281 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_utils.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_utils.py @@ -10,11 +10,7 @@ from datasets import Dataset, DatasetDict, Split from transformers.utils import logging -from cehrbert.data_generators.hf_data_generator import ( - DEFAULT_ED_CONCEPT_ID, - DEFAULT_INPATIENT_CONCEPT_ID, - UNKNOWN_VALUE, -) +from cehrbert.data_generators.hf_data_generator import DEFAULT_INPATIENT_CONCEPT_ID, UNKNOWN_VALUE from cehrbert.data_generators.hf_data_generator.hf_dataset import apply_cehrbert_dataset_mapping from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import MedToCehrBertDatasetMapping from cehrbert.data_generators.hf_data_generator.meds_to_cehrbert_conversion_rules import MedsToCehrBertConversion @@ -136,7 +132,7 @@ def convert_one_patient( visit_end_datetime = max([b.max_time for b in blocks]) discharge_facility = ( next(filter(None, [b.get_discharge_facility() for b in blocks]), None) - if visit_type in [DEFAULT_INPATIENT_CONCEPT_ID, DEFAULT_ED_CONCEPT_ID] + if visit_type == DEFAULT_INPATIENT_CONCEPT_ID else None ) visit_events = list() diff --git a/src/cehrbert/data_generators/hf_data_generator/patient_block.py b/src/cehrbert/data_generators/hf_data_generator/patient_block.py index 837127e..6fa4560 100644 --- a/src/cehrbert/data_generators/hf_data_generator/patient_block.py +++ b/src/cehrbert/data_generators/hf_data_generator/patient_block.py @@ -73,8 +73,7 @@ def __init__( # Cache these variables so we don't need to compute self.has_ed_admission = self._has_ed_admission() self.has_admission = self._has_admission() - self.discharged_to = self.get_discharge_facility() - self.has_discharge = self.discharged_to is not None + self.has_discharge = self._has_discharge() # Infer the visit_type from the events # Admission takes precedence over ED @@ -83,14 +82,7 @@ def __init__( elif self.has_ed_admission: self.visit_type = DEFAULT_ED_CONCEPT_ID else: - self.visit_type = self._infer_visit_type() - - def _infer_visit_type(self) -> str: - for event in self.events: - for matching_rule in self.conversion.get_other_visit_matching_rules(): - if re.match(matching_rule, event.code): - return event.code - return DEFAULT_OUTPATIENT_CONCEPT_ID + self.visit_type = DEFAULT_OUTPATIENT_CONCEPT_ID def _has_ed_admission(self) -> bool: """ @@ -118,7 +110,7 @@ def _has_admission(self) -> bool: return True return False - def get_discharge_facility(self) -> Optional[str]: + def _has_discharge(self) -> bool: """ Determines if the visit includes a discharge event. @@ -128,7 +120,23 @@ def get_discharge_facility(self) -> Optional[str]: for event in self.events: for matching_rule in self.conversion.get_discharge_matching_rules(): if re.match(matching_rule, event.code): - return event.code + return True + return False + + def get_discharge_facility(self) -> Optional[str]: + """ + Extracts the discharge facility code from the discharge event, if present. + + Returns: + Optional[str]: The sanitized discharge facility code, or None if no discharge event is found. + """ + if self._has_discharge(): + for event in self.events: + for matching_rule in self.conversion.get_discharge_matching_rules(): + if matching_rule in event.code: + discharge_facility = event.code.replace(matching_rule, "") + discharge_facility = re.sub(r"[^a-zA-Z]", "_", discharge_facility) + return discharge_facility return None def _convert_event(self, event) -> List[Event]: @@ -188,9 +196,6 @@ def get_meds_events(self) -> Iterable[Event]: """ events = [] for e in self.events: - # We only convert the events that are not visit type and discharge facility events - if (e.code == self.visit_type) or (self.discharged_to is not None and e.code == self.discharged_to): - continue events.extend(self._convert_event(e)) return events