From a45532dc284bfc0af3a2a59a13212b3a64336e59 Mon Sep 17 00:00:00 2001 From: Chao Pang Date: Wed, 9 Oct 2024 12:09:18 -0400 Subject: [PATCH] moved the logic for generating PatientBlocks to the corresponding conversion rules --- .../meds_to_cehrbert_base.py | 25 +++- .../meds_to_cehrbert_micmic4.py | 58 +++++++- .../meds_to_cehrbert_omop.py | 56 ++++++- .../hf_data_generator/meds_utils.py | 140 ++++++++++++------ 4 files changed, 227 insertions(+), 52 deletions(-) diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py index d1c034d3..1a37dc66 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_base.py @@ -1,7 +1,12 @@ import re from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional +from datetime import datetime +from typing import List, Optional, Tuple + +import meds_reader + +from cehrbert.data_generators.hf_data_generator.meds_utils import PatientBlock, PatientDemographics @dataclass @@ -69,6 +74,24 @@ def __init__(self): self._discharge_matching_rules = self._create_discharge_matching_rules() self._text_event_numeric_event_map = {r.code: r for r in self._create_text_event_to_numeric_event_rules()} + @abstractmethod + def generate_demographics_and_patient_blocks( + self, patient: meds_reader.Subject, prediction_time: datetime = None + ) -> Tuple[PatientDemographics, List[PatientBlock]]: + """ + Abstract method for generating demographics and a list of patient blocks from a meds_reader Subject. + + Args: + patient: + prediction_time: + + Returns: + Tuple[PatientDemographics, List[PatientBlock]] + """ + raise NotImplementedError( + "Must implement the method for generating the patient blocks from a meds_reader Subject" + ) + @abstractmethod def _create_ed_admission_matching_rules(self) -> List[str]: """ diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py index 9e8a57dd..e5eee68e 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_micmic4.py @@ -1,14 +1,70 @@ import re -from typing import List +from datetime import datetime +from typing import List, Tuple +import meds_reader + +from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import birth_codes from cehrbert.data_generators.hf_data_generator.meds_to_cehrbert_conversion_rules.meds_to_cehrbert_base import ( EventConversionRule, MedsToCehrBertConversion, ) +from cehrbert.data_generators.hf_data_generator.meds_utils import PatientBlock, PatientDemographics class MedsToBertMimic4(MedsToCehrBertConversion): + def __init__(self, default_visit_id): + super().__init__() + self.default_visit_id = default_visit_id + + def generate_demographics_and_patient_blocks( + self, patient: meds_reader.Subject, prediction_time: datetime = None + ) -> Tuple[PatientDemographics, List[PatientBlock]]: + + birth_datetime = None + race = None + gender = None + ethnicity = None + + visit_id = self.default_visit_id + current_date = None + events_for_current_date = [] + patient_blocks = [] + for e in patient.events: + + # Skip out of the loop if the events' time stamps are beyond the prediction time + if prediction_time is not None and e.time is not None: + if e.time > prediction_time: + break + + # This indicates demographics features + if e.code in birth_codes: + birth_datetime = e.time + elif e.code.upper().startswith("RACE"): + race = e.code + elif e.code.upper().startswith("GENDER"): + gender = e.code + elif e.code.upper().startswith("ETHNICITY"): + ethnicity = e.code + elif e.time is not None: + if not current_date: + current_date = e.time + + if current_date.date() == e.time.date(): + events_for_current_date.append(e) + else: + patient_blocks.append(PatientBlock(events_for_current_date, visit_id, self)) + events_for_current_date = [e] + current_date = e.time + visit_id += 1 + + if events_for_current_date: + patient_blocks.append(PatientBlock(events_for_current_date, visit_id, self)) + + demographics = PatientDemographics(birth_datetime=birth_datetime, race=race, gender=gender, ethnicity=ethnicity) + return demographics, patient_blocks + def _create_ed_admission_matching_rules(self) -> List[str]: return ["ED_REGISTRATION//", "TRANSFER_TO//ED"] diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py index 1a94b261..c2570088 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_to_cehrbert_conversion_rules/meds_to_cehrbert_omop.py @@ -1,13 +1,67 @@ -from typing import List +from datetime import datetime +from typing import List, Tuple +import meds_reader + +from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import birth_codes from cehrbert.data_generators.hf_data_generator.meds_to_cehrbert_conversion_rules.meds_to_cehrbert_base import ( EventConversionRule, MedsToCehrBertConversion, ) +from cehrbert.data_generators.hf_data_generator.meds_utils import PatientBlock, PatientDemographics class MedsToCehrbertOMOP(MedsToCehrBertConversion): + def generate_demographics_and_patient_blocks( + self, patient: meds_reader.Subject, prediction_time: datetime = None + ) -> Tuple[PatientDemographics, List[PatientBlock]]: + + birth_datetime = None + race = None + gender = None + ethnicity = None + + current_visit_id = None + current_date = None + events_for_current_date = [] + patient_blocks = [] + for e in patient.events: + + # Skip out of the loop if the events' time stamps are beyond the prediction time + if prediction_time is not None and e.time is not None: + if e.time > prediction_time: + break + + # Try to set current_visit_id + if not current_visit_id: + current_visit_id = e.visit_id if hasattr(e, "visit_id") else None + + # This indicates demographics features + if e.code in birth_codes: + birth_datetime = e.time + elif e.code.upper().startswith("RACE"): + race = e.code + elif e.code.upper().startswith("GENDER"): + gender = e.code + elif e.code.upper().startswith("ETHNICITY"): + ethnicity = e.code + elif e.time is not None: + if not current_date: + current_date = e.time + if current_date.date() == e.time.date(): + events_for_current_date.append(e) + else: + patient_blocks.append(PatientBlock(events_for_current_date, current_visit_id, self)) + events_for_current_date = [e] + current_date = e.time + + if events_for_current_date: + patient_blocks.append(PatientBlock(events_for_current_date, current_visit_id, self)) + + demographics = PatientDemographics(birth_datetime=birth_datetime, race=race, gender=gender, ethnicity=ethnicity) + return demographics, patient_blocks + def _create_ed_admission_matching_rules(self) -> List[str]: return ["Visit/ER"] diff --git a/src/cehrbert/data_generators/hf_data_generator/meds_utils.py b/src/cehrbert/data_generators/hf_data_generator/meds_utils.py index ce64a00f..f277a866 100644 --- a/src/cehrbert/data_generators/hf_data_generator/meds_utils.py +++ b/src/cehrbert/data_generators/hf_data_generator/meds_utils.py @@ -2,6 +2,7 @@ import functools import os import re +from dataclasses import dataclass from datetime import datetime from typing import Dict, Iterable, List, Optional, Tuple, Union @@ -15,6 +16,9 @@ from cehrbert.data_generators.hf_data_generator.meds_to_cehrbert_conversion_rules.meds_to_cehrbert_base import ( MedsToCehrBertConversion, ) +from cehrbert.data_generators.hf_data_generator.meds_to_cehrbert_conversion_rules.meds_to_cehrbert_omop import ( + MedsToCehrbertOMOP, +) from cehrbert.med_extension.schema_extension import CehrBertPatient, Event, Visit from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments, MedsToCehrBertConversionType @@ -49,6 +53,14 @@ def get_subject_split(meds_reader_db_path: str) -> Dict[str, List[int]]: return result +@dataclass +class PatientDemographics: + birth_datetime: datetime = None + race: str = None + gender: str = None + ethnicity: str = None + + class PatientBlock: """ Represents a block of medical events for a single patient visit, including. @@ -225,46 +237,74 @@ def convert_one_patient( prediction_time: datetime = None, label: Union[int, float] = None, ) -> CehrBertPatient: - birth_datetime = None - race = None - gender = None - ethnicity = None - - visit_id = default_visit_id - current_date = None - events_for_current_date = [] - patient_blocks = [] - for e in patient.events: - - # Skip out of the loop if the events's time stamps are beyond the prediction time - if prediction_time is not None and e.time is not None: - if e.time > prediction_time: - break - - # This indicates demographics features - if e.code in birth_codes: - birth_datetime = e.time - elif e.code.upper().startswith("RACE"): - race = e.code - elif e.code.upper().startswith("GENDER"): - gender = e.code - elif e.code.upper().startswith("ETHNICITY"): - ethnicity = e.code - elif e.time is not None: - if not current_date: - current_date = e.time - - if current_date.date() == e.time.date(): - events_for_current_date.append(e) - else: - patient_blocks.append(PatientBlock(events_for_current_date, visit_id, conversion)) - events_for_current_date = list() - events_for_current_date.append(e) - current_date = e.time - visit_id += 1 - - if events_for_current_date: - patient_blocks.append(PatientBlock(events_for_current_date, visit_id, conversion)) + """ + Converts a patient's event data into a CehrBertPatient object, processing. + + their medical history, visit details, and demographic information. + + Parameters: + ---------- + patient : meds_reader.Subject + The patient's event data, including time-stamped medical events such as + demographic data (race, gender, ethnicity) and clinical visits (ED admissions, + hospital admissions, discharges). + + conversion : MedsToCehrBertConversion + The conversion object to map and process medical event data into the format + required by CehrBert. + + default_visit_id : int, optional (default=1) + The starting ID for patient visits. This is incremented as new visits are + identified in the event data. + + prediction_time : datetime, optional (default=None) + The cutoff time for processing events. Events occurring after this time are + ignored. + + label : Union[int, float], optional (default=None) + The prediction label associated with this patient, which could represent a + clinical outcome (e.g., survival or treatment response). + + Returns: + ------- + CehrBertPatient + An object containing the patient's transformed event data, visits, demographics, + and associated label in a structure compatible with CehrBert's input requirements. + + Description: + ----------- + This function processes a patient's medical history, including demographic + information (birth date, race, gender, and ethnicity) and visit details. It iterates + through the patient's events and groups them into visits (ED, admission, discharge). + Visits are formed based on timestamps, and certain logic is applied to merge ED visits + into hospital admissions if they occur within 24 hours of each other. + + For each event, demographic attributes like birth date, race, gender, and ethnicity + are extracted. If the event has a timestamp, it is compared with `prediction_time` to + filter out events that occurred after the specified time. + + The function handles ongoing (incomplete) visits and cases where multiple visits + should be merged (e.g., ED followed by hospital admission within 24 hours). After + processing the events, visits are built with details such as visit type, start/end + datetime, and events during the visit. + + The function returns a `CehrBertPatient` object that includes the patient's medical + events, structured into visits, along with demographic information, and optionally + a prediction label. + + Example Usage: + ------------- + patient_data = convert_one_patient( + patient=some_patient_object, + conversion=some_conversion_object, + default_visit_id=1, + prediction_time=datetime.now(), + label=1 + ) + """ + demographics, patient_blocks = conversion.generate_demographics_and_patient_blocks( + patient=patient, prediction_time=prediction_time + ) admit_discharge_pairs = [] active_ed_index = None @@ -359,24 +399,26 @@ def convert_one_patient( ) ) age_at_index = -1 - if prediction_time is not None and birth_datetime is not None: - age_at_index = prediction_time.year - birth_datetime.year + if prediction_time is not None and demographics.birth_datetime is not None: + age_at_index = prediction_time.year - demographics.birth_datetime.year if (prediction_time.month, prediction_time.day) < ( - birth_datetime.month, - birth_datetime.day, + demographics.birth_datetime.month, + demographics.birth_datetime.day, ): age_at_index -= 1 # birth_datetime can not be None - assert birth_datetime is not None, f"patient_id: {patient.subject_id} does not have a valid birth_datetime" + assert ( + demographics.birth_datetime is not None + ), f"patient_id: {patient.subject_id} does not have a valid birth_datetime" return CehrBertPatient( patient_id=patient.subject_id, - birth_datetime=birth_datetime, + birth_datetime=demographics.birth_datetime, visits=visits, - race=race if race else UNKNOWN_VALUE, - gender=gender if gender else UNKNOWN_VALUE, - ethnicity=ethnicity if ethnicity else UNKNOWN_VALUE, + race=demographics.race if demographics.race else UNKNOWN_VALUE, + gender=demographics.gender if demographics.gender else UNKNOWN_VALUE, + ethnicity=demographics.ethnicity if demographics.ethnicity else UNKNOWN_VALUE, index_date=prediction_time, age_at_index=age_at_index, label=label,