Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert OMOP MEDS data to CEHR-BERT data by linking events by visit_id #59

Merged
merged 16 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
61fb30a
updated the regex for identifying Race/Gender/Ethinicity
ChaoPang Oct 8, 2024
882a1ec
added a new src/cehrbert/data_generators/hf_data_generator/meds_to_ce…
ChaoPang Oct 9, 2024
d5793c6
moved the logic for generating PatientBlocks to the corresponding con…
ChaoPang Oct 9, 2024
b1c09fd
moved the micmic meds to cehrbert logic to meds_to_cehrbert_micmic4.py
ChaoPang Oct 10, 2024
5fcbb2c
completed the meds_to_cehrbert_omop.py logic for converting the OMOP …
ChaoPang Oct 10, 2024
d954916
moved generate_demographics_and_patient_blocks logic to patient_block.py
ChaoPang Oct 10, 2024
fd982d5
fixed the circular imports for the meds_to_cehrbert_omop conversion rule
ChaoPang Oct 10, 2024
d5e6a7f
added a helper function to remove the trailing slashes from the folde…
ChaoPang Oct 10, 2024
fb578ab
added a unit-test to test the OMOP meds to cehrbert inputs conversion…
ChaoPang Oct 10, 2024
44c2cff
added a check to ensure unit is set to the default value N/A when the…
ChaoPang Oct 11, 2024
4d234b0
added a filter to remove patient records without any domain events
ChaoPang Oct 11, 2024
4495f67
added logic for removing records that don't have any clinical events
ChaoPang Oct 11, 2024
8c40a27
added filter to remove the short sequences from the training/validati…
ChaoPang Oct 11, 2024
3375240
added num_of_concepts and num_of_visits to the saved pretraining data…
ChaoPang Oct 11, 2024
fd702d5
added logging information for MEDS_reader dataset related arguments
ChaoPang Oct 11, 2024
37b5d1d
removed the data filter from _create_cehrbert_data_from_meds
ChaoPang Oct 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/cehrbert/data_generators/hf_data_generator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
UNKNOWN_VALUE = "Unknown"
DEFAULT_ED_CONCEPT_ID = "Visit/ER"
DEFAULT_OUTPATIENT_CONCEPT_ID = "Visit/OP"
DEFAULT_INPATIENT_CONCEPT_ID = "Visit/IP"
19 changes: 19 additions & 0 deletions src/cehrbert/data_generators/hf_data_generator/hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
"concept_values",
"concept_value_masks",
"mlm_skip_values",
"num_of_concepts",
"num_of_visits",
]

TRANSFORMER_COLUMNS = ["input_ids", "labels"]
Expand All @@ -34,6 +36,15 @@ def create_cehrbert_pretraining_dataset(
data_args: DataTrainingArguments,
) -> Dataset:
required_columns = TRANSFORMER_COLUMNS + CEHRBERT_COLUMNS

# Remove patients without any records
dataset = dataset.filter(
lambda batch: [num_of_concepts > 0 for num_of_concepts in batch["num_of_concepts"]],
num_proc=data_args.preprocessing_num_workers if not data_args.streaming else None,
batched=True,
batch_size=data_args.preprocessing_batch_size,
)

# If the data is already in meds, we don't need to sort the sequence anymore
if data_args.is_data_in_med:
mapping_functions = [HFTokenizationMapping(concept_tokenizer, True)]
Expand Down Expand Up @@ -70,6 +81,14 @@ def create_cehrbert_finetuning_dataset(
) -> Dataset:
required_columns = TRANSFORMER_COLUMNS + CEHRBERT_COLUMNS + FINETUNING_COLUMNS

# Remove patients without any records
dataset = dataset.filter(
lambda batch: [num_of_concepts > 0 for num_of_concepts in batch["num_of_concepts"]],
num_proc=data_args.preprocessing_num_workers if not data_args.streaming else None,
batched=True,
batch_size=data_args.preprocessing_batch_size,
)

if data_args.is_data_in_med:
mapping_functions = [
HFFineTuningMapping(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:

# If numeric_value exists, this is a concept/value tuple, we indicate this using a concept_value_mask
numeric_value = e.get("numeric_value", None)
unit = e.get("unit", NA)
# The unit might be populated with a None value
unit = e.get("unit", NA) if e.get("unit", NA) else NA
concept_value_mask = int(numeric_value is not None)
concept_value = numeric_value if concept_value_mask == 1 else -1.0
code = replace_escape_chars(e["code"])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .meds_to_cehrbert_base import MedsToCehrBertConversion
from .meds_to_cehrbert_micmic4 import MedsToBertMimic4
from .meds_to_cehrbert_omop import MedsToCehrbertOMOP
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class MedsToCehrBertConversion(ABC):
or None if no rule exists.
"""

def __init__(self):
def __init__(self, **kwargs):
"""
Initializes the MedsToCehrBertConversion class by caching the matching rules and.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@

class MedsToBertMimic4(MedsToCehrBertConversion):

def __init__(self, default_visit_id, **kwargs):
super().__init__(**kwargs)
self.default_visit_id = default_visit_id

def _create_ed_admission_matching_rules(self) -> List[str]:
return ["ED_REGISTRATION//", "TRANSFER_TO//ED"]

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import List

from cehrbert.data_generators.hf_data_generator.meds_to_cehrbert_conversion_rules.meds_to_cehrbert_base import (
EventConversionRule,
MedsToCehrBertConversion,
)


class MedsToCehrbertOMOP(MedsToCehrBertConversion):

def _create_ed_admission_matching_rules(self) -> List[str]:
return ["Visit/ER"]

def _create_admission_matching_rules(self) -> List[str]:
return ["Visit/IP", "Visit/ERIP", "CMS Place of Service/51", "CMS Place of Service/61"]

def _create_discharge_matching_rules(self) -> List[str]:
return [
"PCORNet/Generic-NI",
"CMS Place of Service/12",
"SNOMED/371827001",
"CMS Place of Service/21",
"NUCC/261Q00000X",
"CMS Place of Service/31",
"SNOMED/397709008",
"Medicare Specialty/A4",
"SNOMED/225928004",
"CMS Place of Service/34",
"CMS Place of Service/61",
"CMS Place of Service/51",
"CMS Place of Service/23",
"PCORNet/Generic-OT",
"CMS Place of Service/27",
"CMS Place of Service/24",
"CMS Place of Service/09",
"CMS Place of Service/33",
"SNOMED/34596002",
"CMS Place of Service/25",
"CMS Place of Service/32",
"CMS Place of Service/20",
]

def _create_text_event_to_numeric_event_rules(self) -> List[EventConversionRule]:
return []
Loading
Loading