Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cehrbert streaming for meds #90

Merged
merged 6 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,13 @@ dependencies = [
"Jinja2==3.1.3",
"meds==0.3.3",
"meds_reader==0.1.9",
"networkx==3.2.1",
"networkx>=3.2.1",
"numpy==1.24.3",
"packaging==23.2",
"pandas==2.2.0",
"peft>=0.10.0",
"Pillow==10.3.0",
"pyarrow==15.0.0",
"pydantic==2.6.0",
"pyarrow>=15.0.0",
"python-dateutil==2.8.2",
"PyYAML==6.0.1",
"scikit-learn==1.4.0",
Expand Down
Empty file.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/code/data
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/code/zdict
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/hadm_id/zdict
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/language/zdict
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions sample_data/MIMIC-IV-meds/meds_reader/meds_reader.version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"train/0": [10022281, 10004235, 10005909, 10018081, 10003046, 10036156, 10020306, 10002930, 10020944, 10018501, 10007795, 10035185, 10021938, 10021312, 10014078, 10006053, 10021118, 10023239, 10007058, 10024043, 10020786, 10005817, 10039997, 10015860, 10038999, 10021487, 10017492, 10020187, 10018845, 10019917, 10008287, 10011398, 10010867, 10037861, 10018328, 10029291, 10022041, 10010471, 10019568, 10005866, 10015931, 10021666, 10037928, 10032725, 10009628, 10002428, 10040025, 10014354, 10007818, 10003400, 10022017, 10007928, 10012552, 10027602, 10005348, 10023117, 10026406, 10008454, 10013049, 10025612, 10039708, 10035631, 10037975, 10009035, 10004422, 10020640, 10027445, 10000032, 10019003, 10006580, 10002495, 10015272, 10001217, 10004720, 10004733, 10031757, 10031404, 10038933, 10016150, 10004457], "tuning/0": [10020740, 10038992, 10009049, 10038081, 10039831, 10019777, 10019172, 10001725, 10016810, 10026255], "held_out/0": [10025463, 10019385, 10014729, 10022880, 10023771, 10012853, 10016742, 10018423, 10029484, 10026354]}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"dataset_name": "MIMIC-IV", "dataset_version": "3.1:0.0.3", "etl_name": "MEDS_transforms", "etl_version": "0.1.1", "meds_version": "0.3.3", "created_at": "2025-02-23T20:51:17.919792"}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/race/data
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/race/zdict
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/route/data
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/route/zdict
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/subject_id
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/time/data
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/time/zdict
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/unit/data
Binary file not shown.
Binary file not shown.
Binary file added sample_data/MIMIC-IV-meds/meds_reader/unit/zdict
Binary file not shown.
3 changes: 2 additions & 1 deletion src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,11 +379,12 @@ def batched_generator():
)

if data_args.streaming:
first_example = next(iter(dataset))
parts = dataset.map(
partial(agg_helper, map_func=map_statistics_partial),
batched=True,
batch_size=data_args.preprocessing_batch_size,
remove_columns=dataset.column_names,
remove_columns=first_example.keys(),
)
else:
parts = dataset.map(
Expand Down
9 changes: 9 additions & 0 deletions src/cehrbert/runners/hf_cehrbert_finetune_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,17 @@ def load_finetuned_model(model_args: ModelArguments, model_name_or_path: str) ->


def main():

data_args, model_args, training_args = parse_runner_args()

if data_args.streaming:
# This happens only when streaming is enabled. This is for disabling the warning message
# https://github.com/huggingface/transformers/issues/5486
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# The iterable dataset doesn't have sharding implemented, so the number of works has to
# be set to 0. Otherwise the trainer will throw an error
training_args.dataloader_num_workers = 0

tokenizer = load_pretrained_tokenizer(model_args)
prepared_ds_path = generate_prepared_ds_path(data_args, model_args, data_folder=data_args.cohort_folder)

Expand Down
15 changes: 12 additions & 3 deletions src/cehrbert/runners/hf_cehrbert_pretrain_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import os
from typing import Optional, Union

import torch
from datasets import Dataset, DatasetDict, IterableDatasetDict, load_from_disk
from transformers import AutoConfig, Trainer, set_seed
from transformers import Trainer, set_seed
from transformers.utils import logging

from cehrbert.data_generators.hf_data_generator.hf_dataset import create_cehrbert_pretraining_dataset
Expand Down Expand Up @@ -95,7 +96,7 @@ def load_and_create_model(model_args: ModelArguments, tokenizer: CehrBertTokeniz
model = load_and_create_model(model_args, tokenizer)
"""
try:
model_config = AutoConfig.from_pretrained(os.path.expanduser(model_args.model_name_or_path))
model_config = CehrBertConfig.from_pretrained(os.path.expanduser(model_args.model_name_or_path))
except (OSError, ValueError, FileNotFoundError, json.JSONDecodeError) as e:
LOG.warning(e)
model_config = CehrBertConfig(
Expand Down Expand Up @@ -262,9 +263,17 @@ def filter_func(examples):
if not data_args.streaming:
processed_dataset.set_format("pt")

def data_collator(features):
batch = collator(features)
# Convert any float64 tensors to float32
for key in batch:
if isinstance(batch[key], torch.Tensor) and batch[key].dtype == torch.float64:
batch[key] = batch[key].to(torch.float32)
return batch

trainer = Trainer(
model=model,
data_collator=collator,
data_collator=data_collator,
train_dataset=processed_dataset["train"],
eval_dataset=processed_dataset["validation"],
args=training_args,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import shutil
import sys
import tempfile
import unittest
from pathlib import Path

from datasets import disable_caching

from cehrbert.runners.hf_cehrbert_pretrain_runner import main

disable_caching()
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["WANDB_MODE"] = "disabled"
os.environ["TRANSFORMERS_VERBOSITY"] = "info"


class HfCehrBertRunnerIntegrationTest(unittest.TestCase):
def setUp(self):
# Get the root folder of the project
root_folder = Path(os.path.abspath(__file__)).parent.parent.parent.parent
data_folder = os.path.join(root_folder, "sample_data", "MIMIC-IV-meds", "meds_reader")
# Create a temporary directory to store model and tokenizer
self.temp_dir = tempfile.mkdtemp()
self.model_folder_path = os.path.join(self.temp_dir, "model")
Path(self.model_folder_path).mkdir(parents=True, exist_ok=True)
self.dataset_prepared_path = os.path.join(self.temp_dir, "dataset_prepared_path")
Path(self.dataset_prepared_path).mkdir(parents=True, exist_ok=True)
sys.argv = [
"hf_cehrbert_pretraining_runner.py",
"--model_name_or_path",
self.model_folder_path,
"--tokenizer_name_or_path",
self.model_folder_path,
"--output_dir",
self.model_folder_path,
"--data_folder",
data_folder,
"--dataset_prepared_path",
self.dataset_prepared_path,
"--max_steps",
"10",
"--streaming",
"--is_data_in_meds",
]

def tearDown(self):
# Remove the temporary directory
shutil.rmtree(self.temp_dir)

def test_train_model(self):
main()


if __name__ == "__main__":
unittest.main()