From 3d09645e7205f11edc733996e831150a554ff498 Mon Sep 17 00:00:00 2001 From: Chao Pang Date: Fri, 6 Sep 2024 17:07:44 -0400 Subject: [PATCH] removed the abosolute model and tokenizer paths from pretraining and finetuning (#50) --- .../runners/hf_cehrbert_finetune_runner.py | 6 ++---- .../runners/hf_cehrbert_pretrain_runner.py | 15 ++++++--------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/cehrbert/runners/hf_cehrbert_finetune_runner.py b/src/cehrbert/runners/hf_cehrbert_finetune_runner.py index 05ecd860..87499acd 100644 --- a/src/cehrbert/runners/hf_cehrbert_finetune_runner.py +++ b/src/cehrbert/runners/hf_cehrbert_finetune_runner.py @@ -66,8 +66,7 @@ def load_pretrained_model_and_tokenizer( ) -> Tuple[CehrBertPreTrainedModel, CehrBertTokenizer]: # Try to load the pretrained tokenizer try: - tokenizer_abspath = os.path.abspath(model_args.tokenizer_name_or_path) - tokenizer = CehrBertTokenizer.from_pretrained(tokenizer_abspath) + tokenizer = CehrBertTokenizer.from_pretrained(model_args.tokenizer_name_or_path) except Exception: raise ValueError(f"Can not load the pretrained tokenizer from {model_args.tokenizer_name_or_path}") @@ -82,8 +81,7 @@ def load_pretrained_model_and_tokenizer( # Try to load the pretrained model try: - model_abspath = os.path.abspath(model_args.model_name_or_path) - model = finetune_model_cls.from_pretrained(model_abspath) + model = finetune_model_cls.from_pretrained(model_args.model_name_or_path) except Exception as e: LOG.warning(e) model_config = CehrBertConfig( diff --git a/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py b/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py index 6d0b026c..e53fac93 100644 --- a/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py +++ b/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py @@ -12,9 +12,8 @@ from cehrbert.models.hf_models.config import CehrBertConfig from cehrbert.models.hf_models.hf_cehrbert import CehrBertForPreTraining from cehrbert.models.hf_models.tokenization_hf_cehrbert import CehrBertTokenizer - -from .hf_runner_argument_dataclass import DataTrainingArguments, ModelArguments -from .runner_util import ( +from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments, ModelArguments +from cehrbert.runners.runner_util import ( generate_prepared_ds_path, get_last_hf_checkpoint, get_meds_extension_path, @@ -54,14 +53,13 @@ def load_and_create_tokenizer( tokenizer = load_and_create_tokenizer(data_args, model_args, dataset) """ # Try to load the pretrained tokenizer - tokenizer_abspath = os.path.abspath(model_args.tokenizer_name_or_path) try: - tokenizer = CehrBertTokenizer.from_pretrained(tokenizer_abspath) + tokenizer = CehrBertTokenizer.from_pretrained(model_args.tokenizer_name_or_path) except (OSError, RuntimeError, FileNotFoundError, json.JSONDecodeError) as e: LOG.warning( "Failed to load the tokenizer from %s with the error " "\n%s\nTried to create the tokenizer, however the dataset is not provided.", - tokenizer_abspath, + model_args.tokenizer_name_or_path, e, ) if dataset is None: @@ -69,7 +67,7 @@ def load_and_create_tokenizer( tokenizer = CehrBertTokenizer.train_tokenizer( dataset, feature_names=["concept_ids"], concept_name_mapping={}, data_args=data_args ) - tokenizer.save_pretrained(tokenizer_abspath) + tokenizer.save_pretrained(model_args.tokenizer_name_or_path) return tokenizer @@ -95,8 +93,7 @@ def load_and_create_model(model_args: ModelArguments, tokenizer: CehrBertTokeniz model = load_and_create_model(model_args, tokenizer) """ try: - model_abspath = os.path.abspath(model_args.model_name_or_path) - model_config = AutoConfig.from_pretrained(model_abspath) + model_config = AutoConfig.from_pretrained(model_args.model_name_or_path) except (OSError, ValueError, FileNotFoundError, json.JSONDecodeError) as e: LOG.warning(e) model_config = CehrBertConfig(