diff --git a/src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py b/src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py index 00b4faa4..96d121fa 100644 --- a/src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py +++ b/src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py @@ -12,12 +12,12 @@ class CehrBertDataCollator: def __init__( - self, - tokenizer: CehrBertTokenizer, - max_length: int, - mlm_probability: float = 0.15, - is_pretraining: bool = True, - truncate_type: TruncationType = TruncationType.RANDOM_RIGHT_TRUNCATION, + self, + tokenizer: CehrBertTokenizer, + max_length: int, + mlm_probability: float = 0.15, + is_pretraining: bool = True, + truncate_type: TruncationType = TruncationType.RANDOM_RIGHT_TRUNCATION, ): self.tokenizer = tokenizer self.max_length = max_length @@ -29,12 +29,12 @@ def __init__( # Pre-compute these so we can use them later on # We used VS for the historical data, currently, we use the new [VS] for the newer data # so we need to check both cases. - self.vs_token_id = tokenizer._convert_token_to_id("VS") - if self.vs_token_id == tokenizer._oov_token_index: - self.vs_token_id = tokenizer._convert_token_to_id("[VS]") - self.ve_token_id = tokenizer._convert_token_to_id("VE") - if self.ve_token_id == tokenizer._oov_token_index: - self.ve_token_id = tokenizer._convert_token_to_id("[VE]") + self.vs_token_id = tokenizer.convert_token_to_id("VS") + if self.vs_token_id == tokenizer.oov_token_index: + self.vs_token_id = tokenizer.convert_token_to_id("[VS]") + self.ve_token_id = tokenizer.convert_token_to_id("VE") + if self.ve_token_id == tokenizer.oov_token_index: + self.ve_token_id = tokenizer.convert_token_to_id("[VE]") @staticmethod def _convert_to_tensor(features: Any) -> torch.Tensor: @@ -205,9 +205,9 @@ def torch_mask_tokens(self, inputs: torch.Tensor, labels: torch.Tensor) -> Tuple # 10% of the time, we replace masked input tokens with random word indices_random = ( - torch.bernoulli(torch.full(labels.shape, 0.5)).bool() - & masked_indices - & ~indices_replaced + torch.bernoulli(torch.full(labels.shape, 0.5)).bool() + & masked_indices + & ~indices_replaced ) random_words = torch.randint(self.tokenizer.vocab_size, labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] @@ -233,8 +233,8 @@ def generate_start_end_index(self, record: Dict[str, Any]) -> Dict[str, Any]: start_index = random.randint(0, seq_length - new_max_length) end_index = min(seq_length, start_index + new_max_length) elif self.truncate_type in ( - TruncationType.RANDOM_RIGHT_TRUNCATION, - TruncationType.RANDOM_COMPLETE, + TruncationType.RANDOM_RIGHT_TRUNCATION, + TruncationType.RANDOM_COMPLETE, ): # We randomly pick a [VS] token starting_points = [] @@ -266,9 +266,9 @@ def generate_start_end_index(self, record: Dict[str, Any]) -> Dict[str, Any]: new_record = collections.OrderedDict() for k, v in record.items(): if ( - isinstance(v, list) - or isinstance(v, np.ndarray) - or (isinstance(v, torch.Tensor) and v.dim() > 0) + isinstance(v, list) + or isinstance(v, np.ndarray) + or (isinstance(v, torch.Tensor) and v.dim() > 0) ): if len(v) == seq_length: new_record[k] = v[start_index:end_index] diff --git a/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py b/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py index 783007b7..5c5aaf77 100644 --- a/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py +++ b/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py @@ -31,7 +31,7 @@ LAB_STATS_FILE_NAME = "cehrgpt_lab_stats.json" -def load_json_file(json_file): +def load_json_file(json_file) -> Union[List[Dict[str, Any]], Dict[str, Any]]: """ Loads a JSON file and returns the parsed JSON object. @@ -56,10 +56,10 @@ def load_json_file(json_file): class CehrBertTokenizer(PushToHubMixin): def __init__( - self, - tokenizer: Tokenizer, - lab_stats: List[Dict[str, Any]], - concept_name_mapping: Dict[str, str], + self, + tokenizer: Tokenizer, + lab_stats: List[Dict[str, Any]], + concept_name_mapping: Dict[str, str], ): self._tokenizer = tokenizer self._lab_stats = lab_stats @@ -139,10 +139,10 @@ def convert_tokens_to_string(self, tokens): return out_string def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - push_to_hub: bool = False, - **kwargs, + self, + save_directory: Union[str, os.PathLike], + push_to_hub: bool = False, + **kwargs, ): """ Save the Cehrbert tokenizer. @@ -190,9 +190,9 @@ def save_pretrained( @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs, + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + **kwargs, ): """ Load the CehrBert tokenizer. @@ -216,7 +216,7 @@ def from_pretrained( ) if not tokenizer_file: - return None + raise RuntimeError(f"tokenizer_file does not exist: {tokenizer_file}") tokenizer = Tokenizer.from_file(tokenizer_file) @@ -224,13 +224,15 @@ def from_pretrained( pretrained_model_name_or_path, LAB_STATS_FILE_NAME, **kwargs ) if not lab_stats_file: - return None + raise RuntimeError(f"lab_stats_file does not exist: {lab_stats_file}") concept_name_mapping_file = transformers.utils.hub.cached_file( pretrained_model_name_or_path, CONCEPT_MAPPING_FILE_NAME, **kwargs ) if not concept_name_mapping_file: - return None + raise RuntimeError( + f"concept_name_mapping_file does not exist: {concept_name_mapping_file}" + ) lab_stats = load_json_file(lab_stats_file) @@ -240,11 +242,11 @@ def from_pretrained( @classmethod def train_tokenizer( - cls, - dataset: Union[Dataset, DatasetDict], - feature_names: List[str], - concept_name_mapping: Dict[str, str], - data_args: DataTrainingArguments, + cls, + dataset: Union[Dataset, DatasetDict], + feature_names: List[str], + concept_name_mapping: Dict[str, str], + data_args: DataTrainingArguments, ): """ Train a huggingface word level tokenizer. diff --git a/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py b/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py index 3214bd1d..955c2ef3 100644 --- a/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py +++ b/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py @@ -1,3 +1,4 @@ +import json import os from typing import Optional, Union @@ -62,7 +63,7 @@ def load_and_create_tokenizer( tokenizer_abspath = os.path.abspath(model_args.tokenizer_name_or_path) try: tokenizer = CehrBertTokenizer.from_pretrained(tokenizer_abspath) - except RuntimeError as e: + except (OSError, RuntimeError, FileNotFoundError, json.JSONDecodeError) as e: LOG.warning( "Failed to load the tokenizer from %s with the error " "\n%s\nTried to create the tokenizer, however the dataset is not provided.", @@ -104,7 +105,7 @@ def load_and_create_model( try: model_abspath = os.path.abspath(model_args.model_name_or_path) model_config = AutoConfig.from_pretrained(model_abspath) - except RuntimeError as e: + except (OSError, ValueError, FileNotFoundError, json.JSONDecodeError) as e: LOG.warning(e) model_config = CehrBertConfig( vocab_size=tokenizer.vocab_size,