From 179ee21afc01e04166b60a612169a8d8a20fd1cb Mon Sep 17 00:00:00 2001 From: Chao Pang Date: Fri, 13 Sep 2024 13:18:32 -0400 Subject: [PATCH] fixed a bug where the normalized value should be bounded by a multiple of the standard deviation because the normalized value is assumed to follow the standard normal --- .../models/hf_models/tokenization_hf_cehrbert.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py b/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py index 810517a..27ab92d 100644 --- a/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py +++ b/src/cehrbert/models/hf_models/tokenization_hf_cehrbert.py @@ -356,13 +356,13 @@ def normalize(self, concept_id, concept_value) -> float: mean_ = concept_value - self._lab_stat_mapping[concept_id]["mean"] std = self._lab_stat_mapping[concept_id]["std"] if std > 0: + value_outlier_std = self._lab_stat_mapping[concept_id]["value_outlier_std"] normalized_value = mean_ / self._lab_stat_mapping[concept_id]["std"] # Clip the value between the lower and upper bounds of the corresponding lab - normalized_value = max( - self._lab_stat_mapping[concept_id]["lower_bound"], - min(self._lab_stat_mapping[concept_id]["upper_bound"], normalized_value), - ) + normalized_value = max(-value_outlier_std, min(value_outlier_std, normalized_value)) else: - normalized_value = mean_ + # If there is not a valid standard deviation, + # we just the normalized value to the mean of the standard normal + normalized_value = 0.0 return normalized_value return concept_value