diff --git a/deep_autoviml/__version__.py b/deep_autoviml/__version__.py index 176d0d1..dfdf086 100644 --- a/deep_autoviml/__version__.py +++ b/deep_autoviml/__version__.py @@ -20,6 +20,6 @@ __author__ = "Ram Seshadri" __description__ = "deep_autoviml - build and test multiple Tensorflow 2.0 models and pipelines" __url__ = "https://github.com/Auto_ViML/deep_autoviml.git" -__version__ = "0.0.78.dev2" +__version__ = "0.0.79" __license__ = "Apache License 2.0" __copyright__ = "2020-21 Google" diff --git a/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc b/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc index b96bc36..758e194 100644 Binary files a/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc and b/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc differ diff --git a/deep_autoviml/data_load/__pycache__/extract.cpython-38.pyc b/deep_autoviml/data_load/__pycache__/extract.cpython-38.pyc index f61f657..a54b820 100644 Binary files a/deep_autoviml/data_load/__pycache__/extract.cpython-38.pyc and b/deep_autoviml/data_load/__pycache__/extract.cpython-38.pyc differ diff --git a/deep_autoviml/data_load/classify_features.py b/deep_autoviml/data_load/classify_features.py index da64635..e451e40 100644 --- a/deep_autoviml/data_load/classify_features.py +++ b/deep_autoviml/data_load/classify_features.py @@ -217,7 +217,7 @@ def classify_columns(df_preds, model_options={}, verbose=0): #### If there are 30 chars are more in a discrete_string_var, it is then considered an NLP variable ### if a variable has more than this many chars, it will be treated like a NLP variable - max_nlp_char_size = check_model_options(model_options, "nlp_char_limit", 30) + max_nlp_char_size = check_model_options(model_options, "nlp_char_limit", 50) ### if a variable has more than this limit, it will not be treated like a cat variable # #### Cat_Limit defines the max number of categories a column can have to be called a categorical colum cat_limit = check_model_options(model_options, "variable_cat_limit", 30) @@ -502,7 +502,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos nlps = [] bools = [] ### if a variable has more than this many chars, it will be treated like a NLP variable - nlp_char_limit = check_model_options(model_options, "nlp_char_limit", 30) + nlp_char_limit = check_model_options(model_options, "nlp_char_limit", 50) ### if a variable has more than this limit, it will not be treated like a cat variable # cat_limit = check_model_options(model_options, "variable_cat_limit", 30) ### Classify features using the previously define function ############# @@ -540,7 +540,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos floats = [] preds_copy = copy.deepcopy(preds) for key in preds_copy: - if data_sample[key].dtype in ['object'] or str(data_sample[key].dtype) == 'category': + if str(data_sample[key].dtype) in ['object', 'category']: if type('str') in data_sample[key].map(type).value_counts().index: feats_max_min[key]["dtype"] = "string" elif data_sample[key].map(type).value_counts().index[0] == int: @@ -574,7 +574,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos discrete_strings.remove(key) var_df1['discrete_string_vars'] = copy.deepcopy(discrete_strings) #### This is not a mistake - you have to test it again. That way we make sure type is safe - if data_sample[key].dtype in ['object'] or str(data_sample[key].dtype) == 'category': + if str(data_sample[key].dtype) in ['object', 'category']: if data_sample[key].map(type).value_counts().index[0] == object or data_sample[key].map(type).value_counts().index[0] == str: feats_max_min[key]["dtype"] = "string" elif data_sample[key].dtype in ['bool']: @@ -627,10 +627,11 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos feats_max_min[key]["vocab"] = vocab feats_max_min[key]['size_of_vocab'] = len(vocab) elif feats_max_min[key]['dtype'] in ['string']: - data_types = len(data_sample[key].fillna("missing").map(type).value_counts()) + data_sample[[key]] = data_sample[[key]].fillna("missing") + data_types = len(data_sample[key].map(type).value_counts()) if data_types > 1: print('\nDATA CLEANING ALERT: Dropping %s since it has %s mixed data types.' %(key, data_types)) - print(' Transform variable to single data type and re-run. Continuing...') + print(' Convert this variable to a single data type and re-run deep_autoviml.') ignore_variables.append(key) preds.remove(key) feats_max_min['predictors_in_train'] = preds @@ -642,7 +643,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos discrete_strings.remove(key) var_df1['discrete_string_vars'] = copy.deepcopy(discrete_strings) if not key in ignore_variables: - if np.mean(data_sample[key].fillna("missing").map(len)) >= nlp_char_limit: + if np.max(data_sample[key].map(len)) >= nlp_char_limit: ### This is for NLP variables. You want to remove duplicates ##### if key in dates: continue @@ -652,7 +653,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos elif key in discrete_strings: discrete_strings.remove(key) var_df1['discrete_string_vars'] = discrete_strings - print('%s is detected and will be treated as an NLP variable' %key) + print('%s is detected as an NLP variable' %key) if key not in var_df1['nlp_vars']: var_df1['nlp_vars'].append(key) #### Now let's calculate some statistics on this NLP variable ### @@ -663,14 +664,14 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos ### Immediately cap the vocab size to 300,000 - don't measure its vocab!! data_sample = data_sample.sample(frac=0.1, random_state=0) try: - vocab = np.concatenate(data_sample[key].fillna('missing').map(tokenize_fast)) + vocab = np.concatenate(data_sample[key].map(tokenize_fast)) except: - vocab = np.concatenate(data_sample[key].fillna('missing').map(tokenize_fast).values) + vocab = np.concatenate(data_sample[key].map(tokenize_fast).values) vocab = np.unique(vocab).tolist() feats_max_min[key]["vocab"] = vocab try: - feats_max_min[key]['seq_length'] = int(data_sample[key].fillna("missing").map(len).max()) - num_words_in_each_row = data_sample[key].fillna("missing").map(lambda x: len(x.split(" "))).mean() + feats_max_min[key]['seq_length'] = int(data_sample[key].map(len).max()) + num_words_in_each_row = data_sample[key].map(lambda x: len(x.split(" "))).mean() feats_max_min[key]['size_of_vocab'] = int(num_rows_in_data*num_words_in_each_row) except: feats_max_min[key]['seq_length'] = len(vocab) // num_rows_in_data @@ -679,10 +680,8 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos ### This is for string variables ######## #### Now we select features if they are present in the data set ### num_rows_in_data = model_options['DS_LEN'] - if data_sample[key].isnull().sum() > 0: - vocab = data_sample[key].fillna("missing").unique().tolist() - else: - vocab = data_sample[key].unique().tolist() + data_sample[[key]] = data_sample[[key]].fillna("missing") + vocab = data_sample[key].unique().tolist() vocab = np.unique(vocab).tolist() #vocab = ['missing' if type(x) != str else x for x in vocab] feats_max_min[key]["vocab"] = vocab @@ -748,7 +747,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos print('Not performing feature crossing for categorical nor integer variables' ) return data_sample, var_df1, feats_max_min ############################################################################################ -def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=20): +def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=50): """ EDA stands for Exploratory data analysis. This function performs EDA - hence the name ######################################################################################## @@ -763,7 +762,8 @@ def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=20): nlpcols = [] for each_cat in cats: try: - if df1[each_cat].map(len).mean() >=nlp_char_limit: + df1[[each_cat]] = df1[[each_cat]].fillna('missing') + if df1[each_cat].map(len).max() >=nlp_char_limit: nlpcols.append(each_cat) catcols.remove(each_cat) except: @@ -775,7 +775,7 @@ def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=20): floatcols = df1.select_dtypes(include='float').columns.tolist() return catcols, int_cats, intcols, floatcols, nlpcols ############################################################################################ -def EDA_classify_features(train, target, idcols, nlp_char_limit=20): +def EDA_classify_features(train, target, idcols, nlp_char_limit=50): ### Test Labeler is a very important dictionary that will help transform test data same as train #### test_labeler = defaultdict(list) @@ -1081,7 +1081,7 @@ def classify_dtypes_using_TF2(data_sample, preds, idcols, verbose=0): """ print_features = False nlps = [] - nlp_char_limit = 30 + nlp_char_limit = 50 all_ints = [] floats = [] cats = [] @@ -1108,7 +1108,8 @@ def classify_dtypes_using_TF2(data_sample, preds, idcols, verbose=0): int_vocab = tf.unique(value)[0].numpy().tolist() feats_max_min[key]['size_of_vocab'] = len(int_vocab) elif feats_max_min[key]['dtype'] in [tf.string]: - if tf.reduce_mean(tf.strings.length(feature_batch[key])).numpy() >= nlp_char_limit: + feature_batch[[key]] = feature_batch[[key]].fillna("missing") + if tf.reduce_max(tf.strings.length(feature_batch[key])).numpy() >= nlp_char_limit: print('%s is detected and will be treated as an NLP variable') nlps.append(key) else: diff --git a/deep_autoviml/data_load/extract.py b/deep_autoviml/data_load/extract.py index 9a2b81d..6c576b2 100644 --- a/deep_autoviml/data_load/extract.py +++ b/deep_autoviml/data_load/extract.py @@ -127,7 +127,7 @@ def transform_train_target(train_target, target, modeltype, model_label, cat_voc train_target = copy.deepcopy(train_target) cat_vocab_dict = copy.deepcopy(cat_vocab_dict) ### Just have to change the target from string to Numeric in entire dataframe! ### - + if modeltype != 'Regression': if model_label == 'Multi_Label': target_copy = copy.deepcopy(target) @@ -310,34 +310,50 @@ def load_train_data_file(train_datafile, target, keras_options, model_options, v ### if modeltype is given, then do not find the model type using this function _, _, usecols = find_problem_type(train_small, target, model_options, verbose) - label_encode_flag = False + ########## Find small details about the data to help create the right model ### - - if modeltype == 'Classification' or modeltype == 'Multi_Classification': + label_encode_flag = model_options["label_encode_flag"] + if isinstance(label_encode_flag, str): + if modeltype == 'Classification' or modeltype == 'Multi_Classification': + if isinstance(target, str): + #### This is for Single-Label problems ######## + if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': + label_encode_flag = True + elif 0 not in np.unique(train_small[target]): + label_encode_flag = False + if verbose: + print(' label encoding must be done since there is no zero class!') + target_vocab = train_small[target].unique() + num_classes = len(target_vocab) + elif isinstance(target, list): + #### This is for Multi-Label problems ######## + num_classes = [] + for each_target in target: + if train_small[each_target].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': + label_encode_flag = True + elif 0 not in np.unique(train_small[each_target]): + label_encode_flag = False + if verbose: + print(' label encoding must be done since there is no zero class!') + target_vocab = train_small[each_target].unique().tolist() + num_classes.append(len(target_vocab)) + else: + num_classes = 1 + target_vocab = [] + label_encode_flag = False + else: if isinstance(target, str): - #### This is for Single-Label problems ######## - if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': - label_encode_flag = True - elif 0 not in np.unique(train_small[target]): - label_encode_flag = True ### label encoding must be done since no zero class! target_vocab = train_small[target].unique() num_classes = len(target_vocab) - elif isinstance(target, list): - #### This is for Multi-Label problems ######## - num_classes = [] - for each_target in target: - if train_small[each_target].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': - label_encode_flag = True - elif 0 not in np.unique(train_small[each_target]): - label_encode_flag = True - target_vocab = train_small[each_target].unique().tolist() - num_classes.append(len(target_vocab)) - else: - num_classes = 1 - target_vocab = [] + else: + for each_target in copy_target: + target_vocab = train_small[target].unique().tolist() + num_classes_each = len(target_vocab) + num_classes.append(int(num_classes_each)) + #### This is where we set the model_options for num_classes and num_labels ######### model_options['num_classes'] = num_classes - + ############# Sample Data classifying features into variaous types ################## print('Loaded a small data sample of size = %s into pandas dataframe to analyze...' %(train_small.shape,)) ### classify variables using the small dataframe ## @@ -727,37 +743,49 @@ def load_train_data_frame(train_dataframe, target, keras_options, model_options, cat_vocab_dict['modeltype'] = modeltype model_options['batch_size'] = batch_size ########## Find small details about the data to help create the right model ### - target_transformed = False - if modeltype != 'Regression': - if isinstance(target, str): - #### This is for Single Label Problems ###### - if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': - target_transformed = True - target_vocab = train_small[target].unique() - num_classes = len(target_vocab) - else: - if 0 not in np.unique(train_small[target]): - target_transformed = True ### label encoding must be done since no zero class! - target_vocab = train_small[target].unique() - num_classes = len(train_small[target].value_counts()) - elif isinstance(target, list): - #### This is for Multi-Label Problems ####### - copy_target = copy.deepcopy(target) - num_classes = [] - for each_target in copy_target: - if train_small[target[0]].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': + target_transformed = model_options["label_encode_flag"] + if isinstance(target_transformed, str): + if modeltype != 'Regression': + if isinstance(target, str): + #### This is for Single Label Problems ###### + if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': target_transformed = True - target_vocab = train_small[target].unique().tolist() - num_classes_each = len(target_vocab) + target_vocab = train_small[target].unique() + num_classes = len(target_vocab) else: - if 0 not in np.unique(train_small[target[0]]): + if 0 not in np.unique(train_small[target]): target_transformed = True ### label encoding must be done since no zero class! - target_vocab = train_small[target[0]].unique() - num_classes_each = train_small[target].apply(np.unique).apply(len).max() - num_classes.append(int(num_classes_each)) + target_vocab = train_small[target].unique() + num_classes = len(train_small[target].value_counts()) + elif isinstance(target, list): + #### This is for Multi-Label Problems ####### + copy_target = copy.deepcopy(target) + num_classes = [] + for each_target in copy_target: + if train_small[target[0]].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': + target_transformed = True + target_vocab = train_small[target].unique().tolist() + num_classes_each = len(target_vocab) + else: + if 0 not in np.unique(train_small[target[0]]): + target_transformed = True ### label encoding must be done since no zero class! + target_vocab = train_small[target[0]].unique() + num_classes_each = train_small[target].apply(np.unique).apply(len).max() + num_classes.append(int(num_classes_each)) + else: + num_classes = 1 + target_vocab = [] + target_transformed = False else: - num_classes = 1 - target_vocab = [] + if isinstance(target, str): + target_vocab = train_small[target].unique() + num_classes = len(target_vocab) + else: + for each_target in copy_target: + target_vocab = train_small[target].unique().tolist() + num_classes_each = len(target_vocab) + num_classes.append(int(num_classes_each)) + ########### find the number of labels in data #### if isinstance(target, str): num_labels = 1 @@ -772,7 +800,7 @@ def load_train_data_frame(train_dataframe, target, keras_options, model_options, cat_vocab_dict['num_labels'] = num_labels cat_vocab_dict['num_classes'] = num_classes cat_vocab_dict["target_transformed"] = target_transformed - + #### once the dataframe has been classified, you can again change train_small to original dataframe ## train_small = copy.deepcopy(train_dataframe) @@ -1054,18 +1082,23 @@ def combine_nlp_text(features): y[NLP_COLUMN] = tf.strings.reduce_join([features[i] for i in NLP_VARS],axis=0, keepdims=False, separator=' ') return y - ################################################################ + ###################################################################################### ### You have to load only the NLP or text variables into dataset. ### otherwise, it will fail during predict. Yo still need to create input for them. ### In mixed_NLP models, you drop original NLP vars and combine them into one NLP var. - if NLP_VARS and keras_model_type.lower() in ['nlp','text']: + ###################################################################################### + + if NLP_VARS and keras_model_type.lower() in ['nlp','text', 'mixed_nlp', 'combined_nlp']: if keras_model_type.lower() in ['nlp', 'text']: train_ds = train_ds.map(lambda x, y: (process_NLP_features(x), y)) #train_ds = train_ds.unbatch().batch(batch_size) print(' processed NLP or text vars: %s successfully' %NLP_VARS) - else: + elif keras_model_type.lower() in ['combined_nlp']: train_ds = train_ds.map(lambda x, y: (combine_nlp_text(x), y)) print(' combined NLP or text vars: %s into a single feature successfully' %NLP_VARS) + else: + ### Mixed NLP is to keep NLP vars separate so they can be processed individually ## + print(' keeping NLP vars separate') else: print(' No special text preprocessing done for NLP vars.') ############################################################################ diff --git a/deep_autoviml/deep_autoviml.py b/deep_autoviml/deep_autoviml.py index f5261b1..3b650f3 100644 --- a/deep_autoviml/deep_autoviml.py +++ b/deep_autoviml/deep_autoviml.py @@ -193,6 +193,11 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep Another option would be to inform autoviml about encoding in CSV file for it to read such as 'latin-1' by setting {"csv_encoding": 'latin-1'} Other examples: + "nlp_char_limit": default 50. Beyond this max limit of chars in column, it + will be considered NLP column and treated as such. + "variable_cat_limit": default 30. if a variable has more than this limit, it + will NOT be treated as a categorical variable. + "DS_LEN": default "". Number of rows in dataset. You can leave it "" to calculate automatically. "csv_encoding": default='utf-8'. You can change to 'latin-1', 'iso-8859-1', 'cp1252', etc. "cat_feat_cross_flag": if you want to cross categorical features such as A*B, B*C... "sep" : default = "," comma but you can override it. Separator used in read_csv. @@ -205,6 +210,9 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep We will figure out single label or multi-label problem based on your target being string or list. "header": default = 0 ### this is the header row for pandas to read + "compression": None => you can set it to zip or other file compression formats if your data is compressed + "csv_encoding": default 'utf-8'. But you can set it to any other csv encoding format your data is in + "label_encode_flag": False. But you can set it to True if you want it encoded. "max_trials": default = 30 ## number of Storm Tuner trials ### Lower this for faster processing. "tuner": default = 'storm' ## Storm Tuner is the default tuner. Optuna is the other option. "embedding_size": default = 50 ## this is the NLP embedding size minimum @@ -260,7 +268,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep print('Model and logs being saved in %s' %save_model_path) if keras_model_type.lower() in ['image', 'images', "image_classification"]: - ############### Now do special image processing here ################################### + ############### Now do special IMAGE processing here ################################### if 'image_directory' in model_options.keys(): print(' Image directory given as %s' %model_options['image_directory']) image_dir = model_options["image_directory"] @@ -287,7 +295,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep print(deep_model.summary()) return deep_model, cat_vocab_dict elif keras_model_type.lower() in ['text', 'text classification', "text_classification"]: - ############### Now do special image processing here ################################### + ############### Now do special TEXT processing here ################################### text_alt = True ### This means you use the text directory option if 'text_directory' in model_options.keys(): print(' text directory given as %s' %model_options['text_directory']) @@ -363,8 +371,8 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep print(' %s : %s' %(key, keras_options_copy[key])) keras_options[key] = keras_options_copy[key] - list_of_model_options = ["idcols","modeltype","sep","cat_feat_cross_flag", "model_use_case", - "nlp_char_limit", "variable_cat_limit", "csv_encoding", "header", + list_of_model_options = ["idcols","modeltype","sep","cat_feat_cross_flag", "model_use_case", "label_encode_flag", + "nlp_char_limit", "variable_cat_limit", "compression", "csv_encoding", "header", "max_trials","tuner", "embedding_size", "tf_hub_model", "image_directory", 'image_height', 'image_width', "image_channels", "save_model_path"] @@ -378,6 +386,8 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep model_options_defaults["nlp_char_limit"] = 30 model_options_defaults["variable_cat_limit"] = 30 model_options_defaults["csv_encoding"] = 'utf-8' + model_options_defaults['compression'] = None ## is is needed in case to read Zip files + model_options_defaults["label_encode_flag"] = '' ## User can set it to True or False depending on their need. model_options_defaults["header"] = 0 ### this is the header row for pandas to read model_options_defaults["max_trials"] = 30 ## number of Storm Tuner trials ### model_options_defaults['tuner'] = 'storm' ## Storm Tuner is the default tuner. Optuna is the other option. @@ -446,7 +456,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep #### There may be other use cases for model_use_case in future hence leave this empty for now # #### you must create a functional model here - print('\nCreating a new Functional model here...') + print('\nCreating a new Functional keras model now...') print(''' ################################################################################# ########### C R E A T I N G A K E R A S M O D E L ############ @@ -483,7 +493,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep keras_options, model_options, var_df, cat_vocab_dict, project_name, save_model_flag, verbose) else: #### This is used only for custom auto models and is out of the strategy scope ####### - print('Building and training an automatic model using %s Tuner...' %model_options['tuner']) + print('Building and training a(n) %s model using %s Tuner...' %(keras_model_type, model_options['tuner'])) deep_model, cat_vocab_dict = train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, batched_data, target, keras_model_type, keras_options, model_options, var_df, cat_vocab_dict, project_name, @@ -509,6 +519,6 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep ############################################################################################ def get_save_folder(save_dir): - run_id = time.strftime("model_%Y_%m_%d-%H_%M_%S") + run_id = time.strftime("model_%Y_%m_%d_%H_%M_%S") return os.path.join(save_dir, run_id) ############################################################################################ \ No newline at end of file diff --git a/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc b/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc index 6c8996f..a3260f5 100644 Binary files a/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc and b/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc differ diff --git a/deep_autoviml/modeling/__pycache__/predict_model.cpython-38.pyc b/deep_autoviml/modeling/__pycache__/predict_model.cpython-38.pyc index 96a2da1..7e5756c 100644 Binary files a/deep_autoviml/modeling/__pycache__/predict_model.cpython-38.pyc and b/deep_autoviml/modeling/__pycache__/predict_model.cpython-38.pyc differ diff --git a/deep_autoviml/modeling/__pycache__/train_custom_model.cpython-38.pyc b/deep_autoviml/modeling/__pycache__/train_custom_model.cpython-38.pyc index 4a6978a..8ace3f6 100644 Binary files a/deep_autoviml/modeling/__pycache__/train_custom_model.cpython-38.pyc and b/deep_autoviml/modeling/__pycache__/train_custom_model.cpython-38.pyc differ diff --git a/deep_autoviml/modeling/__pycache__/train_model.cpython-38.pyc b/deep_autoviml/modeling/__pycache__/train_model.cpython-38.pyc index 0d06530..74b8523 100644 Binary files a/deep_autoviml/modeling/__pycache__/train_model.cpython-38.pyc and b/deep_autoviml/modeling/__pycache__/train_model.cpython-38.pyc differ diff --git a/deep_autoviml/modeling/create_model.py b/deep_autoviml/modeling/create_model.py index 42de2c0..eb1d59b 100644 --- a/deep_autoviml/modeling/create_model.py +++ b/deep_autoviml/modeling/create_model.py @@ -228,7 +228,7 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output fast_models2 = ['deep_and_cross', 'deep_cross', 'deep cross', 'fast2'] nlp_models = ['bert', 'use', 'text', 'mixed_nlp'] #### The Deep and Wide Model is a bit more complicated. So it needs some changes in inputs! ###### - prebuilt_models = ['basic', 'simple', 'default','dnn','reg_dnn', + prebuilt_models = ['basic', 'simple', 'default','dnn','reg_dnn', 'deep', 'big deep', 'dnn_drop', 'big_deep', 'giant_deep', 'giant deep', 'cnn1', 'cnn','cnn2'] ###### Just do a simple check for auto models here #################### @@ -269,10 +269,10 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output elif keras_model_type.lower() in ['dnn', 'simple_dnn']: ########## Now that we have setup the layers correctly, we can build some more hidden layers model_body = dnn.model - elif keras_model_type.lower() in ['dnn_drop', 'big_deep']: + elif keras_model_type.lower() in ['dnn_drop', 'big_deep', 'big deep']: #################################################### model_body = dnn_drop.model - elif keras_model_type.lower() in ['giant', 'giant_deep']: + elif keras_model_type.lower() in ['giant', 'giant_deep', 'giant deep']: #################################################### model_body = giant_deep.model elif keras_model_type.lower() in ['cnn', 'cnn1','cnn2']: @@ -442,6 +442,7 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output #### This final outputs is the one that is taken into final dense layer and compiled print(' %s model loaded successfully. Now compiling model...' %keras_model_type) ############# You need to compile the non-auto models here ############### + model_body = get_compiled_model(all_inputs, model_body, output_activation, num_predicts, modeltype, optimizer, val_loss, val_metrics, cols_len, targets) print(' %s model loaded and compiled successfully...' %keras_model_type) diff --git a/deep_autoviml/modeling/train_custom_model.py b/deep_autoviml/modeling/train_custom_model.py index 6a2361b..a36f9a1 100644 --- a/deep_autoviml/modeling/train_custom_model.py +++ b/deep_autoviml/modeling/train_custom_model.py @@ -52,6 +52,7 @@ def set_seed(seed=31415): from tensorflow.keras.layers import BatchNormalization from tensorflow.keras.optimizers import SGD from tensorflow.keras import regularizers +from tensorflow.keras.layers import LeakyReLU ##################################################################################### from deep_autoviml.modeling.create_model import return_optimizer from deep_autoviml.utilities.utilities import get_model_defaults, get_compiled_model @@ -150,17 +151,17 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi #K.clear_session() #reset_keras() #tf.keras.backend.reset_uids() - - n_layers = trial.suggest_int("n_layers", 1, 4) + ### Keep the number of layers slightly higher to increase model complexity ## + n_layers = trial.suggest_int("n_layers", 2, 8) #num_hidden = trial.suggest_categorical("n_units", [32, 48, 64, 96, 128]) num_hidden = trial.suggest_categorical("n_units", [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]) #weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-3, log=True) - weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-7,1e-6, 1e-5,1e-4, 1e-3,1e-2, 1e-1) + weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-7,1e-6, 1e-5,1e-4, 1e-3) use_bias = trial.suggest_categorical("use_bias", [True, False]) batch_norm = trial.suggest_categorical("batch_norm", [True, False]) add_noise = trial.suggest_categorical("add_noise", [True, False]) - dropout = trial.suggest_float("dropout", 0, 0.5) - activation_fn = trial.suggest_categorical("activation", ['relu', 'tanh', 'elu', 'selu']) + dropout = trial.suggest_float("dropout", 0.5, 0.9) + activation_fn = trial.suggest_categorical("activation", ['relu', 'elu', 'selu']) kernel_initializer = trial.suggest_categorical("kernel_initializer", ['glorot_uniform','he_normal','lecun_normal','he_uniform']) kernel_size = num_hidden @@ -183,7 +184,7 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi model.add(BatchNormalization(name="opt_batchnorm_"+str(i))) if add_noise: - model.add(GaussianNoise(trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True))) + model.add(GaussianNoise(trial.suggest_float("adam_learning_rate", 1e-7, 1e-3, log=True))) model.add(Dropout(dropout, name="opt_drop_"+str(i))) @@ -198,13 +199,13 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi else: optimizer_selected = trial.suggest_categorical("optimizer", optimizer_options) if optimizer_selected == "Adam": - kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True) + kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-7, 1e-3, log=True) kwargs["epsilon"] = trial.suggest_float( "adam_epsilon", 1e-14, 1e-4, log=True ) elif optimizer_selected == "SGD": kwargs["learning_rate"] = trial.suggest_float( - "sgd_opt_learning_rate", 1e-5, 1e-2, log=True + "sgd_opt_learning_rate", 1e-7, 1e-3, log=True ) kwargs["momentum"] = trial.suggest_float("sgd_opt_momentum", 0.8, 0.95) @@ -224,27 +225,27 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi def build_model_storm(hp, *args): #### Before every sequential model definition you need to clear the Keras backend ## keras.backend.clear_session() - + ###### we need to use the batch_size in a few small sizes #### if len(args) == 2: batch_limit, batch_nums = args[0], args[1] - batch_size = hp.Param('batch_size', [32, 48, 64, 96, 128, 256], + batch_size = hp.Param('batch_size', [32, 64, 128, 256, 512, 1024, 2048], ordered=True) elif len(args) == 1: batch_size = args[0] - hp.Param('batch_size', [batch_size]) + batch_size = hp.Param('batch_size', [batch_size]) else: - hp.Param('batch_size', [32]) + batch_size = hp.Param('batch_size', [64]) num_layers = hp.Param('num_layers', [1, 2, 3], ordered=True) ##### Now let us build the model body ############### model_body = Sequential([]) # example of model-wide unordered categorical parameter - activation_fn = hp.Param('activation', ['tanh','relu', 'selu', 'elu']) + activation_fn = hp.Param('activation', ['relu', 'selu', 'elu']) use_bias = hp.Param('use_bias', [True, False]) #weight_decay = hp.Param("weight_decay", np.logspace(-8, -3)) - weight_decay = hp.Param("weight_decay", [1e-8, 1e-7,1e-6, 1e-5,1e-4, 1e-3,1e-2, 1e-1]) + weight_decay = hp.Param("weight_decay", [1e-8, 1e-7,1e-6, 1e-5,1e-4]) batch_norm = hp.Param("batch_norm", [True, False]) kernel_initializer = hp.Param("kernel_initializer", @@ -275,14 +276,14 @@ def build_model_storm(hp, *args): # this param will not affect the configuration hash, if this block of code isn't executed # this is to ensure we do not test configurations that are functionally the same # but have different values for unused parameters - model_body.add(Dropout(hp.Param('dropout_value', [0.1, 0.2, 0.3, 0.4, 0.5], ordered=True), + model_body.add(Dropout(hp.Param('dropout_value', [0.5, 0.6, 0.7, 0.8, 0.9], ordered=True), name="dropout_0")) kernel_size = hp.values['kernel_size_' + str(0)] if dropout_flag: dropout_value = hp.values['dropout_value'] else: - dropout_value = 0.00 + dropout_value = 0.5 batch_norm_flag = hp.values['use_batch_norm'] # example of inline ordered parameter num_copy = copy.deepcopy(num_layers) @@ -367,10 +368,12 @@ def run_trial(self, trial, *args): save_model_architecture(comp_model, project_name, keras_model_type, cat_vocab_dict, model_options, chart_name="model_before") #print(' Custom model compiled successfully. Training model next...') + batch_numbers = [32, 64, 128, 256, 512, 1024, 2048, 4096] shuffle_size = 1000 - batch_sizes = np.linspace(8, batch_limit,batch_nums).astype(int).tolist() - batch_size = hp.Param('batch_size', batch_sizes, ordered=True) - #print('storm batch size = %s' %batch_size) + batch_sizes = batch_numbers[:batch_nums] + #print('storm batch sizes = %s' %batch_sizes) + batch_size = np.random.choice(batch_sizes) + #print(' selected batch size = %s' %batch_size) train_ds = train_ds.unbatch().batch(batch_size) train_ds = train_ds.shuffle(shuffle_size, reshuffle_each_iteration=False, seed=42).prefetch(batch_size)#.repeat(5) @@ -421,22 +424,23 @@ def return_optimizer_trials(hp, hpq_optimizer): nadam = keras.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999) best_optimizer = '' ############################################################################# + lr_list = [1e-2, 1e-3, 1e-4] if hpq_optimizer.lower() in ['adam']: - best_optimizer = tf.keras.optimizers.Adam(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = tf.keras.optimizers.Adam(lr=hp.Param('init_lr', lr_list), epsilon=hp.Param('epsilon', [1e-6, 1e-8, 1e-10, 1e-12, 1e-14], ordered=True)) elif hpq_optimizer.lower() in ['sgd']: - best_optimizer = keras.optimizers.SGD(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.SGD(lr=hp.Param('init_lr', lr_list), momentum=0.9) elif hpq_optimizer.lower() in ['nadam']: - best_optimizer = keras.optimizers.Nadam(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.Nadam(lr=hp.Param('init_lr', lr_list), beta_1=0.9, beta_2=0.999) elif hpq_optimizer.lower() in ['adamax']: - best_optimizer = keras.optimizers.Adamax(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.Adamax(lr=hp.Param('init_lr', lr_list), beta_1=0.9, beta_2=0.999) elif hpq_optimizer.lower() in ['adagrad']: - best_optimizer = keras.optimizers.Adagrad(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4])) + best_optimizer = keras.optimizers.Adagrad(lr=hp.Param('init_lr', lr_list)) elif hpq_optimizer.lower() in ['rmsprop']: - best_optimizer = keras.optimizers.RMSprop(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.RMSprop(lr=hp.Param('init_lr', lr_list), rho=0.9) elif hpq_optimizer.lower() in ['nesterov']: best_optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True) @@ -480,6 +484,10 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ data_size = check_keras_options(keras_options, 'data_size', 10000) batch_size = check_keras_options(keras_options, 'batchsize', 64) class_weights = check_keras_options(keras_options, 'class_weight', {}) + if not isinstance(model_options["label_encode_flag"], str): + if not model_options["label_encode_flag"]: + print(' removing class weights since label_encode_flag is set to False which means classes can be anything.') + class_weights = {} print(' Class weights: %s' %class_weights) num_classes = model_options["num_classes"] num_labels = model_options["num_labels"] @@ -503,7 +511,7 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ if keras_options['lr_scheduler'] in ['expo', 'ExponentialDecay', 'exponentialdecay']: print(' chosen ExponentialDecay learning rate scheduler') expo_steps = (NUMBER_OF_EPOCHS*data_size)//batch_size - learning_rate = keras.optimizers.schedules.ExponentialDecay(0.01, expo_steps, 0.1) + learning_rate = keras.optimizers.schedules.ExponentialDecay(0.0001, expo_steps, 0.1) else: learning_rate = check_keras_options(keras_options, "learning_rate", 5e-2) #### The steps are actually not needed but remove them later.### @@ -542,10 +550,21 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ val_loss, num_predicts, output_activation)) #### just use modeltype for printing that's all ### modeltype = cat_vocab_dict['modeltype'] - ### set some flags for choosing the right model buy here ################### + + ############################################################################ + ### A Regular body does not have separate NLP outputs. #################### + ### However an Irregular body like fast models have separate NLP outputs. ## + ############################################################################ regular_body = True if isinstance(meta_outputs, list): - regular_body = False + if nlp_flag: + if len(nlp_outputs) > 0: + ### This is a true nlp and we need to use nlp inputs ## + regular_body = False + else: + regular_body = True + else: + regular_body = False ############################################################################ ### check the defaults for the following! @@ -584,7 +603,7 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ try: y_test = np.concatenate(list(heldout_ds.map(lambda x,y: y).as_numpy_iterator())) print(' Single-Label: Heldout data shape: %s' %(y_test.shape,)) - max_batch_size = y_test.shape[0] + max_batch_size = int(min(y_test.shape[0], 4096)) except: max_batch_size = 48 pass @@ -644,7 +663,7 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ tune_mode = val_mode if tuner.lower() == "storm": ######## S T O R M T U N E R D E F I N E D H E R E ########### - randomization_factor = 0.25 + randomization_factor = 0.5 tuner = MyTuner(project_dir=trials_saved_path, build_fn=build_model_storm, objective_direction=tune_mode, @@ -657,14 +676,14 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ #### This is where you find best model parameters for keras using SToRM ##### ############################################################################# start_time1 = time.time() - print(' STORM Tuner max_trials = %d, randomization factor = %0.1f' %( + print(' STORM Tuner max_trials = %d, randomization factor = %0.2f' %( max_trials, randomization_factor)) tuner_epochs = 100 ### keep this low so you can run fast tuner_steps = STEPS_PER_EPOCH ## keep this also very low - batch_limit = min(max_batch_size, int(2 * find_batch_size(data_size))) - batch_nums = int(min(5, 0.1 * batch_limit)) + batch_limit = min(max_batch_size, int(5 * find_batch_size(data_size))) + batch_nums = int(min(8, math.log(batch_limit, 3))) print('Max. batch size = %d, number of batch sizes to try: %d' %(batch_limit, batch_nums)) - + #### You have to make sure that inputs are unique, otherwise error #### tuner.search(train_ds, valid_ds, tuner_epochs, tuner_steps, inputs, meta_outputs, cols_len, output_activation, @@ -825,7 +844,7 @@ def objective(trial): print('Model training with best hyperparameters for %d epochs' %NUMBER_OF_EPOCHS) for each_callback in callbacks_list: print(' Callback added: %s' %str(each_callback).split(".")[-1]) - + pdb.set_trace() ############################ M O D E L T R A I N I N G ################## np.random.seed(42) tf.random.set_seed(42) diff --git a/deep_autoviml/modeling/train_model.py b/deep_autoviml/modeling/train_model.py index ad23afa..64218d4 100644 --- a/deep_autoviml/modeling/train_model.py +++ b/deep_autoviml/modeling/train_model.py @@ -120,6 +120,10 @@ def train_model(deep_model, full_ds, target, keras_model_type, keras_options, patience = check_keras_options(keras_options, "patience", 10) optimizer = keras_options['optimizer'] class_weights = check_keras_options(keras_options, "class_weight", {}) + if not isinstance(model_options["label_encode_flag"], str): + if not model_options["label_encode_flag"]: + print(' removing class weights since label_encode_flag is set to False which means classes can be anything.') + class_weights = {} print(' class_weights: %s' %class_weights) cols_len = len([item for sublist in list(var_df.values()) for item in sublist]) print(' original datasize = %s, initial batchsize = %s' %(data_size, batch_size)) diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc index fb3fc8f..55f9c3d 100644 Binary files a/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc and b/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc differ diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc index d93de86..730f3d4 100644 Binary files a/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc and b/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc differ diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc index 6004ebf..02c3187 100644 Binary files a/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc and b/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc differ diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc index cecbfcc..d5812c9 100644 Binary files a/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc and b/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc differ diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc index 7b82048..07413b2 100644 Binary files a/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc and b/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc differ diff --git a/deep_autoviml/preprocessing/preprocessing.py b/deep_autoviml/preprocessing/preprocessing.py index 23e944f..0e9779f 100644 --- a/deep_autoviml/preprocessing/preprocessing.py +++ b/deep_autoviml/preprocessing/preprocessing.py @@ -24,7 +24,7 @@ # Make numpy values easier to read. np.set_printoptions(precision=3, suppress=True) from collections import defaultdict - +import os ############################################################################################ # data pipelines and feature engg here from deep_autoviml.preprocessing.preprocessing_tabular import preprocessing_tabular @@ -65,6 +65,7 @@ from tensorflow.keras import regularizers from tensorflow.keras.layers import Dense, LSTM, GRU, Input, concatenate, Embedding from tensorflow.keras.layers import Reshape, Activation, Flatten +import tensorflow_hub as hub from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error from IPython.core.display import Image, display @@ -183,24 +184,38 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, nlp_names = [] embedding = [] ################## All other Features are Proprocessed Here ################ - fast_models = ['fast','deep_and_wide','deep_wide','wide_deep', 'mixed_nlp', + ### make sure you include mixed_nlp and combined_nlp in this list since you want it separated + fast_models = ['fast','deep_and_wide','deep_wide','wide_deep', "mixed_nlp","combined_nlp", 'wide_and_deep','deep wide', 'wide deep', 'fast1', 'deep_and_cross', 'deep_cross', 'deep cross', 'fast2',"text"] ############################################################################## meta_outputs = [] print('Preprocessing non-NLP layers for %s Keras model...' %keras_model_type) - + if not keras_model_type.lower() in fast_models: - ################################################################################ - ############ T H I S I S F O R "A U T O" M O D E L S O N L Y ######### - ################################################################################ + ############################################################################################ + ############ I N "A U T O" M O D E L S we use Lat and Lon with NLP right here ######### + ############################################################################################ if len(lats+lons) > 0: - print(' starting categorical, float and integer layer preprocessing...') + print(' Now combine all numeric and non-numeric vars into a Deep only model...') meta_outputs, meta_inputs, meta_names = preprocessing_tabular(train_ds, var_df, cat_feat_cross_flag, model_options, cat_vocab_dict, keras_model_type, verbose) - print(' All Non-NLP feature preprocessing for %s completed.' %keras_model_type) + print(' All Non-NLP feature preprocessing completed.') ### this is the order in which columns have been trained ### + if len(nlps) > 0: + print('Starting NLP string column layer preprocessing...') + nlp_inputs = create_nlp_inputs(nlps) + max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlps, cat_vocab_dict, model_options) + nlp_encoded = encode_nlp_inputs(nlp_inputs, cat_vocab_dict) + ### we call nlp_outputs as embedding in this section of the program #### + print('NLP Preprocessing completed.') + #merged = [meta_outputs, nlp_encoded] + merged = layers.concatenate([nlp_encoded, meta_outputs]) + print(' combined categorical+numeric with nlp outputs successfully for %s model...' %keras_model_type) + nlp_inputs = list(nlp_inputs.values()) + else: + merged = meta_outputs final_training_order = nlp_names + meta_names ### find their dtypes - remember to use element_spec[0] for train data sets! ds_types = dict([(col_name, train_ds.element_spec[0][col_name].dtype) for col_name in final_training_order ]) @@ -209,48 +224,61 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, print('Inferred column names, layers and types (double-check for duplicates and correctness!): \n%s' %col_type_tuples) print(' %s model loaded and compiled successfully...' %keras_model_type) else: - ####### Now combine all vars into a complete auto deep and wide model ############## + ############################################################################################ + #### In "auto" vs. "mixed_nlp", the NLP processings are different. Numeric process is same. + #### Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model # + ############################################################################################ + print(' Now combine all numeric+cat+NLP vars into a Deep and Wide model') ## Since we are processing NLPs separately we need to remove them from inputs ### if len(NON_NLP_VARS) == 0: - print(' Non-NLP vars is zero in this dataset. No tabular preprocesing needed...') + print(' There are zero non-NLP variables in this dataset. No non-NLP preprocesing needed...') meta_inputs = [] else: - #### Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model # FEATURE_NAMES = left_subtract(FEATURE_NAMES, nlps) dropout_rate = 0.1 hidden_units = [dense_layer2, dense_layer3] inputs = create_fast_inputs(FEATURE_NAMES, NUMERIC_FEATURE_NAMES, FLOATS) #all_inputs = dict(zip(meta_names,meta_inputs)) + #### In auto models we want "wide" to be short. Hence use_embedding to be True. wide = encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict, - hidden_units, use_embedding=False) + hidden_units, use_embedding=True) wide = layers.BatchNormalization()(wide) deep = encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict, use_embedding=True) + deep = layers.BatchNormalization()(deep) meta_inputs = list(inputs.values()) ### convert input layers to a list #### If there are NLP vars in dataset, you must combine the nlp_outputs ## + print(' All Non-NLP feature preprocessing completed.') if len(nlps) > 0: print('Starting NLP string column layer preprocessing...') nlp_inputs = create_nlp_inputs(nlps) max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlps, cat_vocab_dict, model_options) nlp_encoded = encode_nlp_inputs(nlp_inputs, cat_vocab_dict) ### we call nlp_outputs as embedding in this section of the program #### - print(' NLP Preprocessing completed.') + print('NLP preprocessing completed.') merged = [wide, deep, nlp_encoded] - print(' %s combined wide, deep and nlp outputs successfully...' %keras_model_type) + print(' Combined wide, deep and nlp outputs successfully') nlp_inputs = list(nlp_inputs.values()) else: merged = [wide, deep] print(' %s combined wide and deep successfully...' %keras_model_type) - return nlp_inputs, meta_inputs, merged, embedding - elif keras_model_type.lower() == 'mixed_nlp': + ### if NLP_outputs is NOT a list, it means there is some NLP variable in the data set + if not isinstance(merged, list): + print('Shape of output from all preprocessing layers before model training = %s' %(merged.shape,)) + return nlp_inputs, meta_inputs, merged, embedding + elif keras_model_type.lower() in ['mixed_nlp', 'combined_nlp']: ### this is similar to auto models but uses TFHub models for NLP preprocessing ##### if len(NON_NLP_VARS) == 0: print(' Non-NLP vars is zero in this dataset. No tabular preprocesing needed...') meta_inputs = [] else: + ############################################################################################ + #### In "auto" vs. "mixed_nlp", the NLP processings are different. Numeric process is same. + ############################################################################################ + print(' Now combine all numeric and non-numeric vars into a Deep and Wide model...') #### Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model # FEATURE_NAMES = left_subtract(FEATURE_NAMES, nlps) - dropout_rate = 0.1 + dropout_rate = 0.5 hidden_units = [dense_layer2, dense_layer3] inputs = create_fast_inputs(FEATURE_NAMES, NUMERIC_FEATURE_NAMES, FLOATS) #all_inputs = dict(zip(meta_names,meta_inputs)) @@ -259,20 +287,27 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, wide = layers.BatchNormalization()(wide) deep = encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict, use_embedding=True) + deep = layers.BatchNormalization()(deep) meta_inputs = list(inputs.values()) ### convert input layers to a list + print(' All Non-NLP feature preprocessing completed.') #### If there are NLP vars in dataset, you use TFHub models in this case ## if len(nlps) > 0: print('Starting NLP string column layer preprocessing...') - nlp_inputs, embedding, nlp_names = preprocessing_nlp(train_ds, model_options, + nlp_inputs, embedding, nlp_names = mixed_preprocessing_nlp(train_ds, model_options, var_df, cat_vocab_dict, keras_model_type, verbose) ### we call nlp_outputs as embedding in this section of the program #### print(' NLP Preprocessing completed.') - print('There are no NLP variables in this dataset for preprocessing...') else: + print('There are no NLP variables in this dataset for preprocessing...') embedding = [] - meta_outputs = layers.concatenate([wide, deep]) - print(' %s model: combined wide, deep and NLP (with TFHub) successfully...' %keras_model_type) + if isinstance(embedding, list): + ### This means embedding is an empty list with nothing in it ### + meta_outputs = layers.concatenate([wide, deep]) + print(' Combined wide, deep layers successfully.') + else: + meta_outputs = layers.concatenate([wide, deep, embedding]) + print(' Combined wide, deep and NLP (with TFHub) successfully.') else: meta_inputs = [] ##### You need to send in the ouput from embedding layer to this sequence of layers #### @@ -348,13 +383,13 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, print('There is no numeric or cat or int variables in this data set.') if isinstance(nlp_outputs, list): ### if NLP_outputs is a list, it means there is no NLP variable in the data set - print(' There is no NLP variable in this data set. Returning') + print('There is no NLP variable in this data set. Returning') consolidated_outputs = meta_outputs else: - print(' %s vector dimensions from NLP variable' %(nlp_outputs.shape,)) + print('Shape of encoded NLP variables just before training: %s' %(nlp_outputs.shape,)) consolidated_outputs = nlp_outputs else: - print(' Shape of output from numeric+integer+cat variables before model training = %s' %(meta_outputs.shape,)) + print('Shape of non-NLP encoded variables just before model training = %s' %(meta_outputs.shape,)) if isinstance(nlp_outputs, list): ### if NLP_outputs is a list, it means there is no NLP variable in the data set print(' There is no NLP variable in this data set. Continuing...') @@ -362,8 +397,72 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, consolidated_outputs = meta_outputs else: ### if NLP_outputs is NOT a list, it means there is some NLP variable in the data set - print(' %s vector dimensions from NLP variable' %(nlp_outputs.shape,)) + print(' Shape of encoded NLP variables just before training: %s' %(nlp_outputs.shape,)) consolidated_outputs = layers.concatenate([nlp_outputs, meta_outputs]) print('Shape of output from all preprocessing layers before model training = %s' %(consolidated_outputs.shape,)) return nlp_inputs, meta_inputs, consolidated_outputs, nlp_outputs ########################################################################################## +def mixed_preprocessing_nlp(train_ds, model_options, + var_df, cat_vocab_dict, + keras_model_type, verbose=0): + """ + This is only for mixed NLP preprocessing of tabular and nlp datasets + """ + nlp_inputs = [] + all_nlp_encoded = [] + all_nlp_embeddings = [] + nlp_col_names = [] + nlp_columns = var_df['nlp_vars'] + nlp_columns = list(set(nlp_columns)) + + if len(nlp_columns) == 1: + nlp_column = nlp_columns[0] + elif keras_model_type.lower() == 'combined_nlp': + nlp_column = 'combined_nlp_text' ### this is when there are multiple nlp columns ## + else: + ### This is to keep nlp columns separate ### + nlp_column = '' + + #### Now perform NLP preproprocessing for each nlp_column ###### + ######### This is where we load Swivel model and process each nlp column ### + try: + bert_model_name = "Swivel-20" + if os.name == 'nt': + tfhub_path = os.path.join(keras_model_type, 'tf_cache') + os.environ['TFHUB_CACHE_DIR'] = tfhub_path + tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1' + else: + tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1' + hub_layer = hub.KerasLayer(tfhub_handle_encoder, + input_shape=[], + dtype=tf.string, + trainable=False, name="Swivel20_encoder") + print(f' {bert_model_name} selected from: {tfhub_handle_encoder}') + ### this is for mixed nlp models. You use Swivel to embed NLP columns fast #### + if len(nlp_columns) > 1: + copy_nlp_columns = copy.deepcopy(nlp_columns) + for each_nlp in copy_nlp_columns: + nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=each_nlp) + nlp_inputs.append(nlp_input) + x = hub_layer(nlp_input) + all_nlp_encoded.append(x) + nlp_col_names.append(each_nlp) + else: + nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=nlp_column) + x = hub_layer(nlp_input) + ### Now we combine all inputs and outputs in one place here ########### + nlp_inputs.append(nlp_input) + all_nlp_encoded.append(x) + nlp_col_names.append(nlp_column) + except: + print(' Error: Skipping %s for keras layer preprocessing...' %nlp_column) + ### we gather all outputs above into a single list here called all_features! + if len(all_nlp_encoded) == 0: + print('There are no NLP string variables in this dataset to preprocess!') + elif len(all_nlp_encoded) == 1: + all_nlp_embeddings = all_nlp_encoded[0] + else: + all_nlp_embeddings = layers.concatenate(all_nlp_encoded) + + return nlp_inputs, all_nlp_embeddings, nlp_col_names +################################################################################# diff --git a/deep_autoviml/preprocessing/preprocessing_nlp.py b/deep_autoviml/preprocessing/preprocessing_nlp.py index 17a345b..fc18fc2 100644 --- a/deep_autoviml/preprocessing/preprocessing_nlp.py +++ b/deep_autoviml/preprocessing/preprocessing_nlp.py @@ -123,7 +123,8 @@ def preprocessing_nlp(train_ds, model_options, var_df, cat_vocab_dict, keras_mod 'wide_and_deep','deep wide', 'wide deep', 'fast1', 'deep_and_cross', 'deep_cross', 'deep cross', 'fast2'] - max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options) + max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries( + nlp_columns, cat_vocab_dict, model_options, verbose) if len(nlp_columns) == 1: nlp_column = nlp_columns[0] @@ -360,7 +361,7 @@ def encode_NLP_column(train_ds, nlp_column, nlp_input, vocab_size, sequence_leng #print(f" {nlp_column} vocab size = {vocab_size}, sequence_length={sequence_length}") return nlp_vectorized ################################################################################################ -def aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options): +def aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options, verbose=0): """ This function aggregates all the dictionaries you need for nlp processing. Just send in a list of nlp variables and a small data sample and it will compute all @@ -380,20 +381,24 @@ def aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options): if len(nlps_copy) > 0: vocab_train_small = [] for each_name in nlps_copy: - print('Creating aggregate_nlp_dictionaries for nlp column = %s' %each_name) + if verbose >= 2: + print('Creating aggregate_nlp_dictionaries for nlp column = %s' %each_name) max_tokens_zip[each_name] = cat_vocab_dict[each_name]['size_of_vocab'] print(' size of vocabulary = %s' %max_tokens_zip[each_name]) seq_tokens_zip[each_name] = cat_vocab_dict[each_name]['seq_length'] seq_lengths.append(seq_tokens_zip[each_name]) - print(' sequence length = %s' %seq_tokens_zip[each_name]) + if verbose >= 2: + print(' sequence length = %s' %seq_tokens_zip[each_name]) vocab_size = cat_vocab_dict[each_name]['size_of_vocab'] vocab_train_small += cat_vocab_dict[each_name]['vocab'] vocab_train_small = np.unique(vocab_train_small).tolist() - best_embedding_size = closest(lst, vocab_size//4000) - print(' recommended embedding_size = %s' %best_embedding_size) + best_embedding_size = closest(lst, vocab_size//50000) + if verbose >= 2: + print(' recommended embedding_size = %s' %best_embedding_size) input_embedding_size = check_model_options(model_options, "embedding_size", best_embedding_size) if input_embedding_size != best_embedding_size: - print(' input embedding size given as %d. Overriding recommended embedding_size...' %input_embedding_size) + if verbose >= 2: + print(' input embedding size given as %d. Overriding recommended embedding_size...' %input_embedding_size) best_embedding_size = input_embedding_size embed_tokens_zip[each_name] = best_embedding_size return max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small diff --git a/deep_autoviml/preprocessing/preprocessing_tabular.py b/deep_autoviml/preprocessing/preprocessing_tabular.py index 46c82af..7c260b0 100644 --- a/deep_autoviml/preprocessing/preprocessing_tabular.py +++ b/deep_autoviml/preprocessing/preprocessing_tabular.py @@ -327,7 +327,7 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option except: print(' Error: Skipping %s since Keras Bolean preprocessing is erroring' %each_bool) - ###### This is where we handle Boolean Integer variables - we just combine them ################## + ###### This is where we handle Boolean + Integer variables - we just combine them ################## int_bools_copy = copy.deepcopy(int_bools) if len(int_bools_copy) > 0: for each_int in int_bools_copy: @@ -361,16 +361,24 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option else: nums_bin = max(20, int(max_tokens_zip[each_int]/40)) int_input = keras.Input(shape=(1,), name=each_int, dtype="int32") - encoded = encode_any_integer_to_hash_categorical(int_input, each_int, - train_ds, nums_bin) + if (max_tokens_zip[each_int] >= high_cats_alert): + encoded = encode_any_integer_to_hash_categorical(int_input, each_int, + train_ds, nums_bin) + if verbose: + print(' %s encoded: %d categories, %d bins. After integer HASH encoding shape = %s' %(each_int, + max_tokens_zip[each_int], nums_bin, encoded.shape[1])) + else: + encoded = encode_categorical_and_integer_features(int_input, each_int, + train_ds, is_string=False) + if verbose: + print(' %s encoded: %d categories. After integer encoding shape: %s' %(each_int, + max_tokens_zip[each_int], encoded.shape[1])) all_int_inputs.append(int_input) all_int_encoded.append(encoded) all_input_names.append(each_int) if verbose: - print(' %s number of categories = %d and bins = %d: after integer hash encoding shape: %s' %(each_int, - max_tokens_zip[each_int], nums_bin, encoded.shape[1])) - if (encoded.shape[1] >= high_cats_alert) or (max_tokens_zip[each_int] >= high_cats_alert): - print(' Alert! excessive feature trap. Should this not be a float variable?? %s' %each_int) + if (encoded.shape[1] >= high_cats_alert): + print(' High Dims Alert! Convert %s to float??' %each_int) except: print(' Error: Skipping %s since Keras Integer preprocessing erroring' %each_int) @@ -384,16 +392,19 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option int_input = keras.Input(shape=(1,), name=each_int, dtype="int32") cat_input_dict[each_int] = int_input vocab = max_tokens_zip[each_int] - encoded = encode_integer_to_categorical_feature(int_input, each_int, - train_ds, vocab) + #encoded = encode_integer_to_categorical_feature(int_input, each_int, + # train_ds, vocab) + encoded = encode_categorical_and_integer_features(int_input, each_int, + train_ds, is_string=False) all_int_cat_inputs.append(int_input) all_int_cat_encoded.append(encoded) all_input_names.append(each_int) if verbose: - print(' %s number of categories = %d: after integer categorical encoding shape: %s' %( - each_int, len(vocab), encoded.shape[1])) + print(' %s encoded: %d categories. After integer encoding shape: %s' %(each_int, + len(vocab), encoded.shape[1])) if encoded.shape[1] > high_cats_alert: - print(' Alert! excessive feature dimension created. Check if necessary to have this many.') + if verbose: + print(' High Dims Alert! Convert %s to float??' %each_int) except: print(' Error: Skipping %s since Keras Integer Categorical preprocessing erroring' %each_int) @@ -408,8 +419,10 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option cat_input_dict[each_cat] = cat_input vocab = max_tokens_zip[each_cat] max_tokens = len(vocab) - cat_encoded = encode_string_categorical_feature_categorical(cat_input, each_cat, - train_ds, vocab) + cat_encoded = encode_categorical_and_integer_features(cat_input, each_cat, + train_ds, is_string=True) + #cat_encoded = encode_string_categorical_feature_categorical(cat_input, each_cat, + # train_ds, vocab) all_cat_inputs.append(cat_input) all_cat_encoded.append(cat_encoded) cat_encoded_dict[each_cat] = cat_encoded @@ -418,7 +431,8 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option print(' %s number of categories = %d: after string to categorical encoding shape: %s' %( each_cat, max_tokens, cat_encoded.shape[1])) if cat_encoded.shape[1] > high_cats_alert: - print(' Alert! excessive feature dimension created. Check if necessary to have this many.') + if verbose: + print(' High Dims Alert! Convert %s to float??' %each_int) except: print(' Error: Skipping %s since Keras Categorical preprocessing erroring' %each_cat) @@ -487,9 +501,9 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option all_num_encoded.append(encoded) num_only_encoded.append(encoded) all_input_names.append(each_num) - print(' %s numeric column left as is for feature preprocessing' %each_num) + print(' %s numeric column left as is since float' %each_num) except: - print(' Error: Skipping %s since Keras Float preprocessing erroring' %each_num) + print(' Error: Skipping %s due to Keras float preprocessing error' %each_num) # Latitude and Longitude Numerical features are Binned first and then Category Encoded ####### @@ -617,9 +631,16 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option meta_input_categ1 = all_low_cat_encoded[0] meta_categ1 = layers.Dense(concat_layer_neurons, kernel_initializer=concat_kernel_initializer)(meta_input_categ1) else: - meta_input_categ1 = layers.concatenate(all_low_cat_encoded) - #WIDE - This Dense layer connects to input layer - Categorical Data - meta_categ1 = layers.Dense(concat_layer_neurons, kernel_initializer=concat_kernel_initializer)(meta_input_categ1) + int_list = [x for x in all_low_cat_encoded if x.dtype in [np.int8, np.int16, np.int32, np.int64]] + float_list = [ x for x in all_low_cat_encoded if x.dtype in [np.float32, np.float64]] + if len(float_list) == len(all_low_cat_encoded): + ### All of them are floats ### + all_high_cat_encoded += float_list + else: + meta_input_categ1 = layers.concatenate(int_list) + all_high_cat_encoded += float_list + #WIDE - This Dense layer connects to input layer - Categorical Data + meta_categ1 = layers.Dense(concat_layer_neurons, kernel_initializer=concat_kernel_initializer)(meta_input_categ1) skip_meta_categ2 = False if len(all_high_cat_encoded) == 0: @@ -779,6 +800,22 @@ def encode_binning_numeric_feature_categorical(feature, name, dataset, bins_lat, return encoded_feature ########################################################################################### +def encode_categorical_and_integer_features(feature, name, dataset, is_string): + lookup_class = StringLookup if is_string else IntegerLookup + # Create a lookup layer which will turn strings into integer indices + lookup = lookup_class(output_mode="binary") + + # Prepare a Dataset that only yields our feature + feature_ds = dataset.map(lambda x, y: x[name]) + feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) + + # Learn the set of possible string values and assign them a fixed integer index + lookup.adapt(feature_ds) + + # Turn the string input into integer indices + encoded_feature = lookup(feature) + return encoded_feature +############################################################################## def encode_string_categorical_feature_categorical(feature_input, name, dataset, vocab): """ Inputs: @@ -796,7 +833,7 @@ def encode_string_categorical_feature_categorical(feature_input, name, dataset, Outputs: ----------- encoded_feature: a keras.Tensor. You can use this tensor in keras models for training. - The Tensor has a shape of (None, 1) - None indicates that it has not been + The Tensor has a shape of (None, 1) - None indicates that it is not batched. When the output_mode = "binary" or "count", the output is in float otherwise it is integer. """ extra_oov = 3 @@ -1076,7 +1113,8 @@ def encode_any_feature_to_embed_categorical(feature_input, name, dataset, vocabu # Learn the set of possible string values and assign them a fixed integer index #lookup.adapt(feature_ds) encoded_feature = lookup(feature_input) - embedding_dims = int(math.sqrt(len(vocabulary))) + #embedding_dims = int(math.sqrt(len(vocabulary))) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) # Create an embedding layer with the specified dimensions. embedding = tf.keras.layers.Embedding( input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims @@ -1281,18 +1319,32 @@ def encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE numeric_encoded = [] text_encoded = [] encoded_features = [] - + #### In "auto" model, "wide" part is short. Hence we use "count" with "embedding" flag. for feature_name in inputs: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] extra_oov = 3 if feature_name in CATEGORICAL_FEATURE_NAMES: cat_encoded.append('') cat_len = len(vocabulary) - encoded_feature = inputs[feature_name] - encoded_feature = tf.keras.layers.experimental.preprocessing.StringLookup( - vocabulary=vocabulary, mask_token=None, oov_token = '~UNK~')(encoded_feature) - cat_encoded[-1] = tf.keras.layers.experimental.preprocessing.CategoryEncoding( - num_tokens = cat_len + 1)(encoded_feature) + lookup = StringLookup(vocabulary=vocabulary, + mask_token=None, + oov_token = '~UNK~') + if len(vocabulary) > 32: + # Convert the string input values into integer indices. + encoded_feature = inputs[feature_name] + encoded_feature = lookup(encoded_feature) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) + # Create an embedding layer with the specified dimensions. + embedding = Embedding( + input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims + ) + # Convert the index values to embedding representations. + encoded_feature = embedding(encoded_feature) + cat_encoded[-1] = Flatten()(encoded_feature) + else: + encoded_feature = inputs[feature_name] + encoded_feature = lookup(encoded_feature) + cat_encoded[-1] = CategoryEncoding(num_tokens = cat_len + 1)(encoded_feature) elif feature_name in FLOATS: ### you just ignore the floats in cross models #### numeric_encoded.append('') @@ -1303,7 +1355,7 @@ def encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE else: cat_encoded.append('') if len(vocabulary) > 100: - print(' ALERT! Excessive feature dimension of %s. Should %s be a float variable?' %( + print(' ALERT! Excessive dimensions in %s. Should integer %s be a float variable?' %( len(vocabulary), feature_name)) use_embedding = True lookup = IntegerLookup( @@ -1333,7 +1385,7 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE # Create a lookup to convert string values to an integer indices. # Since we are not using a mask token but expecting some out of vocabulary # (oov) tokens, we set mask_token to None and num_oov_indices to extra_oov. - if len(vocabulary) > 50: + if len(vocabulary) > 32: use_embedding = True lookup = StringLookup( vocabulary=vocabulary, @@ -1346,7 +1398,8 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE # Convert the string input values into integer indices. encoded_feature = inputs[feature_name] encoded_feature = lookup(encoded_feature) - embedding_dims = int(math.sqrt(len(vocabulary))) + #embedding_dims = int(math.sqrt(len(vocabulary))) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) # Create an embedding layer with the specified dimensions. embedding = layers.Embedding( input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims @@ -1365,7 +1418,7 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE encoded_feature = normalizer(inputs[feature_name]) else: if len(vocabulary) > 100: - print(' ALERT! Excessive feature dimension of %s. Should %s be a float variable?' %( + print(' ALERT! Excessive feature dimension in %s. Should %s be a float variable?' %( len(vocabulary), feature_name)) use_embedding = True lookup = IntegerLookup( @@ -1374,7 +1427,7 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE num_oov_indices=extra_oov, max_tokens=None, oov_token=-9999, - output_mode="count" if not use_embedding else "binary", + output_mode="count" if use_embedding else "binary", ) # Use the numerical features as-is. encoded_feature = inputs[feature_name] @@ -1407,8 +1460,9 @@ def encode_nlp_inputs(inputs, CATEGORICAL_FEATURES_WITH_VOCABULARY): for feature_name in inputs: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]['vocab'] extra_oov = 50 - vocab_size = int(math.sqrt(len(vocabulary))) - best_embedding_size = closest(list_embedding_sizes, vocab_size//4000) + #vocab_size = int(math.sqrt(len(vocabulary))) + #best_embedding_size = closest(list_embedding_sizes, vocab_size//4000) + best_embedding_size = int(max(2, math.log(len(vocabulary), 2))) lookup = StringLookup( vocabulary=vocabulary, @@ -1483,7 +1537,7 @@ def encode_num_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA #################################################################################################### def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEATURES_WITH_VOCABULARY, use_embedding=False): - + #### This is a new version intended to reduce dimensions ################# encoded_features = [] for feature_name in inputs: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] @@ -1492,7 +1546,7 @@ def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA # Create a lookup to convert string values to an integer indices. # Since we are not using a mask token but expecting some out of vocabulary # (oov) tokens, we set mask_token to None and num_oov_indices to extra_oov. - if len(vocabulary) > 50: + if len(vocabulary) > 32: use_embedding = True lookup = StringLookup( vocabulary=vocabulary, @@ -1505,7 +1559,7 @@ def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA # Convert the string input values into integer indices. encoded_feature = inputs[feature_name] encoded_feature = lookup(encoded_feature) - embedding_dims = int(math.sqrt(len(vocabulary))) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) # Create an embedding layer with the specified dimensions. embedding = layers.Embedding( input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims @@ -1525,8 +1579,24 @@ def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA encoded_feature = normalizer(inputs[feature_name]) #encoded_feature = inputs[feature_name] encoded_features.append(encoded_feature) + ################### + int_list = [x for x in encoded_features if x.dtype in [np.int8, np.int16, np.int32, np.int64]] + float_list = [ x for x in encoded_features if x.dtype in [np.float32, np.float64]] + if len(int_list) > 0: + all_int_features = layers.concatenate(int_list) + meta_int1 = layers.Dense(32)(all_int_features) + if len(float_list) > 0: + all_float_features = layers.concatenate(float_list) + meta_float1 = layers.Dense(32)(all_float_features) + #### You can add a Dense layer if needed here ########### + if len(int_list) > 0: + if len(float_list) > 0: + all_features = layers.concatenate([meta_int1, meta_float1]) + else: + all_features = layers.concatenate([meta_int1]) + else: + all_features = layers.concatenate([meta_float1]) ##### This is where are float encoded features are combined ### - all_features = layers.concatenate(encoded_features) return all_features ################################################################################ from itertools import combinations diff --git a/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc b/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc index 14a9697..076be18 100644 Binary files a/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc and b/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc differ diff --git a/deep_autoviml/utilities/utilities.py b/deep_autoviml/utilities/utilities.py index ec1a952..f12dc39 100644 --- a/deep_autoviml/utilities/utilities.py +++ b/deep_autoviml/utilities/utilities.py @@ -913,6 +913,7 @@ def get_callbacks(val_mode, val_monitor, patience, learning_rate, save_weights_o callbacks_dict['tensor_board'] = tb callbacks_dict['print'] = pr callbacks_dict['reducer'] = rlr + callbacks_dict['rlr'] = rlr callbacks_dict['decay'] = lr_decay_cb return callbacks_dict, tensorboard_logpath @@ -925,14 +926,14 @@ def get_chosen_callback(callbacks_dict, keras_options): lr_scheduler = callbacks_dict['onecycle2'] elif keras_options['lr_scheduler'] == 'onecycle': lr_scheduler = callbacks_dict['onecycle'] - elif keras_options['lr_scheduler'] == 'reducer': + elif keras_options['lr_scheduler'] in ['reducer', 'rlr']: lr_scheduler = callbacks_dict['reducer'] elif keras_options['lr_scheduler'] == 'decay': lr_scheduler = callbacks_dict['decay'] elif keras_options['lr_scheduler'] == "scheduler": lr_scheduler = callbacks_dict['scheduler'] else: - lr_scheduler = callbacks_dict['scheduler'] + lr_scheduler = callbacks_dict['rlr'] return lr_scheduler ################################################################################################ def get_chosen_callback2(callbacks_dict, keras_options): @@ -948,8 +949,8 @@ def get_chosen_callback2(callbacks_dict, keras_options): elif keras_options['lr_scheduler'] == 'decay': lr_scheduler = callbacks_dict['lr_decay_cb'] else: - lr_scheduler = callbacks_dict['lr_sched'] - keras_options['lr_scheduler'] = "lr_sched" + lr_scheduler = callbacks_dict['rlr'] + keras_options['lr_scheduler'] = "rlr" return lr_scheduler ################################################################################################ import math diff --git a/requirements.txt b/requirements.txt index 4c27ea8..da502f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,17 @@ ipython jupyter -tensorflow==2.5.2 +tensorflow~=2.5 pandas -numpy==1.19.2 +numpy~=1.19.2 matplotlib -scikit-learn>=0.23.1 +scikit-learn>=0.23.1, <=0.24.2 regex storm-tuner>=0.0.8 emoji xlrd tensorflow_hub>=0.12.0 -tensorflow-text==2.5.0 +tensorflow-text~=2.5 optuna -mlflow==1.22.0 +statsmodels +seaborn +scikit-image diff --git a/setup.py b/setup.py index 3d169ce..c75a9e3 100644 --- a/setup.py +++ b/setup.py @@ -15,36 +15,18 @@ ############################################################################################ import setuptools -base_packages = [ - "ipython", - "jupyter", - "tensorflow==2.5.2", - "pandas", - "matplotlib", - "numpy==1.19.2", - "scikit-learn>=0.23.1", - "regex", - "emoji", - "storm-tuner>=0.0.8", - "optuna", - "tensorflow_hub==0.12.0", - "xlrd", - "mlflow==1.22.0", - ] - with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() setuptools.setup( name="deep_autoviml", - version="0.0.78.dev2", + version="0.0.79", author="Ram Seshadri", # author_email="author@example.com", description="Automatically Build Deep Learning Models and Pipelines fast!", long_description=long_description, long_description_content_type="text/markdown", license='Apache License 2.0', - license_files=("LICENSE",), url="https://github.com/AutoViML/deep_autoviml", packages = [ "deep_autoviml", @@ -55,10 +37,22 @@ "deep_autoviml.utilities", ], include_package_data=True, - install_requires=base_packages, - extras_require={ - "text": ["tensorflow-text==2.5.0",] - }, + install_requires=[ + "ipython", + "jupyter", + "tensorflow~=2.5", + "pandas", + "matplotlib", + "numpy~=1.19.2", + "scikit-learn>=0.23.1, <=0.24.2", + "regex", + "emoji", + "storm-tuner>=0.0.8", + "optuna", + "tensorflow_hub~=0.12.0", + "tensorflow-text~=2.5", + "xlrd" + ], classifiers=[ "Programming Language :: Python :: 3", "Operating System :: OS Independent",