diff --git a/R-package/demo/GPBoost_algorithm.R b/R-package/demo/GPBoost_algorithm.R index 81e25d14..dcb6e2d4 100644 --- a/R-package/demo/GPBoost_algorithm.R +++ b/R-package/demo/GPBoost_algorithm.R @@ -127,17 +127,58 @@ legend(legend=c("True F","Pred F"), "bottomright", bty="n", lwd=3, col=c(2,4)) plot(b1, pred$random_effect_mean, xlab="truth", ylab="predicted", main="Comparison of true and predicted random effects") +#--------------------Choosing tuning parameters using Bayesian optimization and the 'mlrMBO' R package ---------------- +library(mlrMBO) +library(DiceKriging) +library(rgenoud) +source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_tune_pars_bayesian_optimization.R")# Load required function +# Define search space +# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range +search_space <- list("learning_rate" = c(0.001, 10), + "min_data_in_leaf" = c(1, 1000), + "max_depth" = c(-1, -1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10) + "num_leaves" = c(2, 2^10), + "lambda_l2" = c(0, 100), + "max_bin" = c(63, min(n,10000)), + "line_search_step_length" = c(TRUE, FALSE)) +metric = "mse" # Define metric +if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) { + metric = "binary_logloss" +} +# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters +gp_model <- GPModel(group_data = group, likelihood = likelihood) +data_train <- gpb.Dataset(data = X, label = y) +# Run parameter optimization using Bayesian optimization and k-fold CV +crit = makeMBOInfillCritCB() # other criterion options: makeMBOInfillCritEI() +opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100, + data = dataset, gp_model = gp_model, + nfold = 5, nrounds = 1000, early_stopping_rounds = 20, + metric = metric, crit = crit, + cv_seed = 4, verbose_eval = 1) +print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params), + function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params, + n=names(opt_params$best_params))), collapse=", "))) +print(paste0("Best number of iterations: ", opt_params$best_iter)) +print(paste0("Best score: ", round(opt_params$best_score, digits=3))) + +# Alternatively and faster: using manually defined validation data instead of cross-validation +valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data +folds <- list(valid_tune_idx) +opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100, + data = dataset, gp_model = gp_model, + folds = folds, nrounds = 1000, early_stopping_rounds = 20, + metric = metric, crit = crit, + cv_seed = 4, verbose_eval = 1) + #--------------------Choosing tuning parameters using random grid search---------------- param_grid <- list("learning_rate" = c(0.001, 0.01, 0.1, 1, 10), "min_data_in_leaf" = c(1, 10, 100, 1000), - "max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10) + "max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10) "num_leaves" = 2^(1:10), "lambda_l2" = c(0, 1, 10, 100), "max_bin" = c(250, 500, 1000, min(n,10000)), "line_search_step_length" = c(TRUE, FALSE)) -other_params <- list(verbose = 0) # avoid trace information when training models -# Define metric -metric = "mse" +metric = "mse" # Define metric if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) { metric = "binary_logloss" } @@ -145,16 +186,15 @@ if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) { gp_model <- GPModel(group_data = group, likelihood = likelihood) data_train <- gpb.Dataset(data = X, label = y) set.seed(1) -opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params, - num_try_random = 100, nfold = 4, +# Run parameter optimization using random grid search and k-fold CV +# Note: deterministic grid search can be done by setting 'num_try_random=NULL' +opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, data = data_train, gp_model = gp_model, - use_gp_model_for_validation = TRUE, verbose_eval = 1, + num_try_random = 100, nfold = 5, nrounds = 1000, early_stopping_rounds = 20, - metric = metric) -print(paste0("Best parameters: ", - paste0(unlist(lapply(seq_along(opt_params$best_params), - function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, - y=opt_params$best_params, + verbose_eval = 1, metric = metric, cv_seed = 4) +print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params), + function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params, n=names(opt_params$best_params))), collapse=", "))) print(paste0("Best number of iterations: ", opt_params$best_iter)) print(paste0("Best score: ", round(opt_params$best_score, digits=3))) @@ -162,19 +202,17 @@ print(paste0("Best score: ", round(opt_params$best_score, digits=3))) # Alternatively and faster: using manually defined validation data instead of cross-validation valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data folds <- list(valid_tune_idx) -opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params, - num_try_random = 100, folds = folds, - data = dataset, gp_model = gp_model, - use_gp_model_for_validation = TRUE, verbose_eval = 1, +opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, + data = data_train, gp_model = gp_model, + num_try_random = 5, folds = folds, nrounds = 1000, early_stopping_rounds = 20, - metric = metric) + verbose_eval = 1, metric = metric, cv_seed = 4) #--------------------Cross-validation for determining number of iterations---------------- gp_model <- GPModel(group_data = group, likelihood = likelihood) dataset <- gpb.Dataset(data = X, label = y) -bst <- gpb.cv(data = dataset, gp_model = gp_model, - use_gp_model_for_validation = TRUE, params = params, - nrounds = 1000, nfold = 4, early_stopping_rounds = 20) +bst <- gpb.cv(data = dataset, gp_model = gp_model, params = params, + nrounds = 1000, nfold = 5, early_stopping_rounds = 20) print(paste0("Optimal number of iterations: ", bst$best_iter)) #--------------------Using a validation set for finding number of iterations---------------- @@ -188,7 +226,7 @@ gp_model <- GPModel(group_data = group[train_ind], likelihood = likelihood) gp_model$set_prediction_data(group_data_pred = group[-train_ind]) bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000, params = params, verbose = 1, valids = valids, - early_stopping_rounds = 20, use_gp_model_for_validation = TRUE) + early_stopping_rounds = 20) print(paste0("Optimal number of iterations: ", bst$best_iter, ", best test error: ", bst$best_score)) # Plot validation error @@ -204,7 +242,7 @@ if (likelihood == "gaussian") { params_newton$learning_rate <- 0.1 bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000, params = params_newton, verbose = 1, valids = valids, - early_stopping_rounds = 10, use_gp_model_for_validation = TRUE, + early_stopping_rounds = 20, leaves_newton_update = TRUE) print(paste0("Optimal number of iterations: ", bst$best_iter, ", best test error: ", bst$best_score)) @@ -245,7 +283,7 @@ shap_long <- shap.prep(bst, X_train = X) shap.plot.dependence(data_long = shap_long, x = "Covariate_1", color_feature = "Covariate_2", smooth = FALSE) # SHAP interaction values -source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/unify_gpboost_treeshap.R")# Load required function +source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_unify_gpboost_treeshap.R")# Load required function library(treeshap) library(shapviz) unified_bst <- gpboost_unify_treeshap(bst, X) @@ -286,9 +324,8 @@ gp_model <- GPModel(group_data = group, likelihood = likelihood) dataset <- gpb.Dataset(X, label = y) # Stage 1: run cross-validation to (i) determine to optimal number of iterations # and (ii) to estimate the GPModel on the out-of-sample data -cvbst <- gpb.cv(data = dataset, gp_model = gp_model, - use_gp_model_for_validation = TRUE, params = params, - nrounds = 1000, nfold = 4, early_stopping_rounds = 5, +cvbst <- gpb.cv(data = dataset, gp_model = gp_model, params = params, + nrounds = 1000, nfold = 5, early_stopping_rounds = 20, fit_GP_cov_pars_OOS = TRUE, verbose = 0) print(paste0("Optimal number of iterations: ", cvbst$best_iter)) # Fitted model (note: ideally, one would have to find the optimal combination of diff --git a/examples/python-guide/GPBoost_algorithm.py b/examples/python-guide/GPBoost_algorithm.py index d5f2c808..cba8f3d4 100644 --- a/examples/python-guide/GPBoost_algorithm.py +++ b/examples/python-guide/GPBoost_algorithm.py @@ -153,21 +153,22 @@ def simulate_response_variable(lp, rand_eff, likelihood): # Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range search_space = { 'learning_rate': [0.001, 10], 'min_data_in_leaf': [1, 1000], - 'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1,10] + 'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1,10] 'num_leaves': [2, 1024], 'lambda_l2': [0, 100], 'max_bin': [63, np.min([10000,n])], 'line_search_step_length': [True, False] } -# Define metric -metric = "mse" +metric = "mse" # Define metric if likelihood in ("bernoulli_probit", "bernoulli_logit"): metric = "binary_logloss" # Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) -# Run parameter optimization using the TPE algorithm and 4-fold CV -opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space, - nfold=4, gp_model=gp_model, metric=metric, tpe_seed=1, - max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20) +# Run parameter optimization using the TPE algorithm and k-fold CV +opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100, + X=X, y=y, gp_model=gp_model, + max_num_boost_round=1000, early_stopping_rounds=20, + nfold=5, metric=metric, + cv_seed=4, tpe_seed=1) print("Best parameters: " + str(opt_params['best_params'])) print("Best number of iterations: " + str(opt_params['best_iter'])) print("Best score: " + str(opt_params['best_score'])) @@ -178,35 +179,36 @@ def simulate_response_variable(lp, rand_eff, likelihood): train_tune_idx = permute_aux[0:int(0.8 * n)] # use 20% of the data as validation data valid_tune_idx = permute_aux[int(0.8 * n):n] folds = [(train_tune_idx, valid_tune_idx)] -opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space, - folds=folds, gp_model=gp_model, metric=metric, tpe_seed=1, - max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20) +opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100, + X=X, y=y, gp_model=gp_model, + max_num_boost_round=1000, early_stopping_rounds=20, + folds=folds, metric=metric, + cv_seed=4, tpe_seed=1) #--------------------Choosing tuning parameters using random grid search---------------- # Define parameter search grid # Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range param_grid = { 'learning_rate': [0.001, 0.01, 0.1, 1, 10], 'min_data_in_leaf': [1, 10, 100, 1000], - 'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10] + 'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10] 'num_leaves': 2**np.arange(1,10), 'lambda_l2': [0, 1, 10, 100], 'max_bin': [250, 500, 1000, np.min([10000,n])], 'line_search_step_length': [True, False]} other_params = {'verbose': 0} # avoid trace information when training models -# Define metric -metric = "mse" +metric = "mse" # Define metric if likelihood in ("bernoulli_probit", "bernoulli_logit"): metric = "binary_logloss" +# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) data_train = gpb.Dataset(data=X, label=y) -# Run parameter optimization using random grid search and 4-fold CV +# Run parameter optimization using random grid search and k-fold CV # Note: deterministic grid search can be done by setting 'num_try_random=None' opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params, - num_try_random=100, nfold=4, seed=1000, train_set=data_train, gp_model=gp_model, - use_gp_model_for_validation=True, verbose_eval=1, + num_try_random=100, nfold=5, num_boost_round=1000, early_stopping_rounds=20, - metric=metric) + verbose_eval=1, metric=metric, seed=4) print("Best parameters: " + str(opt_params['best_params'])) print("Best number of iterations: " + str(opt_params['best_iter'])) print("Best score: " + str(opt_params['best_score'])) @@ -218,19 +220,18 @@ def simulate_response_variable(lp, rand_eff, likelihood): valid_tune_idx = permute_aux[int(0.8 * n):n] folds = [(train_tune_idx, valid_tune_idx)] opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params, - num_try_random=100, folds=folds, seed=1000, train_set=data_train, gp_model=gp_model, - use_gp_model_for_validation=True, verbose_eval=1, + num_try_random=100, folds=folds, num_boost_round=1000, early_stopping_rounds=20, - metric=metric) - + verbose_eval=1, metric=metric, seed=4) + + #--------------------Cross-validation for determining number of iterations---------------- gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) data_train = gpb.Dataset(data=X, label=y) -cvbst = gpb.cv(params=params, train_set=data_train, - gp_model=gp_model, use_gp_model_for_validation=True, +cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=1000, early_stopping_rounds=20, - nfold=4, verbose_eval=True, show_stdv=False, seed=1) + nfold=5, verbose_eval=True, show_stdv=False, seed=1) metric_name = list(cvbst.keys())[0] print("Best number of iterations: " + str(np.argmin(cvbst[metric_name]) + 1)) @@ -246,8 +247,7 @@ def simulate_response_variable(lp, rand_eff, likelihood): evals_result = {} # record eval results for plotting bst = gpb.train(params=params, train_set=data_train, num_boost_round=1000, gp_model=gp_model, valid_sets=data_eval, - early_stopping_rounds=20, use_gp_model_for_validation=True, - evals_result=evals_result) + early_stopping_rounds=20, evals_result=evals_result) gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores plt.show(block=False) @@ -258,8 +258,8 @@ def simulate_response_variable(lp, rand_eff, likelihood): params_newton['learning_rate'] = 0.1 evals_result = {} # record eval results for plotting bst = gpb.train(params=params_newton, train_set=data_train, num_boost_round=1000, - gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=5, - use_gp_model_for_validation=True, evals_result=evals_result) + gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=20, + evals_result=evals_result) gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores #--------------------Model interpretation---------------- diff --git a/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py b/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py index 69cee778..d4453d92 100644 --- a/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py +++ b/examples/python-guide/generalized_linear_Gaussian_process_mixed_effects_models.py @@ -419,7 +419,7 @@ def simulate_response_variable(lp, rand_eff, likelihood): # -> covariance parameters estimates can have high variance # Predict latent GP at training data locations (=smoothing) -GP_smooth = gp_model.predict_training_data_random_effects(predict_var = True) # predict_var = True gives uncertainty for random effect predictions +GP_smooth = gp_model.predict_training_data_random_effects(predict_var = False) # predict_var = True gives uncertainty for random effect predictions # Compare true and predicted random effects plt.scatter(b_train, GP_smooth['GP'], label="Intercept GP", alpha=0.5) plt.scatter(b2, GP_smooth['GP_rand_coef_nb_1'], label="1. random coef. GP", alpha=0.5)