Skip to content

Commit

Permalink
[R-package][python-package] changes in demos
Browse files Browse the repository at this point in the history
  • Loading branch information
fabsig committed Nov 16, 2024
1 parent a459585 commit 60bad15
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 55 deletions.
89 changes: 63 additions & 26 deletions R-package/demo/GPBoost_algorithm.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,54 +127,92 @@ legend(legend=c("True F","Pred F"), "bottomright", bty="n", lwd=3, col=c(2,4))
plot(b1, pred$random_effect_mean, xlab="truth", ylab="predicted",
main="Comparison of true and predicted random effects")

#--------------------Choosing tuning parameters using Bayesian optimization and the 'mlrMBO' R package ----------------
library(mlrMBO)
library(DiceKriging)
library(rgenoud)
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_tune_pars_bayesian_optimization.R")# Load required function
# Define search space
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
search_space <- list("learning_rate" = c(0.001, 10),
"min_data_in_leaf" = c(1, 1000),
"max_depth" = c(-1, -1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
"num_leaves" = c(2, 2^10),
"lambda_l2" = c(0, 100),
"max_bin" = c(63, min(n,10000)),
"line_search_step_length" = c(TRUE, FALSE))
metric = "mse" # Define metric
if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
metric = "binary_logloss"
}
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model <- GPModel(group_data = group, likelihood = likelihood)
data_train <- gpb.Dataset(data = X, label = y)
# Run parameter optimization using Bayesian optimization and k-fold CV
crit = makeMBOInfillCritCB() # other criterion options: makeMBOInfillCritEI()
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
data = dataset, gp_model = gp_model,
nfold = 5, nrounds = 1000, early_stopping_rounds = 20,
metric = metric, crit = crit,
cv_seed = 4, verbose_eval = 1)
print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params),
function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params,
n=names(opt_params$best_params))), collapse=", ")))
print(paste0("Best number of iterations: ", opt_params$best_iter))
print(paste0("Best score: ", round(opt_params$best_score, digits=3)))

# Alternatively and faster: using manually defined validation data instead of cross-validation
valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
folds <- list(valid_tune_idx)
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
data = dataset, gp_model = gp_model,
folds = folds, nrounds = 1000, early_stopping_rounds = 20,
metric = metric, crit = crit,
cv_seed = 4, verbose_eval = 1)

#--------------------Choosing tuning parameters using random grid search----------------
param_grid <- list("learning_rate" = c(0.001, 0.01, 0.1, 1, 10),
"min_data_in_leaf" = c(1, 10, 100, 1000),
"max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
"max_depth" = c(-1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
"num_leaves" = 2^(1:10),
"lambda_l2" = c(0, 1, 10, 100),
"max_bin" = c(250, 500, 1000, min(n,10000)),
"line_search_step_length" = c(TRUE, FALSE))
other_params <- list(verbose = 0) # avoid trace information when training models
# Define metric
metric = "mse"
metric = "mse" # Define metric
if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
metric = "binary_logloss"
}
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model <- GPModel(group_data = group, likelihood = likelihood)
data_train <- gpb.Dataset(data = X, label = y)
set.seed(1)
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params,
num_try_random = 100, nfold = 4,
# Run parameter optimization using random grid search and k-fold CV
# Note: deterministic grid search can be done by setting 'num_try_random=NULL'
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid,
data = data_train, gp_model = gp_model,
use_gp_model_for_validation = TRUE, verbose_eval = 1,
num_try_random = 100, nfold = 5,
nrounds = 1000, early_stopping_rounds = 20,
metric = metric)
print(paste0("Best parameters: ",
paste0(unlist(lapply(seq_along(opt_params$best_params),
function(y, n, i) { paste0(n[[i]],": ", y[[i]]) },
y=opt_params$best_params,
verbose_eval = 1, metric = metric, cv_seed = 4)
print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params),
function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params,
n=names(opt_params$best_params))), collapse=", ")))
print(paste0("Best number of iterations: ", opt_params$best_iter))
print(paste0("Best score: ", round(opt_params$best_score, digits=3)))

# Alternatively and faster: using manually defined validation data instead of cross-validation
valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
folds <- list(valid_tune_idx)
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid, params = other_params,
num_try_random = 100, folds = folds,
data = dataset, gp_model = gp_model,
use_gp_model_for_validation = TRUE, verbose_eval = 1,
opt_params <- gpb.grid.search.tune.parameters(param_grid = param_grid,
data = data_train, gp_model = gp_model,
num_try_random = 5, folds = folds,
nrounds = 1000, early_stopping_rounds = 20,
metric = metric)
verbose_eval = 1, metric = metric, cv_seed = 4)

#--------------------Cross-validation for determining number of iterations----------------
gp_model <- GPModel(group_data = group, likelihood = likelihood)
dataset <- gpb.Dataset(data = X, label = y)
bst <- gpb.cv(data = dataset, gp_model = gp_model,
use_gp_model_for_validation = TRUE, params = params,
nrounds = 1000, nfold = 4, early_stopping_rounds = 20)
bst <- gpb.cv(data = dataset, gp_model = gp_model, params = params,
nrounds = 1000, nfold = 5, early_stopping_rounds = 20)
print(paste0("Optimal number of iterations: ", bst$best_iter))

#--------------------Using a validation set for finding number of iterations----------------
Expand All @@ -188,7 +226,7 @@ gp_model <- GPModel(group_data = group[train_ind], likelihood = likelihood)
gp_model$set_prediction_data(group_data_pred = group[-train_ind])
bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000,
params = params, verbose = 1, valids = valids,
early_stopping_rounds = 20, use_gp_model_for_validation = TRUE)
early_stopping_rounds = 20)
print(paste0("Optimal number of iterations: ", bst$best_iter,
", best test error: ", bst$best_score))
# Plot validation error
Expand All @@ -204,7 +242,7 @@ if (likelihood == "gaussian") {
params_newton$learning_rate <- 0.1
bst <- gpb.train(data = dtrain, gp_model = gp_model, nrounds = 1000,
params = params_newton, verbose = 1, valids = valids,
early_stopping_rounds = 10, use_gp_model_for_validation = TRUE,
early_stopping_rounds = 20,
leaves_newton_update = TRUE)
print(paste0("Optimal number of iterations: ", bst$best_iter,
", best test error: ", bst$best_score))
Expand Down Expand Up @@ -245,7 +283,7 @@ shap_long <- shap.prep(bst, X_train = X)
shap.plot.dependence(data_long = shap_long, x = "Covariate_1",
color_feature = "Covariate_2", smooth = FALSE)
# SHAP interaction values
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/unify_gpboost_treeshap.R")# Load required function
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_unify_gpboost_treeshap.R")# Load required function
library(treeshap)
library(shapviz)
unified_bst <- gpboost_unify_treeshap(bst, X)
Expand Down Expand Up @@ -286,9 +324,8 @@ gp_model <- GPModel(group_data = group, likelihood = likelihood)
dataset <- gpb.Dataset(X, label = y)
# Stage 1: run cross-validation to (i) determine to optimal number of iterations
# and (ii) to estimate the GPModel on the out-of-sample data
cvbst <- gpb.cv(data = dataset, gp_model = gp_model,
use_gp_model_for_validation = TRUE, params = params,
nrounds = 1000, nfold = 4, early_stopping_rounds = 5,
cvbst <- gpb.cv(data = dataset, gp_model = gp_model, params = params,
nrounds = 1000, nfold = 5, early_stopping_rounds = 20,
fit_GP_cov_pars_OOS = TRUE, verbose = 0)
print(paste0("Optimal number of iterations: ", cvbst$best_iter))
# Fitted model (note: ideally, one would have to find the optimal combination of
Expand Down
56 changes: 28 additions & 28 deletions examples/python-guide/GPBoost_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,21 +153,22 @@ def simulate_response_variable(lp, rand_eff, likelihood):
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
search_space = { 'learning_rate': [0.001, 10],
'min_data_in_leaf': [1, 1000],
'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1,10]
'max_depth': [-1,-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1,10]
'num_leaves': [2, 1024],
'lambda_l2': [0, 100],
'max_bin': [63, np.min([10000,n])],
'line_search_step_length': [True, False] }
# Define metric
metric = "mse"
metric = "mse" # Define metric
if likelihood in ("bernoulli_probit", "bernoulli_logit"):
metric = "binary_logloss"
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
# Run parameter optimization using the TPE algorithm and 4-fold CV
opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space,
nfold=4, gp_model=gp_model, metric=metric, tpe_seed=1,
max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
# Run parameter optimization using the TPE algorithm and k-fold CV
opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100,
X=X, y=y, gp_model=gp_model,
max_num_boost_round=1000, early_stopping_rounds=20,
nfold=5, metric=metric,
cv_seed=4, tpe_seed=1)
print("Best parameters: " + str(opt_params['best_params']))
print("Best number of iterations: " + str(opt_params['best_iter']))
print("Best score: " + str(opt_params['best_score']))
Expand All @@ -178,35 +179,36 @@ def simulate_response_variable(lp, rand_eff, likelihood):
train_tune_idx = permute_aux[0:int(0.8 * n)] # use 20% of the data as validation data
valid_tune_idx = permute_aux[int(0.8 * n):n]
folds = [(train_tune_idx, valid_tune_idx)]
opt_params = gpb.tune_pars_TPE_algorithm_optuna(X=X, y=y, search_space=search_space,
folds=folds, gp_model=gp_model, metric=metric, tpe_seed=1,
max_num_boost_round=1000, n_trials=100, early_stopping_rounds=20)
opt_params = gpb.tune_pars_TPE_algorithm_optuna(search_space=search_space, n_trials=100,
X=X, y=y, gp_model=gp_model,
max_num_boost_round=1000, early_stopping_rounds=20,
folds=folds, metric=metric,
cv_seed=4, tpe_seed=1)

#--------------------Choosing tuning parameters using random grid search----------------
# Define parameter search grid
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
param_grid = { 'learning_rate': [0.001, 0.01, 0.1, 1, 10],
'min_data_in_leaf': [1, 10, 100, 1000],
'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionaly tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10]
'max_depth': [-1], # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., 'max_depth': [-1, 1, 2, 3, 5, 10]
'num_leaves': 2**np.arange(1,10),
'lambda_l2': [0, 1, 10, 100],
'max_bin': [250, 500, 1000, np.min([10000,n])],
'line_search_step_length': [True, False]}
other_params = {'verbose': 0} # avoid trace information when training models
# Define metric
metric = "mse"
metric = "mse" # Define metric
if likelihood in ("bernoulli_probit", "bernoulli_logit"):
metric = "binary_logloss"
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
data_train = gpb.Dataset(data=X, label=y)
# Run parameter optimization using random grid search and 4-fold CV
# Run parameter optimization using random grid search and k-fold CV
# Note: deterministic grid search can be done by setting 'num_try_random=None'
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
num_try_random=100, nfold=4, seed=1000,
train_set=data_train, gp_model=gp_model,
use_gp_model_for_validation=True, verbose_eval=1,
num_try_random=100, nfold=5,
num_boost_round=1000, early_stopping_rounds=20,
metric=metric)
verbose_eval=1, metric=metric, seed=4)
print("Best parameters: " + str(opt_params['best_params']))
print("Best number of iterations: " + str(opt_params['best_iter']))
print("Best score: " + str(opt_params['best_score']))
Expand All @@ -218,19 +220,18 @@ def simulate_response_variable(lp, rand_eff, likelihood):
valid_tune_idx = permute_aux[int(0.8 * n):n]
folds = [(train_tune_idx, valid_tune_idx)]
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
num_try_random=100, folds=folds, seed=1000,
train_set=data_train, gp_model=gp_model,
use_gp_model_for_validation=True, verbose_eval=1,
num_try_random=100, folds=folds,
num_boost_round=1000, early_stopping_rounds=20,
metric=metric)

verbose_eval=1, metric=metric, seed=4)


#--------------------Cross-validation for determining number of iterations----------------
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
data_train = gpb.Dataset(data=X, label=y)
cvbst = gpb.cv(params=params, train_set=data_train,
gp_model=gp_model, use_gp_model_for_validation=True,
cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model,
num_boost_round=1000, early_stopping_rounds=20,
nfold=4, verbose_eval=True, show_stdv=False, seed=1)
nfold=5, verbose_eval=True, show_stdv=False, seed=1)
metric_name = list(cvbst.keys())[0]
print("Best number of iterations: " + str(np.argmin(cvbst[metric_name]) + 1))

Expand All @@ -246,8 +247,7 @@ def simulate_response_variable(lp, rand_eff, likelihood):
evals_result = {} # record eval results for plotting
bst = gpb.train(params=params, train_set=data_train, num_boost_round=1000,
gp_model=gp_model, valid_sets=data_eval,
early_stopping_rounds=20, use_gp_model_for_validation=True,
evals_result=evals_result)
early_stopping_rounds=20, evals_result=evals_result)
gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores
plt.show(block=False)

Expand All @@ -258,8 +258,8 @@ def simulate_response_variable(lp, rand_eff, likelihood):
params_newton['learning_rate'] = 0.1
evals_result = {} # record eval results for plotting
bst = gpb.train(params=params_newton, train_set=data_train, num_boost_round=1000,
gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=5,
use_gp_model_for_validation=True, evals_result=evals_result)
gp_model=gp_model, valid_sets=data_eval, early_stopping_rounds=20,
evals_result=evals_result)
gpb.plot_metric(evals_result, figsize=(10, 5))# plot validation scores

#--------------------Model interpretation----------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def simulate_response_variable(lp, rand_eff, likelihood):
# -> covariance parameters estimates can have high variance

# Predict latent GP at training data locations (=smoothing)
GP_smooth = gp_model.predict_training_data_random_effects(predict_var = True) # predict_var = True gives uncertainty for random effect predictions
GP_smooth = gp_model.predict_training_data_random_effects(predict_var = False) # predict_var = True gives uncertainty for random effect predictions
# Compare true and predicted random effects
plt.scatter(b_train, GP_smooth['GP'], label="Intercept GP", alpha=0.5)
plt.scatter(b2, GP_smooth['GP_rand_coef_nb_1'], label="1. random coef. GP", alpha=0.5)
Expand Down

0 comments on commit 60bad15

Please sign in to comment.