-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtune_gbm_model.R
92 lines (85 loc) · 2.51 KB
/
tune_gbm_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#' Tune GBM Model
#'
#' This function tunes the hyperparameters of a Gradient Boosting
#' Machine (GBM) model using a grid search approach and parallel
#' processing.
#'
#' @param data data.frame. The dataset to be used for training the
#' model.
#' @param formula formula. The formula specifying the model to be
#' trained.
#' @return data.frame. A data frame containing the hyperparameter
#' grid with the optimal number of trees and minimum RMSE for each
#' combination.
#'
#' @example
#' # Example usage of the function
#' library(gbm)
#' library(dplyr)
#' library(doParallel)
#' library(foreach)
#'
#' # Example data
#' random_data_simp <- data.frame(
#' PIWO_occ = sample(0:1, 100, replace = TRUE),
#' offset = runif(100),
#' var1 = rnorm(100),
#' var2 = rnorm(100)
#' )
#'
#' # Example function call
#' tuned_results <- tune_gbm_model(data = random_data_simp,
#' formula = PIWO_occ ~ . +
#' offset(offset))
#'
#' # Example result printing
#' print(tuned_results)
tune_gbm_model <- function(data, formula) {
# Step 1: Create hyperparameter grid
hyper_grid <- expand.grid(
shrinkage = c(.001, .01),
interaction.depth = c(2, 3),
n.minobsinnode = c(10, 15, 20, 30, 50, 100),
bag.fraction = c(.5, .75, .85),
optimal_trees = 0, # Placeholder for results
min_RMSE = 0 # Placeholder for results
)
# Step 2: Set up parallel processing
cl <- makeCluster(detectCores() - 1) # Use all but one core
registerDoParallel(cl)
# Step 3: Parallel processing with foreach
results <- foreach(
i = 1:nrow(hyper_grid),
.combine = rbind,
.packages = c("gbm", "dplyr")
) %dopar% {
# Ensure reproducibility
set.seed(123)
# Train model
gbm.tune <- gbm(
formula = formula,
distribution = "bernoulli",
data = data,
n.trees = 5000,
interaction.depth = hyper_grid$interaction.depth[i],
shrinkage = hyper_grid$shrinkage[i],
n.minobsinnode = hyper_grid$n.minobsinnode[i],
bag.fraction = hyper_grid$bag.fraction[i],
train.fraction = .75,
verbose = FALSE
)
# Return results
data.frame(
optimal_trees = which.min(gbm.tune$valid.error),
min_RMSE = sqrt(min(gbm.tune$valid.error))
)
}
# Step 4: Combine results with hyper_grid
hyper_grid$optimal_trees <- results$optimal_trees
hyper_grid$min_RMSE <- results$min_RMSE
hyper_grid <- hyper_grid %>%
dplyr::arrange(min_RMSE)
# Step 5: Stop parallel processing
stopCluster(cl)
return(hyper_grid)
}