-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbootstrap_brt.R
114 lines (109 loc) · 3.76 KB
/
bootstrap_brt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#' Perform Bootstrap Iterations for Boosted Regression Tree
#'
#' This function performs bootstrap iterations for a boosted
#' regression tree (BRT). It returns an object containing a
#' dataframe of model statistics and a list of the models.
#'
#' @param data data.frame. The data to be used for bootstrapping.
#' @param n_iterations integer. The number of bootstrap iterations
#' to perform.
#' @param tuned_param data.frame. A data frame containing the tuned
#' hyperparameters.
#' @return list. A list containing:
#' - models: A list of the bootstrapped models.
#' - stats_df: A dataframe with statistics for each model.
#'
#' @example
#' # Example usage of the function
#' library(gbm)
#' library(dplyr)
#' library(pROC)
#' library(parallel)
#'
#' # Example data
#' random_data <- data.frame(
#' PIWO_occ = sample(0:1, 100, replace = TRUE),
#' PIWO_offset = runif(100),
#' var1 = rnorm(100),
#' var2 = rnorm(100)
#' )
#'
#' # Example tuned parameters
#' tuned_param <- data.frame(
#' interaction.depth = 3,
#' n.minobsinnode = 20,
#' shrinkage = 0.01,
#' bag.fraction = 0.75
#' )
#'
#' # Example function call
#' bootstrap_models <- bootstrap_brt(data = random_data,
#' n_iterations = 10,
#' tuned_param = tuned_param)
#'
#' # Example result printing
#' print(bootstrap_models$stats_df)
bootstrap_brt <- function(data, n_iterations, tuned_param) {
# Step 1: Initialize an empty dataframe for stats
num_predictors <- ncol(data) - 1
cov_columns <- paste0("cov_", 1:num_predictors)
all_stats_df <- data.frame(
model = character(),
deviance.mean = numeric(),
correlation.mean = numeric(),
discrimination.mean = numeric(),
deviance.null = numeric(),
deviance.explained = numeric(),
predict_AUC = numeric(),
stringsAsFactors = FALSE
)
for (cov in cov_columns) {
all_stats_df[[cov]] <- numeric()
}
# Step 2: Initialize a list to store models
models_list <- list()
# Step 3: Define the function to run a single bootstrap iteration
run_iteration <- function(i) {
samp <- sample(nrow(data), round(0.75 * nrow(data)),
replace = TRUE)
train_data <- data[samp, ]
test_data <- data[-samp, ]
# o_train <- train_data$PIWO_offset
# train_data <- dplyr::select(train_data, -PIWO_offset)
# o_test <- test_data$PIWO_offset
# test_data <- dplyr::select(test_data, -PIWO_offset)
model <- gbm.step(
data = train_data,
gbm.x = 2:ncol(train_data),
gbm.y = 1,
# offset = o_train,
family = "bernoulli",
tree.complexity = tuned_param$interaction.depth[1],
n.minobsinnode = tuned_param$n.minobsinnode[1],
learning.rate = tuned_param$shrinkage[1],
bag.fraction = tuned_param$bag.fraction[1],
silent = FALSE,
plot.main = FALSE
)
predictions <- predict(model, newdata = test_data[, -1],
n.trees = model$gbm.call$best.trees,
type = "response")
roc_result <- pROC::roc(response = test_data[, 1],
predictor = predictions)
auc_value <- pROC::auc(roc_result)
model_stats <- calculate_brt_stats(model = model)
model_name <- paste("model", i, sep = "_")
model_stats$model <- model_name
model_stats$predict_AUC <- auc_value
return(list(model = model, stats = model_stats))
}
# Step 4: Run the bootstrap iterations in parallel
results <- mclapply(1:n_iterations, run_iteration,
mc.cores = detectCores() - 1)
# Step 5: Combine the results
for (result in results) {
models_list[[result$stats$model]] <- result$model
all_stats_df <- rbind(all_stats_df, result$stats)
}
return(list(models = models_list, stats_df = all_stats_df))
}