-
Notifications
You must be signed in to change notification settings - Fork 0
/
code.R
327 lines (264 loc) · 10.8 KB
/
code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
######## Introduction ##########
# Non-linear model to predict the the housing prices
# the housing prices of a town or a suburb based on the features of the locality provided to us
# Maher Daoud
# To predict movie ratings in the validation set, train a machine learning algorithm using the inputs from one subset.
# Each record in the database describes a Boston suburb or town.
# The data was drawn from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970
###### Load Required Libraries #######
# Function to check if packages are installed
isInstalled <- function(mypkg){
is.element(mypkg, installed.packages()[,1])
}
if(!isInstalled("readr")) install.packages("readr",repos = "http://cran.us.r-project.org")
if(!isInstalled("dplyr")) install.packages("dplyr",repos = "http://cran.us.r-project.org")
if(!isInstalled("tidyr")) install.packages("readr",repos = "http://cran.us.r-project.org")
if(!isInstalled("caret")) install.packages("caret",repos = "http://cran.us.r-project.org")
if(!isInstalled("rsample")) install.packages("rsample",repos = "http://cran.us.r-project.org")
if(!isInstalled("randomForest")) install.packages("randomForest",repos = "http://cran.us.r-project.org")
if(!isInstalled("ranger")) install.packages("ranger",repos = "http://cran.us.r-project.org")
if(!isInstalled("h2o")) install.packages("h2o",repos = "http://cran.us.r-project.org")
library (readr)
library(dplyr)
library(tidyr)
library(rsample) # data splitting
library(randomForest) # basic implementation
library(ranger) # a faster implementation of randomForest
library(caret) # an aggregator package for performing many machine learning models
library(h2o) # an extremely fast java-based platform
###### Import the Data #######
# In case you face any problem with downloading the dataset
# please use this link to download the dataset
# https://github.com/maherdaoud/Boston-House-Price-Prediction/blob/main/Boston.csv
# then use the following code to read it
# df <- read.csv("C:\download-path\Boston.csv)
# Import the data from Github
urlfile <-
"https://raw.githubusercontent.com/maherdaoud/Boston-House-Price-Prediction/main/Boston.csv"
df <- data.frame(read_csv(url(urlfile), show_col_types = F), stringsAsFactors = F)
###### Transformation #######
# Calculate the log of MEDV
df$MEDV_log <- log(df$MEDV)
# Drop MEDV variable
# We will use the log version
df$MEDV <- NULL
# As we can see from the report the data is very clean and we don't need more transformation
# Let's Implement the random forest and apply some techniques for tuning
###### Modeling #######
# Create training (70%) and test (30%) sets for the AmesHousing::make_ames() data.
# Use set.seed for reproducibility
set.seed(123)
ames_split <- initial_split(df, prop = .7)
ames_train <- training(ames_split)
ames_test <- testing(ames_split)
# for reproduciblity
set.seed(123)
# default RF model
m1 <- randomForest(
formula = MEDV_log ~ .,
data = ames_train
)
# Show Results of the Model
m1
# Plot the Model
plot(m1)
# The plotted error rate above is based on the OOB sample error
# and can be accessed directly at m1$mse. Thus, we can find which number of trees
#providing the lowest error rate, which is
# 402 trees providing an average home sales price error of 0.1469607
# number of trees with lowest MSE
which.min(m1$mse)
# RMSE of this optimal random forest
sqrt(m1$mse[which.min(m1$mse)])
# randomForest also allows us to use a validation set to measure predictive accuracy
# if we did not want to use the OOB samples.
# Here we split our training set further to create a training and validation set.
# We then supply the validation data in the xtest and ytest arguments.
# create training and validation data
set.seed(123)
valid_split <- initial_split(ames_train, .8)
# training data
ames_train_v2 <- analysis(valid_split)
# validation data
ames_valid <- assessment(valid_split)
x_test <- ames_valid[setdiff(names(ames_valid), "MEDV_log")]
y_test <- ames_valid$MEDV_log
rf_oob_comp <- randomForest(
formula = MEDV_log ~ .,
data = ames_train_v2,
xtest = x_test,
ytest = y_test
)
# extract OOB & validation errors
oob <- sqrt(rf_oob_comp$mse)
validation <- sqrt(rf_oob_comp$test$mse)
# compare error rates
tibble::tibble(
`Out of Bag Error` = oob,
`Test error` = validation,
ntrees = 1:rf_oob_comp$ntree
) %>%
gather(Metric, RMSE, -ntrees) %>%
ggplot(aes(ntrees, RMSE, color = Metric)) +
geom_line() +
xlab("Number of trees") +
theme_bw()
##### Basic Tuning #####
## Initial tuning with randomForest ##
# If we are interested with just starting out and tuning the mtry parameter
# we can use randomForest::tuneRF for a quick and easy tuning assessment.
# tuneRf will start at a value of mtry that you supply and increase by a certain step factor
# until the OOB error stops improving be a specified amount.
# For example, the below starts with mtry = 5 and increases by a factor of 1.5 until
# the OOB error stops improving by 1%.
# names of features
features <- setdiff(names(ames_train), "MEDV_log")
set.seed(123)
m2 <- tuneRF(
x = ames_train[features],
y = ames_train$MEDV_log,
ntreeTry = 500,
mtryStart = 5,
stepFactor = 1.5,
improve = 0.01,
trace = FALSE # to not show real-time progress
)
##### Full grid search with ranger #####
# To perform the grid search, first we want to construct our grid of hyperparameters.
# We’re going to search across 96 different models with varying mtry, minimum node size, and sample size.
# hyperparameter grid search
hyper_grid <- expand.grid(
mtry = seq(2, 13, by = 2),
node_size = seq(3, 9, by = 2),
sampe_size = c(.55, .632, .70, .80),
OOB_RMSE = 0
)
# total number of combinations
nrow(hyper_grid)
# We loop through each hyperparameter combination and apply 500 trees since our previous examples
# illustrated that 500 was plenty to achieve a stable error rate.
# Also note that we set the random number generator seed.
# This allows us to consistently sample the same observations for each sample size
# and make it more clear the impact that each change makes
for(i in 1:nrow(hyper_grid)) {
# train model
model <- ranger(
formula = MEDV_log ~ .,
data = ames_train,
num.trees = 500,
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$node_size[i],
sample.fraction = hyper_grid$sampe_size[i],
seed = 123
)
# add OOB error to grid
hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}
hyper_grid %>%
dplyr::arrange(OOB_RMSE) %>%
head(10)
# as we can see the best model until now come with mtry = 6, node_size=3, sampe_size=0.8
OOB_RMSE <- vector(mode = "numeric", length = 100)
for(i in seq_along(OOB_RMSE)) {
optimal_ranger <- ranger(
formula = MEDV_log ~ .,
data = ames_train,
num.trees = 500,
mtry = 6,
min.node.size = 3,
sample.fraction = .8,
importance = 'impurity'
)
OOB_RMSE[i] <- sqrt(optimal_ranger$prediction.error)
}
hist(OOB_RMSE, breaks = 20)
# Furthermore, you may have noticed we set importance = 'impurity' in the above modeling,
# which allows us to assess variable importance.
# Variable importance is measured by recording the decrease in MSE each time a variable is used
# as a node split in a tree
vi <- data.frame(imp = optimal_ranger$variable.importance)
# Transform the data frame and re-order it
vi <- vi %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
# Plot the importance
ggplot2::ggplot(vi) +
geom_col(aes(x = variable, y = imp, fill=variable),
show.legend = F) +
coord_flip() +
theme_bw()
##### Full grid search with H2O #####
# h2o is a powerful and efficient java-based interface that provides parallel distributed algorithms.
# Moreover, h2o allows for different optimal search paths in our grid search.
# This allows us to be more efficient in tuning our models.
# Here, I demonstrate how to tune a random forest model with h2o.
# Lets go ahead and start up h2o
# start up h2o
h2o.no_progress()
h2o.init(max_mem_size = "5g")
# First, we can try a comprehensive (full cartesian) grid search,
# which means we will examine every combination of hyperparameter settings that we specify in hyper_grid.h2o
# create feature names
y <- "MEDV_log"
x <- setdiff(names(ames_train), y)
# turn training set into h2o object
train.h2o <- as.h2o(ames_train)
# hyperparameter grid
hyper_grid.h2o <- list(
ntrees = seq(200, 500, by = 150),
mtries = seq(2, 13, by = 2),
max_depth = seq(20, 40, by = 5),
min_rows = seq(1, 5, by = 2),
nbins = seq(10, 30, by = 5),
sample_rate = c(.55, .632, .75)
)
# random grid search criteria
search_criteria <- list(
strategy = "RandomDiscrete",
stopping_metric = "mse",
stopping_tolerance = 0.005,
stopping_rounds = 10,
max_runtime_secs = 10*60
)
# build grid search
# Maximum process time will be less than 10 minutes
# So please be patient :)
random_grid <- h2o.grid(
algorithm = "randomForest",
grid_id = "rf_grid2",
x = x,
y = y,
training_frame = train.h2o,
hyper_params = hyper_grid.h2o,
search_criteria = search_criteria
)
# collect the results and sort by our model performance metric of choice
grid_perf2 <- h2o.getGrid(
grid_id = "rf_grid2",
sort_by = "mse",
decreasing = FALSE
)
print(grid_perf2)
# Once we’ve identified the best model we can get that model
# and apply it to our hold-out test set to compute our final test error.
# We see that we’ve been able to reduce our RMSE to near 0.1378412
# Grab the model_id for the top model, chosen by validation error
best_model_id <- grid_perf2@model_ids[[1]]
best_model <- h2o.getModel(best_model_id)
# Now let’s evaluate the model performance on a test set
ames_test.h2o <- as.h2o(ames_test)
best_model_perf <- h2o.performance(model = best_model, newdata = ames_test.h2o)
# RMSE of best model
h2o.mse(best_model_perf) %>% sqrt()
# Note that the RMSE for testing dataset is very close with the RMSE of the best model
# which means our modeling is performing good and we don't have over-fitting issue
##### Predicting #####
# for predicting a new data sets we will use the model that generated by H2o
# just for testing purpose we will use ames_test.h2o as a new datset
pred_h2o <- predict(best_model, ames_test.h2o)
# Then we need to find the exponential for the predicted value, it will represents the
# Median value of owner-occupied homes in 1000 dollars.
pred_h2o <- exp(pred_h2o)
head(pred_h2o)