forked from zygmuntz/kaggle-happiness
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidation.r
91 lines (59 loc) · 1.47 KB
/
validation.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# random forest and naive bayes
library( randomForest )
library( e1071 )
library( caTools )
setwd( '/path/to/showofhands/data' )
data = read.csv( 'train.csv' )
# which columns are not factors?
cols = colnames( data )
for ( i in 1:length( cols )) {
col_class = class( data[,i] )
if ( col_class != 'factor' ) {
cat( cols[i], col_class, "\n" )
}
}
"
UserID integer
YOB integer
Happy integer
votes integer
"
# clean-up
drops = c( 'UserID' )
data = data[, !( names( data ) %in% drops )]
data$Happy = as.factor( data$Happy )
# clean up YOB
data$YOB[data$YOB < 1930] = 0
data$YOB[data$YOB > 2004] = 0
data$YOB[is.na(data$YOB)] = 0
# train / test split
p_train = 0.8
n = nrow( data )
train_len = round( n * p_train )
test_start = train_len + 1
i = sample.int( n )
train_i = i[1:train_len]
test_i = i[test_start:n]
train = data[train_i,]
test = data[test_i,]
# random forest
y_test = as.factor( test$Happy )
ntree = 100
rf = randomForest( as.factor( Happy ) ~ ., data = train, ntree = ntree, do.trace = 10 )
p <- predict( rf, test, type = 'prob' )
probs = p[,2]
auc = colAUC( probs, y_test )
auc = auc[1]
cat( "Random forest AUC:", auc )
varImpPlot( rf, n.var = 20 )
# naive bayes
nb = naiveBayes( Happy ~ ., data = train )
# for predicting
drops = c( 'Happy' )
x_test = test[, !( names( test ) %in% drops )]
p = predict( nb, x_test, type = 'raw' )
probs = p[,2]
auc = colAUC( probs, y_test )
auc = auc[1]
cat( "\n\n" )
cat( "Naive Bayes AUC:", auc, "\n" )