-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScript.py
300 lines (231 loc) · 10.1 KB
/
Script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script used to generate the results for the KDD Kaggle competition.
@authors: António, Gabriel
"""
#General Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Preprocessing libraries
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
#ML libraries
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
def analyze_dataset(df_train, df_test):
"""
Method used to analyze the dataset and plot useful data BEFORE messing with the data and
using any type of machine learning
Parameters
----------
df_train : Pandas Dataframe
Contains all the examples that will be used when training the algorithm
df_test : Pandas Dataframe
Contains all the examples that will be used when testing the algorithm, does not include "y" column
Returns
-------
None.
"""
#Print number of missing values for each column
for i in range(df_train.shape[1]):
print("Column: ", df_train.columns[i], "NAs: ", df_train.iloc[:, i].isnull().sum())
#Check data imbalancement
print("-----------------------------------")
print(df_train["y"].value_counts())
print("Percentage of minority class: " , df_train["y"].value_counts()[1]/sum(df_train["y"].value_counts()))
#Create a correlation matrix heatmap
#We should add a way to check correlation between the Y and each variable, right now we are only looking at the X variables
corr = df_train.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="Greens",annot=False)
def change_types(df_train, df_test):
col_c = list(df_train.filter(like='c', axis=1).columns)
col_o = list(df_train.filter(like='o', axis=1).columns)
col_cat = col_c + col_o
cat_value = ['category']*len(col_cat)
dict_cat = dict(zip(col_cat, cat_value))
df_train_correct_type= df_train.astype(dict_cat)
df_train_correct_type = df_train_correct_type.astype({'y':"category"})
df_test_correct_type= df_test.astype(dict_cat)
return df_train_correct_type, df_test_correct_type
def preprocess_data(df_train, df_test, verbose = True):
"""
Method that will be used for all the data preprocessing steps.
Parameters
----------
df_train : Pandas Dataframe
Contains all the examples that will be used when training the algorithm
df_test : Pandas Dataframe
Contains all the examples that will be used when testing the algorithm, does not include "y" column
verbose : By default True. It shows the no. of NAs in the converted data.
Returns
-------
df_train_noNAs : Pandas Dataframe
A transformed version of the initial pandas dataframe after all the preprocessing steps have been done
df_test_noNAs : Pandas Dataframe
A transformed version of the initial pandas dataframe after all the preprocessing steps have been done
"""
#df_train = df_train.drop("y", axis=1) # Should we keep this? I don't think so
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df_train)
df_train_noNAs=pd.DataFrame(imp_mean.fit_transform(df_train))
df_train_noNAs.columns=df_train.columns
df_train_noNAs.index=df_train.index
df_test_noNAs=pd.DataFrame(imp_mean.fit_transform(df_test))
df_test_noNAs.columns=df_test.columns
df_test_noNAs.index=df_test.index
if verbose:
for i in range(df_train_noNAs.shape[1]):
print("Column ", df_train_noNAs.columns[i], "NAs: ", df_train_noNAs.iloc[:, i].isnull().sum())
col_c = list(df_train.filter(like='c', axis=1).columns)
col_o = list(df_train.filter(like='o', axis=1).columns)
col_cat = col_c + col_o
cat_value = ['category']*len(col_cat)
dict_cat = dict(zip(col_cat, cat_value))
df_train_noNAs= df_train_noNAs.astype(dict_cat)
df_train_noNAs = df_train_noNAs.astype({'y':"category"})
df_test_noNAs= df_test_noNAs.astype(dict_cat)
return df_train_noNAs, df_test_noNAs
def compute_metrics(y_pred, y_real):
"""
Parameters
----------
y_pred : List
Results predicted by the algorithm
y_real : List
True values
Returns
-------
metrics : String
String ready to print with f1, precision, recall and accuracy values
"""
metrics = {"F1: ": f1_score(y_pred, y_real),
"Precision: " : precision_score(y_pred, y_real),
"Recall: " : recall_score(y_pred, y_real),
"Accuracy: ": accuracy_score(y_pred, y_real)}
return metrics
def cv_metrics_summary(model, X, y, no_cv=10, scoring=['f1','precision', 'recall', 'accuracy' ]):
metrics_cv = pd.DataFrame(cross_validate(model, X, y, cv=no_cv, scoring=scoring))
metrics_cv.loc['mean'] = metrics_cv.mean()
return metrics_cv
def save_predictions(preds, name = "preds"):
"""
Saves the predictions in a format that allows them to be uploaded to the kaggle competition.
Parameters
----------
preds : List
Contains the predictions obtained by the machine learning algorithms
name : String, optional
Name with which to save the predictions file. The default is "preds".
Returns
-------
None.
"""
index = pd.read_csv("Data/test.csv").loc[ :, "i"]
prediction = pd.concat([index, pd.DataFrame(preds).astype(int)] , axis=1)
prediction.columns = ["i", "y"]
prediction.to_csv(name + ".csv", index=False)
#return prediction
# I HAVE TO COMMENT THIS PROPERLY FOR GOD'S SAKE
def create_encoder(X):
"""Selects the categorical columns and creates an one-hot encoder for the dataset.
Args:
X : Multilabel dataset. Must include categorical columns
Returns:
enc: encoder from SKlearn OneHotEncoder
"""
X_2 = X.select_dtypes(include=['category'])
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_2)
return enc
def encode_using_training(X, encoder):
"""Transform data given a previously-fitted encoder. If new categories
appears they are ignored.
Args:
X dataset, similar to the one used to create encoder.
encoder: previously fitted encoder.
Returns:
new_data: X transformed using encoder.
"""
cat_col = X.select_dtypes(include=['category']).columns
aux_transformed = encoder.transform(X[cat_col]).toarray()
X = X.drop(cat_col, axis=1)
newcols = pd.DataFrame(aux_transformed)
newcols.columns = encoder.get_feature_names_out()
# newcols = newcols.astype('category')
# for i in newcols.columns:
# newcols[i] = newcols[i].cat.rename_categories(['no', 'yes'])
X_onehot = pd.concat([X, newcols ], axis=1)
return X_onehot
def to_one_hot(currentData, trainingData):
"""Transform currentData to one-hot encoding using trainingData to create the encoder
"""
_x_encoder = create_encoder(trainingData)
new_data = encode_using_training(currentData, _x_encoder)
return new_data
if __name__ == '__main__':
#Read and reshape the data initially
df_train = pd.read_csv("Data/train.csv")
df_test = pd.read_csv("Data/test.csv")
df_train = df_train.drop("i", axis=1)
y = df_train.pop("y")
df_train.insert(54,"y", y)
df_test = df_test.drop(["y", "i"], axis = 1)
#Changing variable data types
df_test.loc[:, "c1":"c12"] = df_test.loc[:, "c1":"c12"].astype("category")
df_test.loc[:, ["o1", "o2"]] = df_test.loc[:, ["o1", "o2"]].astype("category")
df_train.loc[:, "c1":"c12"] = df_train.loc[:, "c1":"c12"].astype("category")
df_train.loc[:, ["o1", "o2", "y"]] = df_train.loc[:, ["o1", "o2", "y"]].astype("category")
#Call relevant methods until we have results
analyze_dataset(df_train, df_test)
df_train, df_test = preprocess_data(df_train, df_test)
# HERE JUST DO WHATEVER YOU WANT WITH THE MODELS. I DID A STRATIFIED K-FOLD ON A RANDOM FOREST
# COMMENT OR DELETE WHATEVER YOU DON'T WANT TO USE AND MAKE YOUR OWN MODELS TO TEST AND ALL THAT SHIT
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# skf = StratifiedKFold(n_splits = 3)
# score_list = []
# models = []
# #Stratified fold to keep the data imbalance while training
# #Train 3 times to simualte a 3 way cross-validation
# for train, test in skf.split(df_train, y):
# print('train - {} | test - {}'.format(np.bincount(y[train]), np.bincount(y[test])))
# train_x = df_train.iloc[train]
# train_y = y.iloc[train]
# test_x = df_train.iloc[test]
# test_y = y.iloc[test]
# #CREATE THE MODELS HERE IF POSSIBLE
# #Random Forest
# clf = RandomForestClassifier(random_state = 0)
# clf.fit(train_x, train_y)
# y_pred = clf.predict(test_x)
# print(compute_metrics(y_pred, test_y))
# #SAVE BEST SCORES AND BEST MODELS TO TEST ON THE DATA LATER.
# score_list.append(clf.score(test_x,test_y))
# models.append(clf)
# #Check Metrics attained
# best_model_index = score_list.index(max(score_list))
# preds = models[best_model_index].predict(df_test)
estimators = []
model1 = RandomForestClassifier()
estimators.append(('Random Forest', model1))
model2 = make_pipeline(StandardScaler(), SVC(gamma='auto'))
estimators.append(('svm', model2))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cv_metrics_summary(ensemble, df_train, y)
# results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results)
ensemble.fit(df_train, y)
preds = ensemble.predict(df_test)
#Save results
save_predictions(preds)