-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlstm_keras_stateful.py
250 lines (225 loc) · 10.6 KB
/
lstm_keras_stateful.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import sys, os
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Flatten
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from graph_tool.all import *
import prep_time_series_input
import scenario_info
import create_graph
from metrics import *
# Disable print statements
def blockPrint():
sys.stdout = open(os.devnull, 'w')
# Enable print stements
def enablePrint():
sys.stdout = sys.__stdout__
'''
Trains the model
Parameters:
x_train - NumPy array for x training set
y_train - NumPy array for y training set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
save_model - True if model is saved in an h5 file
savefile - name of file that the model is saved to
'''
def create_model(x_train, y_train, num_samples, windows_per_sample, \
save_model=True, savefile="model.h5"):
print "Starting the creation of the model"
model = Sequential()
# Input arrays of shape (num_vertices, 12) and
# output arrays of shape (num_vertices, 1)
# len(x_train) = number of samples/vertices
# len(x_train[0]) = number of time_steps/graphs,
# len(x_train[0][0]) = number of features
'''
# Adding batch size screws up the program since it has to match batch
# size later...it's necessary for stateful LSTM but not for stateless
model.add(LSTM(32, batch_input_shape=(len(x_train), len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=False))
'''
# Dropout: Randomly set half (arbitrarily fraction) of the input units
# to 0 at each update during training, which helps prevent overfitting.
# Perhaps lower the rate if accuracy on the training or validation set
# is low and increase if training set worked well but test set did not
"""
# One layer:
model.add(LSTM(64, batch_input_shape=(windows_per_sample, len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
"""
# Two layers:
model.add(LSTM(64, batch_input_shape=(windows_per_sample, len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
"""
# Three layers:
model.add(LSTM(64, batch_input_shape=(windows_per_sample, len(x_train[0]), \
len(x_train[0][0])), return_sequences=True, stateful=True))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
"""
model.compile(optimizer='rmsprop', loss='mean_squared_error', \
metrics=['accuracy', true_positives, true_negatives, \
false_positives, false_negatives, true_positive_rate, \
true_negative_rate, false_positive_rate, false_negative_rate])
for i in range(num_samples):
model.fit(x_train, y_train, epochs=20, \
batch_size=windows_per_sample, shuffle = False)
model.reset_states()
if save_model == True:
try:
model.save(savefile)
print "Saved model as " + str(savefile)
except:
print "Couldn't save the model"
return model
'''
Evaluates the model given x_test and y_test
Parameters:
model - model generated by create_model or loaded from h5 file
x_test - NumPy array for x test set
y_test - NumPy array for y test set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
'''
def evaluate_model(model, x_test, y_test, windows_per_sample):
# Stateful LSTM:
score = model.evaluate(x_test, y_test, batch_size=windows_per_sample)
model.reset_states()
loss, accuracy, true_positives, true_negatives, false_positives, \
false_negatives, true_positive_rate, true_negative_rate, \
false_positive_rate, false_negative_rate = score
print "\n"
print "Loss: " + str(loss)
print "Accuracy: " + str(accuracy * 100) + "%"
print "True positives: " + str(true_positives)
print "True positive rate: " + str(true_positive_rate * 100) + "%"
print "True negatives: " + str(true_negatives)
print "True negative rate: " + str(true_negative_rate * 100) + "%"
print "False positives: " + str(false_positives)
print "False positive rate: " + str(false_positive_rate * 100) + "%"
print "False negatives: " + str(false_negatives)
print "False negative rate: " + str(false_negative_rate * 100) + "%"
'''
Displays the Receiver Operator Characteristic (ROC) curve with the area
under its curve given the parameter model and x and y data arrays
'''
def generate_roc_curve(model, x_test, y_test, windows_per_sample, data_scenario, model_scenario):
# Get array of probabilities of that the y result is a 1
y_score = model.predict_proba(x_test, batch_size=windows_per_sample) # THIS LINE CAUSES THE STATEFUL LSTM TO FAIL
# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic of scenario ' \
+ str(model_scenario) + '\'s model on scenario ' \
+ str(data_scenario) + '\'s data')
plt.legend(loc="lower right")
#plt.savefig("roc_curves/stateful_model_" + str(model_scenario) + "_data_" + \
# str(data_scenario) + ".png")
plt.show()
def main():
step_length = 15
interval_length = 60
model_scenario = 11
data_scenario = 11 # scenario 9's data has good results for several models`
pcap_file = sys.argv[1]
# Dictionary of malicious IP addresses with start timestamp as its value
botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1
savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt'
savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt'
model_savefile = 'stateful_lstm_model_scenario_' + str(model_scenario) + '.h5'
'''
Note that it's important that the original x and y are processed in the
following order: balanced (maintain a certain ratio between postive and
negative samples), separated into training and testing sets, and then
broken into time windows (for stateful LSTM). This is because balancing
data before keeps the entire initial time interval for the chosen samples
and then the testing and training sets each contain of fewer samples with
their entire time intervals. Finally we break each set's samples' entire
time interval into time windows (as opposed to breaking into time windows
and haphazardly choosing time windows from the entire time interval)
'''
# x and y contain the entire dataset in these NumPy arrays
x, y = prep_time_series_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \
step_length = step_length, interval_length = interval_length, \
do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, \
verbose = True)
'''
'''
x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)
# Balanced x and y arrays maintain a certain ratio; each sample contains
# its entire time interval
balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)
# Pre-(x/y)-(train/test) separate the balanced x and y arrays based on a
# certain ratio -> each sample still contains its entire time interval
'''
# Note that the test set contains all the data so obviously it includes the
# training data
_, _, pre_x_test, pre_y_test = \
separate_into_sets(x, y, training_proportion = 0)
'''
pre_x_train, pre_y_train, _, _ = prep_time_series_input. \
separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
_, _, pre_x_test, pre_y_test = prep_time_series_input. \
separate_into_sets(x, y, training_proportion = 0)
'''
pre_x_train, pre_y_train, pre_x_test, pre_y_test = prep_time_series_input. \
separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
'''
# (x,y)_(train/test) contains the chosen samples (balanced and broken into
# time windows)
x_train, y_train, num_training_samples, windows_per_training_sample \
= prep_time_series_input.time_window_data(pre_x_train, pre_y_train, 5, 2, \
interval_length, step_length, data_scenario)
x_test, y_test, num_testing_samples, windows_per_testing_sample \
= prep_time_series_input.time_window_data(pre_x_test, pre_y_test, 5, 2, \
interval_length, step_length, data_scenario)
print "Original x, y shapes: ", x.shape, y.shape
print "Number of training samples: ", str(num_training_samples)
print "Number of windows per training sample: ", str(windows_per_training_sample)
print "Number of testing samples: ", str(num_testing_samples)
print "Number of windows per testing sample: ", str(windows_per_testing_sample)
print "x_train, y_train shapes: ", x_train.shape, y_train.shape
print "x_test, y_test shapes: ", x_test.shape, y_test.shape
weighted_y_train = np.copy(y_train)
weighted_y_train[weighted_y_train == 1] = 6
weighted_y_test = np.copy(y_test)
weighted_y_test[weighted_y_test == 1] = 6
# TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
# (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
model = create_model(x_train, weighted_y_train, num_training_samples, \
windows_per_training_sample, save_model=False, savefile=model_savefile)
"""
model = load_model(model_savefile, custom_objects = \
{'true_positives': true_positives, 'false_positives': false_positives, \
'true_negatives': true_negatives, 'false_negatives': false_negatives, \
'true_positive_rate': true_positive_rate, \
'false_positive_rate': false_positive_rate, \
'true_negative_rate': true_negative_rate, \
'false_negative_rate': false_negative_rate})
"""
evaluate_model(model, x_test, y_test, windows_per_testing_sample)
generate_roc_curve(model, x_test, y_test, windows_per_testing_sample, \
data_scenario, model_scenario)
main()