lstm_keras_stateful.py

import sys, os
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Flatten
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from graph_tool.all import *

import prep_time_series_input
import scenario_info
import create_graph
from metrics import *

# Disable print statements
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Enable print stements
def enablePrint():
    sys.stdout = sys.__stdout__

'''
Trains the model
Parameters:
x_train - NumPy array for x training set
y_train - NumPy array for y training set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
save_model - True if model is saved in an h5 file
savefile - name of file that the model is saved to
'''
def create_model(x_train, y_train, num_samples, windows_per_sample, \
    save_model=True, savefile="model.h5"):
    print "Starting the creation of the model"
    model = Sequential()
    # Input arrays of shape (num_vertices, 12) and
    # output arrays of shape (num_vertices, 1)
    # len(x_train) = number of samples/vertices
    # len(x_train[0]) = number of time_steps/graphs,
    # len(x_train[0][0]) = number of features
    '''
    # Adding batch size screws up the program since it has to match batch
    # size later...it's necessary for stateful LSTM but not for stateless
    model.add(LSTM(32, batch_input_shape=(len(x_train), len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=False))
    '''
    # Dropout: Randomly set half (arbitrarily fraction) of the input units
    # to 0 at each update during training, which helps prevent overfitting.
    # Perhaps lower the rate if accuracy on the training or validation set
    # is low and increase if training set worked well but test set did not
    
    """
    # One layer:
    model.add(LSTM(64, batch_input_shape=(windows_per_sample, len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=True))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    """
    
    
    # Two layers:
    model.add(LSTM(64, batch_input_shape=(windows_per_sample, len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    """
    # Three layers:
    model.add(LSTM(64, batch_input_shape=(windows_per_sample, len(x_train[0]), \
        len(x_train[0][0])), return_sequences=True, stateful=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    """

    model.compile(optimizer='rmsprop', loss='mean_squared_error', \
        metrics=['accuracy', true_positives, true_negatives, \
        false_positives, false_negatives, true_positive_rate, \
        true_negative_rate, false_positive_rate, false_negative_rate])
    for i in range(num_samples):
        model.fit(x_train, y_train, epochs=20, \
            batch_size=windows_per_sample, shuffle = False)
        model.reset_states()

    if save_model == True:
        try:
            model.save(savefile)
            print "Saved model as " + str(savefile)
        except:
            print "Couldn't save the model"
    return model

'''
Evaluates the model given x_test and y_test
Parameters:
model - model generated by create_model or loaded from h5 file
x_test - NumPy array for x test set
y_test - NumPy array for y test set
pcap_duration - pcap duration (seconds) - available on CTU website
step_length - step duration (seconds)
'''
def evaluate_model(model, x_test, y_test, windows_per_sample):
    # Stateful LSTM:
    score = model.evaluate(x_test, y_test, batch_size=windows_per_sample)
    model.reset_states()
    loss, accuracy, true_positives, true_negatives, false_positives, \
        false_negatives, true_positive_rate, true_negative_rate, \
        false_positive_rate, false_negative_rate = score
    print "\n"
    print "Loss: " + str(loss)
    print "Accuracy: " + str(accuracy * 100) + "%"
    print "True positives: " + str(true_positives)
    print "True positive rate: " + str(true_positive_rate * 100) + "%"
    print "True negatives: " + str(true_negatives)
    print "True negative rate: " + str(true_negative_rate * 100) + "%"
    print "False positives: " + str(false_positives)
    print "False positive rate: " + str(false_positive_rate * 100) + "%"
    print "False negatives: " + str(false_negatives)
    print "False negative rate: " + str(false_negative_rate * 100) + "%"

'''
Displays the Receiver Operator Characteristic (ROC) curve with the area
under its curve given the parameter model and x and y data arrays
'''
def generate_roc_curve(model, x_test, y_test, windows_per_sample, data_scenario, model_scenario):
    # Get array of probabilities of that the y result is a 1
    y_score = model.predict_proba(x_test, batch_size=windows_per_sample) # THIS  LINE CAUSES THE STATEFUL LSTM TO FAIL
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange',
             lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic of scenario ' \
        + str(model_scenario) + '\'s model on scenario ' \
        + str(data_scenario) + '\'s data')
    plt.legend(loc="lower right")
    #plt.savefig("roc_curves/stateful_model_" + str(model_scenario) + "_data_" + \
    #   str(data_scenario) + ".png")
    plt.show()

def main():
    step_length = 15
    interval_length = 60
    
    model_scenario = 11
    data_scenario = 11 # scenario 9's data has good results for several models`

    pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1

    savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    model_savefile = 'stateful_lstm_model_scenario_' + str(model_scenario) + '.h5'

    '''
    Note that it's important that the original x and y are processed in the
    following order: balanced (maintain a certain ratio between postive and
    negative samples), separated into training and testing sets, and then
    broken into time windows (for stateful LSTM). This is because balancing
    data before keeps the entire initial time interval for the chosen samples
    and then the testing and training sets each contain of fewer samples with
    their entire time intervals. Finally we break each set's samples' entire
    time interval into time windows (as opposed to breaking into time windows
    and haphazardly choosing time windows from the entire time interval)
    '''

    # x and y contain the entire dataset in these NumPy arrays
    x, y = prep_time_series_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \
        step_length = step_length, interval_length = interval_length, \
        do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, \
        verbose = True)
    '''
    '''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)

    # Balanced x and y arrays maintain a certain ratio; each sample contains
    # its entire time interval
    balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)

    # Pre-(x/y)-(train/test) separate the balanced x and y arrays based on a
    # certain ratio -> each sample still contains its entire time interval
    '''
    # Note that the test set contains all the data so obviously it includes the
    # training data
    _, _, pre_x_test, pre_y_test = \
        separate_into_sets(x, y, training_proportion = 0)
    '''
    pre_x_train, pre_y_train, _, _ = prep_time_series_input. \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
    _, _, pre_x_test, pre_y_test = prep_time_series_input. \
        separate_into_sets(x, y, training_proportion = 0)
    '''
    pre_x_train, pre_y_train, pre_x_test, pre_y_test = prep_time_series_input. \
        separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
    '''

    # (x,y)_(train/test) contains the chosen samples (balanced and broken into
    # time windows)
    x_train, y_train, num_training_samples, windows_per_training_sample \
        = prep_time_series_input.time_window_data(pre_x_train, pre_y_train, 5, 2, \
        interval_length, step_length, data_scenario)
    x_test, y_test, num_testing_samples, windows_per_testing_sample \
        = prep_time_series_input.time_window_data(pre_x_test, pre_y_test, 5, 2, \
        interval_length, step_length, data_scenario)

    print "Original x, y shapes: ", x.shape, y.shape
    print "Number of training samples: ", str(num_training_samples)
    print "Number of windows per training sample: ", str(windows_per_training_sample)
    print "Number of testing samples: ", str(num_testing_samples)
    print "Number of windows per testing sample: ", str(windows_per_testing_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    model = create_model(x_train, weighted_y_train, num_training_samples, \
        windows_per_training_sample, save_model=False, savefile=model_savefile)
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    """
    
    evaluate_model(model, x_test, y_test, windows_per_testing_sample)
    generate_roc_curve(model, x_test, y_test, windows_per_testing_sample, \
        data_scenario, model_scenario)

main()