-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathtrain_asp.py
116 lines (85 loc) · 3.05 KB
/
train_asp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
import chardet
max_features = 50000
webshell_dir = "Data/WebShell/asp/"
normal_dir = "Data/normal/asp/"
white_count = 0
black_count = 0
def check_style(filepath):
with open(filepath, mode='rb') as f:
data = f.read()
style = chardet.detect(data)['encoding']
return style
def load_str(filepath):
t = ""
try:
style = check_style(filepath)
with open(filepath, encoding=style) as f:
for line in f:
line = line.strip('\r')
line = line.strip('\n')
t += line
except UnicodeDecodeError:
with open(filepath, mode='rb') as f:
t = f.read()
return t
def load_files(dir):
files_list = []
g = os.walk(dir)
for path, d, filelist in g:
for filename in filelist:
if filename.endswith('.asp'):
fulpath = os.path.join(path, filename)
print ("Load %s" % fulpath)
t = load_str(fulpath)
files_list.append(t)
return files_list
def get_feature_by_wordbag_tfidf():
global max_features
global white_count
global black_count
print ("max_features = %d" % max_features)
webshell_files_list = load_files(webshell_dir)
y1 = [1] * len(webshell_files_list)
black_count = len(webshell_files_list)
normal_files_list = load_files(normal_dir)
y2 = [0] * len(normal_files_list)
white_count = len(normal_files_list)
x = webshell_files_list + normal_files_list
y = y1 + y2
CV = CountVectorizer(ngram_range = (2, 2), decode_error = 'ignore', max_features = max_features, token_pattern = r'\b\w+\b', min_df = 1, max_df = 1.0)
x = CV.fit_transform(x).toarray()
vocabulary = CV.vocabulary_
with open('vocabulary_asp.pickle', 'wb') as f:
pickle.dump(vocabulary, f)
transformer = TfidfTransformer(smooth_idf = False)
x_tfidf = transformer.fit_transform(x)
x = x_tfidf.toarray()
return x, y
def do_metrics(y_test, y_pred):
print ("metrics.accuracy_score:")
print (metrics.accuracy_score(y_test, y_pred))
print ("metrics.confusion_matrix:")
print (metrics.confusion_matrix(y_test, y_pred))
print ("metrics.precision_score:")
print (metrics.precision_score(y_test, y_pred))
print ("metrics.recall_score:")
print (metrics.recall_score(y_test, y_pred))
def do_GNB(x, y):
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
clf = GaussianNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
with open('GNB_asp.pickle', 'wb') as f:
pickle.dump(clf, f)
do_metrics(y_test, y_pred)
if __name__ == '__main__':
x, y = get_feature_by_wordbag_tfidf()
print ("Load %d white files %d black files" % (white_count, black_count))
do_GNB(x, y)