-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
155 lines (116 loc) · 5.52 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
from tqdm import tqdm
import os.path
import json
import re
import math
def get_data_from_saved_file(file_info_name, need_pl=False):
with open(file_info_name, 'r') as reader:
data = json.loads(reader.read())
if need_pl:
return data['url_data'], data['label_data'], data['url_to_pl'], data['url_to_label']
else:
return data['url_data'], data['label_data']
def get_data(dataset_name, need_pl=False):
file_info_name = 'info_' + dataset_name + '.json'
if os.path.isfile(file_info_name):
return get_data_from_saved_file(file_info_name, need_pl)
print("Reading dataset...")
df = pd.read_csv(dataset_name)
df = df[['commit_id', 'repo', 'partition', 'PL', 'label']]
items = df.to_numpy().tolist()
url_train, url_val, url_val_java, url_val_python, url_test_java, url_test_python = [], [], [], [], [], []
label_train, label_val, label_val_java, label_val_python, label_test_java, label_test_python = [], [], [], [], [], []
url_to_pl = {}
url_to_label = {}
for item in tqdm(items):
commit_id = item[0]
repo = item[1]
url = repo + '/commit/' + commit_id
partition = item[2]
pl = item[3]
label = item[4]
url_to_pl[url] = pl
url_to_label[url] = label
if partition == 'train':
if url not in url_train:
url_train.append(url)
label_train.append(label)
elif partition == 'val':
if url not in url_val:
url_val.append(url)
label_val.append(label)
if pl == 'java' and url not in url_val_java:
url_val_java.append(url)
label_val_java.append(label)
if pl == 'python' and url not in url_val_python:
url_val_python.append(url)
label_val_python.append(label)
elif partition == 'test':
if pl == 'java' and url not in url_test_java:
url_test_java.append(url)
label_test_java.append(label)
elif pl == 'python' and url not in url_test_python:
url_test_python.append(url)
label_test_python.append(label)
else:
Exception("Invalid partition: {}".format(partition))
print("Finish reading dataset")
url_data = {'train': url_train, 'val': url_val, 'val_java': url_val_java, 'val_python': url_val_python,
'test_java': url_test_java, 'test_python': url_test_python}
label_data = {'train': label_train, 'val': label_val, 'val_java': label_val_java, 'val_python': label_val_python,
'test_java': label_test_java, 'test_python': label_test_python}
data = {'url_data': url_data, 'label_data': label_data, 'url_to_pl': url_to_pl, 'url_to_label' : url_to_label}
json.dump(data, open(file_info_name, 'w'))
if need_pl:
return url_data, label_data, url_to_pl, url_to_label
else:
return url_data, label_data
def extract_security_dataset(dataset_name, output_path):
java_sec_url_set, python_sec_url_set = filter_security_changes_by_keywords(dataset_name)
print(len(java_sec_url_set))
print(len(python_sec_url_set))
print("Reading dataset....")
df = pd.read_csv(dataset_name)
df = df[['commit_id', 'repo', 'partition', 'diff', 'label', 'PL', 'LOC_MOD', 'filename', 'msg']]
df = df[df.partition == 'test']
items = df.to_numpy().tolist()
sec_items = []
for item in items:
label = item[4]
url = item[1] + '/commit/' + item[0]
if label == 1 or url in java_sec_url_set or url in python_sec_url_set:
sec_items.append(item)
sec_df = pd.DataFrame(sec_items, columns= ['commit_id', 'repo', 'partition', 'diff', 'label', 'PL', 'LOC_MOD', 'filename', 'msg'])
sec_df.to_csv(output_path, encoding='utf-8')
def filter_security_changes_by_keywords(dataset_name):
print("Reading dataset....")
df = pd.read_csv(dataset_name)
df = df[['commit_id', 'repo', 'partition', 'PL', 'label', 'msg']]
df = df[df.label == 0]
df = df[df.partition == 'test']
items = df.to_numpy().tolist()
python_sec_url_set = set()
java_sec_url_set = set()
sec_message_set = set()
strong_regex = re.compile(r'(?i)(denial.of.service|remote.code.execution|\bopen.redirect|OSVDB|\bXSS\b|\bReDoS\b|\bNVD\b|malicious|x−frame−options|attack|cross.site|exploit|directory.traversal|\bRCE\b|\bdos\b|\bXSRF\b|clickjack|session.fixation|hijack|advisory|insecure|security|\bcross−origin\b|unauthori[z|s]ed|infinite.loop)')
medium_regex =re.compile(r'(?i)(authenticat(e|ion)|bruteforce|bypass|constant.time|crack|credential|\bDoS\b|expos(e|ing)|hack|harden|injection|lockout|overflow|password|\bPoC\b|proof.of.concept|poison|privelage|\b(in)?secur(e|ity)|(de)?serializ|spoof|timing|traversal)')
for item in tqdm(items):
message = item[5]
url = item[1] + '/commit/' + item[0]
pl = item[3]
if not isinstance(message, str) and math.isnan(message):
continue
m = strong_regex.search(message)
n = medium_regex.search(message)
if m or n:
if pl == 'java':
java_sec_url_set.add(url)
else:
python_sec_url_set.add(url)
sec_message_set.add(message)
return java_sec_url_set, python_sec_url_set
if __name__== '__main__':
dataset_name = 'ase_dataset_sept_19_2021.csv'
sec_dataset_name = 'ase_surity_sub_dataset.csv'
extract_security_dataset(dataset_name, sec_dataset_name)