-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_log.py
214 lines (183 loc) · 7.32 KB
/
parse_log.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import json
import os
from shutil import copyfile, move
import glob
import csv
import sys
import hashlib
# Anti-error: https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
#csv.field_size_limit(sys.maxsize)
def cls():
"""Clear console"""
os.system('cls' if os.name=='nt' else 'clear')
def numProcs(reportname):
"""Number of created processes by executable in report"""
with open(reportname, "rb") as f:
data = json.load(f)
numProcesses = len(data["behavior"]["processes"])
return numProcesses
def isThereApiCalls(report):
"""Check if there API calls were performed by any process"""
with open(report, "rb") as r:
data = json.load(r)
for i in range(numProcs(report)):
if len(data["behavior"]["processes"][i]["calls"]) > 0:
return True
else:
continue
return False
def sort(leg=True, reportsDir = "./reports/"):
"""
Sorting legitimate and malicious file reports
param leg: Should we sort legetimate files or malicious
reportsDir: reports directory with necessary structure
"""
if leg:
directory = reportsDir + "leg/"
else:
directory = reportsDir + "mal/"
dirFiles = glob.glob(directory+"*.json")
num, broken = 0, 0
for i, f in enumerate(dirFiles):
try:
if numProcs(f) == 1:
if isThereApiCalls(f):
num += 1
move(f, directory + "oneproc")
else:
broken += 1
move(f, directory + "broken/")
elif not isThereApiCalls(f):
broken += 1
move(f, directory + "broken/")
except KeyError:
move(f, directory + "broken/")
broken += 1
cls()
print("Filename: ", f)
print("Progress: {0}/{1}. One process files: {2}. Broken: {3}".format(i, len(dirFiles), num, broken))
def dataStats(reportsDir = "./reports/"):
"""Statistics about number of reports"""
legMulti = glob.glob(reportsDir+"/leg/*.json")
legOne = glob.glob(reportsDir+"/leg/oneproc/*.json")
legBroken = glob.glob(reportsDir+"/leg/broken/*.json")
malMulti = glob.glob(reportsDir+"/mal/*.json")
malOne = glob.glob(reportsDir+"/mal/oneproc/*.json")
malBroken = glob.glob(reportsDir+"/mal/broken/*.json")
print("""Legal files:
Total: {0}, One-proc: {1}, Multi-proc: {2}, Broken: {3} """
.format(len(legBroken+legMulti+legOne), len(legOne), len(legMulti), len(legBroken)))
print("""Malicious files:
Total: {0}, One-proc: {1}, Multi-proc: {2}, Broken: {3} """
.format(len(malBroken+malMulti+malOne), len(malOne), len(malMulti), len(malBroken)))
print("Working samples: {0}".format(len(malMulti+malOne+legMulti+legOne)))
def parseReport(report, numAPIs=None):
numProcesses = numProcs(report)
with open(report, "rb") as r:
data = json.load(r)
for i in range(numProcesses):
apis = []
#statuses = []
#returns = []
for call in data["behavior"]["processes"][i]["calls"]:
apis.append(call["api"])
#statuses.append(str(call["status"]))
#returns.append(str(call["return_value"]))
apis = " ".join(apis[:numAPIs])
#statuses = " ".join(statuses)
#returns = " ".join(returns)
yield apis#, statuses, returns
def collectDataset(saveto='data.csv', reportsDir = "./reports/", append=False, limitNumAPI=None):
csvData = [['Malicious', 'API Calls'],]#, 'Statuses', 'Returns'],]
csvFile = open(saveto, 'w')
writer = csv.writer(csvFile)
writer.writerows(csvData)
legalFiles = glob.glob(reportsDir+"/leg/*.json")
for i, f in enumerate(legalFiles):
cls()
print("""\tProcessing: {0}
Legal files progress: {1}/{2}""".format(f, i, len(legalFiles)))
for apis in parseReport(f, limitNumAPI): #, statuses, returns in parseReport(f):
if len(apis) == 0:
continue
writer.writerows([["0", apis]])#, statuses, returns]])
maliciousFiles = glob.glob(reportsDir+"/mal/*.json")
for i, f in enumerate(maliciousFiles):
cls()
print("""\tProcessing: {0}
Malicious files progress: {1}/{2}""".format(f, i, len(maliciousFiles)))
for apis in parseReport(f, limitNumAPI): #, statuses, returns in parseReport(f):
if len(apis) == 0:
continue
writer.writerows([["1", apis]])#, statuses, returns]])
csvFile.close()
def countProcesses(reportsDir = "./reports/", printProcNum = 4, skipLegal=False):
"""Count processes in reports and print reports with provided number of processes
Also write to reports/malprocs.json the malicious files and the number of procs to analyze them manually"""
legalProcNum = {}
if not skipLegal:
legalFiles = glob.glob(reportsDir+"/leg/*.json")
for i, r in enumerate(legalFiles):
cls()
print("Legal progress: {0}/{1}".format(i, len(legalFiles)))
print(legalProcNum)
num = numProcs(r)
try:
legalProcNum[num] += 1
except KeyError:
legalProcNum[num] = 1
if num == printProcNum:
print("{0}: {1}".format(r, num))
maliciousFiles = glob.glob(reportsDir+"/mal/*.json")
malProcNum = {}
procsToMalFiles = {}
for i, r in enumerate(maliciousFiles):
cls()
print("Malicious progress: {0}/{1}".format(i, len(maliciousFiles)))
print(malProcNum)
num = numProcs(r)
try:
procsToMalFiles[num].append(r)
malProcNum[num] += 1
except KeyError:
procsToMalFiles[num] = []
procsToMalFiles[num].append(r)
malProcNum[num] = 1
if num == printProcNum:
print("{0}: {1}".format(r, num))
with open("./reports/malprocs.json", "w+") as f:
json.dump(procsToMalFiles, f, indent=4)
if not skipLegal:
print("In Legal reports: {0}".format(legalProcNum))
print("In Malicious reports: {0}".format(malProcNum))
print("Total: {0}".format({ k: legalProcNum.get(k, 0) + malProcNum.get(k, 0) for k in set(legalProcNum) | set(malProcNum) }))
def hashSortAPI(data, saveto):
csvData = [['Malicious', 'API Calls'],]
saveFile = open(saveto, mode='w+')
writer = csv.writer(saveFile)
writer.writerows(csvData)
with open(data, mode='r') as f:
print("Counting number of lines")
line, lines = 0, sum(1 for row in f)//2-1
rdata = open(data, mode='r')
data_reader = csv.DictReader(rdata)
hashes = []
for row in data_reader:
h = hashlib.sha1(row["API Calls"].encode("ASCII")).hexdigest()
if h in hashes:
pass
else:
hashes.append(h)
writer.writerows([[row['Malicious'], row["API Calls"]]])
line += 1
cls()
print(f'Processed {line}/{lines} lines.')
rdata.close()
saveFile.close()
if __name__ == "__main__":
#sort(leg=True, reportsDir = "./reports/")
#sort(leg=False, reportsDir = "./reports/")
#dataStats()
#countProcesses(skipLegal=True)
collectDataset(limitNumAPI=50, saveto="data50.csv")
hashSortAPI(data = 'data50.csv', saveto='hashed_data50.csv')