-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTrace_Parser(old).py
266 lines (226 loc) · 12.2 KB
/
Trace_Parser(old).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import os
import time
import json
import pydot
import numpy as np
import pandas as pd
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
def get_dataframe_from_jsontrace(trace_path):
start_time = time.time()
print(start_time)
data = []
with open(trace_path, 'rb') as f:
for line in f:
# print(str(json_lines.index(line)) + '-th line of ' + str(len(json_lines)) + ' has been processed!')
try:
data.append(json.loads(line))
except:
continue
# following codes will not be executed because of the existence of too many problematic lines in traces
# some items in original traces are not properly contained by quotation marks
# so far we identified "-inf", "inf", and "nan"
if '-inf' in line:
# print('-inf')
line = line.replace('-inf', '"-inf"')
data.append(json.loads(line))
elif 'inf' in line:
# print('inf')
line = line.replace('inf', '"inf"')
data.append(json.loads(line))
elif 'nan' in line:
# print('nan')
line = line.replace('nan', '"nan"')
data.append(json.loads(line))
print(str(np.round(time.time() - start_time,2)) + ' seconds consumed for parsing trace: ' + trace_url)
# First check: whether traces of the original file equals to the sum of traces of TEN events
original_trace_df = pd.DataFrame(data=data)
if check_traces_euqal(original_trace_df):
print('First check passed: traces of the original file equals to the sum of traces of TEN events!')
else:
raise ValueError('First check failed!')
# Second check: whether traces of threadID 1 segment equals to the sum of traces of TEN events in the same segment
thread_one_df = original_trace_df[original_trace_df['threadID'] == 1]
if check_traces_euqal(thread_one_df):
print('Second check passed: traces of thread one equals to the sum of traces of TEN events!')
else:
raise ValueError('First check failed!')
# the following two lines make the thread_one_df drop lines in which the "event" value is 5
# (i.e. "method" value is "null"). So far. it seems that the removal operation will not twist the call relations
thread_one_df = thread_one_df[thread_one_df['event'] != 5]
# replace items with different formats as those in the same format
thread_one_df = thread_one_df.replace(np.nan, 'N/A', regex=True)
thread_one_df = thread_one_df.replace('nan', 'N/A', regex=True)
event_eight_df = original_trace_df[original_trace_df['event'] == 8]
event_nine_df = original_trace_df[original_trace_df['event'] == 9]
return thread_one_df, event_eight_df, event_nine_df
def check_traces_euqal(my_trace_df):
check_flag = False
event_zero_df = my_trace_df[my_trace_df['event'] == 0]
event_one_df = my_trace_df[my_trace_df['event'] == 1]
event_two_df = my_trace_df[my_trace_df['event'] == 2]
event_three_df = my_trace_df[my_trace_df['event'] == 3]
event_four_df = my_trace_df[my_trace_df['event'] == 4]
event_five_df = my_trace_df[my_trace_df['event'] == 5]
event_six_df = my_trace_df[my_trace_df['event'] == 6]
event_seven_df = my_trace_df[my_trace_df['event'] == 7]
event_eight_df = my_trace_df[my_trace_df['event'] == 8]
event_nine_df = my_trace_df[my_trace_df['event'] == 9]
if event_six_df.shape[0] > 0 or event_seven_df.shape[0] > 0:
raise ValueError('Found that event 6 or event 7 traces without any processing logic!')
all_events_traces = event_zero_df.shape[0] + event_one_df.shape[0] + event_two_df.shape[0] + event_three_df.shape[0] \
+ event_four_df.shape[0] + event_five_df.shape[0] + event_six_df.shape[0] + event_seven_df.shape[0] \
+ event_eight_df.shape[0] + event_nine_df.shape[0]
if my_trace_df.shape[0] == all_events_traces:
check_flag = True
return check_flag
def getMethodName(method_name_df, id):
return method_name_df.method[method_name_df.retVal == id].to_string().split(' ')[-1]
# return method_name_df.method[method_name_df.retVal == id].to_string()
if __name__ == '__main__':
archive_url = os.getcwd() + '/data'
category_names = os.listdir(archive_url)
for category in category_names:
if category.startswith('.'):
continue
app_names = os.listdir(archive_url + '/' + category)
for app in app_names:
trace_names = os.listdir(archive_url + '/' + category + '/' + app)
for trace in trace_names:
if not trace.endswith('.trace'):
continue
trace_url = archive_url + '/' + category + '/' + app + '/' + trace
thread_one_df, event_eight_df, event_nine_df = get_dataframe_from_jsontrace(trace_url)
start_method_df = thread_one_df[thread_one_df.event == 0]
start_method_df.index = np.arange(1, len(start_method_df)+1)
end_method_df = thread_one_df[(thread_one_df.event == 1) | (thread_one_df.event == 2)]
end_method_df.index = np.arange(1, len(end_method_df)+1)
# Find unique method names and construct method name list
method_names = []
methods = list(np.unique(start_method_df.method)) + list(np.unique(start_method_df.pre1))
try:
methods.remove('N/A')
except:
pass
for method in methods:
method_name = getMethodName(event_eight_df, method)
try:
method_names.index(method_name)
except:
method_names.append(method_name)
'''
=============================================================
Dynamic Call Tree
=============================================================
'''
G = nx.DiGraph()
# Generate Tree
alive_methods = []
next_end = None
node_dict = {'retVals':[], 'nameIds':[]}
i = j = 0
while j < end_method_df.shape[0]:
# Read Start Method
if i < start_method_df.shape[0]:
method = start_method_df.method.iloc[i]
callee = start_method_df.pre1.iloc[i]
i += 1
method_name = getMethodName(event_eight_df, method)
method_idx = method_names.index(method_name)
# Add Node
node_idx = len(G.nodes)
G.add_node(node_idx)
node_dict['retVals'].append(method)
node_dict['nameIds'].append(method_idx)
alive_methods.append({'node_idx': node_idx, 'retVal': method})
# Add Edge
if callee == 'N/A':
continue
found_alive = False
for k in np.arange(len(alive_methods)-1, -1, -1):
if alive_methods[k]['retVal'] == callee:
found_alive = True
G.add_edge(alive_methods[k]['node_idx'], node_idx)
break
if not found_alive:
# Add Higher level methods Node
callee_name = getMethodName(event_eight_df, callee)
callee_idx = method_names.index(callee_name)
node_idx = len(G.nodes)
G.add_node(node_idx)
node_dict['retVals'].append(callee)
node_dict['nameIds'].append(callee_idx)
G.add_edge(node_idx, node_idx-1)
# Check Node Alive
if next_end == None:
next_end = end_method_df.method.iloc[j]
j += 1
for k in np.arange(len(alive_methods) - 1, -1, -1):
if alive_methods[k]['retVal'] == next_end:
alive_methods.remove(alive_methods[k])
next_end = None
break
labels = {i: node_dict['nameIds'][i] for i in range(0, len(node_dict['nameIds']))}
pos = graphviz_layout(G, prog="dot")
fig = plt.figure(figsize=(15, 4), dpi=200)
# nx.draw(graph, pos, **options)
nx.draw_networkx_edges(G, pos, edge_color='dimgray', width=1)
nx.draw_networkx_nodes(G, pos, node_color='skyblue', node_size=200)
nx.draw_networkx_labels(G, pos, labels, font_size=8)
nx.draw_networkx_edge_labels(G, pos, edge_labels=nx.get_edge_attributes(G, 'weight'))
plt.title('%s %s' % (app, trace))
fig.savefig('%s_%s_DCT.pdf' % (app, trace), dpi=200)
plt.show()
plt.close(fig)
'''
=============================================================
Call Graph
=============================================================
'''
G = nx.DiGraph()
labels = {}
# Add Vertices
for method_name in method_names:
method_idx = method_names.index(method_name)
G.add_node(method_idx)
# Calculate Occurence
retVals = list(event_eight_df.retVal[event_eight_df.method == method_name])
occurence = len(start_method_df[start_method_df.method.isin(retVals)])
if occurence == 0:
occurence = len(start_method_df[start_method_df.pre1.isin(retVals)])
labels[method_idx] = '%d:%d' % (method_idx, occurence)
# Add Weighted Edges
pairs = start_method_df[start_method_df.pre1 != 'N/A'].groupby(['method','pre1']).size().reset_index().rename(columns={0:'weight'})
for idx, row in pairs.iterrows():
caller = method_names.index(getMethodName(event_eight_df, row.pre1))
callee = method_names.index(getMethodName(event_eight_df, row.method))
if G.has_edge(caller, callee):
G[caller][callee]['weight'] += row.weight
else:
G.add_edge(caller, callee, weight=row.weight)
pos = graphviz_layout(G, prog="dot")
fig = plt.figure(figsize=(10,4), dpi=200)
# nx.draw(graph, pos, **options)
nx.draw_networkx_edges(G, pos, edge_color='dimgray', width=1)
nx.draw_networkx_nodes(G, pos, node_color='skyblue', node_size=200)
nx.draw_networkx_labels(G, pos, labels, font_size=6)
# nx.draw_networkx_edge_labels(G, pos, edge_labels=nx.get_edge_attributes(G,'weight'))
plt.title('%s %s' % (app, trace))
plt.show()
fig.savefig('%s_%s_CG.pdf' % (app, trace), dpi=200)
plt.close(fig)
method_names_df = pd.DataFrame(method_names)
method_names_df.to_csv('%s_%s.csv' % (app, trace), index=True, header=False)
archive_url = os.getcwd() + '/data'
category_names = os.listdir(archive_url)
category = category_names[3]
app_names = os.listdir(archive_url + '/' + category)
app = app_names[0]
trace_names = os.listdir(archive_url + '/' + category + '/' + app)
trace = trace_names[1]
method_name = method_names[61]
retVal = event_eight_df[event_eight_df.method == method_name].retVal.to_string().split(' ')[-1]
traces = thread_one_df.iloc[np.where(thread_one_df.method == retVal)]
event_eight_df[event_eight_df.method == '@0x75e15e70']