-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDecisionTreeAlgorithm.py
342 lines (288 loc) · 10.7 KB
/
DecisionTreeAlgorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#Written by Darcie Howley & Niamh Hennigan
#Student ID= 17321006 & 17418134
import csv
import random
import sys
import easygui
import null as null
import pymsgbox
from easygui import *
#Darcie-start
print("Results printed to results.txt file in project folder ")
#Creates the file for results to printed too
sys.stdout = open("results.txt", "w")
#user inputs
text = "Please Select which column to use for classification"
# window title
title = "Select Column"
# default integer
d_int = 3
# lower bound
lower = 0
# upper bound
upper = 99999
# creating a integer box
output = integerbox(text, title, d_int, lower, upper)
class_column = int(output) #3
pymsgbox.alert('Please Select The Full Dataset \n \t (beer)', 'File Selector')
training_data_name = easygui.fileopenbox()
#Darcie-end
#Niamh-start
#returns all values in the column col
def get_column(rows,col):
column_list = []
for row in rows:
column_list.append(row[col])
return column_list
#determines how many of each classification are in the rows passed to the function
def class_counts(rows):
counts = {}
for row in rows:
label = row[class_column]
if label not in counts:
counts[label] = 0
counts[label] += 1
return counts
#Niamh end
#Darcie start
#checks if input is numeric
def is_numeric(value):
try:
float(value)
return True
except ValueError:
pymsgbox.alert('There is an issue with file either null value \nor tabulated wrong please check file\nand restart program ', 'Incorrect File')
return False
#compare the current row with the question value being tested against - return true if greater than
def compare(row, feature_column, feature_value):
if row[feature_column] == null:
return False
#Darcie end
#Niamh start
if is_numeric(row[feature_column]):
if float(row[feature_column]) >= float(feature_value):
return True
else:
return False
#if it is categorical, it returns true if the question column and value and input attribute are equal
else:
return row[feature_column] == feature_value
print("x")
#create two lists true_results and false_results based on true/false (greater than/less than) from compare method
def split_data(rows, feature_column, value_to_test_against):
true_results, false_results = [], []
for row in rows:
if compare(row, feature_column, value_to_test_against):
true_results.append(row)
else:
false_results.append(row)
return true_results, false_results
#calculate gini of the rows inputted
def calculate_gini(rows):
labels = class_counts(rows)
impurity = 1
for l in labels:
prob_of_l = labels[l] / float(len(rows))
impurity -= prob_of_l**2
return impurity
#calculate the information gain by getting the gini of left and right and taking that away from overall gini of system at that node
def calculate_info_gain(left_side,right_side, current_gini):
Pi = float(len(left_side)/ (len(left_side) + len(right_side)))
info_gain = current_gini - Pi * calculate_gini(left_side) - (1 - Pi) * calculate_gini(right_side)
return info_gain
#iterate through attributes to find best attribute and best attribute value to split data on
def find_best_split(rows):
best_gain =0
best_attribute = 0
best_attribute_value = 0
#calculates gini for the rows provided
current_gini = calculate_gini(rows)
no_features = len(rows[0])
#iterates through the features in the dataset (i.e columns)
for feature in range(no_features):
#skip if the feature is the one we want to classify as - set at start
if feature == class_column:
continue
column_vals = get_column(rows,feature)
l = 0
while l < len(column_vals):
#value we want to check
testing_feature_value = rows[l][feature]
#split the data based on the testing_feature_value
true, false = split_data(rows, feature, testing_feature_value)
#calcualte information gain based on current split
gain = calculate_info_gain(true,false,current_gini)
#if better gain acheived update
if gain > best_gain:
best_gain = gain
best_attribute = feature
best_attribute_value = rows[l][feature]
l += 1
return best_gain,best_attribute,best_attribute_value
#recursively find best attribute to split on until leaf nodes reached
def iterate_through_tree(rows):
#find best split of data
info_gain, feature_column, feature_value = find_best_split(rows)
#if no information gain we must be at a leaf node
if info_gain == 0:
return Leaf(rows)
#split the data on this best split
true_rows, false_rows = split_data(rows, feature_column, feature_value )
#recursively go through true branch until leaf reached
true_branch = iterate_through_tree(true_rows)
#then recursively go through false branch until leaf reached
false_branch = iterate_through_tree(false_rows)
return Decision_Node(feature_column, feature_value, true_branch, false_branch)
#class to define leaf node reached - contains the classifications at that leaf
class Leaf:
def __init__(self,rows):
self.predictions = class_counts(rows)
#class to define a decision node - contains the feature and feature value (to be able to split data again) and two child branches (true and false)
class Decision_Node:
def __init__(self, feature, feature_value, true_branch, false_branch):
self.feature = feature
self.feature_value = feature_value
self.true_branch = true_branch
self.false_branch = false_branch
#method used to classify test data once tree is formed
def classify(row, node):
if isinstance(node,Leaf):
return node.predictions
feature = node.feature
feature_value = node.feature_value
#recursively call itself until leaf node reached
if compare(row,feature, feature_value):
return classify(row, node.true_branch)
else:
return classify(row, node.false_branch)
#Niamh end
#Darcie start
depth = 0
#used for cleaner print statement
def check_indent(depth):
if depth == 0:
indent = "|--"
elif depth == 1:
indent = "| |------"
elif depth == 2:
indent = "| | |--------"
elif depth == 3:
indent = "| | | |----------"
elif depth == 4:
indent = "| | | | |------------"
else:
indent = "| | | | | |--------------"
return indent
#print out of tree
def print_tree(node,spacing="",equals="|--"):
global depth
indent=check_indent(depth)
if isinstance(node,Leaf):
print(indent+"--",node.predictions)
return
indent=check_indent(depth)
print(indent+"Is "+attributes[node.feature]+" > "+str(node.feature_value)+" ?")
depth+=1
indent=check_indent(depth)
print(indent+'> True:')
print_tree(node.true_branch,spacing+" ")
indent=check_indent(depth)
print(indent+"> False:")
print_tree(node.false_branch,spacing+" ")
depth-=1
#Darcie end
#Niamh start
#print out of leaf
def print_leaf(counts):
total = sum(counts.values())
probs = {}
for lbl in counts.keys():
probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
return probs
#read in rows from file
def read_in_file(file_name):
training_data = []
with open(file_name, 'r') as f:
reader = csv.reader(f,delimiter='\t')
linecount = 0
for row in reader:
#skip first row with attributes
if linecount == 0:
linecount += 1
else:
training_data.append(row)
linecount += 1
return training_data
#read attributes to be used in printing
def read_attributes(file_name):
attributes = []
with open(file_name) as f:
reader = csv.reader(f,delimiter='\t')
for row in reader:
attributes = row
break
return attributes
#Niamh End
#Darcie/Niamh- Start
if __name__ == "__main__":
average_accuracy = 0
i = 0
pymsgbox.alert('Please Select Training Dataset \n (comparison_training)', 'File Selector')
comparison_training_data = read_in_file(easygui.fileopenbox())
pymsgbox.alert('Please Select Testing Dataset \n (comparison_testing)', 'File Selector')
comparison_testing_data = read_in_file(easygui.fileopenbox())
input_data = read_in_file(training_data_name)
attributes = read_attributes(training_data_name)
print("Manually divided 1/3 and 2/3 test file for Weka comparison")
#build and print tree for Weka comparison
comparison_tree = iterate_through_tree(comparison_training_data)
print_tree(comparison_tree)
#classify testing data for Weka comparison and calculate accuracy
right2 =0
wrong2 = 0
for row in comparison_testing_data:
print("Actual: %s. Predicted: %s"% (row[class_column],print_leaf(classify(row,comparison_tree))))
for key, value in classify(row,comparison_tree).items():
if row[class_column]==key:
right2+=1
else:
wrong2+=1
print('\n\nPercentage Correctly Classified')
print(right2/(right2+wrong2)*100 )
print('Percentage Incorrectly Classified')
print(wrong2/(right2+wrong2)*100 )
print("End of weka comparison set \n \n \n \n")
print("The 10 itererations of randomly split data for average accuarcy result")
while i < 10:
# randomise input data into one third and two thirds
random.shuffle(input_data)
no_samples = (len(input_data))//3
testing_third, training_two_thirds = [], []
testing_third = input_data[: no_samples]
training_two_thirds = input_data[no_samples:]
#build and print tree
tree = iterate_through_tree(training_two_thirds)
print_tree(tree)
#classify testing data for Weka comparison and calculate accuracy
right = 0
wrong = 0
for row in testing_third:
print("Actual: %s. Predicted: %s"% (row[class_column],print_leaf(classify(row,tree))))
for key, value in classify(row,tree).items():
if row[class_column]==key:
right+=1
elif row[class_column]==null:
print(0)
else:
wrong+=1
print('\nPercentage Correctly Classified')
print(right/(right+wrong)*100 )
print('Percentage Incorrectly Classified')
print(wrong/(right+wrong)*100 )
i += 1
average_accuracy += right/(right+wrong)
print("\n\nAverage Accuracy over 10 iterations:")
print(average_accuracy/10*100)
#writing results to file
sys.stdout.close()
#Darcie/Niamh End