-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNaiveBayes.py
259 lines (241 loc) · 9.07 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import math
import numpy as np
import csv
from scipy.sparse import csr_matrix
import datetime
from collections import defaultdict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from pprint import pprint
import matplotlib.pyplot as plt
from scipy.stats import entropy
import array as arr
def my_func(array_O_words,words_txt,MAPS):
"""
Parameters
----------
array_O_words : Array
Array representing the original data set including the true classes.
words_txt : path to vocabulary.txt file
.txt file holding ordered set of vocabulary.
MAPS : List
List of calculated MAP approximations of P(X_i|Y).
Returns
-------
None.
"""
print("Start")
ans =[]
#ordered words from vocabulary.txt
words = np.loadtxt(words_txt, usecols=0, skiprows=1, dtype='str')
Ys = []
#num_of_rows = []
Hs =[]
counts = arr.array('i', [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
#exampleCount = 0
for col in range(1,61189):
for row in range(0,12000):
#If x_i for example j >0, find class it belongs to, increment index
#equivalent to (class # -1) by 1 to have number for each count given
#this x_i
if array_O_words[row][col] >0:
counts[int(array_O_words[row][-1])-1] = counts[int(array_O_words[row][-1])-1]+1
A = counts.tolist()
#get probabilities of classes dividing each count by sum of class counts
A = [x/(sum(A)) if sum(A) >0 else x for x in A]
Ys.append(A)
for i in range(0,20):
counts[i] =0
print(np.shape(Ys))
for i in range(0,len(Ys)):
Hs.append(entropy(Ys[i],base =2))
print(np.shape(Hs),np.shape(MAPS))
Hs = [-1*Hs[i]*MAPS[i] for i in range(0,len(Hs))]
#find max, append to ans, delete that index, repeat to get the top 100.
Hs =np.array(Hs)
n = 100
indices = (-Hs).argsort()[:n]
for idx in indices:
ans.append(idx)
print(ans,"\n")
sol =[]
for i in ans:
sol.append(words[i])
print(sol)
"""
Pseudo-code
-Entropy(P(Y|x_i)) * MAP est
"""
def csv_to_sparse(csvdoc, colnum, skipstart=0, stopafter=0):
"""
Function to turn csv into a sparse matrix
input csvdoc: csv file
output: returns the sparse Matrix and a rowcount
"""
col_count = []
row_count = []
data_count = []
rowcount = 0
#code incorporated from from https://docs.python.org/3/library/csv.html
#This code helps create a csr matrix from a csv file
with open(csvdoc, newline='', encoding='utf-8-sig') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
filecount = 0
for row in csvreader:
filecount = filecount+1
if filecount < skipstart:
continue
if rowcount >= stopafter and stopafter > 0:
break
for idx, el in enumerate(row):
if int(el) == 0:
continue
else:
row_count.append(rowcount)
col_count.append(idx)
data_count.append(int(el))
rowcount = rowcount + 1
new_matrix = csr_matrix((data_count, (row_count, col_count)), shape=(rowcount, 61190))
return new_matrix, rowcount
def dropcols_coo(M, idx_to_drop):
"""
Function that drops an index from the csr matrix
Taken from the website
https://stackoverflow.com/questions/23966923/delete-columns-of-matrix-of-csr-format-in-python
"""
idx_to_drop = np.unique(idx_to_drop)
C = M.tocoo()
keep = ~np.in1d(C.col, idx_to_drop)
C.data, C.row, C.col = C.data[keep], C.row[keep], C.col[keep]
C.col -= idx_to_drop.searchsorted(C.col) # decrement column indices
C._shape = (C.shape[0], C.shape[1] - len(idx_to_drop))
return C.tocsr()
def classify_conf(class_matrix, p_v, probs_calc):
"""
Function to classify an input specifically for the confusion matrix
input csvdoc: an input file that contains data to be classified
input p_v: The array of probabilities
input probs_calc: array of probabilities
output a classified array for the confusion matrix
"""
max_prob = -1000000
max_idx = 0
Map_calc = 0
row_count = 0
array_classified = []
# for i in range(0, 2400):
for i in range(0, 2400):
row_mat = class_matrix.getrow(i)
row_dense = row_mat.toarray()
row_dense = list(row_dense)
row_dense = row_dense[0]
testid = row_dense[0]
row_dense = row_dense[1:]
for idx, row in enumerate(probs_calc):
for idxrow, el in enumerate(row_dense):
if el == 0:
continue
else:
Map_calc = Map_calc + row[idxrow] * int(el)
Map_calc = Map_calc + p_v[row_count]
row_count = row_count + 1
#Find the maximum probability
if Map_calc > max_prob:
max_prob = Map_calc
max_idx = idx + 1
Map_calc = 0
row_count = 0
array_classified.append(max_idx)
#file_object.write(str(testid) + ","+str(max_idx)+'\n')
max_prob = -100000
max_idx = 0
return array_classified
def classify(csvdoc, p_v, probs_calc):
"""
Function to classify an input
input csvdoc: an input file that contains data to be classified
input p_v: The array of probabilities
input probs_calc: array of probabilities
output is a csv file called classified.csv that has
"""
testid = 0
max_prob = -10000000
max_idx = 0
Map_calc = 0
row_count = 0
file_object = open('classified.csv', 'w+')
file_object.write("id,class\n")
with open(csvdoc, newline='', encoding='utf-8-sig') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
for training_row in csvreader:
testid = training_row[0]
del training_row[0]
for idx, row in enumerate(probs_calc):
for idxrow, el in enumerate(training_row):
if int(el) == 0:
continue
else:
#calculate probabilities for each feature
Map_calc = Map_calc + row[idxrow] * int(el)
#Calulate the maximum liklihood estimate
Map_calc = Map_calc + p_v[row_count]
row_count = row_count + 1
#find the class that provides the highest probability
if Map_calc > max_prob:
max_prob = Map_calc
max_idx = idx + 1
Map_calc = 0
row_count = 0
#write max class to file
file_object.write(str(testid) + ","+str(max_idx)+'\n')
max_prob = -10000000
max_idx = 0
Vocabulary = 61188
beta = 1/Vocabulary
#beta= .0015
#cols_comp = []
class_wrong = defaultdict(int)
#array for MLE P(Yk) for each class 1 x 20
p_v = []
#array for MaP for P(X|Y) for each class Size 20 x 61188
prob_calcs = []
text_matrix, total_rows = csv_to_sparse('training.csv', 61190)
#These are used for the confusion matrix calculations
#text_matrix, total_rows= csv_to_sparse('training.csv', 61190, 0, 9600)
#class_matrix, total_rows_class= csv_to_sparse('training.csv', 61190, 9601, 12000)
#cols = class_matrix.getcol(61189)
#cols = cols.toarray()
#for i in cols:
# cols_comp.append(i.item())
#new_class_matrix = dropcols_coo(class_matrix, 61189)
for classnum in range(1, 21):
filtered_matrix = text_matrix[text_matrix[:, 61189] == classnum].tolist()[0]
#code from https://stackoverflow.com/questions/4918425/subtract-a-value-from-every-number-in-a-list-in-python
#Subtract the key ids from the data
filtered_matrix_list = [x - 1 for x in filtered_matrix]
num_rows = len(filtered_matrix_list)
#Take logrithm of the probabilities for future calculations
probs = math.log2(num_rows/total_rows)
p_v.append(probs)
man_data = text_matrix[filtered_matrix_list, :].sum(axis=0).tolist()[0]
del man_data[0]
del man_data[-1]
total_words = sum(man_data)
#used code from https://stackoverflow.com/questions/54629298/how-to-use-vectorized-numpy-operations-on-lambda-functions-that-return-constant
#Calculate the probabilities for each class
func = lambda x: math.log2((x + beta)/(total_words + 1))
vfunc = np.vectorize(func)
prob_bayes = vfunc(man_data)
#put probabilities in a 1x20 matrix
prob_calcs.append(prob_bayes)
#my_func(text_matrix.toarray(), '/home/jared/Downloads/vocabulary.txt', prob_bayes)
#This section is used to create the confusion matrix but is commented out upon submission
#spec_array = classify_conf(new_class_matrix, p_v, prob_calcs)
#actual = cols_comp
#predicted = spec_array
#disp_label = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
#matrix = confusion_matrix(actual,predicted, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,18, 19, 20])
#print(matrix)
#disp = ConfusionMatrixDisplay(confusion_matrix= matrix, display_labels=disp_label)
#disp.plot()
#plt.show()
classify('testing.csv', p_v, prob_calcs)