-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHW1.py
100 lines (85 loc) · 4.42 KB
/
HW1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -*- coding: utf-8 -*-
# Problem 3
# generate three 2D array for lebel, word number, and word frequency
with open('C:/Onedrive/OneDrive - Knights - University of Central Florida/UCF/Courses/Smart City/HW1/data_task_a/task_a_labeled_train.tf') as df:
label_list=[] # define label list
word_matrix=[] # define word number 2D array
count_matrix=[] # define word frequency 2D array
for line in df: # read the raw data line by line
word_list=[] # for each line, define word number list
count_list=[] # for each line, define word frequency list
label=line.split(' ')[0] # pick up the first value as label
cells=line.split(' ')[1::] # pick up the the remaining value as list
for i in range (0, len(cells)): # read the row data column by column
word=int(cells[i].split(':')[0]) # for each cell, pick up the left value as word number
count=int(cells[i].split(':')[1]) # for each cell, pick up the left value as word frequency
count_list.append(count) # collect two lists of each line
word_list.append(word)
label_list.append(label) # collect label list
word_matrix.append(word_list) # collect word number 2D array
count_matrix.append(count_list) # collect word frequency 2D array
print (label_list, word_matrix,count_matrix)
# convert the data format to be array
import pandas as pd
import numpy as np
word_frame=np.array(word_matrix)
count_frame=np.array(count_matrix)
# generate the list of word numbers to create a data frame
word_matrix=list(set([j for i in (word_frame) for j in i]))
word_matrix.sort()
# create a data frame of 4000*41675, and fill in all 0
count_output=pd.DataFrame(index=range(0,word_frame.shape[0]), columns=word_matrix)
count_output = count_output.fillna(0)
# fill in the corresponding word frequency based on the comparison between word number and column number
for i in range(0,word_frame.shape[0]):
for j in word_matrix:
for k in range(0,len(word_frame[i])):
if word_frame[i][k]==j:
count_output.loc[i][j]=count_frame[i][k]
j=j+1
print (count_output)
# for the test dataset, generate three 2D array for lebel, word number, and word frequency following the same procedure with training dataset
with open('C:/Onedrive/OneDrive - Knights - University of Central Florida/UCF/Courses/Smart City/HW1/data_task_a/task_a_u00_tune.tf') as test:
label_list_tst=[]
word_matrix_tst=[]
count_matrix_tst=[]
for line in test:
word_list=[]
count_list=[]
label=line.split(' ')[0]
cells=line.split(' ')[1::]
for i in range (0, len(cells)):
word=int(cells[i].split(':')[0])
count=int(cells[i].split(':')[1])
count_list.append(count)
word_list.append(word)
label_list_tst.append(label)
word_matrix_tst.append(word_list)
count_matrix_tst.append(count_list)
print (label_list_tst, word_matrix_tst,count_matrix_tst)
# convert the data format to be array
word_frame_tst=np.array(word_matrix_tst)
count_frame_tst=np.array(count_matrix_tst)
# create a data frame of 4000*41675, and fill in all 0
count_output_tst=pd.DataFrame(index=range(0,word_frame_tst.shape[0]), columns=word_matrix)
count_output_tst = count_output_tst.fillna(0)
# fill in the corresponding word frequency based on the comparison between word number and column number
for i in range(0,word_frame_tst.shape[0]):
for j in word_matrix:
for k in range(0,len(word_frame_tst[i])):
if word_frame_tst[i][k]==j:
count_output_tst.loc[i][j]=count_frame_tst[i][k]
j=j+1
print (count_output_tst)
# fitting the Naive bayes classifier based on the training dataset
from sklearn.naive_bayes import GaussianNB
x=count_output.as_matrix()
y=np.array(label_list)
nbc=GaussianNB()
nbc.fit(x,y)
# predicting the outcome of test dataset based on the fitted Naive bayes classifier
x_tst=count_output_tst.as_matrix()
pred=nbc.predict(x_tst)
# calculate the accuracy score based on the comparison between test outcome and predicted outcome
from sklearn.metrics import accuracy_score
accuracy_score(np.array(label_list_tst), pred, normalize = True)