-
Notifications
You must be signed in to change notification settings - Fork 0
/
onion.py
52 lines (43 loc) · 1.83 KB
/
onion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# from nltk.text import TextCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential
from tensorflow.keras.losses import binary_crossentropy
# from tensorflow_addons.metrics import F1Score
# from tensorflow.keras.metrics import F1Score
data = pd.read_csv('onion-or-not.csv')
y = data['label']
# print(y)
data['tokens'] = data.apply(lambda x: word_tokenize(x['text']), axis=1)
stemmer = PorterStemmer()
data['stemmed'] = data['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])
stop = set(stopwords.words('english'))
data['stop'] = data['stemmed'].apply(lambda x: ' '.join([word for word in x if word not in (stop)]))
tf = TfidfVectorizer()
v = tf.fit_transform(data['stop'].to_numpy())
feature_names = tf.get_feature_names()
dense = v.todense()
# df = pd.DataFrame(dense, columns=[feature_names])
df = pd.DataFrame(dense)
print(df)
train_X, val_X, train_y, val_y = train_test_split(df, y, train_size=0.75, test_size=0.25, random_state=0)
model = Sequential()
model.add(Dense(100, input_dim=16998, activation='relu'))
model.add(Dense(100, activation='relu'))
# model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=binary_crossentropy,
optimizer='adam',
metrics=['accuracy', 'Precision', 'Recall']) # todo f1score
model.fit(train_X, train_y)
prediction = model.predict_classes(val_X)
print(precision_recall_fscore_support(val_y, prediction, average='macro'))
print(prediction)
print(accuracy_score(prediction, val_y))