-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaccuracy_checker.py
183 lines (154 loc) · 6.56 KB
/
accuracy_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import requests
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT
from transformers import BertModel
from kiwipiepy import Kiwi
from tqdm import tqdm
import threading
import time
def request_to_naver(url, headers, params, client_id, client_secret):
headers['X-Naver-Client-Id'] = client_id
headers['X-Naver-Client-Secret'] = client_secret
response = requests.get(url, headers=headers, params=params)
if response.status_code == 429:
if 'Retry-After' not in response.headers:
wait_time = 1
else:
wait_time = response.headers.get('Retry-After', 1)
print(f"Rate limit exceeded. Waiting for {wait_time} seconds.")
time.sleep(wait_time)
return request_to_naver(url, headers, params, client_id, client_secret)
return response
def noun_extractor(text):
results = []
kiwi = Kiwi()
result = kiwi.analyze(text)
for token, pos, _, _ in result[0][0]:
if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
results.append(token)
return results
def preprocess(text):
nouns = noun_extractor(text)
return ' '.join(nouns)
def get_keywords(kw_model, text):
preprocessed_text = preprocess(text)
keywords = kw_model.extract_keywords(preprocessed_text, keyphrase_ngram_range=(1, 2), stop_words=None, top_n=3)
keywords = [item[0] for item in keywords]
return keywords
data = ["data/사실.txt", "data/대체로 사실.txt", "data/절반의 사실.txt", "data/대체로 사실 아님.txt", "data/전혀 사실 아님.txt"]
#I don't think this algorithm is best for this task, because it takes 13 lines
non_existent_files = []
empty_files = []
for file in data:
if not os.path.exists(file):
non_existent_files.append(file)
elif os.path.getsize(file) == 0:
empty_files.append(file)
if non_existent_files:
for file in non_existent_files:
print(f"{file} does not exist.")
if empty_files:
for file in empty_files:
print(f"{file} is empty.")
client_ids = ["iRRD0P6q5HghHv94n8K3", "DbXP8DUYGIe5PJWyCwJT", "YR9tV08g62h7FjrqYHDr"]
client_secrets = ["GBfTG8vmuU", "37HyAhWvgE", "ULJapPIC3z"]
url = "https://openapi.naver.com/v1/search/news.json"
model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
stopwords_url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ko/master/stopwords-ko.txt"
response = requests.get(stopwords_url)
stopwords = response.text.splitlines()
def process_text(extracted_text, client_id, client_secret):
keywords = get_keywords(kw_model, extracted_text)
all_descriptions = []
for token in keywords:
if token not in stopwords:
params = {
'query': token,
'display': 100,
'sort': 'sim'
}
headers = {}
response = request_to_naver(url, headers, params, client_id, client_secret)
data = response.json()
descriptions = [item['description'] for item in data['items']]
descriptions = [re.sub('<b>', '', description) for description in descriptions]
descriptions = [re.sub('</b>', '', description) for description in descriptions]
all_descriptions.extend(descriptions)
all_descriptions_combined = ' '.join(all_descriptions)
all_descriptions_combined = all_descriptions_combined.replace('[사진]', '')
all_descriptions_combined = all_descriptions_combined.replace('"', '')
documents = [all_descriptions_combined, extracted_text]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
cos_similar = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
if cos_similar[0][0] > 0.20:
result = 1
elif cos_similar[0][0] > 0.15:
result = 2
elif cos_similar[0][0] > 0.10:
result = 3
elif cos_similar[0][0] > 0.05:
result = 4
else:
result = 5
return result, cos_similar[0][0]
match_count = 0
total_titles = 0
total_cos_sim = 0
def process_file(file_path, label, pbar, client_id, client_secret, file_results):
global match_count, total_titles, total_cos_sim
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read().split('\n')
for j in text:
result, cos_sim = process_text(j, client_id, client_secret)
if label == 1 and 0 <= result <= 2:
match_count += 1
elif label == 2 and 1 <= result <= 3:
match_count += 1
elif label == 3 and 2 <= result <= 4:
match_count += 1
elif label == 4 and 3 <= result <= 5:
match_count += 1
elif label == 5 and 4 <= result <= 6:
match_count += 1
total_titles += 1
total_cos_sim += cos_sim
file_results[file_path]['total_titles'] += 1
file_results[file_path]['total_cos_sim'] += cos_sim
pbar.update(1)
def print_accuracy(file_results):
while any(thread.is_alive() for thread in threads):
if total_titles > 0:
accuracy = (match_count / total_titles) * 100
avg_cos_sim = total_cos_sim / total_titles
print(f"\nCurrent accuracy: {accuracy:.2f}%")
print(f"Average cosine similarity: {avg_cos_sim:.4f}")
for file_path, results in file_results.items():
if results['total_titles'] > 0:
file_avg_cos_sim = results['total_cos_sim'] / results['total_titles']
print(f"File: {file_path}, Average cosine similarity: {file_avg_cos_sim:.4f}")
time.sleep(60)
threads = []
pbar = tqdm(total=sum(1 for file_path in data for line in open(file_path, 'r', encoding='utf-8')))
# Initialize a dictionary to store results for each file
file_results = {file_path: {'total_titles': 0, 'total_cos_sim': 0} for file_path in data}
for i, file_path in enumerate(data):
label = i + 1
client_id = client_ids[i % len(client_ids)]
client_secret = client_secrets[i % len(client_secrets)]
thread = threading.Thread(target=process_file, args=(file_path, label, pbar, client_id, client_secret, file_results))
threads.append(thread)
thread.start()
accuracy_thread = threading.Thread(target=print_accuracy, args=(file_results,))
accuracy_thread.start()
for thread in threads:
thread.join()
accuracy_thread.join()
pbar.close()
print(match_count, "/", total_titles)
print("완료")
input()