-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocess.py
345 lines (300 loc) · 15.4 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# -*- coding: utf-8 -*-
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import logging
import configparser
import numpy as np
import jieba
import os
import sys
from collections import Counter
import threadpool
from random import randint
import time
drop_stopwords_sentences_list = []
stopwords_list = []
class DataPreprocess(object):
def __init__(self):
config_path = "sentiment_analysis.config"
self.config = configparser.ConfigParser()
self.config.read(config_path, encoding='utf-8-sig')
# self.word2vec_model = KeyedVectors.load_word2vec_format(self.config.get("input_file_path", "word2vec_bin_path"),
# binary=True)
self.labels_path = self.config.get("word2vec_parameter", "labels_path")
self.total_comment_matrix_path = self.config.get("word2vec_parameter", "total_comment_matrix_path")
# model, vector, word_list, neg_tokenized_content_list, pos_tokenized_content_list, max_sen_length = self.load_file()
# self.neg_content_len, self.pos_content_len, self.total_content_matrix, self.model, self.max_sen_length = self.build_sen2vec_matrix(
# model, neg_tokenized_content_list, pos_tokenized_content_list, max_sen_length)
# 生成向量text 和labels
# total_content_matrix, labels = self.generatewordembeddings(
# model, neg_tokenized_content_list, pos_tokenized_content_list, max_sen_length)
def duplicate_remove(self, sentences_list):
sentences_list = list(set(sentences_list))
return sentences_list
def tokenize(self, sentence_list):
tokenized_sentences_list = []
for sentence in sentence_list:
one_sentence = []
cut_sentence = jieba.cut(sentence)
for word in cut_sentence:
one_sentence.append(word)
tokenized_sentences_list.append(one_sentence)
return tokenized_sentences_list
def threadpool_drop_stopwords(self, sentence):
drop_stopwords_one_sentence = ""
for word in sentence:
if word not in stopwords_list:
if word != "\t":
drop_stopwords_one_sentence += word
drop_stopwords_sentences_list.append(drop_stopwords_one_sentence)
def drop_stopwords(self, sentences_list):
tokenized_sentences_list = self.tokenize(sentences_list)
with open(self.config.get("input_file_path", "stopwords_path"), 'r') as stop_words_file:
for stop_words in stop_words_file.readlines():
stopwords_list.append(stop_words.replace("\n", ""))
pool = threadpool.ThreadPool(20)
requests = threadpool.makeRequests(self.threadpool_drop_stopwords, tokenized_sentences_list)
[pool.putRequest(req) for req in requests]
pool.wait()
return drop_stopwords_sentences_list
def data_preprocess(self, sentences_list, remove_duplicate=True, stopwords_drop=False):
if remove_duplicate:
sentences_list = self.duplicate_remove(sentences_list)
if stopwords_drop:
self.drop_stopwords(sentences_list)
return sentences_list
"""
read negative and positive corpus in corpus folder, and collect these corpus in one list dividually.
Args:
file_path = negative or positive corpus foleder path
Return:
file_content_list which includes negative or positive all corpus
"""
def read_file_name(self, file_dir):
file_content_list = []
for root, dirs, files in os.walk(file_dir):
for file_name in files:
with open(os.path.join(root, file_name), 'r') as corpus_file:
file_content_list = corpus_file.readlines()
return file_content_list
"""
1. this function decides the max length of sentences we need to select
2. we use the 'most_common' function to adjust how many common sentences ought to be considered, where most_common ratio can be set in a config file.
3. we will select the max length of sentence which are in common sentences as the max_sen_length
arg:
tokenized_content_list: these sentences have been tokenized
return:
max_sen_length: this is a 'int' format
"""
def calculate_len_sen(self, tokenized_content_list):
sen_len_list = []
print("tokenized_content_list 的长度为 {0}".format(len(tokenized_content_list)))
for sen in tokenized_content_list:
sen_len_list.append(len(sen))
sen_counter = Counter(sen_len_list)
most_sen_tuple = sen_counter.most_common(
int(float(self.config.get("threshold_parameter", "most_percentage")) * len(sen_len_list)))
len_collection_list = []
for len_tuple in most_sen_tuple:
len_collection_list.append(len_tuple[0])
max_sen_length = max(len_collection_list)
return max_sen_length
"""
this function is used for tokenizing sentences with jieba tool
for example:
input = ["今天天气不错","上海今天温度是多少度?"]
output = [["今天","天气","不错"],["上海","今天","温度","是","多少","度","?"]]
args:
sen_list: these sentences is used to be tokenized
return:
tokenized_content_list: these sentences have been tokenized
"""
def do_tokenize(self, sen_list):
tokenized_content_list = []
for sen in sen_list:
sen_list = []
cut_sen = jieba.cut(sen)
for i in cut_sen:
sen_list.append(i)
tokenized_content_list.append(sen_list)
return tokenized_content_list
"""
1. chiefly this function is aim to load file, where detail steps are shown belown:
a. load the word2vec bin model
b. load the word2vec npy model
c. load the negative/positive comment text file
2. then the loaded comment files will be preprocessed in data_preprocess
a. remove_duplicate argument call help users to remove duplicate comment
b. stopwords_drop argument can help users to remove stopwords
3. the last step is tokenizing these sentences and calculate the max length of sentences
args:
null
return:
neg_tokenized_content_list: negative content which have tokenized list
pos_tokenized_content_lsit: positive content which have tokenized list
max_sen_length: the max_length of sentences that we need to select
"""
def load_word2vec(self):
start = time.time()
word2vec_model = KeyedVectors.load_word2vec_format(self.config.get("input_file_path", "word2vec_bin_path"),
binary=True)
tmp = time.time()
print("加载 word2vec_model 花费时间为: ", tmp - start)
logging.info("loading word2vec bin file...")
logging.info("loading word2vec npy file ...")
vector = np.load(self.config.get("input_file_path", "word2vec_npy_path"))
word_list = word2vec_model.wv.vocab.keys()
end = time.time()
print("加载vector ,word_list花费时间为: " , end - tmp)
return word2vec_model, vector, word_list
def load_corpus(self):
logging.info("loading nagetive and positive file ...")
neg_content_list = self.read_file_name(self.config.get("input_file_path", "neg_file_path"))
pos_path = self.config.get("input_file_path", "pos_file_path")
print("pos_content_list 路径为: ", pos_path)
pos_content_list = self.read_file_name(pos_path)
print("neg_content_list 的长度为 {0}".format(len(neg_content_list)))
print("pos_content_list 的长度为 {0}".format(len(pos_content_list)))
logging.info("data preprocessing ...")
neg_content_list = self.data_preprocess(sentences_list=neg_content_list, remove_duplicate=True,
stopwords_drop=False)
pos_content_list = self.data_preprocess(sentences_list=pos_content_list, remove_duplicate=True,
stopwords_drop=False)
print("neg_content_list 的长度为 {0}".format(len(neg_content_list)))
print("pos_content_list 的长度为 {0}".format(len(pos_content_list)))
logging.info("sentence tokenized ...")
neg_tokenized_content_list = self.do_tokenize(neg_content_list)
pos_tokenized_content_list = self.do_tokenize(pos_content_list)
max_sen_length = self.calculate_len_sen(pos_tokenized_content_list + neg_tokenized_content_list)
neg_content_len = len(neg_tokenized_content_list)
pos_content_len = len(pos_tokenized_content_list)
return neg_tokenized_content_list, pos_tokenized_content_list, max_sen_length, neg_content_len, pos_content_len
def build_train_sen(self, sen_list, max_sen_length, model):
total_sen_vec_list = []
word_dimension = int(self.config.get("word2vec_parameter", "dimension"))
for record_num, sen in enumerate(sen_list):
one_sen_vec_list = []
if len(sen) >= max_sen_length:
for num in range(0, max_sen_length):
try:
one_sen_vec_list.append(model[sen[num]])
except:
one_sen_vec_list.append(np.zeros(word_dimension))
else:
for word in sen:
try:
one_sen_vec_list.append(model[word])
except:
one_sen_vec_list.append(np.zeros(word_dimension))
for num in range(len(sen) + 1, max_sen_length + 1):
one_sen_vec_list.append(np.zeros(word_dimension))
total_sen_vec_list.append(np.array(one_sen_vec_list))
return np.array(total_sen_vec_list)
def get_train_batch(self, total_content_matrix, neg_content_len, pos_content_len, max_sen_length):
batch_size = int(self.config.get("lstm_hyperparameter", "batch_size"))
word_dimension = int(self.config.get("word2vec_parameter", "dimension"))
labels = []
batch_matrix = np.zeros([batch_size, max_sen_length, word_dimension])
for i in range(batch_size):
if (i % 2 == 0):
num = randint(1, neg_content_len - 1000)
labels.append([0, 1])
else:
num = randint(neg_content_len + 2, neg_content_len + pos_content_len - 1000)
labels.append([1, 0])
batch_matrix[i] = total_content_matrix[num - 1:num]
return batch_matrix, labels
def get_test_batch(self, total_content_matrix, neg_content_len, pos_content_len, max_sen_length):
batch_size = int(self.config.get("lstm_hyperparameter", "batch_size"))
word_dimension = int(self.config.get("word2vec_parameter", "dimension"))
labels = []
batch_matrix = np.zeros([batch_size, max_sen_length, word_dimension])
for i in range(batch_size):
if (i % 2 == 0):
num = randint(neg_content_len - 1999, neg_content_len + 1)
labels.append([0, 1])
else:
num = randint(neg_content_len + pos_content_len - 1999, neg_content_len + pos_content_len)
labels.append([1, 0])
batch_matrix[i] = total_content_matrix[num - 1:num]
return batch_matrix, labels
def batch_iter(self, data, batch_size, num_epochs, shuffle=True):
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
def generatewordembeddings(self, model, neg_tokenized_content_list, pos_tokenized_content_list, max_sen_length,
):
neg_content_len = len(neg_tokenized_content_list)
pos_content_len = len(pos_tokenized_content_list)
neg_content_array = self.build_train_sen(neg_tokenized_content_list, max_sen_length, model)
pos_content_array = self.build_train_sen(pos_tokenized_content_list, max_sen_length, model)
total_content_matrix = np.concatenate((neg_content_array, pos_content_array), axis=0)
labels = np.ones((len(total_content_matrix), 1))
labels[:neg_content_len] = 0
if not os.path.isfile(self.total_comment_matrix_path):
file = open(self.total_comment_matrix_path, 'w')
file.close()
if not os.path.isfile(self.labels_path):
file = open(self.labels_path, 'w')
file.close()
"""
import numpy as np
保存多维数组的方式
a = np.random.random((2, 3, 4, 5))
header = ','.join(map(str, a.shape))
np.savetxt('test.txt', a.reshape(-1, a.shape[-1]), header=header,
delimiter=',')
加载保存多维数组的方式
with open('test.txt') as f:
shape = map(int, f.next()[1:].split(','))
b = np.genfromtxt(f, delimiter=',').reshape(shape)
# 在python2时, 使用 f.next()
# python3
#map(function, iterable, ...) # 返回的不是list 而是 map object
#list(map(function, iterable, ...)) # 转成list
"""
total_content_matrix_header = ','.join(map(str, total_content_matrix.shape))
np.savetxt(self.total_comment_matrix_path, total_content_matrix.reshape(-1, total_content_matrix.shape[-1]),
header=total_content_matrix_header,
delimiter=',')
labels_header = ','.join(map(str, labels.shape))
np.savetxt(self.labels_path, labels.reshape(-1, labels.shape[-1]), header=labels_header,
delimiter=',')
# print("total_content_matrix_header: " + total_content_matrix)
# print("labels_header: " + labels_header)
return total_content_matrix, labels
def loadembeddings(self):
with open(self.total_comment_matrix_path) as f:
shape = list(map(int, f.readline()[1:].split(',')))
total_content_matrix = np.genfromtxt(f, delimiter=',').reshape(shape)
with open(self.labels_path) as f:
shape = list(map(int, f.readline()[1:].split(',')))
labels = np.genfromtxt(f, delimiter=',').reshape(shape)
return total_content_matrix, labels
if __name__ == '__main__':
data = DataPreprocess()
# 加载语料库
neg_tokenized_content_list, pos_tokenized_content_list, max_sen_length, neg_content_len, pos_content_len = data.load_corpus()
#加载Word2vec 模型
word2vec_model, vector, word_list = data.load_word2vec()
# 生成 评论向量 矩阵
total_content_matrix, labels = data.generatewordembeddings(word2vec_model, neg_tokenized_content_list,
pos_tokenized_content_list, max_sen_length)
# 加载 评论向量 矩阵
total_content_matrix, labels = data.loadembeddings()
print(total_content_matrix.shape)
print(labels.shape)