-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
291 lines (223 loc) · 8.3 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import os
import gensim
import nltk
import numpy as np
import pandas as pd
import preprocessor
import scipy.sparse as sp
from spellchecker import SpellChecker
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from definitions import ROOT_DIR
class Parser:
ENGINE_ADDR = 'postgresql+psycopg2://postgres:password@localhost:5432/iot_tweet' # 'postgresql+psycopg2://postgres:password@/iot_tweet?host=/cloudsql/iot-tweet:europe-west3:main-instance'
def __init__(self):
self.load_nltk()
self.model = None
self.abbreviations = None
self.spell_check = None
self.session = None
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.MENTION, preprocessor.OPT.RESERVED,
preprocessor.OPT.EMOJI, preprocessor.OPT.SMILEY)
self.load_db_tweets()
def load_db_tweets(self):
engine = create_engine(Parser.ENGINE_ADDR, echo=True)
Session = sessionmaker(bind=engine)
self.session = Session()
def clean_tweet(self, tweet_text):
"""
Taking a raw tweet, return a cleaned list of tweets tokens
:param tweet_text:
:return: array of tokens words
"""
tweet = preprocessor.clean(tweet_text)
tokens = [word[1:] if word.startswith('#') else word for word in tweet.split(' ')]
tokens = self.replace_abbreviations(tokens)
tokens = self.remove_stopwords_spelling_mistakes(tokens)
tokens = gensim.utils.simple_preprocess(' '.join(tokens))
return tokens
def load_spell_check(self):
if self.spell_check is not None:
return
self.spell_check = SpellChecker()
def load_abbreviations(self):
if self.abbreviations is not None:
return
file_name = os.path.join(ROOT_DIR, "corpus/slang.txt")
file = open(file_name, 'r')
self.abbreviations = {}
for line in file.readlines():
parts = line[:-1].split('=')
self.abbreviations[parts[0].upper()] = parts[1]
file.close()
def replace_abbreviations(self, tokens):
"""
Replace the abbreviations (OMG -> Oh My God) based on the dictionary in slang.txt
:param tokens: words of the tweet
:return: words with abbreviations replaced by their meaning
"""
self.load_abbreviations()
for i in range(len(tokens)):
tokens[i] = self.abbreviations[tokens[i]] if tokens[i] in self.abbreviations else tokens[i]
return tokens
def remove_stopwords_spelling_mistakes(self, tokens):
"""
Remove stopwords and corrects spelling mistakes
:param spell: Object to correct spelling mistakes
:param tokens: words of the tweet
:return: words cleaned and corrected
"""
# self.load_spell_check()
return list(filter(lambda token: token not in nltk.corpus.stopwords.words('english'), tokens))
def get_vector(self, tweet_id, as_np_array=False):
"""
Return the vector of a specific tweet
:param tweet_id: id of the tweet
:param as_np_array: convert the vector into numpy.array
:return: vector: list or numpy.array
"""
from models.tweet import Tweet
if self.session.query(Tweet.vector).filter_by(id=int(tweet_id)).first() is None:
return None
vector = self.session.query(Tweet.vector).filter_by(id=int(tweet_id)).first()[0]
if as_np_array:
vector = np.array(vector)
return vector
def get_all_vectors(self, tweet_ids=None, limit=None):
"""
Return all the vectors
:param tweet_ids: is specified, filter the vectors to return with tweet_id
:param limit: nb of results
:return: dict tweet_id -> vector (list)
"""
from models.tweet import Tweet
query = self.session.query(Tweet.id, Tweet.vector)
if tweet_ids is not None:
query = query.filter(Tweet.id.in_(tweet_ids))
if limit is not None:
query = query.limit(limit)
return dict(query.all())
@staticmethod
def parsing_vector_corpus_pandas(corpus_path, separator='\t', categorize=False, vector_asarray=True):
"""
Parse the corpus and return a Pandas DataFrame
:param categorize: boolean to make the tweet and user ids start to 0
:param separator:
:param corpus_path: path of the corpus
:return: pandas.DataFrame
"""
df = pd.read_csv(corpus_path, sep=separator, dtype={'User_ID': object}) # , index_col="TweetID"
df = df.dropna(subset=['User_ID']) # remove tweets without users
if categorize:
df['User_ID_u'] = df.User_ID.astype('category').cat.codes.values
df['TweetID_u'] = df.TweetID.astype('category').cat.codes.values
df = df[df.User_ID_u >= 0]
# This takes 20 seconds
if vector_asarray:
df['Vector'] = df.apply(lambda row: np.asarray([float(x) for x in row['Vector'][1:-1].split(', ')]), axis=1)
return df
@staticmethod
def parsing_base_corpus_pandas(corpus_path, separator='\t', categorize=False):
"""
Parse the corpus and return a Pandas DataFrame
:param categorize: boolean to make the tweet and user ids start to 0
:param separator:
:param corpus_path: path of the corpus
:return: pandas.DataFrame
"""
df = pd.read_csv(corpus_path, sep=separator, dtype={'User_ID': object}) # , index_col="TweetID"
df = df.dropna(subset=['User_ID']) # remove tweets without users
if categorize:
df['User_ID_u'] = df.User_ID.astype('category').cat.codes.values
df['TweetID_u'] = df.TweetID.astype('category').cat.codes.values
df = df[df.User_ID_u >= 0]
return df
@staticmethod
def corpus_to_sparse_matrix(corpus_path):
corpus = Parser.parsing_vector_corpus_pandas(corpus_path)
num_users = corpus.User_ID.max() + 1
num_tweets = corpus.TweetID.max() + 1
print(num_users, 'users')
print(num_tweets, 'tweets')
# Construct matrix
mat = sp.dok_matrix((num_users, num_tweets), dtype=np.float32)
for index, tweet in corpus.iterrows():
mat[int(tweet.User_ID), int(tweet.TweetID)] = 1.
return mat
@staticmethod
def vector_string_to_array(vector):
return np.asarray([float(x) for x in vector[1:-1].split(', ')])
def tweet2vec(self, tweet_text):
sentence_vector = []
self.load_w2v_model()
for word in tweet_text:
try:
sentence_vector.append(self.model.wv[word])
except KeyError:
pass
# if a tweet word do not appear in the model we put a zeros vector
if len(sentence_vector) == 0:
sentence_vector.append(np.zeros_like(self.model.wv["tax"]))
return np.mean(sentence_vector, axis=0, dtype=float)
def load_w2v_model(self,
path_to_pretrained_model=os.path.join(ROOT_DIR, 'corpus/GoogleNews-vectors-negative300.bin')):
if self.model is not None:
return
self.model = gensim.models.KeyedVectors.load_word2vec_format(path_to_pretrained_model, binary=True)
print('GoogleNews-vectors LOADED')
@staticmethod
def add_vector_to_corpus(corpus_path, new_corpus_path, write_every=1000):
"""
Create a new CleanedText and Vector column on the corpus
Separate the URLs by space if many
:param write_every: write in the final file every x lines
:param corpus_path:
:param new_corpus_path:
:return:
"""
parser = Parser()
parser.load_w2v_model()
corpus = open(corpus_path, 'r', encoding='utf-8')
new_corpus = open(new_corpus_path, 'w', encoding='utf-8')
lines = corpus.readlines()
corpus.close()
new_lines = []
last_written = -1
new_lines.append(lines[0][:-1] + '\tCleanedText\tVector\n')
for i in range(1, len(lines)):
parts = lines[i][:-1].split('\t')
cleaned_tweet = parser.clean_tweet(parts[-6])
urls = parts[5:-6]
new_lines.append(
'\t'.join(parts[:5]) + '\t' + # TweetID Sentiment TopicID Country Gender
' '.join(urls) + '\t' + # URLs separated by space
parts[-6] + '\t' + # Text
parts[-5] + '\t' + # User_ID
parts[-4][1:-1] + '\t' + # User_Name without quotes
parts[-3][1:-1] + '\t' + # Date without quotes
'\t'.join(parts[-2:]) + '\t' + # Hashtags Indication
' '.join(cleaned_tweet) + '\t' + # CleanedText
str(list(parser.tweet2vec(cleaned_tweet))) # Vector
+ '\n')
if i % write_every == 0:
new_corpus.write(''.join(new_lines[(last_written + 1):]))
last_written = i
print(str(last_written) + '/' + str(len(lines)) + ' treated')
new_corpus.write(''.join(new_lines[(last_written + 1):]))
new_corpus.close()
def load_nltk(self):
# todo : find another solution for nltk download !
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
if __name__ == '__main__':
p = Parser()
print(p.get_all_vectors(limit=20))
exit()
vector = p.get_vector(80434341692663808089, as_np_array=True)
print(vector)