-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
259 lines (229 loc) · 8.95 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# Load libraries
import os
import unicodedata
import re
import itertools
import pandas as pd
import numpy as np
import gensim
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
import pickle
import argparse
# Load functions:
symbols_to_space = re.compile(u"[/\|\n(\; )|(\: )|( \()|(\) )|( \")|(\" )|( \')|(\' )|\t]")
symbols_to_remove = re.compile(u"[\"\'\$\€\£\(\)\:\[\]\.\,\>\<\?\-\_]")
space_repetition = re.compile(u" {2,}")
words_with_numbers = re.compile(u"\w*\d\w*")
def transmission_categories_files(main_path):
"""
Function to read category folders
"""
cats = []
try:
cats = os.listdir(main_path)
num_cats = len(cats)
print("---There are {} categories to train---".format(num_cats)) if num_cats > 0 else print(
"---There is not any category folder, add some please!")
except ValueError:
print("---It does not exist that path, create it or change it!")
return cats
def files_corrector(main_path):
"""
Function to change rare name folders
"""
print("---Formatting file names---")
for i_cat in cats:
cat_path = os.path.join(main_path, i_cat)
os.chdir(cat_path)
txt_cat_files = os.listdir(cat_path)
for i_file in txt_cat_files:
if bool(re.search("^.!", i_file)):
prev_name = re.search('^.![^!]+!', i_file).group(0)
new_name = re.sub('\.|\!', '', prev_name)
os.rename(i_file, new_name)
os.chdir(main_path)
print("DONE")
pass
def text_to_dict(main_path, cats):
"""
Function to put txt into a dictionary
"""
pruebas_dict = []
try:
for i_cat in cats:
print("---Loading cat {} ---".format(i_cat))
cat_path = os.path.join(main_path, i_cat)
os.chdir(cat_path)
txt_cat_files = os.listdir(cat_path)
for i_file in txt_cat_files:
with open(i_file) as file:
text_data = file.readlines()
aux_text_dict = {i_cat: text_data} # Podremos acceder a ellos a través de .keys o .values
pruebas_dict.append(aux_text_dict)
except ValueError:
print("---Still with no category folders in that path. Please to continue add the data or change the path")
os.chdir(main_path)
return pruebas_dict
def canonize_language(text):
"""
Function to remove spaces, symbols...
"""
text = strip_accents(text.strip().lower())
text = symbols_to_space.sub(" ", text)
text = symbols_to_remove.sub("", text)
text = space_repetition.sub(" ", text)
text = words_with_numbers.sub("", text)
return text.strip()
def simple_tokenizer(text, min_token_length=0):
"""
Function to short words
"""
tokens = text.split(" ")
if min_token_length > 0:
tokens = filter(lambda x: len(x) >= min_token_length, tokens)
return tokens
def strip_accents(input_str):
"""
Remove accents
"""
nfkd_form = unicodedata.normalize('NFKD', input_str)
return nfkd_form
# lemmer = WordNetLemmatizer()
def lem_word(txt_list, lemmer):
"""
Lemmatizer of words
"""
return lemmer.lemmatize(txt_list, "v")
# stemmer = PorterStemmer()
def stem_word(txt_list, stemmer):
"""
Stemmer for words
"""
return stemmer.stem(txt_list)
def join_lines_txt(txt_list, separator=' '):
"""
Join element words of a list into one string
"""
return separator.join(txt_list)
def annotation_weight_representation(venue_annotation):
"""
Function that iterates a long a list vocab. Look up for its embedding. Make weighted avg of embeddings
"""
venue_vectors = []
count_model_included = 0
count_model_nonincluded = 0
idx_token_vectors = 0
tags = venue_annotation.split()
word_vectors = np.empty(shape=(len(tags), 300))
idx_word_vectors = 0
for tag in tags:
tag = tag.replace('_', ' ')
if tag in model.vocab:
word_vectors[idx_word_vectors] = model[tag]
idx_word_vectors += 1
else:
tokens = tag.split()
token_vectors = np.empty(shape=(len(tokens), 300))
idx_token_vectors = 0
for token in tokens:
if token in model.vocab:
token_vectors[idx_token_vectors] = model[token]
idx_token_vectors += 1
else:
continue
if idx_token_vectors > 0:
word_vectors[idx_word_vectors] = np.average(token_vectors[:idx_token_vectors], axis=0)
idx_word_vectors += 1
if idx_word_vectors != 0 or idx_token_vectors != 0:
count_model_included += 1
venue_vectors.append(np.average(word_vectors[:idx_word_vectors], axis=0))
else:
count_model_nonincluded += 1
venue_vectors.append(np.nan)
return venue_vectors[0]
def parse_args():
"""Parse and return command-line arguments."""
parser = argparse.ArgumentParser(
description='Train Classifier')
parser.add_argument(
'--dataset',
dest='dataset_folder',
default='dataset',
required=True)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
# Read data
name_folder = args.dataset_folder
initial_path = os.getcwd()
main_path = os.path.join(os.getcwd(), name_folder)
cats = transmission_categories_files(main_path)
files_corrector(main_path)
data_files = text_to_dict(main_path, cats)
# Clean data
clean_data = []
n_chars = 4
stemmer = PorterStemmer()
lemmer = WordNetLemmatizer()
other_words = ["@", "that", "than", "=", "*", "^", "+", "-", "these"]
# Iterate through each doc
for i_txt in np.arange(0, len(data_files)):
txt = list(data_files[i_txt].values())[0]
clean_txt = []
# Iterate lines of each doc and join them in one
for i_sub_txt in np.arange(1, len(txt)):
i_line = txt[i_sub_txt]
txt_f1 = canonize_language(i_line)
txt_f2 = simple_tokenizer(txt_f1, n_chars)
txt_f2 = list(txt_f2)
lemmed_vector = [lem_word(element_txt, lemmer) for element_txt in txt_f2] # look up at wordnet dictionary
lemmed_vector = [ele for ele in lemmed_vector if all(ch not in ele for ch in
other_words)] # Delete words containing @ (Dont know if its good idea since i dont know if @mails correlate with the categories will be uncommented if in the analysis is not a clue)
lemmed_vector = join_lines_txt(lemmed_vector)
clean_txt.append(lemmed_vector)
clean_merged_txt = join_lines_txt(clean_txt)
# SECOND CLEANING STEP after merging:
# ----------------------
clean_merged_txt = clean_merged_txt.strip()
clean_merged_txt = space_repetition.sub(" ", clean_merged_txt)
aux_clean_data = {list(data_files[i_txt].keys())[0]: clean_merged_txt}
clean_data.append(aux_clean_data)
# Tabular data
transmissions_df = pd.DataFrame()
for irow in np.arange(1, len(clean_data)):
auxrow = clean_data[irow]
target = list(auxrow.keys())[0]
transmissions = list(auxrow.values())[0]
aux_pd = pd.DataFrame([[transmissions, target]], columns=["transmissions", "target"])
transmissions_df = transmissions_df.append(aux_pd)
transmissions_df.reset_index(inplace=True)
transmissions_df.drop(columns="index", inplace=True)
# Prepare data:
transmissions_df = transmissions_df.reindex(np.random.permutation(transmissions_df.index)).reset_index(drop=True)
model = gensim.models.KeyedVectors.load_word2vec_format(
os.path.join(initial_path, "GoogleNews-vectors-negative300.bin"), binary=True)
transmissions_df["vector_rep"] = transmissions_df["transmissions"].apply(
lambda x: annotation_weight_representation(x))
# Expand the elements embedding as variables and stick the target variable to have our master table:
embeddings_df = transmissions_df['vector_rep'].apply(pd.Series)
embeddings_df = embeddings_df.rename(columns=lambda x: 'element_' + str(x))
embeddings_df = pd.merge(embeddings_df, transmissions_df["target"], left_index=True, right_index=True)
# Split in train/test
sz = embeddings_df.shape
train = embeddings_df.iloc[:int(sz[0] * 0.8), :]
test = embeddings_df.iloc[int(sz[0] * 0.8):, :]
train = train.dropna()
test = test.dropna()
# Save the train and test data sets with its label
x_train = train.iloc[:, :300]
y_train = train.iloc[:, 300]
x_test = test.iloc[:, :300]
y_test = test.iloc[:, 300]
rf_clf = RandomForestClassifier(n_estimators=1500, class_weight="balanced", max_depth=5, random_state=655321)
model = rf_clf.fit(x_train, y_train)
# Save model
output = open(os.path.join(initial_path, 'classifier.pkl'), 'wb')
pickle.dump(model, output)
output.close()