-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathpreprocessor.py
63 lines (54 loc) · 1.62 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import spacy
from os import listdir
from os.path import isfile, join
import numpy as np
nlp = spacy.load('en')
print("Loaded Vectorizer.")
data_path = 'data/intent_classes/'
labels = [f.split('.')[0] for f in listdir(data_path) if isfile(join(data_path, f))]
class Dataset(object):
def __init__(self):
vocab = nlp.vocab
X_all_sent = []
X_all_vec_seq = []
X_all_doc_vec = []
Y_all = []
for label in labels:
x_file = open(data_path+label + '.txt')
x_sents = x_file.read().split('\n')
for x_sent in x_sents:
if len(x_sent) > 0:
x_doc = nlp(x_sent)
x_doc_vec = x_doc.vector
x_vec_seq = []
for word in x_doc:
x_vec_seq.append(word.vector)
X_all_sent.append(x_sent)
X_all_doc_vec.append(x_doc_vec)
X_all_vec_seq.append(x_vec_seq)
Y_all.append(label)
self.X_all_sent = X_all_sent
self.X_all_vec_seq = X_all_vec_seq
self.X_all_doc_vec = X_all_doc_vec
self.Y_all = Y_all
def pad_vec_sequences(sequences,maxlen=50):
new_sequences = []
for sequence in sequences:
orig_len, vec_len = np.shape(sequence)
if orig_len < maxlen:
new = np.zeros((maxlen,vec_len))
new[maxlen-orig_len:,:] = sequence
else:
#print(sequence)
new = sequence[orig_len-maxlen:,:]
new_sequences.append(new)
new_sequences = np.array(new_sequences)
#print(new_sequences.shape)
return new_sequences
def pad_class_sequence(sequence, nb_classes):
return_sequence = []
for label in sequence:
new_seq = [0.0] * nb_classes
new_seq[labels.index(label)] = 1.0
return_sequence.append(new_seq)
return return_sequence