-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_pre_process.py
90 lines (77 loc) · 3.48 KB
/
data_pre_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
from collections import Counter
import spacy
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset
import numpy as np
from torchnlp.word_to_vector import GloVe
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
PAD = 0
UNK = 1
class ProcessDataset(Dataset):
def __init__(self, df, word2idx=None, idx2word=None, max_vocab_size=50000):
print('Processing Data')
self._df = df
print('Removing white space...')
self._df.sentence = self._df.sentence
self._nlp = spacy.load("en_core_web_sm", disable=['parser', 'tagger', 'ner'])
if word2idx is None:
print('Building Counter...')
word_counter = self.build_counter()
print('Building Vocab...')
self._word2idx, self._idx2word = self.build_vocab(word_counter, max_vocab_size)
else:
self._word2idx, self._idx2word = word2idx, idx2word
print('*' * 100)
print('Dataset info:')
print(f'Number of sent: {self._df.shape[0]}')
print(f'Vocab Size: {len(self._word2idx)}')
print('*' * 100)
def __len__(self):
return self._df.shape[0]
def __getitem__(self, idx):
sent = self._df.sentence.iloc[idx]
tokens = [w.text.lower() for w in self._nlp(self.process(sent))]
vec = self.vectorize(tokens, self._word2idx)
return vec, self._df.Label.iloc[idx]
@staticmethod
def process(text):
text = re.sub(r'[\s]+', ' ', text) # replace multiple white spaces with single space
# text = re.sub(r'@[A-Za-z0-9]+', ' ', text) # remove @ mentions
text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
return text.strip()
def build_counter(self):
words_counter = Counter()
for sent in tqdm(self._df.sentence.values):
words_counter.update(w.text.lower() for w in self._nlp(self.process(sent)))
return words_counter
@staticmethod
def build_vocab(words_counter, max_vocab_size):
word2idx = {'<PAD>': PAD, '<UNK>': UNK}
word2idx.update(
{word: i + 2 for i, (word, count) in tqdm(enumerate(words_counter.most_common(max_vocab_size)))})
idx2word = {idx: word for word, idx in tqdm(word2idx.items())}
return word2idx, idx2word
@staticmethod
def vectorize(tokens, word2idx):
vec = np.asarray([word2idx.get(token, UNK) for token in tokens], dtype=float)
return vec
def pad_features(sent, seq_length=135):
feature = sent
if len(sent) < seq_length:
feature = np.pad(sent, [seq_length - len(sent), 0], mode='constant')
if len(sent) > seq_length:
feature = sent[0:seq_length]
return feature
def get_lstm_parsed_sentences_and_embeddings(data):
processed_data = ProcessDataset(data, max_vocab_size=len(data))
words_counter = processed_data.build_counter()
vocab, index_to_vocab = processed_data.build_vocab(words_counter=words_counter, max_vocab_size=len(data))
sentences = np.array([pad_features(processed_data.__getitem__(i)[0]) for i in range(len(data))])
pretrained_embedding = GloVe(name='6B', dim=300, is_include=lambda w: w in vocab.keys())
embedding_weights = torch.Tensor(len(vocab.keys()), pretrained_embedding.dim)
for num, word in index_to_vocab.items():
embedding_weights[num] = pretrained_embedding[index_to_vocab[num]]
return sentences, embedding_weights