-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathword2vecReader.py
268 lines (219 loc) · 12.1 KB
/
word2vecReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import logging
import word2vecReaderUtils as utils
from numpy import exp, dot, zeros, outer, random, dtype, float32 as REAL,\
uint32, seterr, array, uint8, vstack, argsort, fromstring, sqrt, newaxis,\
ndarray, empty, sum as np_sum, prod
from six import string_types
from gensim import matutils
class Vocab(object):
"""A single vocabulary item, used internally for constructing binary trees (incl. both word leaves and inner nodes)."""
def __init__(self, **kwargs):
self.count = 0
self.__dict__.update(kwargs)
def __lt__(self, other): # used for sorting in a priority queue
return self.count < other.count
def __str__(self):
vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
return "<" + ', '.join(vals) + ">"
class Word2Vec:
"""
Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/
The model can be stored/loaded via its `save()` and `load()` methods, or stored/loaded in a format
compatible with the original word2vec implementation via `save_word2vec_format()` and `load_word2vec_format()`.
"""
def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0):
"""
Initialize the model from an iterable of `sentences`. Each sentence is a
list of words (unicode strings) that will be used for training.
The `sentences` iterable can be simply a list, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in
this module for such examples.
If you don't supply `sentences`, the model is left uninitialized -- use if
you plan to initialize it in some other way.
`sg` defines the training algorithm. By default (`sg=1`), skip-gram is used. Otherwise, `cbow` is employed.
`size` is the dimensionality of the feature vectors.
`window` is the maximum distance between the current and predicted word within a sentence.
`alpha` is the initial learning rate (will linearly drop to zero as training progresses).
`seed` = for the random number generator.
`min_count` = ignore all words with total frequency lower than this.
`sample` = threshold for configuring which higher-frequency words are randomly downsampled;
default is 0 (off), useful value is 1e-5.
`workers` = use this many worker threads to train the model (=faster training with multicore machines).
`hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
`negative` = if > 0, negative sampling will be used, the int for negative
specifies how many "noise words" should be drawn (usually between 5-20).
`cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
Only applies when cbow is used.
"""
self.vocab = {} # mapping from a word (string) to a Vocab object
self.index2word = [] # map from a word's matrix index (int) to word (string)
self.sg = int(sg)
self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
self.layer1_size = int(size)
#if size % 4 != 0:
# logger.warning("consider setting layer size to a multiple of 4 for greater performance")
self.alpha = float(alpha)
self.window = int(window)
self.seed = seed
self.min_count = min_count
self.sample = sample
self.workers = workers
self.min_alpha = min_alpha
self.hs = hs
self.negative = negative
self.cbow_mean = int(cbow_mean)
if sentences is not None:
self.build_vocab(sentences)
self.train(sentences)
@classmethod
def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True):
"""
Load the input-hidden weight matrix from the original C word2vec-tool format.
Note that the information stored in the file is incomplete (the binary tree is missing),
so while you can query for word similarity etc., you cannot continue training
with a model loaded this way.
`binary` is a boolean indicating whether the data is in binary word2vec format.
`norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
Word counts are read from `fvocab` filename, if set (this is the file generated
by `-save-vocab` flag of the original C tool).
"""
counts = None
if fvocab is not None:
#logger.info("loading word counts from %s" % (fvocab))
counts = {}
with utils.smart_open(fvocab) as fin:
for line in fin:
word, count = utils.to_unicode(line).strip().split()
counts[word] = int(count)
#logger.info("loading projection weights from %s" % (fname))
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline())
vocab_size, layer1_size = map(int, header.split()) # throws for invalid file format
result = Word2Vec(size=layer1_size)
result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL)
if binary:
binary_len = dtype(REAL).itemsize * layer1_size
for line_no in xrange(vocab_size):
# mixed text and binary: read text first, then binary
word = []
while True:
ch = fin.read(1)
if ch == b' ':
break
if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't)
word.append(ch)
word = utils.to_unicode(b''.join(word),encoding='latin-1')
if counts is None:
result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
elif word in counts:
result.vocab[word] = Vocab(index=line_no, count=counts[word])
else:
#logger.warning("vocabulary file is incomplete")
result.vocab[word] = Vocab(index=line_no, count=None)
result.index2word.append(word)
result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL)
else:
for line_no, line in enumerate(fin):
parts = utils.to_unicode(line).split()
if len(parts) != layer1_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
word, weights = parts[0], map(REAL, parts[1:])
if counts is None:
result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
elif word in counts:
result.vocab[word] = Vocab(index=line_no, count=counts[word])
else:
#logger.warning("vocabulary file is incomplete")
result.vocab[word] = Vocab(index=line_no, count=None)
result.index2word.append(word)
result.syn0[line_no] = weights
#logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
result.init_sims(norm_only)
return result
def init_sims(self, replace=False):
"""
Precompute L2-normalized vectors.
If `replace` is set, forget the original vectors and only keep the normalized
ones = saves lots of memory!
Note that you **cannot continue training** after doing a replace. The model becomes
effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.
"""
if getattr(self, 'syn0norm', None) is None or replace:
#logger.info("precomputing L2-norms of word weight vectors")
if replace:
for i in xrange(self.syn0.shape[0]):
self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1))
self.syn0norm = self.syn0
if hasattr(self, 'syn1'):
del self.syn1
else:
self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
def __getitem__(self, word):
return self.syn0[self.vocab[word].index]
def __contains__(self, word):
return word in self.vocab
def most_similar(self, positive=[], negative=[], topn=10):
if isinstance(positive, string_types) and not negative:
# allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
positive = [positive]
# add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
positive = [(word, 1.0) if isinstance(word, string_types + (ndarray,))
else word for word in positive]
negative = [(word, -1.0) if isinstance(word, string_types + (ndarray,))
else word for word in negative]
# compute the weighted average of all words
all_words, mean = set(), []
for word, weight in positive + negative:
if isinstance(word, ndarray):
mean.append(weight * word)
elif word in self.vocab:
mean.append(weight * self.syn0norm[self.vocab[word].index])
all_words.add(self.vocab[word].index)
else:
raise KeyError("word '%s' not in vocabulary" % word)
if not mean:
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
dists = dot(self.syn0norm, mean)
if not topn:
return dists
best = argsort(dists)[::-1][:topn + len(all_words)]
# ignore (don't return) words from the input
result = [(self.index2word[sim], float(dists[sim]), self.syn0[sim]) for sim in best if sim not in all_words]
return result[:topn]
def most_similar_cosmul(self, positive=[], negative=[], topn=10):
self.init_sims()
if isinstance(positive, string_types) and not negative:
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
positive = [positive]
all_words = set()
def word_vec(word):
if isinstance(word, ndarray):
return word
elif word in self.vocab:
all_words.add(self.vocab[word].index)
return self.syn0norm[self.vocab[word].index]
else:
raise KeyError("word '%s' not in vocabulary" % word)
positive = [word_vec(word) for word in positive]
negative = [word_vec(word) for word in negative]
if not positive:
raise ValueError("cannot compute similarity with no input")
# equation (4) of Levy & Goldberg "Linguistic Regularities...",
# with distances shifted to [0,1] per footnote (7)
pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)
if not topn:
return dists
best = argsort(dists)[::-1][:topn + len(all_words)]
# ignore (don't return) words from the input
result = [(self.index2word[sim], float(dists[sim],)) for sim in best if sim not in all_words]
return result[:topn]
if __name__ == "__main__":
model_path = "./word2vec_twitter_model.bin"
print("Loading the model, this can take some time...")
model = Word2Vec.load_word2vec_format(model_path, binary=True)
print("The vocabulary size is: "+str(len(model.vocab)))