-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGenerate_Basic_Files_Gensim.py
52 lines (46 loc) · 1.77 KB
/
Generate_Basic_Files_Gensim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from django.utils.encoding import smart_str
from gensim.models import Word2Vec, KeyedVectors
from decimal import Decimal
import plac
import pickle
import os
import process_data
@plac.annotations(
gensim_model_path=("Location of gensim's .bin file"),
out_dir=("Location of output directory"),
)
def main(gensim_model_path, out_dir):
"""
This function is used to generate words file, word embeddings file, and weight4ind file from the word embeddings in the gensim output format.
"""
gensim_model = Word2Vec.load(gensim_model_path)
words = {}
n = 0
vectors = []
weightfile_name = os.path.join(out_dir, "weightfile.txt")
weightfile = open(weightfile_name, "w")
for string in gensim_model.wv.vocab:
vocab = gensim_model.wv.vocab[string]
freq, idx = vocab.count, vocab.index
weightfile.write(smart_str(string))
weightfile.write(" ")
weightfile.write(smart_str(freq))
weightfile.write("\n")
vector = gensim_model.wv.syn0[idx]
vectors.append(vector)
words[string] = n
n = n + 1
vector_file = open(os.path.join(out_dir, "vectors"), "w")
pickle.dump(vectors, vector_file)
words_file = open(os.path.join(out_dir, "words"), "w")
pickle.dump(words, words_file)
weightpara = [1e-2, 1e-3, 1e-4]
for a in weightpara:
print("calculating word2weight with a = {}.".format(a))
word2weight = process_data.getWordWeight(weightfile_name, a)
print("calculating weight4ind with a = {}.".format(a))
weight4ind = process_data.getWeight(words, word2weight)
weight4ind_file = open(os.path.join(out_dir, "weight4ind_weightpara_%.E" % Decimal(a)), 'w')
pickle.dump(weight4ind, weight4ind_file)
if __name__ == '__main__':
plac.call(main)