-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword2sent_v3.py
105 lines (98 loc) · 5.93 KB
/
word2sent_v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
import pickle, string, plac
import collections, operator
import process_data
def scoring_function_v3(word_embeddings, pairs_id, pairs_weight):
"""
Computing the similarity score of two input sentences by taking the weighted average of the similarity scores of each candidate pair.
:param word_embeddings: a file contains all word vectors of the words in the vocabulary, one word embedding per line.
:param pairs_id: a list of all uni-gram pairs of index from the input sequences.
:param pairs_weight: a list of all uni-gram pairs of weights from the input sequences.
:return: the overall similarity score of two input sentences.
"""
word_embeddings = np.array(word_embeddings)
candidate_pairs, candidate_scores = get_candidate_pairs_v3(word_embeddings, pairs_id, pairs_weight)
overall_score = reduce(lambda x, y: x + y, candidate_scores) / len(candidate_scores)
# print('the resulting pairs is {}'.format(candidate_pairs))
# print('the resulting scores is {}'.format(candidate_scores))
return overall_score
def get_candidate_pairs_v3(word_embeddings, pairs_id, pairs_weight):
"""
Building all candidates pairs and computing the corresponding similarity scores.
:param word_embeddings: a file contains all word vectors of the words in the vocabulary, one word embedding per line.
:param pairs_id: a list of all uni-gram pairs of index from the input sequences.
:param pairs_weight: a list of all uni-gram pairs of weights from the input sequences.
:return: all candidates pairs, all similarity scores
"""
candidate_pairs = []
candidate_scores = []
score4pair = {}
idx4weight = 0
for each_id_pair, each_weight_pair in zip(pairs_id, pairs_weight):
each_bondle = [each_id_pair[0], each_id_pair[1], each_weight_pair[0], each_weight_pair[1]]
each_pair_score = process_data.compute_each_score_weighted(word_embeddings, each_id_pair, each_weight_pair)
score4pair[tuple(each_bondle)] = each_pair_score
sorted_score4pair = sorted(score4pair.items(), key=operator.itemgetter(1), reverse=True)
for bondle, score in sorted_score4pair:
findItem = False
if len(candidate_pairs) == 0:
pairs = [bondle[0], bondle[1]]
weights = [bondle[2], bondle[3]]
candidate_pairs.append(pairs)
candidate_scores.append(score / np.mean(weights))
else:
for re in candidate_pairs:
pairs = [bondle[0], bondle[1]]
if len(set(pairs).intersection(re)) != 0:
findItem = True
break
if findItem == False:
pairs = [bondle[0], bondle[1]]
weights = [bondle[2], bondle[3]]
candidate_pairs.append(pairs)
candidate_scores.append(score / np.mean(weights))
idx4weight = idx4weight + 1
return candidate_pairs, candidate_scores
def main():
"""
This is a demo for Word2Sent-V3, measuring the similarity score of two given sentences with the weighted scheme.
:return: the similarity score.
"""
# Step 1: Choosing the vocabulary and the word embeddings file.
# path4words = '/shared/data_WordSentenceVector/model_lawinsider/SentenceVectorNoTagging/words_beagle'
# path4words ='/shared/data_WordSentenceVector/model_lawinsider_full/lawinsider_full_tagged_OnlyWord/words_beagle'
# path4words ='/shared/data_WordSentenceVector/model_googlenews/SentenceVector/words_beagle'
path4words = '/shared/data_WordSentenceVector/model_wiki_glove/SentenceVector/words_glove'
# path4emb = '/shared/data_WordSentenceVector/model_lawinsider/SentenceVectorNoTagging/vectors_beagle'
# path4emb = '/shared/data_WordSentenceVector/model_lawinsider_full/lawinsider_full_tagged_OnlyWord/vectors_beagle'
# path4emb = '/shared/data_WordSentenceVector/model_googlenews/SentenceVector/vectors_beagle'
path4emb = '/shared/data_WordSentenceVector/model_wiki_glove/SentenceVector/vectors_glove'
# path4weight = '/shared/data_WordSentenceVector/model_lawinsider/SentenceVectorNoTagging/weight4ind_weightpara_1e-03'
# path4weight = '/shared/data_WordSentenceVector/model_lawinsider_full/lawinsider_full_tagged_OnlyWord/weight4ind_weightpara_1E-03'
# path4weight = '/shared/data_WordSentenceVector/model_googlenews/SentenceVector/weight4ind_weightpara_1e-03'
path4weight = '/shared/data_WordSentenceVector/model_wiki_glove/SentenceVector/weight4ind_glove_1e-03'
# Step 2: Loading the vocabulary and the word embeddings file.
print("loading words file from {}".format(path4words.split('/')[3]))
words = pickle.load(open(path4words, 'rb'))
print("loading word embeddings file from {}".format(path4emb.split('/')[3]))
word_embeddings = pickle.load(open(path4emb, 'rb'))
print("loading weight4ind file from {}".format(path4weight.split('/')[3]))
weight4ind = pickle.load(open(path4weight, 'rb'))
sentence2 = 'three kids are sitting in the leaves'
sentence1 = 'children in red shirts are playing with leaves'
# Step 3: Preprocessing the input sentence, removing the punctuation marks.
sentence1 = sentence1.translate(None, string.punctuation)
sentence2 = sentence2.translate(None, string.punctuation)
# Step 4: Converting the two input sentences into sequences of index.
seq1, seq2 = process_data.getSeqs(sentence1, sentence2, words)
id1, m1 = process_data.prepare_data(seq1)
id2, m2 = process_data.prepare_data(seq2)
weight1 = process_data.seq2weight(id1, m1, weight4ind)
weight2 = process_data.seq2weight(id2, m2, weight4ind)
uni_pairs_weight = process_data.unigram_pairs(weight1, weight2) # list of list
uni_pairs_id = process_data.unigram_pairs(id1, id2)
# Step 5: Computing the similarity score.
similarity_score = process_data.scoring_function_v3(word_embeddings, uni_pairs_id, uni_pairs_weight)
print('the overall score is {}'.format(similarity_score))
if __name__ == '__main__':
plac.call(main)