-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_embeddings.py
101 lines (79 loc) · 3.4 KB
/
make_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from transformers import AutoTokenizer, AutoModel
from torch import Tensor
import torch
import numpy as np
import os
import json
model_name = "intfloat/multilingual-e5-base"
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
def create_embeddings(string_dict, device="cuda"):
embeddings_raw_name = "embeddings.npy"
embeddings_text_name = "embeddings.txt"
if not os.path.exists(embeddings_raw_name):
print("Генерация эмбеддингов")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
embeddings = []
string_list = list(string_dict.values())
string_list = [i for i in string_list if i is not None or i != ""]
string_list = ["passage: " + i for i in string_list]
with torch.no_grad():
for line in string_list:
batch_dict = tokenizer(
line,
max_length=512,
padding=True,
truncation=True,
return_tensors="pt",
).to(device)
outputs = model(**batch_dict)
embedding = average_pool(
outputs.last_hidden_state, batch_dict["attention_mask"]
).cpu()
embeddings.append(embedding[0])
embeddings = torch.stack(embeddings).cpu().detach().numpy()
np.save(embeddings_raw_name, embeddings)
with open(embeddings_text_name, "w", encoding="utf-8") as f:
for line in string_list:
f.write(line + "\n")
embeddings_raw = embeddings
embeddings_text = string_list
else:
print("Загрузка готовых эмбеддингов")
embeddings_raw = np.load(embeddings_raw_name)
with open(embeddings_text_name, "r", encoding="utf-8") as f:
embeddings_text = f.readlines()
return embeddings_raw, embeddings_text
def get_embedding(text: str):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
batch_dict = tokenizer(
text, max_length=512, padding=True, truncation=True, return_tensors="pt"
)
outputs = model(**batch_dict)
embedding = (
average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
.cpu()
.detach()
.numpy()
)
return embedding
def get_parts_texts(generated_data_path: str):
with open(generated_data_path, "r", encoding="utf-8") as file:
dict = json.load(file)
# save only sub small texts
parapraphs = []
for topic in dict.keys():
for subtopic in dict[topic]:
for point_inx in range(len(dict[topic][subtopic])):
parapraphs.append(dict[topic][subtopic][point_inx])
string_dict = {i: k for i, k in enumerate(parapraphs)}
return string_dict
if __name__ == "__main__":
# add argument parser
generated_data = "./generated_data.json"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
small_texts_parts = get_parts_texts(generated_data)
embeddings_raw, embeddings_text = create_embeddings(small_texts_parts, device)