-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTALNT.py
91 lines (68 loc) · 3.44 KB
/
TALNT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import logging
import torch
import torch.nn.functional as F
# This should work for any HuggingFace transformers model and tokenizer
def add_token(model, tokenizer, token, description):
# First attempt to add the token to the tokenizer
tokens_before = len(tokenizer)
tokenizer.add_tokens([token])
if tokens_before == len(tokenizer):
logging.info("Token already in tokenizer")
return
# Next expand the dimension of the model embeddings (NOTE: This already updates the final linear layer size too)
new_token_embeddings = model.resize_token_embeddings(len(tokenizer))
# Tokenize the description, get embeddings for each token, and sum
description_tokens = torch.tensor(tokenizer(description)["input_ids"])
embeddings_sum = new_token_embeddings(description_tokens).sum(dim=0)
# Do the same for the LM head
new_lm_head = model.get_output_embeddings()
lm_head_embeddings_sum = F.embedding(
description_tokens, new_lm_head.weight).sum(dim=0)
# Set the new token's embedding to the sum of the description's token embeddings
new_token_embeddings_module = model.get_input_embeddings()
with torch.no_grad():
new_token_embeddings_module.weight[-1, :] = embeddings_sum
new_lm_head.weight[-1, :] = lm_head_embeddings_sum
model.set_input_embeddings(new_token_embeddings)
model.set_output_embeddings(new_lm_head)
return model, tokenizer
def add_tokens(model, tokenizer, tokens, descriptions):
for token, description in zip(tokens, descriptions):
add_token(model, tokenizer, token, description)
return model, tokenizer
def add_token_norm_weighted(model, tokenizer, token, description):
# First attempt to add the token to the tokenizer
tokens_before = len(tokenizer)
tokenizer.add_tokens([token])
if tokens_before == len(tokenizer):
logging.info("Token already in tokenizer")
return
# Next expand the dimension of the model embeddings (NOTE: This already updates the final linear layer size too)
new_token_embeddings = model.resize_token_embeddings(len(tokenizer))
# Tokenize the description, get embeddings for each token, and do weighted sum
description_tokens = torch.tensor(tokenizer(description)["input_ids"])
embeddings_sum = norm_weighted_sum(
new_token_embeddings(description_tokens))
# Do the same for the LM head
new_lm_head = model.get_output_embeddings()
lm_head_embeddings_sum = norm_weighted_sum(
F.embedding(description_tokens, new_lm_head.weight))
# Set the new token's embedding to the sum of the description's token embeddings
new_token_embeddings_module = model.get_input_embeddings()
with torch.no_grad():
new_token_embeddings_module.weight[-1, :] = embeddings_sum
new_lm_head.weight[-1, :] = lm_head_embeddings_sum
model.set_input_embeddings(new_token_embeddings)
model.set_output_embeddings(new_lm_head)
return model, tokenizer
def add_tokens_norm_weighted(model, tokenizer, tokens, descriptions):
for token, description in zip(tokens, descriptions):
add_token_norm_weighted(model, tokenizer, token, description)
return model, tokenizer
def norm_weighted_sum(embeddings):
# Empty tensor to store the weighted sum
weighted_sum = torch.zeros(embeddings.size(1))
# Add the weighted sum of the embeddings
for embedding in embeddings:
weighted_sum += torch.norm(embedding) * embedding
return weighted_sum