-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathneural_note_linking.py
157 lines (122 loc) · 6.03 KB
/
neural_note_linking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
def get_distance_matrix(vectors, metric="cosine", triangular=True, verbose=False):
"""Compute the distance between all pairs of vectors in a list and return them as a matrix.
Nothing fancy here, so number of operations is quadratic in len(vectors).
Args:
vectors (list of 1-d arrays): Vectors to compare. All vectors should be of the same length.
metric (str): Distance metric to use.
Currently supported: "l2", "cosine". (cosine distance = 1 - cosine similarity, normalized to be between 0 and 1)
Default: "cosine".
triangular (bool): Whether to only fill the lower triangle of the matrix instead of returning a symmetric matrix.
Default: True.
verbose (bool): Whether to print progressbar.
Default: False.
Returns:
np.array: Distance matrix of shape (len(vectors), len(vectors)).
"""
distance_functions = {
"l2": lambda x, y: np.linalg.norm(x-y),
"cosine": lambda x, y: 1 - (np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + 1) / 2
# "cosine": lambda x, y: -np.log((np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + 1) / 2)
}
assert metric in distance_functions, f"Unrecognized metric: {metric}."
distance_function = distance_functions[metric]
n = len(vectors)
pairs = list(itertools.combinations(range(n), 2))
matrix = np.zeros((n, n))
# The distance functions have as minimal distance 0, so we never compute the distance between a vector and itself.
for i, j in tqdm(pairs, disable=not verbose):
dist = distance_function(vectors[i], vectors[j])
matrix[j, i] = dist
if not triangular:
matrix[i, j] = dist
return matrix
def get_text_distances(texts, names=None, sentence_transformer=None, metric="cosine", skip_duplicates=True):
"""Get distances between texts as a pd.Series.
Args:
texts (list of strings): Texts to compare.
names (list of strings): Names of texts. If None, use integers.
Default: None.
sentence_transformer (SentenceTransformer): Transformer for computing sentence embeddings.
If None, will automatically load "distiluse-base-multilingual-cased"
metric (str): Distance metric to use.
Currently supported: "l2", "cosine". (cosine distance = 1 - cosine similarity, normalized to be between 0 and 1)
Default: "cosine".
skip_duplicates (bool): Whether to skip rows representing a distance already present in the Series.
If False, each distance will be represented twice, as srs.loc[a, b] and srs.loc[b, a].
Default: True.
Returns:
pd.Series: Series with MultiIndex representing pairs of texts, and distances as values.
"""
if sentence_transformer is None:
sentence_transformer = SentenceTransformer("distiluse-base-multilingual-cased")
if names is None:
names = range(len(texts))
embeddings = sentence_transformer.encode(texts)
dist = pd.DataFrame(
get_distance_matrix(embeddings, metric=metric, triangular=skip_duplicates),
index=names,
columns=names
).unstack()
if skip_duplicates:
# Keep only one row for each pair, also excluding the diagonal
dist = dist.loc[itertools.combinations(names, 2)]
else:
# Exclude only the diagonal
dist = dist.drop([(name, name) for name in names])
return dist
def get_small_dimensional_embeddings(texts, sentence_transformer=None, dimension=2):
"""Embed texts in small dimension using PCA.
Args:
texts (list of strings): Texts to compare.
sentence_transformer (SentenceTransformer): Transformer for computing sentence embeddings.
If None, will automatically load "distiluse-base-multilingual-cased"
Default: None.
dimension (int): How many dimensions to keep.
Default: 2.
Returns:
np.array with shape (n_texts, dimension)
"""
if sentence_transformer is None:
sentence_transformer = SentenceTransformer("distiluse-base-multilingual-cased")
embeddings = sentence_transformer.encode(texts)
small_dim_embeddings = PCA(dimension).fit_transform(embeddings)
return small_dim_embeddings
def scatter_with_annotations(coords, annotations):
"""Create a scatterplot based on some coords and adding text annotations.
Args:
coords (array-like): Coordinates to scatter and add text to.
Each element must be 2-dimensional.
annotations (list of strings): Texts to add to the plot.
Must be of same length as coords.
Returns:
plt.Axes: Axes object for the plot.
"""
fig, ax = plt.subplots(figsize=(16, 9))
plt.scatter(coords[:, 0], coords[:, 1], alpha=0.4)
for name, vect in zip(annotations, coords):
plt.annotate(name, vect)
return ax
def embed_and_plot(texts, names=None, sentence_transformer=None):
"""Embed texts in 2 dimensions and plot them using their names.
Args:
texts (list of strings): Texts to embed.
names (list of strings): Names of texts. If None, use integers.
Default: None.
sentence_transformer (SentenceTransformer): Transformer for computing sentence embeddings.
If None, will automatically load "distiluse-base-multilingual-cased"
Returns:
plt.Axes: Axes object for the plot
"""
if sentence_transformer is None:
sentence_transformer = SentenceTransformer("distiluse-base-multilingual-cased")
if names is None:
names = range(len(texts))
embeddings = get_small_dimensional_embeddings(texts, sentence_transformer=sentence_transformer, dimension=2)
return scatter_with_annotations(embeddings, names)