-
-
Notifications
You must be signed in to change notification settings - Fork 46.2k
/
Copy pathsimilarity_search.py
162 lines (131 loc) · 5.33 KB
/
similarity_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Similarity Search : https://en.wikipedia.org/wiki/Similarity_search
Similarity search is a search algorithm for finding the nearest vector from
vectors, used in natural language processing.
In this algorithm, it calculates distance with euclidean distance and
returns a list containing two data for each vector:
1. the nearest vector
2. distance between the vector and the nearest vector (float)
"""
from __future__ import annotations
import math
import numpy as np
from numpy.linalg import norm
def euclidean(input_a: np.ndarray, input_b: np.ndarray) -> float:
"""
Calculates euclidean distance between two data.
:param input_a: ndarray of first vector.
:param input_b: ndarray of second vector.
:return: Euclidean distance of input_a and input_b. By using math.sqrt(),
result will be float.
>>> euclidean(np.array([0]), np.array([1]))
1.0
>>> euclidean(np.array([0, 1]), np.array([1, 1]))
1.0
>>> euclidean(np.array([0, 0, 0]), np.array([0, 0, 1]))
1.0
"""
return math.sqrt(sum(pow(a - b, 2) for a, b in zip(input_a, input_b)))
def similarity_search(
dataset: np.ndarray, value_array: np.ndarray
) -> list[list[list[float] | float]]:
"""
:param dataset: Set containing the vectors. Should be ndarray.
:param value_array: vector/vectors we want to know the nearest vector from dataset.
:return: Result will be a list containing
1. the nearest vector
2. distance from the vector
>>> dataset = np.array([[0], [1], [2]])
>>> value_array = np.array([[0]])
>>> similarity_search(dataset, value_array)
[[[0], 0.0]]
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
>>> value_array = np.array([[0, 1]])
>>> similarity_search(dataset, value_array)
[[[0, 0], 1.0]]
>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
>>> value_array = np.array([[0, 0, 1]])
>>> similarity_search(dataset, value_array)
[[[0, 0, 0], 1.0]]
>>> dataset = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
>>> similarity_search(dataset, value_array)
[[[0, 0, 0], 0.0], [[0, 0, 0], 1.0]]
These are the errors that might occur:
1. If dimensions are different.
For example, dataset has 2d array and value_array has 1d array:
>>> dataset = np.array([[1]])
>>> value_array = np.array([1])
>>> similarity_search(dataset, value_array)
Traceback (most recent call last):
...
ValueError: Wrong input data's dimensions... dataset : 2, value_array : 1
2. If data's shapes are different.
For example, dataset has shape of (3, 2) and value_array has (2, 3).
We are expecting same shapes of two arrays, so it is wrong.
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]])
>>> value_array = np.array([[0, 0, 0], [0, 0, 1]])
>>> similarity_search(dataset, value_array)
Traceback (most recent call last):
...
ValueError: Wrong input data's shape... dataset : 2, value_array : 3
3. If data types are different.
When trying to compare, we are expecting same types so they should be same.
If not, it'll come up with errors.
>>> dataset = np.array([[0, 0], [1, 1], [2, 2]], dtype=np.float32)
>>> value_array = np.array([[0, 0], [0, 1]], dtype=np.int32)
>>> similarity_search(dataset, value_array) # doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
TypeError: Input data have different datatype...
dataset : float32, value_array : int32
"""
if dataset.ndim != value_array.ndim:
msg = (
"Wrong input data's dimensions... "
f"dataset : {dataset.ndim}, value_array : {value_array.ndim}"
)
raise ValueError(msg)
try:
if dataset.shape[1] != value_array.shape[1]:
msg = (
"Wrong input data's shape... "
f"dataset : {dataset.shape[1]}, value_array : {value_array.shape[1]}"
)
raise ValueError(msg)
except IndexError:
if dataset.ndim != value_array.ndim:
raise TypeError("Wrong shape")
if dataset.dtype != value_array.dtype:
msg = (
"Input data have different datatype... "
f"dataset : {dataset.dtype}, value_array : {value_array.dtype}"
)
raise TypeError(msg)
answer = []
for value in value_array:
dist = euclidean(value, dataset[0])
vector = dataset[0].tolist()
for dataset_value in dataset[1:]:
temp_dist = euclidean(value, dataset_value)
if dist > temp_dist:
dist = temp_dist
vector = dataset_value.tolist()
answer.append([vector, dist])
return answer
def cosine_similarity(input_a: np.ndarray, input_b: np.ndarray) -> float:
"""
Calculates cosine similarity between two data.
:param input_a: ndarray of first vector.
:param input_b: ndarray of second vector.
:return: Cosine similarity of input_a and input_b. By using math.sqrt(),
result will be float.
>>> cosine_similarity(np.array([1]), np.array([1]))
1.0
>>> cosine_similarity(np.array([1, 2]), np.array([6, 32]))
0.9615239476408232
"""
return float(np.dot(input_a, input_b) / (norm(input_a) * norm(input_b)))
if __name__ == "__main__":
import doctest
doctest.testmod()