-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmean_shift.py
168 lines (138 loc) · 5.71 KB
/
mean_shift.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#https://pythonprogramming.net/mean-shift-from-scratch-python-machine-learning-tutorial/
#http://homepages.inf.ed.ac.uk/rbf/CVonline/LOCAL_COPIES/AV0405/MISHRA/kde.html
from toy_clustering_datasets import *
from pylab import plot, grid, show
import math
# Opcoes para facilitar os testes
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-b", "--bandwidth", dest="bandwidth", help="define bandwidth: -b 2.1", metavar="BANDWIDTH", default="2.1")
parser.add_option("-k", "--kernel", dest="kernel", help="choose kernel: -k [0:euclidian,1:gaussian]", metavar="KERNEL", default="1")
parser.add_option("-d", "--data", dest="data", help="choose data: [0,1]", metavar="DATA", default="0")
(opt, args) = parser.parse_args()
KERNEL = int(opt.kernel)
DATA = int(opt.data)
BANDWIDTH = float(opt.bandwidth)
#python mean_shift.py -d 0 -b 2.1 -k 0
def mean_shift_old(S,r):
P = []
C = S
while P != C:
P = C
C = sorted(set([centroid(p, r, [q for q in C if distance(p,q)<r], KERNEL) for p in C]))
return C
def distance(p,q):
return hypot(p[0]-q[0],p[1]-q[1])
"""
mean_shift(S,r) : Retorna um array com um centroid para cada ponto recebido por S, usando r como janela.
Paramentros:
S: array de pontos
r: janela
"""
def mean_shift(S,r):
C = S # C: armazena as centroids. Inicia com cada ponto sendo a sua propria centroid.
lastC = [] # lastC: armazena as centroid da iteração anterior. Inicializada vazia.
# Deve executar até que as centroids não sejam alteradas.
while lastC != C:
lastC = C # Guarda as centroids atuais para comparação futura.
newC = [] # newC: ira armazenar as novas centroids. Util pois durante a iteração os arrays lastC e C não devem ser alterados.
# Itera em todas as centroids.
for cc in C: # cc: centroid atual.
# recupera a lista de pontos pertencentes ao dataSet que estão a uma distancia menor ou igual a r a partir de uma centroid cc.
for pointsOnWindow in [[q for q in C if distance(cc,q)<=r]]:
c = centroid(cc, r, pointsOnWindow, KERNEL) # determina a centroid a partir dos pontos pertencentes a janela.
newC.append(c) # armazena a centroid obtida
#print ("centroid: %s=>%s, pontos pertencentes a janela: %s"%(cc, c, pointsOnWindow)) # Util para debug
C = newC # Fim da iteração, C pode ser atualizada.
print("C: ", C)
return C
# Gera um L a partir das centroids
def buidL(C):
NL=C[:]
for idx, value in enumerate(sorted(set(C))):
for i, l in enumerate(NL):
if l==value:
NL[i]=idx
return NL
# https://mccormickml.com/2013/08/15/the-gaussian-kernel/
'''
Given two D dimensional vectors x_i and x_j. The Gaussian kernel is defined as
k(x_i,x_j)=exp(-|| x_i - x_j ||^2 / sigma^2)
where ||x_i - x_j|| is the Euclidean distance given by
||x_i - x_j||=((x_i1-x_j1)^2 + (x_i2-x_j2)^2 + ... + (x_iD-x_jD)^2)^.5
and sigma^2 is the bandwidth of the kernel.
Note that the Gaussian kernel is a measure of similarity between x_i and x_j.
It evalues to 1 if the x_i and x_j are identical, and approaches 0 as x_i and x_j move further apart.
The function relies on the dist function in the stats package for an initial estimate of the euclidean distance.
'''
def gaussian_kernel(p, q, r):
return math.exp(-distance(p, q)**2/r**2)
def calc_gaussian(p, S, h):
o=0
m=0
for q in S:
w = gaussian_kernel(p, q, h)
o += w
m += w*q[0]
return m/o
def gaussian_centroid_by_axis(p, axis_data, h):
o=0
m=0
for q in axis_data:
w = math.exp(-math.fabs(p-q)**2/h**2)
o += w
m += w*q
return m/o
def centroid(p, r, S, kernelId):
m = 0
X = [s[0] for s in S]
Y = [s[1] for s in S]
if (kernelId==0):
m = (sum(X)/len(X), sum(Y)/len(Y)) # Calcula a media euclidiana para cada eixos
else:
m=(gaussian_centroid_by_axis(p[0], X, r), gaussian_centroid_by_axis(p[1], Y, r)) # calcula a distancia gaussiana para cada eixo
return m
def get_data_sample(id):
S=[]
L=[]
if id==0:
S = [(1,1), (1,2), (2,2), # cluster 0
(2,5), (3,4), (3,6), (4,5), # cluster 1
(4,1), (5,3), (6,1) # cluster 2
]
L = [0, 0, 0,
1, 1, 1, 1,
2, 2, 2]
elif id==1:
S = [(1.0,1.0), (1.0,2.0), (1.5,1.5), (2.0,1.0), (2.0,2.0), # cluster 0
(3.5,1.5), (4.0,1.0), (4.0,2.0), (4.5,1.5), (4.5,2.5), # cluster 1
(5.0,1.0), (5.0,2.0), (5.0,3.0), (5.5,1.5), (5.5,2.5),
(6.0,2.0), (6.0,3.0), (6.5,2.5),
(7.5,1.5), (8.0,1.0), (8.0,1.5), (8.0,2.0), (8.5,1.5), # cluster 2
(9.0,9.0) # noise
]
L = [0, 0, 0, 0, 0, # labels
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2,
3]
return S,L
def main():
S, L = get_data_sample(DATA)
show_clusters('Original clusters',S,L)
C = mean_shift(S, BANDWIDTH)
L = buidL(C)
title('Centroids found by Mean-Shift')
for i,p in enumerate(S): scatter([p[0]],[p[1]],color=colour(L[i]),marker=marker(L[i]),s=25)
for c in C: scatter([c[0]],[c[1]],color='k',marker='*',s=150)
grid(True)
show()
def base():
S, L = make_circles(50,0)
show_clusters('Original clusters',S,L)
C = mean_shift(S, BANDWIDTH)
L = buidL(C)
for i,p in enumerate(S): scatter([p[0]],[p[1]],color=colour(L[i]),marker=marker(L[i]),s=25)
for c in C: scatter([c[0]],[c[1]],color='k',marker='*',s=150)
grid(True)
show()
base()