forked from jjaranda13/HD-python-cluster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHDexample.py
131 lines (126 loc) · 5.03 KB
/
HDexample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: cp1252 -*-
###############################################################################
# High Dimensionality problem example
# Authors:
# 2015 Jose Javier Garcia Aranda , Juan Ramos Diaz
#
###############################################################################
# This High Dimensionality example creates N items (which are "users").
# Each user is defined by his profile.
# A profile is a tuple of 10 pairs of keyword and weight ( 20 fields in total)
# weights are floating numbers and belong to 0..1
# The summation of weights of a profile is normalized to 1
# we consider 1000 diferent keywords
# A profile takes 8 keywords from first 200 keywords (the "popular" keywords)
# Each keyword is a dimension. Therefore there are 1000 possible dimensions
# A single user only have 10 dimensions
# Different users can have different dimensions.
# A new distance and equality function are defined for this use case
#
# cl = KMeansClustering(users,HDdistItems,HDequals);
#
# Additionally, now the number of iterations can be limited in order to save time
# Experimentally, we have concluded that 10 iterations is enough accurate for most cases.
# The new HDgetClusters() function is linear. Avoid the recalculation of centroids
# whereas original function getClusters() is N*N complex, because recalculate the
# centroid when move an item from one cluster to another.
# This new function can be used for low and high dimensionality problems, increasing
# performance in both cases
#
# solution = cl.HDgetclusters(numclusters,max_iterations);
#
# Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation.
# median is more accurate but involves more computations when N is huge.
# The function HDcentroid() is invoked internally by HDgetclusters()
#
# The optional invocation of HDcomputeSSE() assist the computation of the optimal number or clusters.
#
#
from cluster import KMeansClustering
from cluster import ClusteringError
from cluster import util
from cluster.util import HDcentroid
from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE
import time
import datetime
import random
def createProfile():
num_words=1000
total_weight=0;
marked_word=[0]*num_words
repeated_word=False
list_profile=[]
returned_profile=();
profile_aux=[];
#10 pairs word, weight.
#Don't repeated words.
for i in range(8):
partial_weight=random.uniform(0,1)
total_weight+=partial_weight
repeated_word=False
while repeated_word==False:
random_word=random.randint(0,299)
if marked_word[random_word]==0:
marked_word[random_word]=1
repeated_word=True
random_word= str(random_word)
tupla=[random_word,partial_weight]
list_profile.append(tupla)
for i in range(2):
partial_weight=random.uniform(0,1)
total_weight+=partial_weight
repeated_word=False
while repeated_word==False:
random_word=random.randint(300,999)
if marked_word[random_word]==0:
marked_word[random_word]=1
repeated_word=True
random_word= str(random_word)
tupla=[random_word,partial_weight]
list_profile.append(tupla)
#Normalization of the profile
for i in range(5):
a=list_profile[i][0]
b=list_profile[i][1]
b=b/total_weight; #the sum of the weights must be 1
profile_aux=([a,b])
returned_profile+=tuple(profile_aux)
return returned_profile
####################################################
# MAIN #
####################################################
sses=[0]*10 #stores the sse metric for each number of clusters from 5 to 50
num_users=100
numsse=0
numclusters=5 # starts at 5
max_iteraciones=10
ts = time.time()
start_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
while numclusters<=50: # compute SSE from num_clusters=5 to 50
supersol=0#supersolucion, distancias entre el clusters y los usuarios.
users=[] # users are the items of this example
for i in range(num_users):#en el range el numero de usuarios
user = createProfile()
users.append(user)
#x=0;
print " inicializing kmeans..."
cl = KMeansClustering(users,HDdistItems,HDequals);
print " executing...",numclusters
ts = time.time()
st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print st
numclusters=numclusters
solution = cl.HDgetclusters(numclusters,max_iteraciones);
for i in range(numclusters):
a = solution[i]
print util.HDcentroid(a),","
ts = time.time()
st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
sses[numsse]=HDcomputeSSE(solution,numclusters)
numsse+=1
numclusters+=5
ts = time.time()
end_time=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print "start_time:",start_time
print "end_time:",end_time
print "sses:",sses