Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new functionalities for High Dimensionality problem and improved performance #19

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
6 changes: 5 additions & 1 deletion AUTHORS
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
Michel Albert ([email protected])
Sam Sandberg (@LoisaidaSam)
Sam Sandberg (@LoisaidaSam)

high dimensionality functionalities:
Jose J. GarciaAranda (@jjaranda13)
Juan Ramos Diaz (@juanrd0088)
128 changes: 128 additions & 0 deletions HDexample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# -*- coding: cp1252 -*-
###############################################################################
# High Dimensionality problem example
# Authors:
# 2015 Jose Javier Garcia Aranda , Juan Ramos Diaz
#
###############################################################################
# This High Dimensionality example creates N items (which are "users").
# Each user is defined by his profile.
# A profile is a tuple of 10 pairs of keyword and weight ( 20 fields in total)
# weights are floating numbers and belong to 0..1
# The summation of weights of a profile is normalized to 1
# we consider 1000 diferent keywords
# A profile takes 8 keywords from first 200 keywords (the "popular" keywords)
# Each keyword is a dimension. Therefore there are 1000 possible dimensions
# A single user only have 10 dimensions
# Different users can have different dimensions.
# A new distance and equality function are defined for this use case
#
# cl = KMeansClustering(users,HDdistItems,HDequals);
#
# Additionally, now the number of iterations can be limited in order to save time
# Experimentally, we have concluded that 10 iterations is enough accurate for most cases.
# The new HDgetClusters() function is linear. Avoid the recalculation of centroids
# whereas original function getClusters() is N*N complex, because recalculate the
# centroid when move an item from one cluster to another.
# This new function can be used for low and high dimensionality problems, increasing
# performance in both cases
#
# solution = cl.HDgetclusters(numclusters,max_iterations);
#
# Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation.
# median is more accurate but involves more computations when N is huge.
# The function HDcentroid() is invoked internally by HDgetclusters()
#
# The optional invocation of HDcomputeSSE() assist the computation of the optimal number or clusters.
#
#
from __future__ import print_function
from cluster import KMeansClustering
from cluster import ClusteringError
from cluster import util
from cluster.util import HDcentroid
from cluster.HDdistances import HDdistItems, HDequals, HDcomputeSSE, HD_profile_dimensions

import time
import datetime
import random

def createProfile():
"""create a profile composed of 10 dimensions chosen from 1000 dimensions"""
num_words=1000
total_weight=0;
marked_word=[0]*num_words
repeated_word=False
list_profile=[]
returned_profile=();
profile_aux=[];
#10 pairs word, weight.
HD_profile_dimensions=10
#Don't repeated words.
for i in range(8):
partial_weight=random.uniform(0,1)
total_weight+=partial_weight
repeated_word=False
while repeated_word==False:
random_word=random.randint(0,299)
if marked_word[random_word]==0:
marked_word[random_word]=1
repeated_word=True
random_word= str(random_word)
tupla=[random_word,partial_weight]
list_profile.append(tupla)
for i in range(2):
partial_weight=random.uniform(0,1)
total_weight+=partial_weight
repeated_word=False
while repeated_word==False:
random_word=random.randint(300,999)
if marked_word[random_word]==0:
marked_word[random_word]=1
repeated_word=True
random_word= str(random_word)
tupla=[random_word,partial_weight]
list_profile.append(tupla)
#Normalization of the profile
for i in range(5):
a=list_profile[i][0]
b=list_profile[i][1]
b=b/total_weight; #the sum of the weights must be 1
profile_aux=([a,b])
returned_profile+=tuple(profile_aux)
return returned_profile

####################################################
# MAIN #
####################################################
sses=[0]*10 #stores the sse metric for each number of clusters from 5 to 50
num_users=100
numsse=0
numclusters=5 # starts at 5
max_iterations=10
start_time=datetime.datetime.now()
while numclusters<=50: # compute SSE from num_clusters=5 to 50
users=[] # users are the items of this example
for i in range(num_users):
user = createProfile()
users.append(user)
print (" inicializing kmeans...")
cl = KMeansClustering(users,HDdistItems,HDequals);
print (" executing...",numclusters)
st=datetime.datetime.now()
print (st)
numclusters=numclusters
solution = cl.HDgetclusters(numclusters,max_iterations);
for i in range(numclusters):
a = solution[i]
print (util.HDcentroid(a),",")
st=datetime.datetime.now()

sses[numsse]=HDcomputeSSE(solution,numclusters)
numsse+=1
numclusters+=5
end_time=datetime.datetime.now()
print ("start_time:",start_time)
print ("end_time:",end_time)
print ("sses:",sses)

45 changes: 45 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,48 @@ The parameter passed to getclusters is the count of clusters generated.
.. image:: https://readthedocs.org/projects/python-cluster/badge/?version=latest
:target: http://python-cluster.readthedocs.org
:alt: Documentation Status



2015/07/20 NEW FUNCTIONALITIES FOR HIGH AND LOW DIMENSIONALITY PROBLEMS
=======================================================================
Authors of new added functionalities:
- Garcia Aranda, Jose Javier [email protected]
- Ramos Diaz, Juan [email protected]

Acknoledgements:
Authors want to thank the Spanish Economy & competitiveness Ministry which funds this research through "INNPACTO" innovation program IPT-2012-0839-430000.

High dimensionality (HD) problems are those which have items with high number of dimensions
There are two types of HD problems:
a)set of items with large number of dimensions.
b)set of items with a limited number of dimensions from a large available number of dimensions::
For example considering dimensions X, Y, Z, K, L, M and the items
item1=(X=2, Z=5, L=7)
item2=(X=6, Y=5, M=7)

The HD problems involves a high cost computation because distance functions in this case takes more operations than Low dimensionality problems.

For case "b" (valid also for "a"), a new distance for HD problems is available: HDdistItems() ,HDequals()
This distance function compares dimensions between 2 items.
Each dimension of item1 is searched in item2, and if it is found, then the distance takes into account the difference (manhattan style). If the dimension does not exist in item2, a maximum value is added to the total distance between item1 and item2

There is no difference with current usage::

>>>cl = KMeansClustering(users,HDdistItems,HDequals);


Additionally, now the number of iterations can be limited in order to save time
Experimentally, we have concluded that 10 iterations is enough accurate for most cases.
The new HDgetClusters() function is linear. Avoid the recalculation of centroids
whereas original function getClusters() is N*N complex, because recalculate the
exhuma marked this conversation as resolved.
Show resolved Hide resolved
centroid when move an item from one cluster to another.
This new function can be used for low and high dimensionality problems, increasing
performance in both cases::

>>>solution = cl.HDgetclusters(numclusters,max_iterations)

Other new available optimization inside HDcentroid() function in is the use of mean instead median at centroid calculation.
median is more accurate but involves more computations when N is huge.
The function HDcentroid() is invoked internally by HDgetclusters()

74 changes: 74 additions & 0 deletions cluster/HDdistances.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@

""" This file provides functionalities for High dimensionality problems but also for low dimensionality problems

added functionalities:
- New Distance computation
- SSE metric computation for assist the computation of the optimal number of clusters

Authors:
Jose Javier Garcia Aranda
Juan Ramos Diaz
"""
import util
import time
import datetime
import random

HD_profile_dimensions=10 #dimensions per profile, default value is 10

def HDdistItems(profile1,profile2):
"""Distance function, this distance between two profiles is defined as:
For each keyword of user A, if the keyword is not present in user B , then the distance for this keyword is the weight in the user A.
If the keyword exists in both users, the weights are compared and the distance is the absolute difference.
For each keyword present in the union of keywords of both profiles, the distance is computed and added to the total distance between both users
"""

len1=len(profile1)/2 # len(profile1) is always pair because each dimension has a weight
len2=len(profile2)/2 # len(profile2) is always pair because each dimension has a weight
total_len=len1+len2 #this value usually is 20
#factor_len=20.0/total_len #this only work if the profile has less than 10 keys
factor_len=2.0*HD_profile_dimensions/total_len #this only work if the profile has less than 10 keys
distance = 0.0
marked=[0]*(total_len*2);
for i in range(len1):
found=False
for j in range(len2):
if profile1[i*2]==profile2[j*2]:
distance+=abs(profile1[i*2+1]-profile2[j*2+1]);
found=True;
marked[j*2]=1;
break;
if found==False:
distance+=profile1[i*2+1];

for i in range(len2):
if marked[i*2]==1:
continue;
distance+=profile2[i*2+1]

distance=distance*factor_len
return distance

def HDequals(profile1,profile2):
for i in range(HD_profile_dimensions):
for j in range(HD_profile_dimensions):
if profile1[i*2]!=profile2[j*2]:
return False
elif profile1[i*2+1]!=profile2[j*2+1]:
return False
return True


def HDcomputeSSE(solution,numclusters):
"""This metric measure the cohesion of users into a cluster and the separation among clusters at the same time"""

partial_solution=0
total_solution=0
dist=0
for i in range(numclusters):
partial_solution=0
for j in solution[i]:
dist=HDdistItems(util.HDcentroid(solution[i]),j)
partial_solution+=dist*dist
total_solution+=partial_solution
return total_solution
102 changes: 100 additions & 2 deletions cluster/method/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
# along with this library; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# new functions: HDgetCluster() and HDassignItem by:
# 2015 Jose Javier Garcia Aranda, Juan Ramos Diaz


from cluster.util import ClusteringError, centroid, minkowski_distance
from cluster.util import ClusteringError, centroid, minkowski_distance, HDcentroid
import time
import datetime


class KMeansClustering(object):
Expand Down Expand Up @@ -166,3 +169,98 @@ def initialise_clusters(self, input_, clustercount):
for item in input_:
self.__clusters[count % clustercount].append(item)
count += 1


def HDgetclusters(self, count, max_iterations):
"""
Generates *count* clusters.

:param count: The amount of clusters that should be generated. count
must be greater than ``1``.
:raises ClusteringError: if *count* is out of bounds.
"""

# only proceed if we got sensible input
if count <= 1:
raise ClusteringError("When clustering, you need to ask for at "
"least two clusters! "
"You asked for %d" % count)

# return the data straight away if there is nothing to cluster
if (self.__data == [] or len(self.__data) == 1 or
count == self.__initial_length):
return self.__data

# It makes no sense to ask for more clusters than data-items available
if count > self.__initial_length:
raise ClusteringError(
"Unable to generate more clusters than "
"items available. You supplied %d items, and asked for "
"%d clusters." % (self.__initial_length, count))

self.initialise_clusters(self.__data, count)

items_moved = True # tells us if any item moved between the clusters,
# as we initialised the clusters, we assume that
# is the case

iteration=0

#The number of iterations is limited to max_iterations. When this limit is reached, the items_moved is forced to false
while items_moved is True:
items_moved = False
#print "iterating",iteration # for debug purposes
st=datetime.datetime.now()
# print st # for debug purposes
iteration=iteration+1

#computation of centroids
my_centroids={} # new!!
for cluster in self.__clusters:
one_centroid=HDcentroid(cluster)
my_centroids[one_centroid]=cluster

#print centroids . it works, for debug purposes only!!
#for i in my_centroids.keys():
# print "key:",i # print the centroid!!
# print "value:",my_centroids[i] # print all elements of the cluster!!
#print my_centroids.keys()[0] # print the fist centroid. for testing

#now we scan the N items without recalculation of centroids. Therefore, it is linear
for cluster in self.__clusters:
for centroid_aux, cluster_aux in my_centroids.iteritems():
if cluster_aux == cluster:
centroid_cluster=centroid_aux
break;
for item in cluster:
res = self.HDassign_item(item, cluster,centroid_cluster,my_centroids)
if items_moved is False:
items_moved = res

if (iteration == max_iterations):
items_moved = False
return self.__clusters


def HDassign_item(self, item, origin, origin_centroid, my_centroids):
"""
Assigns an item from a given cluster to the closest located cluster.

:param item: the item to be moved.
:param origin: the originating cluster.
:param origin_centroid: centroid of the originating cluster
:my_centroids: dictionary of centroid,cluster
"""
closest_cluster=origin
closest_centroid=origin_centroid

for center in my_centroids.keys():
if self.distance(item, center) < self.distance(
item, closest_centroid):
closest_cluster = my_centroids[center]

if closest_cluster is not origin:
self.move_item(item, origin, closest_cluster)
return True
else:
return False
Loading