Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug] Fix clustering result #28

Merged
merged 1 commit into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 4 additions & 36 deletions pygeoda/clustering/azp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__author__ = "Xun Li <[email protected]>"

from pygeoda.clustering.utils import calculate_clustering_statistics
from ..libgeoda import VecVecDouble, VecPair, VecDouble, VecInt, Pair
from ..libgeoda import gda_azp_greedy, gda_azp_sa, gda_azp_tabu
from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters
Expand Down Expand Up @@ -74,18 +75,7 @@ def azp_greedy(p, w, data, **kwargs):

cluster_ids = gda_azp_greedy(p, w.gda_w, in_data, scale_method, inits, min_bounds, max_bounds, in_init_regions, distance_method, random_seed)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : within_ss,
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)

def azp_sa(p, w, data, cooling_rate=0.85, **kwargs):
''' A simulated annealing algorithm to solve the AZP problem
Expand Down Expand Up @@ -151,18 +141,7 @@ def azp_sa(p, w, data, cooling_rate=0.85, **kwargs):

cluster_ids = gda_azp_sa(p, w.gda_w, in_data, scale_method, inits, cooling_rate, sa_maxit, min_bounds, max_bounds, in_init_regions, distance_method, random_seed)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : within_ss,
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)

def azp_tabu(p, w, data, tabu_length, **kwargs):
''' A tabu-search algorithm to solve the AZP problem
Expand Down Expand Up @@ -225,15 +204,4 @@ def azp_tabu(p, w, data, tabu_length, **kwargs):

cluster_ids = gda_azp_tabu(p, w.gda_w, in_data, scale_method, inits, tabu_length, conv_tabu, min_bounds, max_bounds, in_init_regions, distance_method, random_seed)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : within_ss,
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
99 changes: 40 additions & 59 deletions pygeoda/clustering/maxp.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
__author__ = "Xun Li <[email protected]>"

from ..libgeoda import VecVecDouble, VecPair, VecDouble, VecInt, Pair
from pygeoda.clustering.utils import calculate_clustering_statistics
from ..libgeoda import VecVecDouble, VecPair, VecInt
from ..libgeoda import gda_maxp_greedy, gda_maxp_sa, gda_maxp_tabu
from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters

'''
Changes:
1/20/2021 Add maxp_greedy, maxp_sa, maxp_tabu
'''


def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
''' A greedy algorithm to solve the max-p-region problem

The max-p-region problem is a special case of constrained clustering where a finite number of geographical areas are aggregated into the maximum number of regions (max-p-regions), such that each region is geographically connected and the clusters could maximize internal homogeneity.

Arguments:
Expand All @@ -29,11 +30,12 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
Returns:
dict: A dict with keys {"Clusters", "TotalSS", "Within-clusterSS", "TotalWithin-clusterSS", "Ratio"}
'''

iterations = 99 if 'iterations' not in kwargs else kwargs['iterations']
init_regions = [] if 'init_regions' not in kwargs else kwargs['init_regions']
scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
'distance_method']
random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']

Expand All @@ -46,16 +48,17 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
# check if bound_variable is pandas.core.series.Series, if so, convert to list
if type(bound_variable).__name__ == "Series":
bound_variable = bound_variable.values.tolist()

if len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
raise ValueError(
"The bound_variable has to be a list of numeric values, e.g. a column of input table.")

if min_bound <= 0:
raise ValueError("The min_bound has to be a positive numeric value.")

if type(data).__name__ == "DataFrame":
data = data.values.transpose().tolist()

in_data = VecVecDouble()
for d in data:
in_data.push_back(d)
Expand All @@ -67,26 +70,17 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):

in_init_regions = VecInt(list(init_regions))

cluster_ids = gda_maxp_greedy(w.gda_w, in_data, scale_method, iterations, min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)
cluster_ids = gda_maxp_greedy(w.gda_w, in_data, scale_method, iterations, min_bounds,
max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : within_ss,
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}

def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):
''' A simulated annealing algorithm to solve the max-p-region problem

The max-p-region problem is a special case of constrained clustering where a finite number of geographical areas are aggregated into the maximum number of regions (max-p-regions), such that each region is geographically connected and the clusters could maximize internal homogeneity.

Arguments:
w (Weight): an instance of Weight class
data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
Expand All @@ -105,15 +99,16 @@ def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):
dict: A dict with keys {"Clusters", "TotalSS", "Within-clusterSS", "TotalWithin-clusterSS", "Ratio"}
'''

sa_maxit = 1 if 'sa_maxit' not in kwargs else kwargs['sa_maxit']
sa_maxit = 1 if 'sa_maxit' not in kwargs else kwargs['sa_maxit']
iterations = 99 if 'iterations' not in kwargs else kwargs['iterations']
init_regions = [] if 'init_regions' not in kwargs else kwargs['init_regions']
scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
'distance_method']
random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']

if cooling_rate <=0 or cooling_rate >=1:
if cooling_rate <= 0 or cooling_rate >= 1:
raise ValueError("Cooling rate should be in range (0,1).")

if w.num_obs < 1:
Expand All @@ -123,11 +118,12 @@ def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):
raise ValueError("The data from selected variable is empty.")

if len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
raise ValueError(
"The bound_variable has to be a list of numeric values, e.g. a column of input table.")

if min_bound <= 0:
raise ValueError("The min_bound has to be a positive numeric value.")

in_data = VecVecDouble()

if type(data).__name__ == "DataFrame":
Expand All @@ -143,26 +139,17 @@ def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):

in_init_regions = VecInt(list(init_regions))

cluster_ids = gda_maxp_sa(w.gda_w, in_data, scale_method, iterations, cooling_rate, sa_maxit, min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)
cluster_ids = gda_maxp_sa(w.gda_w, in_data, scale_method, iterations, cooling_rate, sa_maxit,
min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : within_ss,
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}

def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):
''' A tabu-search algorithm to solve the max-p-region problem

The max-p-region problem is a special case of constrained clustering where a finite number of geographical areas are aggregated into the maximum number of regions (max-p-regions), such that each region is geographically connected and the clusters could maximize internal homogeneity.

Arguments:
w (Weight): an instance of Weight class
data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
Expand All @@ -180,12 +167,13 @@ def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):
Returns:
dict: A dict with keys {"Clusters", "TotalSS", "Within-clusterSS", "TotalWithin-clusterSS", "Ratio"}
'''

conv_tabu = 10 if 'conv_tabu' not in kwargs else kwargs['conv_tabu']
iterations = 99 if 'iterations' not in kwargs else kwargs['iterations']
init_regions = [] if 'init_regions' not in kwargs else kwargs['init_regions']
scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
'distance_method']
random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']

Expand All @@ -196,11 +184,12 @@ def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):
raise ValueError("The data from selected variable is empty.")

if len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
raise ValueError(
"The bound_variable has to be a list of numeric values, e.g. a column of input table.")

if min_bound <= 0:
raise ValueError("The min_bound has to be a positive numeric value.")

in_data = VecVecDouble()

if type(data).__name__ == "DataFrame":
Expand All @@ -216,17 +205,9 @@ def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):

in_init_regions = VecInt(list(init_regions))

cluster_ids = gda_maxp_tabu(w.gda_w, in_data, scale_method, iterations, tabu_length, conv_tabu, min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)
cluster_ids = gda_maxp_tabu(
w.gda_w, in_data, scale_method, iterations, tabu_length, conv_tabu,
min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads
)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : within_ss,
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
17 changes: 3 additions & 14 deletions pygeoda/clustering/redcap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ..libgeoda import VecVecDouble, VecDouble
from pygeoda.clustering.utils import calculate_clustering_statistics
from ..libgeoda import VecVecDouble
from ..libgeoda import gda_redcap
from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters

__author__ = "Xun Li <[email protected]>, "

Expand Down Expand Up @@ -65,15 +65,4 @@ def redcap(k, w, data, method, **kwargs):

cluster_ids = gda_redcap(k, w.gda_w, in_data, scale_method, method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : list(within_ss) + [0]*(len(cluster_ids) - len(within_ss)),
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
17 changes: 3 additions & 14 deletions pygeoda/clustering/schc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ..libgeoda import VecVecDouble, VecDouble
from pygeoda.clustering.utils import calculate_clustering_statistics
from ..libgeoda import VecVecDouble
from ..libgeoda import gda_schc
from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters

__author__ = "Xun Li <[email protected]>, "

Expand Down Expand Up @@ -57,15 +57,4 @@ def schc(k, w, data, linkage_method, **kwargs):

cluster_ids = gda_schc(k, w.gda_w, in_data, scale_method, linkage_method, distance_method, bound_variable, min_bound)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : within_ss,
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
33 changes: 13 additions & 20 deletions pygeoda/clustering/skater.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
__author__ = "Xun Li <[email protected]>"

from ..libgeoda import VecVecDouble, VecDouble
from pygeoda.clustering.utils import calculate_clustering_statistics
from ..libgeoda import VecVecDouble
from ..libgeoda import gda_skater
from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters

def skater(k, w, data, **kwargs):
''' Spatial C(K)luster Analysis by Tree Edge Removal
Expand All @@ -27,40 +27,33 @@ def skater(k, w, data, **kwargs):
min_bound = 0 if 'min_bound' not in kwargs else kwargs['min_bound']
bound_variable = [] if 'bound_variable' not in kwargs else kwargs['bound_variable']
scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
'distance_method']
random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']

# check if bound_variable is pandas.core.series.Series, if so, convert to list
if type(bound_variable).__name__ == "Series":
bound_variable = bound_variable.values.tolist()

# if bound_variable is not empty, check if it has the same length as the number of observations
if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")

raise ValueError(
"The bound_variable has to be a list of numeric values, e.g. a column of input table.")

# check if min_bound is available when bound_variable is not empty
if len(bound_variable) > 0 and min_bound == 0:
raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")
raise ValueError(
"min_bound is required and greater than 0 when bound_variable is not empty.")

if type(data).__name__ == "DataFrame":
data = data.values.transpose().tolist()

in_data = VecVecDouble()
for d in data:
in_data.push_back(d)

cluster_ids = gda_skater(k, w.gda_w, in_data, scale_method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
ratio = between_ss / total_ss
within_ss = gda_withinsumofsquare(cluster_ids, in_data)
cluster_ids = gda_skater(k, w.gda_w, in_data, scale_method, distance_method,
bound_variable, min_bound, random_seed, cpu_threads)

return {
"Total sum of squares" : total_ss,
"Within-cluster sum of squares" : list(within_ss) + [0]*(len(cluster_ids) - len(within_ss)),
"Total within-cluster sum of squares" : between_ss,
"The ratio of between to total sum of squares" : ratio,
"Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
}
return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
Loading