GeoDaCenter · lixun910 · Jan 30, 2025 · Jan 29, 2025
diff --git a/pygeoda/clustering/azp.py b/pygeoda/clustering/azp.py
@@ -1,5 +1,6 @@
 __author__ = "Xun Li <[email protected]>"
 
+from pygeoda.clustering.utils import calculate_clustering_statistics
 from ..libgeoda import VecVecDouble, VecPair, VecDouble, VecInt, Pair
 from ..libgeoda import gda_azp_greedy, gda_azp_sa, gda_azp_tabu
 from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters
@@ -74,18 +75,7 @@ def azp_greedy(p, w, data, **kwargs):
 
     cluster_ids = gda_azp_greedy(p, w.gda_w, in_data, scale_method, inits, min_bounds, max_bounds, in_init_regions, distance_method, random_seed)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
-
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : within_ss,
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
 
 def azp_sa(p, w, data, cooling_rate=0.85, **kwargs):
     ''' A simulated annealing algorithm to solve the AZP problem 
@@ -151,18 +141,7 @@ def azp_sa(p, w, data, cooling_rate=0.85, **kwargs):
 
     cluster_ids = gda_azp_sa(p, w.gda_w, in_data, scale_method, inits, cooling_rate, sa_maxit, min_bounds, max_bounds, in_init_regions, distance_method, random_seed)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
-
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : within_ss,
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
 
 def azp_tabu(p, w, data, tabu_length, **kwargs):
     ''' A tabu-search algorithm to solve the AZP problem 
@@ -225,15 +204,4 @@ def azp_tabu(p, w, data, tabu_length, **kwargs):
 
     cluster_ids = gda_azp_tabu(p, w.gda_w, in_data, scale_method, inits, tabu_length, conv_tabu, min_bounds, max_bounds, in_init_regions, distance_method, random_seed)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
-
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : within_ss,
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
diff --git a/pygeoda/clustering/maxp.py b/pygeoda/clustering/maxp.py
@@ -1,17 +1,18 @@
 __author__ = "Xun Li <[email protected]>"
 
-from ..libgeoda import VecVecDouble, VecPair, VecDouble, VecInt, Pair
+from pygeoda.clustering.utils import calculate_clustering_statistics
+from ..libgeoda import VecVecDouble, VecPair, VecInt
 from ..libgeoda import gda_maxp_greedy, gda_maxp_sa, gda_maxp_tabu
-from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters
 
 '''
 Changes:
 1/20/2021 Add maxp_greedy, maxp_sa, maxp_tabu
 '''
 
+
 def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
     ''' A greedy algorithm to solve the max-p-region problem 
-    
+
     The max-p-region problem is a special case of constrained clustering where a finite number of geographical areas are aggregated into the maximum number of regions (max-p-regions), such that each region is geographically connected and the clusters could maximize internal homogeneity.
 
     Arguments:
@@ -29,11 +30,12 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
     Returns:
         dict: A dict with keys {"Clusters", "TotalSS", "Within-clusterSS", "TotalWithin-clusterSS", "Ratio"}
     '''
-    
+
     iterations = 99 if 'iterations' not in kwargs else kwargs['iterations']
     init_regions = [] if 'init_regions' not in kwargs else kwargs['init_regions']
     scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
-    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method'] 
+    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
+        'distance_method']
     random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
     cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']
 
@@ -46,16 +48,17 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
     # check if bound_variable is pandas.core.series.Series, if so, convert to list
     if type(bound_variable).__name__ == "Series":
         bound_variable = bound_variable.values.tolist()
-        
+
     if len(bound_variable) != w.num_obs:
-        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
+        raise ValueError(
+            "The bound_variable has to be a list of numeric values, e.g. a column of input table.")
 
     if min_bound <= 0:
         raise ValueError("The min_bound has to be a positive numeric value.")
-    
+
     if type(data).__name__ == "DataFrame":
         data = data.values.transpose().tolist()
- 
+
     in_data = VecVecDouble()
     for d in data:
         in_data.push_back(d)
@@ -67,26 +70,17 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
 
     in_init_regions = VecInt(list(init_regions))
 
-    cluster_ids = gda_maxp_greedy(w.gda_w, in_data, scale_method, iterations, min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)
+    cluster_ids = gda_maxp_greedy(w.gda_w, in_data, scale_method, iterations, min_bounds,
+                                  max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
 
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : within_ss,
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
 
 def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):
     ''' A simulated annealing algorithm to solve the max-p-region problem 
-    
+
     The max-p-region problem is a special case of constrained clustering where a finite number of geographical areas are aggregated into the maximum number of regions (max-p-regions), such that each region is geographically connected and the clusters could maximize internal homogeneity.
-        
+
     Arguments:
         w (Weight): an instance of Weight class
         data (list or dataframe):   A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
@@ -105,15 +99,16 @@ def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):
         dict: A dict with keys {"Clusters", "TotalSS", "Within-clusterSS", "TotalWithin-clusterSS", "Ratio"}
     '''
 
-    sa_maxit = 1 if 'sa_maxit'  not in kwargs else kwargs['sa_maxit']
+    sa_maxit = 1 if 'sa_maxit' not in kwargs else kwargs['sa_maxit']
     iterations = 99 if 'iterations' not in kwargs else kwargs['iterations']
     init_regions = [] if 'init_regions' not in kwargs else kwargs['init_regions']
     scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
-    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method'] 
+    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
+        'distance_method']
     random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
     cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']
 
-    if cooling_rate <=0 or cooling_rate >=1:
+    if cooling_rate <= 0 or cooling_rate >= 1:
         raise ValueError("Cooling rate should be in range (0,1).")
 
     if w.num_obs < 1:
@@ -123,11 +118,12 @@ def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):
         raise ValueError("The data from selected variable is empty.")
 
     if len(bound_variable) != w.num_obs:
-        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
+        raise ValueError(
+            "The bound_variable has to be a list of numeric values, e.g. a column of input table.")
 
     if min_bound <= 0:
         raise ValueError("The min_bound has to be a positive numeric value.")
-    
+
     in_data = VecVecDouble()
 
     if type(data).__name__ == "DataFrame":
@@ -143,26 +139,17 @@ def maxp_sa(w, data, bound_variable, min_bound, cooling_rate=0.85, **kwargs):
 
     in_init_regions = VecInt(list(init_regions))
 
-    cluster_ids = gda_maxp_sa(w.gda_w, in_data, scale_method, iterations, cooling_rate, sa_maxit, min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)
+    cluster_ids = gda_maxp_sa(w.gda_w, in_data, scale_method, iterations, cooling_rate, sa_maxit,
+                              min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
 
-    return {
-       "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : within_ss,
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
 
 def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):
     ''' A tabu-search algorithm to solve the max-p-region problem 
-    
+
     The max-p-region problem is a special case of constrained clustering where a finite number of geographical areas are aggregated into the maximum number of regions (max-p-regions), such that each region is geographically connected and the clusters could maximize internal homogeneity.
-        
+
     Arguments:
         w (Weight): an instance of Weight class
         data (list or dataframe):   A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
@@ -180,12 +167,13 @@ def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):
     Returns:
         dict: A dict with keys {"Clusters", "TotalSS", "Within-clusterSS", "TotalWithin-clusterSS", "Ratio"}
     '''
-    
+
     conv_tabu = 10 if 'conv_tabu' not in kwargs else kwargs['conv_tabu']
     iterations = 99 if 'iterations' not in kwargs else kwargs['iterations']
     init_regions = [] if 'init_regions' not in kwargs else kwargs['init_regions']
     scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
-    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method'] 
+    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
+        'distance_method']
     random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
     cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']
 
@@ -196,11 +184,12 @@ def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):
         raise ValueError("The data from selected variable is empty.")
 
     if len(bound_variable) != w.num_obs:
-        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
+        raise ValueError(
+            "The bound_variable has to be a list of numeric values, e.g. a column of input table.")
 
     if min_bound <= 0:
         raise ValueError("The min_bound has to be a positive numeric value.")
-    
+
     in_data = VecVecDouble()
 
     if type(data).__name__ == "DataFrame":
@@ -216,17 +205,9 @@ def maxp_tabu(w, data, bound_variable, min_bound, tabu_length=10, **kwargs):
 
     in_init_regions = VecInt(list(init_regions))
 
-    cluster_ids = gda_maxp_tabu(w.gda_w, in_data, scale_method, iterations, tabu_length, conv_tabu, min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads)
-
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
+    cluster_ids = gda_maxp_tabu(
+        w.gda_w, in_data, scale_method, iterations, tabu_length, conv_tabu,
+        min_bounds, max_bounds, in_init_regions, distance_method, random_seed, cpu_threads
+    )
 
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : within_ss,
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
diff --git a/pygeoda/clustering/redcap.py b/pygeoda/clustering/redcap.py
@@ -1,6 +1,6 @@
-from ..libgeoda import VecVecDouble, VecDouble
+from pygeoda.clustering.utils import calculate_clustering_statistics
+from ..libgeoda import VecVecDouble
 from ..libgeoda import gda_redcap
-from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters
 
 __author__ = "Xun Li <[email protected]>, "
 
@@ -65,15 +65,4 @@ def redcap(k, w, data, method, **kwargs):
 
     cluster_ids = gda_redcap(k, w.gda_w, in_data, scale_method, method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
-
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : list(within_ss) + [0]*(len(cluster_ids) - len(within_ss)),
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
diff --git a/pygeoda/clustering/schc.py b/pygeoda/clustering/schc.py
@@ -1,6 +1,6 @@
-from ..libgeoda import VecVecDouble, VecDouble
+from pygeoda.clustering.utils import calculate_clustering_statistics
+from ..libgeoda import VecVecDouble
 from ..libgeoda import gda_schc
-from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters
 
 __author__ = "Xun Li <[email protected]>, "
 
@@ -57,15 +57,4 @@ def schc(k, w, data, linkage_method, **kwargs):
 
     cluster_ids = gda_schc(k, w.gda_w, in_data, scale_method, linkage_method, distance_method, bound_variable, min_bound)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
-
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : within_ss,
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)
diff --git a/pygeoda/clustering/skater.py b/pygeoda/clustering/skater.py
@@ -1,8 +1,8 @@
 __author__ = "Xun Li <[email protected]>"
 
-from ..libgeoda import VecVecDouble, VecDouble
+from pygeoda.clustering.utils import calculate_clustering_statistics
+from ..libgeoda import VecVecDouble
 from ..libgeoda import gda_skater
-from ..libgeoda import gda_betweensumofsquare, gda_totalsumofsquare, gda_withinsumofsquare, flat_2dclusters
 
 def skater(k, w, data, **kwargs):
     ''' Spatial C(K)luster Analysis by Tree Edge Removal
@@ -27,40 +27,33 @@ def skater(k, w, data, **kwargs):
     min_bound = 0 if 'min_bound' not in kwargs else kwargs['min_bound']
     bound_variable = [] if 'bound_variable' not in kwargs else kwargs['bound_variable']
     scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
-    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method'] 
+    distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs[
+        'distance_method']
     random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
     cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']
 
     # check if bound_variable is pandas.core.series.Series, if so, convert to list
     if type(bound_variable).__name__ == "Series":
         bound_variable = bound_variable.values.tolist()
-       
+
     # if bound_variable is not empty, check if it has the same length as the number of observations
     if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
-        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
-
+        raise ValueError(
+            "The bound_variable has to be a list of numeric values, e.g. a column of input table.")
+
     # check if min_bound is available when bound_variable is not empty
     if len(bound_variable) > 0 and min_bound == 0:
-        raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")
+        raise ValueError(
+            "min_bound is required and greater than 0 when bound_variable is not empty.")
 
     if type(data).__name__ == "DataFrame":
         data = data.values.transpose().tolist()
 
     in_data = VecVecDouble()
     for d in data:
         in_data.push_back(d)
-
-    cluster_ids = gda_skater(k, w.gda_w, in_data, scale_method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)
 
-    between_ss = gda_betweensumofsquare(cluster_ids, in_data)
-    total_ss = gda_totalsumofsquare(in_data)
-    ratio = between_ss / total_ss
-    within_ss = gda_withinsumofsquare(cluster_ids, in_data)
+    cluster_ids = gda_skater(k, w.gda_w, in_data, scale_method, distance_method,
+                             bound_variable, min_bound, random_seed, cpu_threads)
 
-    return {
-        "Total sum of squares" : total_ss,
-        "Within-cluster sum of squares" : list(within_ss) + [0]*(len(cluster_ids) - len(within_ss)),
-        "Total within-cluster sum of squares" : between_ss,
-        "The ratio of between to total sum of squares" : ratio,
-        "Clusters" : flat_2dclusters(w.num_obs, cluster_ids),
-    }
+    return calculate_clustering_statistics(cluster_ids, in_data, w.num_obs)