[Fix] discrepancy in SCHC; improve bound_variable and min_bound argum…

…ents (#27)
GeoDaCenter · Jan 25, 2025 · 8da0441 · 8da0441
1 parent 1850cd9
commit 8da0441
Show file tree

Hide file tree

Showing 10 changed files with 69 additions and 24 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,5 @@ temp/
 test.ipynb
 .ipynb_checkpoints/
 Untitled.ipynb
+wheelhouse/
+.venv/
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ lisa = pygeoda.local_moran(w, gda['Crm_prs'])
 ```
 
 
-## Current version 0.0.8
+## Current version 0.1.0
 
 * Spatial Weights
     * Queen

diff --git a/libgeoda b/libgeoda
diff --git a/pygeoda/clustering/azp.py b/pygeoda/clustering/azp.py
@@ -19,7 +19,7 @@ def azp_greedy(p, w, data, **kwargs):
         p (int): The number of spatially constrained clusters
         w (Weight): an instance of Weight class
         data (tuple):  A list of numeric vectors of selected variable
-        bound_variable (tuple, optional): A numeric vector of selected bounding variable
+        bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
         min_bound (float, optional): A minimum value that the sum value of bounding variable int each cluster should be greater than
         inits (int, optional): The number of construction re-runs, which is for ARiSeL "automatic regionalization with initial seed location"
         init_regions (tuple, optional): The initial regions that the local search starts with. Default is empty. means the local search starts with a random process to "grow" clusters
@@ -49,11 +49,18 @@ def azp_greedy(p, w, data, **kwargs):
     if len(data) < 1:
         raise ValueError("The data from selected variable is empty.")
 
-    in_data = VecVecDouble()
-
+    # check if bound_variable is pandas.core.series.Series, if so, convert to list
+    if type(bound_variable).__name__ == "Series":
+        bound_variable = bound_variable.values.tolist()
+
+    # if bound_variable is not empty, check if it has the same length as the number of observations
+    if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
+        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
+
     if type(data).__name__ == "DataFrame":
         data = data.values.transpose().tolist()
 
+    in_data = VecVecDouble()
     for d in data:
         in_data.push_back(d)
 

diff --git a/pygeoda/clustering/maxp.py b/pygeoda/clustering/maxp.py
@@ -17,7 +17,7 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
     Arguments:
         w (Weight): an instance of Weight class
         data (list or dataframe):   A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
-        bound_variable (tuple): A numeric vector of selected bounding variable
+        bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
         min_bound (float): A minimum value that the sum value of bounding variable int each cluster should be greater than
         iterations (int, optional): The number of iterations of greedy algorithm. Defaults to 99.
         init_regions (tuple, optional): The initial regions that the local search starts with. Default is empty. means the local search starts with a random process to "grow" clusters
@@ -43,17 +43,20 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
     if len(data) < 1:
         raise ValueError("The data from selected variable is empty.")
 
+    # check if bound_variable is pandas.core.series.Series, if so, convert to list
+    if type(bound_variable).__name__ == "Series":
+        bound_variable = bound_variable.values.tolist()
+
     if len(bound_variable) != w.num_obs:
         raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
 
     if min_bound <= 0:
         raise ValueError("The min_bound has to be a positive numeric value.")
 
-    in_data = VecVecDouble()
-
     if type(data).__name__ == "DataFrame":
         data = data.values.transpose().tolist()
-
+
+    in_data = VecVecDouble()
     for d in data:
         in_data.push_back(d)
 

diff --git a/pygeoda/clustering/redcap.py b/pygeoda/clustering/redcap.py
@@ -21,7 +21,7 @@ def redcap(k, w, data, method, **kwargs):
         k (int): number of clusters
         w (Weight): An instance of Weight class
         data (list or dataframe):   A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
-        bound_variable (tuple, optional): A numeric vector of selected bounding variable
+        bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
         min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than 
         scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).
         distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"
@@ -42,14 +42,25 @@ def redcap(k, w, data, method, **kwargs):
     if method not in ['firstorder-singlelinkage', 'fullorder-singlelinkage', 'fullorder-averagelinkage', 'fullorder-completelinkage', 'fullorder-wardlinkage']:
         raise ValueError('The method has to be one of {"firstorder-singlelinkage", "fullorder-completelinkage", "fullorder-averagelinkage","fullorder-singlelinkage", "fullorder-wardlinkage"}')
 
-    in_data = VecVecDouble()
-
+    # check if bound_variable is pandas.core.series.Series, if so, convert to list
+    if type(bound_variable).__name__ == "Series":
+        bound_variable = bound_variable.values.tolist()
+
+    # if bound_variable is not empty, check if it has the same length as the number of observations
+    if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
+        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
+
+    # check if min_bound is available when bound_variable is not empty
+    if len(bound_variable) > 0 and min_bound == 0:
+        raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")
+
     if type(data).__name__ == "DataFrame":
         data = data.values.transpose().tolist()
 
+    in_data = VecVecDouble()
     for d in data:
         in_data.push_back(d)
-
+     
     #in_bound_variable = VecDouble(bound_variable)
 
     cluster_ids = gda_redcap(k, w.gda_w, in_data, scale_method, method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)

diff --git a/pygeoda/clustering/schc.py b/pygeoda/clustering/schc.py
@@ -19,7 +19,7 @@ def schc(k, w, data, linkage_method, **kwargs):
         w (Weight): An instance of Weight class
         data (tuple): A list of numeric vectors of selected variable
         linkage_method (str): The method of agglomerative hierarchical clustering: {"single", "complete", "average","ward"}. Defaults to "ward".
-        bound_variable (tuple, optional): A numeric vector of selected bounding variable
+        bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
         min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than 
         scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).
         distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"
@@ -33,18 +33,29 @@ def schc(k, w, data, linkage_method, **kwargs):
     scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
     distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method'] 
 
-    if linkage_method not in ["single", "complete", "average","ward"]:
-        raise ValueError('The method has to be one of {"single", "complete", "average","ward"}')
+    if linkage_method not in ["single", "complete", "average", "ward"]:
+        raise ValueError('The method has to be one of {"single", "complete", "average", "ward"}')
 
-    in_data = VecVecDouble()
+    # check if bound_variable is pandas.core.series.Series, if so, convert to list
+    if type(bound_variable).__name__ == "Series":
+        bound_variable = bound_variable.values.tolist()
+
+    # if bound_variable is not empty, check if it has the same length as the number of observations
+    if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
+        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
 
+    # check if min_bound is available when bound_variable is not empty
+    if len(bound_variable) > 0 and min_bound == 0:
+        raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")
+
     if type(data).__name__ == "DataFrame":
         data = data.values.transpose().tolist()
 
+    in_data = VecVecDouble()
     for d in data:
         in_data.push_back(d)
-    
-    cluster_ids = gda_schc(k, w.gda_w, in_data, linkage_method, scale_method, distance_method, bound_variable, min_bound)
+
+    cluster_ids = gda_schc(k, w.gda_w, in_data, scale_method, linkage_method, distance_method, bound_variable, min_bound)
 
     between_ss = gda_betweensumofsquare(cluster_ids, in_data)
     total_ss = gda_totalsumofsquare(in_data)

diff --git a/pygeoda/clustering/skater.py b/pygeoda/clustering/skater.py
@@ -13,7 +13,7 @@ def skater(k, w, data, **kwargs):
         k (int): number of clusters
         w (Weight): An instance of Weight class
         data (list or dataframe):   A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
-        bound_variable (tuple, optional): A numeric vector of selected bounding variable
+        bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
         min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than 
         scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).
         distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"
@@ -31,14 +31,25 @@ def skater(k, w, data, **kwargs):
     random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
     cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']
 
-    in_data = VecVecDouble()
+    # check if bound_variable is pandas.core.series.Series, if so, convert to list
+    if type(bound_variable).__name__ == "Series":
+        bound_variable = bound_variable.values.tolist()
+
+    # if bound_variable is not empty, check if it has the same length as the number of observations
+    if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
+        raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")
+
+    # check if min_bound is available when bound_variable is not empty
+    if len(bound_variable) > 0 and min_bound == 0:
+        raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")
 
     if type(data).__name__ == "DataFrame":
         data = data.values.transpose().tolist()
 
+    in_data = VecVecDouble()
     for d in data:
         in_data.push_back(d)
-
+        
     cluster_ids = gda_skater(k, w.gda_w, in_data, scale_method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)
 
     between_ss = gda_betweensumofsquare(cluster_ids, in_data)

diff --git a/setup.py b/setup.py
@@ -204,7 +204,7 @@
                         extra_objects=EXTRA_OBJECTS),]
 
 setup (name = 'pygeoda',
-       version = '0.0.9',
+       version = '0.1.0',
        author = "Xun Li",
        author_email = "[email protected]",
        url = "https://github.com/geodacenter/pygeoda",

diff --git a/tests/test_clustering.py b/tests/test_clustering.py
@@ -196,4 +196,4 @@ def test_SCHC(self):
         clusters = pygeoda.schc(p, self.queen_w, self.data, linkage_method)
 
         self.assertAlmostEqual(
-            clusters["The ratio of between to total sum of squares"], 0.2147711255)
+            clusters["The ratio of between to total sum of squares"], 0.4241728843)
-Original file line number
+Diff line change
@@ Expand Up / @@ -75,7 +75,7 @@ lisa = pygeoda.local_moran(w, gda['Crm_prs']) @@
     ```
-    ## Current version 0.0.8
+    ## Current version 0.1.0
     * Spatial Weights
         * Queen
@@ Expand Down @@
+9 −4		clustering/redcap_wrapper.cpp
+16 −7		clustering/schc_wrapper.cpp
+14 −8		gda_clustering.cpp