diff --git a/.gitignore b/.gitignore index c5578b9..1c127c2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ temp/ test.ipynb .ipynb_checkpoints/ Untitled.ipynb +wheelhouse/ +.venv/ \ No newline at end of file diff --git a/README.md b/README.md index cb6ed07..95abd19 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ lisa = pygeoda.local_moran(w, gda['Crm_prs']) ``` -## Current version 0.0.8 +## Current version 0.1.0 * Spatial Weights * Queen diff --git a/libgeoda b/libgeoda index c175643..718902b 160000 --- a/libgeoda +++ b/libgeoda @@ -1 +1 @@ -Subproject commit c175643772634dc2193bb9072549552c7bb41135 +Subproject commit 718902b22dd4e367369206fa161eb404d90844d0 diff --git a/pygeoda/clustering/azp.py b/pygeoda/clustering/azp.py index 7823afe..d500b26 100644 --- a/pygeoda/clustering/azp.py +++ b/pygeoda/clustering/azp.py @@ -19,7 +19,7 @@ def azp_greedy(p, w, data, **kwargs): p (int): The number of spatially constrained clusters w (Weight): an instance of Weight class data (tuple): A list of numeric vectors of selected variable - bound_variable (tuple, optional): A numeric vector of selected bounding variable + bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable min_bound (float, optional): A minimum value that the sum value of bounding variable int each cluster should be greater than inits (int, optional): The number of construction re-runs, which is for ARiSeL "automatic regionalization with initial seed location" init_regions (tuple, optional): The initial regions that the local search starts with. Default is empty. means the local search starts with a random process to "grow" clusters @@ -49,11 +49,18 @@ def azp_greedy(p, w, data, **kwargs): if len(data) < 1: raise ValueError("The data from selected variable is empty.") - in_data = VecVecDouble() - + # check if bound_variable is pandas.core.series.Series, if so, convert to list + if type(bound_variable).__name__ == "Series": + bound_variable = bound_variable.values.tolist() + + # if bound_variable is not empty, check if it has the same length as the number of observations + if len(bound_variable) > 0 and len(bound_variable) != w.num_obs: + raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.") + if type(data).__name__ == "DataFrame": data = data.values.transpose().tolist() + in_data = VecVecDouble() for d in data: in_data.push_back(d) diff --git a/pygeoda/clustering/maxp.py b/pygeoda/clustering/maxp.py index 2acd38f..097a50e 100644 --- a/pygeoda/clustering/maxp.py +++ b/pygeoda/clustering/maxp.py @@ -17,7 +17,7 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs): Arguments: w (Weight): an instance of Weight class data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']] - bound_variable (tuple): A numeric vector of selected bounding variable + bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable min_bound (float): A minimum value that the sum value of bounding variable int each cluster should be greater than iterations (int, optional): The number of iterations of greedy algorithm. Defaults to 99. init_regions (tuple, optional): The initial regions that the local search starts with. Default is empty. means the local search starts with a random process to "grow" clusters @@ -43,17 +43,20 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs): if len(data) < 1: raise ValueError("The data from selected variable is empty.") + # check if bound_variable is pandas.core.series.Series, if so, convert to list + if type(bound_variable).__name__ == "Series": + bound_variable = bound_variable.values.tolist() + if len(bound_variable) != w.num_obs: raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.") if min_bound <= 0: raise ValueError("The min_bound has to be a positive numeric value.") - in_data = VecVecDouble() - if type(data).__name__ == "DataFrame": data = data.values.transpose().tolist() - + + in_data = VecVecDouble() for d in data: in_data.push_back(d) diff --git a/pygeoda/clustering/redcap.py b/pygeoda/clustering/redcap.py index 5b02033..a02f4cd 100644 --- a/pygeoda/clustering/redcap.py +++ b/pygeoda/clustering/redcap.py @@ -21,7 +21,7 @@ def redcap(k, w, data, method, **kwargs): k (int): number of clusters w (Weight): An instance of Weight class data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']] - bound_variable (tuple, optional): A numeric vector of selected bounding variable + bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization). distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan" @@ -42,14 +42,25 @@ def redcap(k, w, data, method, **kwargs): if method not in ['firstorder-singlelinkage', 'fullorder-singlelinkage', 'fullorder-averagelinkage', 'fullorder-completelinkage', 'fullorder-wardlinkage']: raise ValueError('The method has to be one of {"firstorder-singlelinkage", "fullorder-completelinkage", "fullorder-averagelinkage","fullorder-singlelinkage", "fullorder-wardlinkage"}') - in_data = VecVecDouble() - + # check if bound_variable is pandas.core.series.Series, if so, convert to list + if type(bound_variable).__name__ == "Series": + bound_variable = bound_variable.values.tolist() + + # if bound_variable is not empty, check if it has the same length as the number of observations + if len(bound_variable) > 0 and len(bound_variable) != w.num_obs: + raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.") + + # check if min_bound is available when bound_variable is not empty + if len(bound_variable) > 0 and min_bound == 0: + raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.") + if type(data).__name__ == "DataFrame": data = data.values.transpose().tolist() + in_data = VecVecDouble() for d in data: in_data.push_back(d) - + #in_bound_variable = VecDouble(bound_variable) cluster_ids = gda_redcap(k, w.gda_w, in_data, scale_method, method, distance_method, bound_variable, min_bound, random_seed, cpu_threads) diff --git a/pygeoda/clustering/schc.py b/pygeoda/clustering/schc.py index 0e4fdcb..7a5c431 100644 --- a/pygeoda/clustering/schc.py +++ b/pygeoda/clustering/schc.py @@ -19,7 +19,7 @@ def schc(k, w, data, linkage_method, **kwargs): w (Weight): An instance of Weight class data (tuple): A list of numeric vectors of selected variable linkage_method (str): The method of agglomerative hierarchical clustering: {"single", "complete", "average","ward"}. Defaults to "ward". - bound_variable (tuple, optional): A numeric vector of selected bounding variable + bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization). distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan" @@ -33,18 +33,29 @@ def schc(k, w, data, linkage_method, **kwargs): scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method'] distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method'] - if linkage_method not in ["single", "complete", "average","ward"]: - raise ValueError('The method has to be one of {"single", "complete", "average","ward"}') + if linkage_method not in ["single", "complete", "average", "ward"]: + raise ValueError('The method has to be one of {"single", "complete", "average", "ward"}') - in_data = VecVecDouble() + # check if bound_variable is pandas.core.series.Series, if so, convert to list + if type(bound_variable).__name__ == "Series": + bound_variable = bound_variable.values.tolist() + + # if bound_variable is not empty, check if it has the same length as the number of observations + if len(bound_variable) > 0 and len(bound_variable) != w.num_obs: + raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.") + # check if min_bound is available when bound_variable is not empty + if len(bound_variable) > 0 and min_bound == 0: + raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.") + if type(data).__name__ == "DataFrame": data = data.values.transpose().tolist() + in_data = VecVecDouble() for d in data: in_data.push_back(d) - - cluster_ids = gda_schc(k, w.gda_w, in_data, linkage_method, scale_method, distance_method, bound_variable, min_bound) + + cluster_ids = gda_schc(k, w.gda_w, in_data, scale_method, linkage_method, distance_method, bound_variable, min_bound) between_ss = gda_betweensumofsquare(cluster_ids, in_data) total_ss = gda_totalsumofsquare(in_data) diff --git a/pygeoda/clustering/skater.py b/pygeoda/clustering/skater.py index d6dcf36..d6cfb98 100644 --- a/pygeoda/clustering/skater.py +++ b/pygeoda/clustering/skater.py @@ -13,7 +13,7 @@ def skater(k, w, data, **kwargs): k (int): number of clusters w (Weight): An instance of Weight class data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']] - bound_variable (tuple, optional): A numeric vector of selected bounding variable + bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization). distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan" @@ -31,14 +31,25 @@ def skater(k, w, data, **kwargs): random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed'] cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads'] - in_data = VecVecDouble() + # check if bound_variable is pandas.core.series.Series, if so, convert to list + if type(bound_variable).__name__ == "Series": + bound_variable = bound_variable.values.tolist() + + # if bound_variable is not empty, check if it has the same length as the number of observations + if len(bound_variable) > 0 and len(bound_variable) != w.num_obs: + raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.") + + # check if min_bound is available when bound_variable is not empty + if len(bound_variable) > 0 and min_bound == 0: + raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.") if type(data).__name__ == "DataFrame": data = data.values.transpose().tolist() + in_data = VecVecDouble() for d in data: in_data.push_back(d) - + cluster_ids = gda_skater(k, w.gda_w, in_data, scale_method, distance_method, bound_variable, min_bound, random_seed, cpu_threads) between_ss = gda_betweensumofsquare(cluster_ids, in_data) diff --git a/setup.py b/setup.py index 7f34e34..20a6ab9 100644 --- a/setup.py +++ b/setup.py @@ -204,7 +204,7 @@ extra_objects=EXTRA_OBJECTS),] setup (name = 'pygeoda', - version = '0.0.9', + version = '0.1.0', author = "Xun Li", author_email = "lixun910@gmail.com", url = "https://github.com/geodacenter/pygeoda", diff --git a/tests/test_clustering.py b/tests/test_clustering.py index c7bef5b..4af2c92 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -196,4 +196,4 @@ def test_SCHC(self): clusters = pygeoda.schc(p, self.queen_w, self.data, linkage_method) self.assertAlmostEqual( - clusters["The ratio of between to total sum of squares"], 0.2147711255) + clusters["The ratio of between to total sum of squares"], 0.4241728843)