Skip to content

Commit

Permalink
[Fix] discrepancy in SCHC; improve bound_variable and min_bound argum…
Browse files Browse the repository at this point in the history
…ents (#27)
  • Loading branch information
lixun910 authored Jan 25, 2025
1 parent 1850cd9 commit 8da0441
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 24 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ temp/
test.ipynb
.ipynb_checkpoints/
Untitled.ipynb
wheelhouse/
.venv/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ lisa = pygeoda.local_moran(w, gda['Crm_prs'])
```


## Current version 0.0.8
## Current version 0.1.0

* Spatial Weights
* Queen
Expand Down
2 changes: 1 addition & 1 deletion libgeoda
13 changes: 10 additions & 3 deletions pygeoda/clustering/azp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def azp_greedy(p, w, data, **kwargs):
p (int): The number of spatially constrained clusters
w (Weight): an instance of Weight class
data (tuple): A list of numeric vectors of selected variable
bound_variable (tuple, optional): A numeric vector of selected bounding variable
bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
min_bound (float, optional): A minimum value that the sum value of bounding variable int each cluster should be greater than
inits (int, optional): The number of construction re-runs, which is for ARiSeL "automatic regionalization with initial seed location"
init_regions (tuple, optional): The initial regions that the local search starts with. Default is empty. means the local search starts with a random process to "grow" clusters
Expand Down Expand Up @@ -49,11 +49,18 @@ def azp_greedy(p, w, data, **kwargs):
if len(data) < 1:
raise ValueError("The data from selected variable is empty.")

in_data = VecVecDouble()

# check if bound_variable is pandas.core.series.Series, if so, convert to list
if type(bound_variable).__name__ == "Series":
bound_variable = bound_variable.values.tolist()

# if bound_variable is not empty, check if it has the same length as the number of observations
if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")

if type(data).__name__ == "DataFrame":
data = data.values.transpose().tolist()

in_data = VecVecDouble()
for d in data:
in_data.push_back(d)

Expand Down
11 changes: 7 additions & 4 deletions pygeoda/clustering/maxp.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
Arguments:
w (Weight): an instance of Weight class
data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
bound_variable (tuple): A numeric vector of selected bounding variable
bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
min_bound (float): A minimum value that the sum value of bounding variable int each cluster should be greater than
iterations (int, optional): The number of iterations of greedy algorithm. Defaults to 99.
init_regions (tuple, optional): The initial regions that the local search starts with. Default is empty. means the local search starts with a random process to "grow" clusters
Expand All @@ -43,17 +43,20 @@ def maxp_greedy(w, data, bound_variable, min_bound, **kwargs):
if len(data) < 1:
raise ValueError("The data from selected variable is empty.")

# check if bound_variable is pandas.core.series.Series, if so, convert to list
if type(bound_variable).__name__ == "Series":
bound_variable = bound_variable.values.tolist()

if len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")

if min_bound <= 0:
raise ValueError("The min_bound has to be a positive numeric value.")

in_data = VecVecDouble()

if type(data).__name__ == "DataFrame":
data = data.values.transpose().tolist()


in_data = VecVecDouble()
for d in data:
in_data.push_back(d)

Expand Down
19 changes: 15 additions & 4 deletions pygeoda/clustering/redcap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def redcap(k, w, data, method, **kwargs):
k (int): number of clusters
w (Weight): An instance of Weight class
data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
bound_variable (tuple, optional): A numeric vector of selected bounding variable
bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than
scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).
distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"
Expand All @@ -42,14 +42,25 @@ def redcap(k, w, data, method, **kwargs):
if method not in ['firstorder-singlelinkage', 'fullorder-singlelinkage', 'fullorder-averagelinkage', 'fullorder-completelinkage', 'fullorder-wardlinkage']:
raise ValueError('The method has to be one of {"firstorder-singlelinkage", "fullorder-completelinkage", "fullorder-averagelinkage","fullorder-singlelinkage", "fullorder-wardlinkage"}')

in_data = VecVecDouble()

# check if bound_variable is pandas.core.series.Series, if so, convert to list
if type(bound_variable).__name__ == "Series":
bound_variable = bound_variable.values.tolist()

# if bound_variable is not empty, check if it has the same length as the number of observations
if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")

# check if min_bound is available when bound_variable is not empty
if len(bound_variable) > 0 and min_bound == 0:
raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")

if type(data).__name__ == "DataFrame":
data = data.values.transpose().tolist()

in_data = VecVecDouble()
for d in data:
in_data.push_back(d)

#in_bound_variable = VecDouble(bound_variable)

cluster_ids = gda_redcap(k, w.gda_w, in_data, scale_method, method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)
Expand Down
23 changes: 17 additions & 6 deletions pygeoda/clustering/schc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def schc(k, w, data, linkage_method, **kwargs):
w (Weight): An instance of Weight class
data (tuple): A list of numeric vectors of selected variable
linkage_method (str): The method of agglomerative hierarchical clustering: {"single", "complete", "average","ward"}. Defaults to "ward".
bound_variable (tuple, optional): A numeric vector of selected bounding variable
bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than
scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).
distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"
Expand All @@ -33,18 +33,29 @@ def schc(k, w, data, linkage_method, **kwargs):
scale_method = "standardize" if "scale_method" not in kwargs else kwargs['scale_method']
distance_method = 'euclidean' if 'distance_method' not in kwargs else kwargs['distance_method']

if linkage_method not in ["single", "complete", "average","ward"]:
raise ValueError('The method has to be one of {"single", "complete", "average","ward"}')
if linkage_method not in ["single", "complete", "average", "ward"]:
raise ValueError('The method has to be one of {"single", "complete", "average", "ward"}')

in_data = VecVecDouble()
# check if bound_variable is pandas.core.series.Series, if so, convert to list
if type(bound_variable).__name__ == "Series":
bound_variable = bound_variable.values.tolist()

# if bound_variable is not empty, check if it has the same length as the number of observations
if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")

# check if min_bound is available when bound_variable is not empty
if len(bound_variable) > 0 and min_bound == 0:
raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")

if type(data).__name__ == "DataFrame":
data = data.values.transpose().tolist()

in_data = VecVecDouble()
for d in data:
in_data.push_back(d)
cluster_ids = gda_schc(k, w.gda_w, in_data, linkage_method, scale_method, distance_method, bound_variable, min_bound)

cluster_ids = gda_schc(k, w.gda_w, in_data, scale_method, linkage_method, distance_method, bound_variable, min_bound)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
total_ss = gda_totalsumofsquare(in_data)
Expand Down
17 changes: 14 additions & 3 deletions pygeoda/clustering/skater.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def skater(k, w, data, **kwargs):
k (int): number of clusters
w (Weight): An instance of Weight class
data (list or dataframe): A list of numeric vectors of selected variable or a data frame of selected variables e.g. guerry[['Crm_prs', 'Literacy']]
bound_variable (tuple, optional): A numeric vector of selected bounding variable
bound_variable (tuple or pandas.core.series.Series, optional): A numeric vector of selected bounding variable
min_bound (float, optional): a minimum value that the sum value of bounding variable int each cluster should be greater than
scale_method (str, optional): One of the scaling methods {'raw', 'standardize', 'demean', 'mad', 'range_standardize', 'range_adjust'} to apply on input data. Default is 'standardize' (Z-score normalization).
distance_method (str, optional): {"euclidean", "manhattan"} the distance method used to compute the distance betwen observation i and j. Defaults to "euclidean". Options are "euclidean" and "manhattan"
Expand All @@ -31,14 +31,25 @@ def skater(k, w, data, **kwargs):
random_seed = 123456789 if 'random_seed' not in kwargs else kwargs['random_seed']
cpu_threads = 6 if 'cpu_threads' not in kwargs else kwargs['cpu_threads']

in_data = VecVecDouble()
# check if bound_variable is pandas.core.series.Series, if so, convert to list
if type(bound_variable).__name__ == "Series":
bound_variable = bound_variable.values.tolist()

# if bound_variable is not empty, check if it has the same length as the number of observations
if len(bound_variable) > 0 and len(bound_variable) != w.num_obs:
raise ValueError("The bound_variable has to be a list of numeric values, e.g. a column of input table.")

# check if min_bound is available when bound_variable is not empty
if len(bound_variable) > 0 and min_bound == 0:
raise ValueError("min_bound is required and greater than 0 when bound_variable is not empty.")

if type(data).__name__ == "DataFrame":
data = data.values.transpose().tolist()

in_data = VecVecDouble()
for d in data:
in_data.push_back(d)

cluster_ids = gda_skater(k, w.gda_w, in_data, scale_method, distance_method, bound_variable, min_bound, random_seed, cpu_threads)

between_ss = gda_betweensumofsquare(cluster_ids, in_data)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@
extra_objects=EXTRA_OBJECTS),]

setup (name = 'pygeoda',
version = '0.0.9',
version = '0.1.0',
author = "Xun Li",
author_email = "[email protected]",
url = "https://github.com/geodacenter/pygeoda",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,4 +196,4 @@ def test_SCHC(self):
clusters = pygeoda.schc(p, self.queen_w, self.data, linkage_method)

self.assertAlmostEqual(
clusters["The ratio of between to total sum of squares"], 0.2147711255)
clusters["The ratio of between to total sum of squares"], 0.4241728843)

0 comments on commit 8da0441

Please sign in to comment.