Merge pull request #66 from lbluque/tests

unit tests
CederGroupHub · Feb 7, 2023 · 83ec2aa · 83ec2aa
2 parents 33ad1e6 + 19f18e7
commit 83ec2aa
Show file tree

Hide file tree

Showing 16 changed files with 901 additions and 64 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,7 +30,7 @@ repos:
     exclude: README.md
 
 - repo: https://github.com/pycqa/isort
-  rev: 5.11.4
+  rev: 5.12.0
   hooks:
   - id: isort
     name: isort (python)

diff --git a/README.md b/README.md
@@ -7,9 +7,6 @@ Sparse Linear Regression Models
 [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/CederGroupHub/sparse-lm/main.svg)](https://results.pre-commit.ci/latest/github/CederGroupHub/sparse-lm/main)
 [![pypi version](https://img.shields.io/pypi/v/sparse-lm?color=blue)](https://pypi.org/project/sparse-lm)
 
-> :warning: this package is currently largely lacking in unit-tests.
-> Use at your own risk!
-
 **sparse-lm**  includes several regularized regression estimators that are absent in the
 `sklearn.linear_model` module. The estimators in **sparse-lm** are designed to fit right into
 [scikit-lean](https://scikit-learn.org/stable/index.html) by inheriting from their base
@@ -42,7 +39,7 @@ model like you would any linear model in **scikit-learn**:
 import numpy as np
 from sklearn.datasets import make_regression
 from sklearn.model_selection import GridSearchCV
-from src.sparselm import AdaptiveLasso
+from sparselm import AdaptiveLasso
 
 X, y = make_regression(n_samples=200, n_features=5000, random_state=0)
 alasso = AdaptiveLasso(fit_intercept=False)

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,7 @@ classifiers = [
 
 [project.optional-dependencies]
 dev = ["pre-commit", "black", "isort", "flake8", "pylint", "pydocstyle", "flake8-pyproject"]
-tests = ["pytest >=7.2.0", "pytest-cov >=4.0.0", "coverage"]
+tests = ["pytest >=7.2.0", "pytest-cov >=4.0.0", "coverage", "gurobipy"]
 docs = ["sphinx>=5.3", "furo", "m2r2"]
 optional = ["gurobipy"]
 
@@ -72,3 +72,7 @@ skip = "*.c,./.*"
 count = ''
 quiet-level = 3
 ignore-words-list = ['nd', 'tread']
+
+[tool.coverage.run]
+source = ["src/sparselm"]
+omit = ["*/__init__.py"]
diff --git a/src/sparselm/model/_base.py b/src/sparselm/model/_base.py
@@ -112,7 +112,7 @@ def fit(
         )
 
         if sample_weight is not None:
-            X, y = _rescale_data(X, y, sample_weight)
+            X, y, _ = _rescale_data(X, y, sample_weight)
 
         self.coef_ = self._solve(X, y, *args, **kwargs)
         self._set_intercept(X_offset, y_offset, X_scale)

diff --git a/src/sparselm/model/_lasso.py b/src/sparselm/model/_lasso.py
@@ -155,15 +155,16 @@ def __init__(
                 See docs in CVXEstimator for more information.
         """
         self.groups = np.asarray(groups)
-        self.group_masks = [self.groups == i for i in np.unique(groups)]
+        group_ids = np.sort(np.unique(groups))
+        self.group_masks = [self.groups == i for i in group_ids]
         self.standardize = standardize
         self._group_norms = None
 
         if group_weights is not None:
-            if len(group_weights) != len(self.group_masks):
+            if len(group_weights) != len(group_ids):
                 raise ValueError(
                     "group_weights must be the same length as the number of "
-                    f"groups:  {len(group_weights)} != {len(self.group_masks)}"
+                    f"unique groups:  {len(group_weights)} != {len(group_ids)}"
                 )
         self.group_weights = (
             group_weights

diff --git a/src/sparselm/model/_miqp/_base.py b/src/sparselm/model/_miqp/_base.py
@@ -116,18 +116,20 @@ def _gen_constraints(self, X, y):
         constraints = []
         for i, mask in enumerate(self._group_masks):
             constraints += [
-                self._big_M * self._z0[i] >= self._beta[mask],
-                self._big_M * self._z0[i] >= -self._beta[mask],
+                self._beta[mask] <= self._big_M * self._z0[i],
+                -self._big_M * self._z0[i] <= self._beta[mask],
             ]
 
         if self.hierarchy is not None:
             constraints += self._gen_hierarchy_constraints()
+
         return constraints
 
     def _gen_hierarchy_constraints(self):
         """Generate single feature hierarchy constraints."""
+        group_ids = np.sort(np.unique(self.groups))
         return [
             self._z0[high_id] <= self._z0[sub_id]
-            for high_id, sub_ids in enumerate(self.hierarchy)
+            for high_id, sub_ids in zip(group_ids, self.hierarchy)
             for sub_id in sub_ids
         ]
diff --git a/src/sparselm/model/_miqp/_best_subset.py b/src/sparselm/model/_miqp/_best_subset.py
@@ -25,7 +25,7 @@ def __init__(
         self,
         groups,
         sparse_bound,
-        big_M=1000,
+        big_M=100,
         hierarchy=None,
         ignore_psd_check=True,
         fit_intercept=False,
@@ -89,6 +89,8 @@ def __init__(
             solver=solver,
             solver_options=solver_options,
         )
+        if sparse_bound <= 0:
+            raise ValueError("sparse_bound must be > 0")
 
         self._bound = cp.Parameter(nonneg=True, value=sparse_bound)
 
@@ -119,7 +121,7 @@ def __init__(
         groups,
         sparse_bound,
         eta=1.0,
-        big_M=1000,
+        big_M=100,
         hierarchy=None,
         tikhonov_w=None,
         ignore_psd_check=True,

diff --git a/src/sparselm/model/_miqp/_regularized_l0.py b/src/sparselm/model/_miqp/_regularized_l0.py
@@ -49,7 +49,7 @@ def __init__(
         self,
         groups,
         alpha=1.0,
-        big_M=1000,
+        big_M=100,
         hierarchy=None,
         ignore_psd_check=True,
         fit_intercept=False,
@@ -112,22 +112,22 @@ def __init__(
             solver=solver,
             solver_options=solver_options,
         )
-        self._eta = cp.Parameter(nonneg=True, value=alpha)
+        self._alpha = cp.Parameter(nonneg=True, value=alpha)
 
     @property
     def alpha(self):
         """Get alpha hyperparameter value."""
-        return self._eta.value
+        return self._alpha.value
 
     @alpha.setter
     def alpha(self, val):
         """Set alpha hyperparameter value."""
-        self._eta.value = val
+        self._alpha.value = val
 
     def _gen_objective(self, X, y):
         """Generate the quadratic form and l0 regularization portion of objective."""
         c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
-        objective = super()._gen_objective(X, y) + c0 * self._eta * cp.sum(self._z0)
+        objective = super()._gen_objective(X, y) + c0 * self._alpha * cp.sum(self._z0)
         return objective
 
 
@@ -139,7 +139,7 @@ def __init__(
         groups,
         alpha=1.0,
         eta=1.0,
-        big_M=1000,
+        big_M=100,
         hierarchy=None,
         ignore_psd_check=True,
         fit_intercept=False,
@@ -250,7 +250,7 @@ def __init__(
         groups,
         alpha=1.0,
         eta=1.0,
-        big_M=1000,
+        big_M=100,
         hierarchy=None,
         ignore_psd_check=True,
         fit_intercept=False,
@@ -364,7 +364,7 @@ def __init__(
         groups,
         alpha=1.0,
         eta=1.0,
-        big_M=1000,
+        big_M=100,
         hierarchy=None,
         tikhonov_w=None,
         ignore_psd_check=True,

diff --git a/src/sparselm/tools.py b/src/sparselm/tools.py
@@ -16,9 +16,9 @@ def constrain_coefficients(indices, high=None, low=None):
     and below the supplied value.
 
     Use this as a standard decorator with parameters:
-    - At runtime: coefs = constrain_dielectric(indices, high, low)(fit_method)(X, y)
+    - At runtime: coefs = constrain_coefficients(indices, high, low)(fit_method)(X, y)
     - In fit_method definitions:
-      @constrain_dielectric(indices, high, low)
+      @constrain_coefficients(indices, high, low)
       def your_fit_method(X, y):
 
     Args:
@@ -32,12 +32,18 @@ def your_fit_method(X, y):
     indices = np.array(indices)
     if high is not None:
         high = (
-            high * np.ones(len(indices)) if isinstance(high, float) else np.array(high)
+            high * np.ones(len(indices))
+            if isinstance(high, (int, float))
+            else np.array(high)
         )
     else:
         high = np.inf * np.ones(len(indices))
     if low is not None:
-        low = low * np.ones(len(indices)) if isinstance(low, float) else np.array(low)
+        low = (
+            low * np.ones(len(indices))
+            if isinstance(low, (int, float))
+            else np.array(low)
+        )
     else:
         low = -np.inf * np.ones(len(indices))
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,29 +1,106 @@
 import numpy as np
 import pytest
+from sklearn.datasets import make_regression, make_sparse_coded_signal
+
+SEED = 0
+
+# A few solvers to test for convex problems
+# ECOS sometimes fails for Adaptive group estimators, but is fast
+# SCS and CXVOPT are reliable, but slower
+# GUROBI is best
+CONVEX_SOLVERS = ["GUROBI", "SCS", "CVXOPT"]
+
+# ECOS_BB is open source alternative, but much slower, and can get things wrong
+MIQP_SOLVERS = ["GUROBI"]
 
 # Set to small values bc gurobi non-commercial can not solver large model.
-n_features = 30
-n_samples = 40
-n_nonzeros = 20
+N_FEATURES = [20, 30]  # an overdetermined and underdetermined case
+N_SAMPLES = 25
+N_INFORMATIVE = 10
 
 
 @pytest.fixture(scope="package")
-def random_model():
-    femat = np.random.rand(n_samples, n_features)
-    ecis = np.zeros(n_features)
-    non_zero_ids = np.random.choice(n_features, size=n_nonzeros, replace=False)
+def rng():
+    """Seed and return an RNG for test reproducibility"""
+    return np.random.default_rng(SEED)
+
+
+@pytest.fixture(params=CONVEX_SOLVERS)
+def solver(request):
+    return request.param
+
+
+@pytest.fixture(params=MIQP_SOLVERS)
+def miqp_solver(request):
+    return request.param
+
+
+@pytest.fixture(scope="package", params=N_FEATURES)
+def random_model(rng, request):
+    """Returns a fully random set of X, y, and beta representing a linear model."""
+    X, y, beta = make_regression(
+        n_samples=N_SAMPLES,
+        n_features=request.param,
+        n_informative=N_INFORMATIVE,
+        coef=True,
+        random_state=rng.integers(0, 2**32 - 1),
+        bias=10 * rng.random(),
+    )
+    return X, y, beta
+
+
+@pytest.fixture(scope="package", params=N_FEATURES)
+def random_energy_model(rng, request):
+    """Returns a random set of X, y, and beta with added gaussian noise for a linear
+    model with sparse coefficients beta decay (on average) exponentially with the index
+    of the coefficient.
+    """
+    X = rng.random((N_SAMPLES, request.param))
+    beta = np.zeros(request.param)  # coefficients
+    non_zero_ids = rng.choice(request.param, size=N_INFORMATIVE, replace=False)
     non_zero_ids = np.array(np.round(non_zero_ids), dtype=int)
+
     for idx in non_zero_ids:
         eci = 0
         mag = np.exp(-0.5 * idx)
         while np.isclose(eci, 0):
-            eci = (np.random.random() - 0.5) * 2 * mag
-        ecis[idx] = eci
-    energies = femat @ ecis + np.random.normal(size=n_samples) * 2e-3
-    return femat, energies, ecis
+            eci = (rng.random() - 0.5) * 2 * mag
+        beta[idx] = eci
+    y = X @ beta + rng.normal(size=N_SAMPLES) * 2e-3  # fake energies
+    return X, y, beta
 
 
 @pytest.fixture(scope="package")
-def random_weights():
-    weights = 1000 * np.random.rand(n_features)
-    return np.diag(weights)
+def sparse_coded_signal(rng):
+    n_samples, n_features, n_nonzero = 10, 30, 5
+    y, X, beta = make_sparse_coded_signal(
+        n_samples=n_samples,
+        n_components=n_features,
+        n_features=n_samples,
+        n_nonzero_coefs=n_nonzero,
+        random_state=rng.integers(0, 2**32 - 1),
+        data_transposed=True,
+    )
+    # Make X not of norm 1 for testing
+    X *= 10
+    y *= 10
+    return X, y[:, 0], beta[:, 0]
+
+
+# TODO this fixture sometimes causes tests to fail because it does not pick some groups
+@pytest.fixture(params=[4, 6], scope="package")
+def random_model_with_groups(random_model, rng, request):
+    """Add a correct set of groups to model."""
+    X, y, beta = random_model
+    coef_mask = abs(beta) > 0
+    n_groups = request.param
+    n_active_groups = n_groups // 3 + 1
+    active_group_inds = rng.choice(range(n_groups), size=n_active_groups, replace=False)
+    inactive_group_inds = np.setdiff1d(range(n_groups), active_group_inds)
+
+    groups = np.zeros(len(beta), dtype=int)
+    for i, c in enumerate(coef_mask):
+        groups[i] = (
+            rng.choice(active_group_inds) if c else rng.choice(inactive_group_inds)
+        )
+    return X, y, beta, groups
diff --git a/tests/test_all_models.py b/tests/test_all_models.py
@@ -0,0 +1,50 @@
+"""General tests for all linear models.
+
+Simply check that they execute successfully on random data.
+"""
+
+from inspect import getmembers, isclass, signature
+
+import numpy as np
+import pytest
+
+import sparselm.model as spm
+
+ALL_ESTIMATORS = getmembers(spm, isclass)
+
+
+@pytest.mark.parametrize("estimator_cls", ALL_ESTIMATORS)
+def test_general_fit(estimator_cls, random_model, rng):
+    print(f"\nGeneral test of {estimator_cls[0]}.")
+    X, y, beta = random_model
+
+    # instantiate the estimator
+    sig = signature(estimator_cls[1])
+
+    # check for necessary parameters
+    args = {}
+    if "groups" in sig.parameters:
+        args["groups"] = rng.integers(0, 5, size=len(beta))
+    if "group_list" in sig.parameters:
+        args["group_list"] = [
+            rng.choice(range(5), replace=False, size=rng.integers(1, 5))
+            for _ in range(len(beta))
+        ]
+    if "sparse_bound" in sig.parameters:
+        args["sparse_bound"] = 12
+
+    estimator = estimator_cls[1](**args)
+    estimator.fit(X, y)
+    # assert a value of coefficients has been set correctly
+    assert isinstance(estimator.coef_, np.ndarray)
+    assert len(estimator.coef_) == len(beta)
+    assert len(estimator.predict(X)) == len(y)
+    assert estimator.intercept_ == 0.0
+
+    estimator = estimator_cls[1](fit_intercept=True, **args)
+    estimator.fit(X, y)
+    # assert a value of coefficients has been set correctly
+    assert isinstance(estimator.coef_, np.ndarray)
+    assert len(estimator.coef_) == len(beta)
+    assert len(estimator.predict(X)) == len(y)
+    assert estimator.intercept_ != 0.0