Skip to content

Commit

Permalink
Merge pull request #66 from lbluque/tests
Browse files Browse the repository at this point in the history
unit tests
  • Loading branch information
lbluque authored Feb 7, 2023
2 parents 33ad1e6 + 19f18e7 commit 83ec2aa
Show file tree
Hide file tree
Showing 16 changed files with 901 additions and 64 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ repos:
exclude: README.md

- repo: https://github.com/pycqa/isort
rev: 5.11.4
rev: 5.12.0
hooks:
- id: isort
name: isort (python)
Expand Down
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@ Sparse Linear Regression Models
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/CederGroupHub/sparse-lm/main.svg)](https://results.pre-commit.ci/latest/github/CederGroupHub/sparse-lm/main)
[![pypi version](https://img.shields.io/pypi/v/sparse-lm?color=blue)](https://pypi.org/project/sparse-lm)

> :warning: this package is currently largely lacking in unit-tests.
> Use at your own risk!
**sparse-lm** includes several regularized regression estimators that are absent in the
`sklearn.linear_model` module. The estimators in **sparse-lm** are designed to fit right into
[scikit-lean](https://scikit-learn.org/stable/index.html) by inheriting from their base
Expand Down Expand Up @@ -42,7 +39,7 @@ model like you would any linear model in **scikit-learn**:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV
from src.sparselm import AdaptiveLasso
from sparselm import AdaptiveLasso

X, y = make_regression(n_samples=200, n_features=5000, random_state=0)
alasso = AdaptiveLasso(fit_intercept=False)
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ classifiers = [

[project.optional-dependencies]
dev = ["pre-commit", "black", "isort", "flake8", "pylint", "pydocstyle", "flake8-pyproject"]
tests = ["pytest >=7.2.0", "pytest-cov >=4.0.0", "coverage"]
tests = ["pytest >=7.2.0", "pytest-cov >=4.0.0", "coverage", "gurobipy"]
docs = ["sphinx>=5.3", "furo", "m2r2"]
optional = ["gurobipy"]

Expand Down Expand Up @@ -72,3 +72,7 @@ skip = "*.c,./.*"
count = ''
quiet-level = 3
ignore-words-list = ['nd', 'tread']

[tool.coverage.run]
source = ["src/sparselm"]
omit = ["*/__init__.py"]
2 changes: 1 addition & 1 deletion src/sparselm/model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def fit(
)

if sample_weight is not None:
X, y = _rescale_data(X, y, sample_weight)
X, y, _ = _rescale_data(X, y, sample_weight)

self.coef_ = self._solve(X, y, *args, **kwargs)
self._set_intercept(X_offset, y_offset, X_scale)
Expand Down
7 changes: 4 additions & 3 deletions src/sparselm/model/_lasso.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,16 @@ def __init__(
See docs in CVXEstimator for more information.
"""
self.groups = np.asarray(groups)
self.group_masks = [self.groups == i for i in np.unique(groups)]
group_ids = np.sort(np.unique(groups))
self.group_masks = [self.groups == i for i in group_ids]
self.standardize = standardize
self._group_norms = None

if group_weights is not None:
if len(group_weights) != len(self.group_masks):
if len(group_weights) != len(group_ids):
raise ValueError(
"group_weights must be the same length as the number of "
f"groups: {len(group_weights)} != {len(self.group_masks)}"
f"unique groups: {len(group_weights)} != {len(group_ids)}"
)
self.group_weights = (
group_weights
Expand Down
8 changes: 5 additions & 3 deletions src/sparselm/model/_miqp/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,18 +116,20 @@ def _gen_constraints(self, X, y):
constraints = []
for i, mask in enumerate(self._group_masks):
constraints += [
self._big_M * self._z0[i] >= self._beta[mask],
self._big_M * self._z0[i] >= -self._beta[mask],
self._beta[mask] <= self._big_M * self._z0[i],
-self._big_M * self._z0[i] <= self._beta[mask],
]

if self.hierarchy is not None:
constraints += self._gen_hierarchy_constraints()

return constraints

def _gen_hierarchy_constraints(self):
"""Generate single feature hierarchy constraints."""
group_ids = np.sort(np.unique(self.groups))
return [
self._z0[high_id] <= self._z0[sub_id]
for high_id, sub_ids in enumerate(self.hierarchy)
for high_id, sub_ids in zip(group_ids, self.hierarchy)
for sub_id in sub_ids
]
6 changes: 4 additions & 2 deletions src/sparselm/model/_miqp/_best_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(
self,
groups,
sparse_bound,
big_M=1000,
big_M=100,
hierarchy=None,
ignore_psd_check=True,
fit_intercept=False,
Expand Down Expand Up @@ -89,6 +89,8 @@ def __init__(
solver=solver,
solver_options=solver_options,
)
if sparse_bound <= 0:
raise ValueError("sparse_bound must be > 0")

self._bound = cp.Parameter(nonneg=True, value=sparse_bound)

Expand Down Expand Up @@ -119,7 +121,7 @@ def __init__(
groups,
sparse_bound,
eta=1.0,
big_M=1000,
big_M=100,
hierarchy=None,
tikhonov_w=None,
ignore_psd_check=True,
Expand Down
16 changes: 8 additions & 8 deletions src/sparselm/model/_miqp/_regularized_l0.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(
self,
groups,
alpha=1.0,
big_M=1000,
big_M=100,
hierarchy=None,
ignore_psd_check=True,
fit_intercept=False,
Expand Down Expand Up @@ -112,22 +112,22 @@ def __init__(
solver=solver,
solver_options=solver_options,
)
self._eta = cp.Parameter(nonneg=True, value=alpha)
self._alpha = cp.Parameter(nonneg=True, value=alpha)

@property
def alpha(self):
"""Get alpha hyperparameter value."""
return self._eta.value
return self._alpha.value

@alpha.setter
def alpha(self, val):
"""Set alpha hyperparameter value."""
self._eta.value = val
self._alpha.value = val

def _gen_objective(self, X, y):
"""Generate the quadratic form and l0 regularization portion of objective."""
c0 = 2 * X.shape[0] # keeps hyperparameter scale independent
objective = super()._gen_objective(X, y) + c0 * self._eta * cp.sum(self._z0)
objective = super()._gen_objective(X, y) + c0 * self._alpha * cp.sum(self._z0)
return objective


Expand All @@ -139,7 +139,7 @@ def __init__(
groups,
alpha=1.0,
eta=1.0,
big_M=1000,
big_M=100,
hierarchy=None,
ignore_psd_check=True,
fit_intercept=False,
Expand Down Expand Up @@ -250,7 +250,7 @@ def __init__(
groups,
alpha=1.0,
eta=1.0,
big_M=1000,
big_M=100,
hierarchy=None,
ignore_psd_check=True,
fit_intercept=False,
Expand Down Expand Up @@ -364,7 +364,7 @@ def __init__(
groups,
alpha=1.0,
eta=1.0,
big_M=1000,
big_M=100,
hierarchy=None,
tikhonov_w=None,
ignore_psd_check=True,
Expand Down
14 changes: 10 additions & 4 deletions src/sparselm/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ def constrain_coefficients(indices, high=None, low=None):
and below the supplied value.
Use this as a standard decorator with parameters:
- At runtime: coefs = constrain_dielectric(indices, high, low)(fit_method)(X, y)
- At runtime: coefs = constrain_coefficients(indices, high, low)(fit_method)(X, y)
- In fit_method definitions:
@constrain_dielectric(indices, high, low)
@constrain_coefficients(indices, high, low)
def your_fit_method(X, y):
Args:
Expand All @@ -32,12 +32,18 @@ def your_fit_method(X, y):
indices = np.array(indices)
if high is not None:
high = (
high * np.ones(len(indices)) if isinstance(high, float) else np.array(high)
high * np.ones(len(indices))
if isinstance(high, (int, float))
else np.array(high)
)
else:
high = np.inf * np.ones(len(indices))
if low is not None:
low = low * np.ones(len(indices)) if isinstance(low, float) else np.array(low)
low = (
low * np.ones(len(indices))
if isinstance(low, (int, float))
else np.array(low)
)
else:
low = -np.inf * np.ones(len(indices))

Expand Down
105 changes: 91 additions & 14 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,106 @@
import numpy as np
import pytest
from sklearn.datasets import make_regression, make_sparse_coded_signal

SEED = 0

# A few solvers to test for convex problems
# ECOS sometimes fails for Adaptive group estimators, but is fast
# SCS and CXVOPT are reliable, but slower
# GUROBI is best
CONVEX_SOLVERS = ["GUROBI", "SCS", "CVXOPT"]

# ECOS_BB is open source alternative, but much slower, and can get things wrong
MIQP_SOLVERS = ["GUROBI"]

# Set to small values bc gurobi non-commercial can not solver large model.
n_features = 30
n_samples = 40
n_nonzeros = 20
N_FEATURES = [20, 30] # an overdetermined and underdetermined case
N_SAMPLES = 25
N_INFORMATIVE = 10


@pytest.fixture(scope="package")
def random_model():
femat = np.random.rand(n_samples, n_features)
ecis = np.zeros(n_features)
non_zero_ids = np.random.choice(n_features, size=n_nonzeros, replace=False)
def rng():
"""Seed and return an RNG for test reproducibility"""
return np.random.default_rng(SEED)


@pytest.fixture(params=CONVEX_SOLVERS)
def solver(request):
return request.param


@pytest.fixture(params=MIQP_SOLVERS)
def miqp_solver(request):
return request.param


@pytest.fixture(scope="package", params=N_FEATURES)
def random_model(rng, request):
"""Returns a fully random set of X, y, and beta representing a linear model."""
X, y, beta = make_regression(
n_samples=N_SAMPLES,
n_features=request.param,
n_informative=N_INFORMATIVE,
coef=True,
random_state=rng.integers(0, 2**32 - 1),
bias=10 * rng.random(),
)
return X, y, beta


@pytest.fixture(scope="package", params=N_FEATURES)
def random_energy_model(rng, request):
"""Returns a random set of X, y, and beta with added gaussian noise for a linear
model with sparse coefficients beta decay (on average) exponentially with the index
of the coefficient.
"""
X = rng.random((N_SAMPLES, request.param))
beta = np.zeros(request.param) # coefficients
non_zero_ids = rng.choice(request.param, size=N_INFORMATIVE, replace=False)
non_zero_ids = np.array(np.round(non_zero_ids), dtype=int)

for idx in non_zero_ids:
eci = 0
mag = np.exp(-0.5 * idx)
while np.isclose(eci, 0):
eci = (np.random.random() - 0.5) * 2 * mag
ecis[idx] = eci
energies = femat @ ecis + np.random.normal(size=n_samples) * 2e-3
return femat, energies, ecis
eci = (rng.random() - 0.5) * 2 * mag
beta[idx] = eci
y = X @ beta + rng.normal(size=N_SAMPLES) * 2e-3 # fake energies
return X, y, beta


@pytest.fixture(scope="package")
def random_weights():
weights = 1000 * np.random.rand(n_features)
return np.diag(weights)
def sparse_coded_signal(rng):
n_samples, n_features, n_nonzero = 10, 30, 5
y, X, beta = make_sparse_coded_signal(
n_samples=n_samples,
n_components=n_features,
n_features=n_samples,
n_nonzero_coefs=n_nonzero,
random_state=rng.integers(0, 2**32 - 1),
data_transposed=True,
)
# Make X not of norm 1 for testing
X *= 10
y *= 10
return X, y[:, 0], beta[:, 0]


# TODO this fixture sometimes causes tests to fail because it does not pick some groups
@pytest.fixture(params=[4, 6], scope="package")
def random_model_with_groups(random_model, rng, request):
"""Add a correct set of groups to model."""
X, y, beta = random_model
coef_mask = abs(beta) > 0
n_groups = request.param
n_active_groups = n_groups // 3 + 1
active_group_inds = rng.choice(range(n_groups), size=n_active_groups, replace=False)
inactive_group_inds = np.setdiff1d(range(n_groups), active_group_inds)

groups = np.zeros(len(beta), dtype=int)
for i, c in enumerate(coef_mask):
groups[i] = (
rng.choice(active_group_inds) if c else rng.choice(inactive_group_inds)
)
return X, y, beta, groups
50 changes: 50 additions & 0 deletions tests/test_all_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""General tests for all linear models.
Simply check that they execute successfully on random data.
"""

from inspect import getmembers, isclass, signature

import numpy as np
import pytest

import sparselm.model as spm

ALL_ESTIMATORS = getmembers(spm, isclass)


@pytest.mark.parametrize("estimator_cls", ALL_ESTIMATORS)
def test_general_fit(estimator_cls, random_model, rng):
print(f"\nGeneral test of {estimator_cls[0]}.")
X, y, beta = random_model

# instantiate the estimator
sig = signature(estimator_cls[1])

# check for necessary parameters
args = {}
if "groups" in sig.parameters:
args["groups"] = rng.integers(0, 5, size=len(beta))
if "group_list" in sig.parameters:
args["group_list"] = [
rng.choice(range(5), replace=False, size=rng.integers(1, 5))
for _ in range(len(beta))
]
if "sparse_bound" in sig.parameters:
args["sparse_bound"] = 12

estimator = estimator_cls[1](**args)
estimator.fit(X, y)
# assert a value of coefficients has been set correctly
assert isinstance(estimator.coef_, np.ndarray)
assert len(estimator.coef_) == len(beta)
assert len(estimator.predict(X)) == len(y)
assert estimator.intercept_ == 0.0

estimator = estimator_cls[1](fit_intercept=True, **args)
estimator.fit(X, y)
# assert a value of coefficients has been set correctly
assert isinstance(estimator.coef_, np.ndarray)
assert len(estimator.coef_) == len(beta)
assert len(estimator.predict(X)) == len(y)
assert estimator.intercept_ != 0.0
Loading

0 comments on commit 83ec2aa

Please sign in to comment.