Skip to content

Commit

Permalink
Beef up test coverage (#23)
Browse files Browse the repository at this point in the history
* Beef up test coverage

* Use GTIL as reference

* Test multi-target XGBoost

* Slim down tests on Windows

* Beef up LightGBM tests

* Beef up scikit-learn tests

* Gracefully handle missing optional deps
  • Loading branch information
hcho3 authored Feb 24, 2024
1 parent 69a8169 commit fc2a5a1
Show file tree
Hide file tree
Showing 8 changed files with 816 additions and 305 deletions.
2 changes: 1 addition & 1 deletion ops/win-python-coverage.bat
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ call micromamba activate dev
if %errorlevel% neq 0 exit /b %errorlevel%
set "PYTHONPATH=./python"
set "PYTEST_TMPDIR=%WORKING_DIR%\temp"
python -m pytest --basetemp="%WORKING_DIR%\temp" --cov=tl2cgen --cov-report xml -v -rxXs --fulltrace --durations=0 tests\python
python -m pytest --basetemp="%WORKING_DIR%\temp" --cov=tl2cgen --cov-report xml -v -rxXs --fulltrace --durations=0 tests\python\test_basic.py
if %errorlevel% neq 0 exit /b %errorlevel%
2 changes: 1 addition & 1 deletion python/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ generated-members=np.float32,np.uintc,np.uintp,np.uint32

[MESSAGES CONTROL]
# globally disable pylint checks (comma separated)
disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc
disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc,import-outside-toplevel,duplicate-code
6 changes: 5 additions & 1 deletion tests/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import tempfile

import pytest
from sklearn.datasets import load_svmlight_file

import tl2cgen

Expand All @@ -14,6 +13,11 @@
@pytest.fixture(scope="session")
def annotation():
"""Pre-computed branch annotation information for example datasets"""
try:
from sklearn.datasets import load_svmlight_file
except ImportError:
pytest.skip(reason="scikit-learn is required")

with tempfile.TemporaryDirectory(dir=".") as tmpdir:

def compute_annotation(dataset):
Expand Down
71 changes: 67 additions & 4 deletions tests/python/hypothesis_util.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
"""Utility functions for hypothesis-based testing"""

# pylint: disable=differing-param-doc,missing-type-doc

from math import ceil
from sys import platform as _platform

import numpy as np
from hypothesis import assume
from hypothesis.strategies import composite, integers, just, none
from sklearn.datasets import make_classification, make_regression
from sklearn.datasets import (
make_classification,
make_multilabel_classification,
make_regression,
)

# pylint: disable=differing-param-doc,missing-type-doc


def _get_limits(strategy):
Expand Down Expand Up @@ -265,11 +270,69 @@ def standard_regression_datasets(
return X.astype(np.float32), y.astype(np.float32)


@composite
def standard_multi_target_binary_classification_datasets(
draw,
n_samples=integers(min_value=100, max_value=200),
n_features=integers(min_value=100, max_value=200),
n_targets=just(5),
*,
n_pos_labels=None,
random_state=None,
):
"""
Returns a strategy to generate datasets for multi-target binary classification,
where each target is associated with a binary class label.
Note:
This function uses the sklearn.datasets.make_multilabel_classification function to
generate the regression problem from the provided search strategies.
Parameters
----------
draw:
Callback function, to be used internally by Hypothesis
n_samples: SearchStrategy[int]
A search strategy for the number of samples in the X array
n_features: SearchStrategy[int]
A search strategy for the total number of features in the X array
n_targets: SearchStrategy[int], default=just(5)
A search strategy for the number of targets in the y array.
n_pos_labels: SearchStrategy[int], default=None
A search strategy for the expected number of positive class labels per sample.
If None, n_pos_labels will be set to ceil(0.4 * n_targets).
random_state: int, RandomState instance or None, default=None
Pass a random state or integer to determine the random number
generation for data set generation.
Returns
-------
(X, y): SearchStrategy[array], SearchStrategy[array]
A tuple of search strategies for arrays subject to the constraints of
the provided parameters.
"""
n_targets_ = draw(n_targets)
if n_pos_labels is None:
n_pos_labels = just(int(ceil(0.4 * n_targets_)))
ret = make_multilabel_classification(
n_samples=draw(n_samples),
n_features=draw(n_features),
n_classes=draw(n_targets),
n_labels=draw(n_pos_labels),
length=50,
allow_unlabeled=True,
sparse=False,
return_indicator="dense",
return_distributions=False,
random_state=random_state,
)
X, y = ret[0], ret[1]
return X.astype(np.float32), y.astype(np.float32)


def standard_settings():
"""Default hypothesis settings. Set a smaller max_examples on Windows"""
kwargs = {
"deadline": None,
"max_examples": 20,
"max_examples": 40,
"print_blob": True,
}
if _platform == "win32":
Expand Down
214 changes: 132 additions & 82 deletions tests/python/test_lightgbm_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,44 @@
import pytest
import scipy.sparse
import treelite
from hypothesis import given, settings
from hypothesis.strategies import sampled_from
from sklearn.datasets import load_iris, load_svmlight_file
from sklearn.model_selection import train_test_split

import tl2cgen
from tl2cgen.contrib.util import _libext

from .hypothesis_util import standard_regression_datasets, standard_settings
from .metadata import (
example_model_db,
format_libpath_for_example_model,
load_example_model,
try:
import lightgbm
except ImportError:
pytest.skip("LightGBM not installed; skipping", allow_module_level=True)

try:
from hypothesis import given, settings
from hypothesis.strategies import data as hypothesis_callback
from hypothesis.strategies import integers, just, sampled_from
except ImportError:
pytest.skip("hypothesis not installed; skipping", allow_module_level=True)


try:
from sklearn.model_selection import train_test_split
except ImportError:
pytest.skip("scikit-learn not installed; skipping", allow_module_level=True)


from .hypothesis_util import (
standard_classification_datasets,
standard_regression_datasets,
standard_settings,
)
from .metadata import format_libpath_for_example_model, load_example_model
from .util import (
TemporaryDirectory,
check_predictor,
has_pandas,
os_compatible_toolchains,
os_platform,
to_categorical,
)

try:
import lightgbm
except ImportError:
# skip this test suite if LightGBM is not installed
pytest.skip("LightGBM not installed; skipping", allow_module_level=True)


def _compile_lightgbm_model( # pylint: disable=too-many-arguments
*,
Expand Down Expand Up @@ -109,28 +119,46 @@ def test_lightgbm_regression(toolchain, objective, reg_sqrt, dataset):
np.testing.assert_almost_equal(out_pred, expected_pred, decimal=4)


@pytest.mark.parametrize("toolchain", os_compatible_toolchains())
@pytest.mark.parametrize("boosting_type", ["gbdt", "rf"])
@pytest.mark.parametrize("objective", ["multiclass", "multiclassova"])
@given(
dataset=standard_classification_datasets(
n_classes=integers(min_value=3, max_value=5), n_informative=just(5)
),
objective=sampled_from(["multiclass", "multiclassova"]),
boosting_type=sampled_from(["gbdt", "rf"]),
num_boost_round=integers(min_value=5, max_value=50),
use_categorical=sampled_from([True, False]),
toolchain=sampled_from(os_compatible_toolchains()),
callback=hypothesis_callback(),
)
@settings(**standard_settings())
def test_lightgbm_multiclass_classification(
tmpdir, objective, boosting_type, toolchain
dataset,
objective,
boosting_type,
num_boost_round,
use_categorical,
toolchain,
callback,
):
# pylint: disable=too-many-locals
# pylint: disable=too-many-locals,too-many-arguments
"""Test a multi-class classifier"""
model_path = pathlib.Path(tmpdir) / "iris_lightgbm.txt"

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, shuffle=False
)
dtrain = lightgbm.Dataset(X_train, y_train, free_raw_data=False)
dtest = lightgbm.Dataset(X_test, y_test, reference=dtrain, free_raw_data=False)
X, y = dataset
num_class = np.max(y) + 1
if use_categorical:
n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1]))
df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1)
dtrain = lightgbm.Dataset(
df, label=y, free_raw_data=False, categorical_feature="auto"
)
else:
dtrain = lightgbm.Dataset(X, label=y, free_raw_data=False)
X_pred = X.copy()
param = {
"task": "train",
"boosting": boosting_type,
"objective": objective,
"metric": "multi_logloss",
"num_class": 3,
"num_class": num_class,
"num_leaves": 31,
"learning_rate": 0.05,
}
Expand All @@ -139,75 +167,97 @@ def test_lightgbm_multiclass_classification(
bst = lightgbm.train(
param,
dtrain,
num_boost_round=10,
valid_sets=[dtrain, dtest],
valid_names=["train", "test"],
)
bst.save_model(model_path)

predictor = _compile_lightgbm_model(
model_path=model_path,
libname=f"iris_{objective}",
prefix=tmpdir,
toolchain=toolchain,
params={"quantize": 1},
verbose=True,
num_boost_round=num_boost_round,
valid_sets=[dtrain],
valid_names=["train"],
)
tl_model = treelite.frontend.from_lightgbm(bst)
expected_prob = treelite.gtil.predict(tl_model, X_pred)
expected_margin = treelite.gtil.predict(tl_model, X_pred, pred_margin=True)

dmat = tl2cgen.DMatrix(X_test, dtype="float64")
out_pred = predictor.predict(dmat)
expected_pred = bst.predict(X_test).reshape((X_test.shape[0], 1, -1))
np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5)
with TemporaryDirectory() as tmpdir:
libpath = pathlib.Path(tmpdir) / (f"lgb_multiclass_{objective}" + _libext())
tl2cgen.export_lib(
tl_model,
toolchain=toolchain,
libpath=libpath,
params={"parallel_comp": tl_model.num_tree, "quantize": 1},
)
predictor = tl2cgen.Predictor(libpath)
dmat = tl2cgen.DMatrix(X_pred, dtype="float64")
out_prob = predictor.predict(dmat)
out_margin = predictor.predict(dmat, pred_margin=True)
np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5)


@pytest.mark.parametrize("toolchain", os_compatible_toolchains())
@pytest.mark.parametrize("objective", ["binary", "xentlambda", "xentropy"])
def test_lightgbm_binary_classification(tmpdir, objective, toolchain):
# pylint: disable=too-many-locals
@given(
dataset=standard_classification_datasets(n_classes=just(2)),
objective=sampled_from(["binary", "xentlambda", "xentropy"]),
num_boost_round=integers(min_value=5, max_value=10),
use_categorical=sampled_from([True, False]),
toolchain=sampled_from(os_compatible_toolchains()),
callback=hypothesis_callback(),
)
@settings(**standard_settings())
def test_lightgbm_binary_classification(
dataset,
objective,
num_boost_round,
use_categorical,
toolchain,
callback,
):
# pylint: disable=too-many-locals,too-many-arguments
"""Test a binary classifier"""
dataset = "mushroom"
model_path = pathlib.Path(tmpdir) / "mushroom_lightgbm.txt"
dtest_path = example_model_db[dataset].dtest
X, y = dataset
if use_categorical:
n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1]))
df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1)
dtrain = lightgbm.Dataset(
df, label=y, free_raw_data=False, categorical_feature="auto"
)
else:
dtrain = lightgbm.Dataset(X, label=y, free_raw_data=False)
X_pred = X.copy()

dtrain = lightgbm.Dataset(example_model_db[dataset].dtrain)
dtest = lightgbm.Dataset(dtest_path, reference=dtrain)
param = {
params = {
"task": "train",
"boosting_type": "gbdt",
"objective": objective,
"metric": "auc",
"num_leaves": 7,
"learning_rate": 0.1,
}

bst = lightgbm.train(
param,
params,
dtrain,
num_boost_round=10,
valid_sets=[dtrain, dtest],
valid_names=["train", "test"],
)
bst.save_model(model_path)

shape = (dtest.num_data(), 1, -1)
expected_prob = bst.predict(dtest_path).reshape(shape)
expected_margin = bst.predict(dtest_path, raw_score=True).reshape(shape)

predictor = _compile_lightgbm_model(
model_path=model_path,
libname=f"agaricus_{objective}",
prefix=tmpdir,
toolchain=toolchain,
params={},
verbose=True,
)
dmat = tl2cgen.DMatrix(
load_svmlight_file(dtest_path, zero_based=True)[0], dtype="float64"
num_boost_round=num_boost_round,
valid_sets=[dtrain],
valid_names=["train"],
)

out_prob = predictor.predict(dmat)
np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
out_margin = predictor.predict(dmat, pred_margin=True)
np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5)
with TemporaryDirectory() as tmpdir:
model_path = pathlib.Path(tmpdir) / f"lgb_binary_{objective}.txt"
bst.save_model(model_path)
tl_model = treelite.frontend.load_lightgbm_model(model_path)
expected_prob = treelite.gtil.predict(tl_model, X_pred)
expected_margin = treelite.gtil.predict(tl_model, X_pred, pred_margin=True)

libpath = pathlib.Path(tmpdir) / (f"lgb_binary_{objective}" + _libext())
tl2cgen.export_lib(
tl_model,
toolchain=toolchain,
libpath=libpath,
params={"parallel_comp": tl_model.num_tree},
verbose=True,
)
predictor = tl2cgen.Predictor(libpath)
dtest = tl2cgen.DMatrix(X_pred)
out_prob = predictor.predict(dtest)
out_margin = predictor.predict(dtest, pred_margin=True)
np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5)


@pytest.mark.parametrize("toolchain", os_compatible_toolchains())
Expand Down
Loading

0 comments on commit fc2a5a1

Please sign in to comment.