Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Beef up test coverage #23

Merged
merged 7 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ops/win-python-coverage.bat
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ call micromamba activate dev
if %errorlevel% neq 0 exit /b %errorlevel%
set "PYTHONPATH=./python"
set "PYTEST_TMPDIR=%WORKING_DIR%\temp"
python -m pytest --basetemp="%WORKING_DIR%\temp" --cov=tl2cgen --cov-report xml -v -rxXs --fulltrace --durations=0 tests\python
python -m pytest --basetemp="%WORKING_DIR%\temp" --cov=tl2cgen --cov-report xml -v -rxXs --fulltrace --durations=0 tests\python\test_basic.py
if %errorlevel% neq 0 exit /b %errorlevel%
2 changes: 1 addition & 1 deletion python/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ generated-members=np.float32,np.uintc,np.uintp,np.uint32

[MESSAGES CONTROL]
# globally disable pylint checks (comma separated)
disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc
disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc,import-outside-toplevel,duplicate-code
6 changes: 5 additions & 1 deletion tests/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import tempfile

import pytest
from sklearn.datasets import load_svmlight_file

import tl2cgen

Expand All @@ -14,6 +13,11 @@
@pytest.fixture(scope="session")
def annotation():
"""Pre-computed branch annotation information for example datasets"""
try:
from sklearn.datasets import load_svmlight_file
except ImportError:
pytest.skip(reason="scikit-learn is required")

with tempfile.TemporaryDirectory(dir=".") as tmpdir:

def compute_annotation(dataset):
Expand Down
71 changes: 67 additions & 4 deletions tests/python/hypothesis_util.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
"""Utility functions for hypothesis-based testing"""

# pylint: disable=differing-param-doc,missing-type-doc

from math import ceil
from sys import platform as _platform

import numpy as np
from hypothesis import assume
from hypothesis.strategies import composite, integers, just, none
from sklearn.datasets import make_classification, make_regression
from sklearn.datasets import (
make_classification,
make_multilabel_classification,
make_regression,
)

# pylint: disable=differing-param-doc,missing-type-doc


def _get_limits(strategy):
Expand Down Expand Up @@ -265,11 +270,69 @@ def standard_regression_datasets(
return X.astype(np.float32), y.astype(np.float32)


@composite
def standard_multi_target_binary_classification_datasets(
draw,
n_samples=integers(min_value=100, max_value=200),
n_features=integers(min_value=100, max_value=200),
n_targets=just(5),
*,
n_pos_labels=None,
random_state=None,
):
"""
Returns a strategy to generate datasets for multi-target binary classification,
where each target is associated with a binary class label.
Note:
This function uses the sklearn.datasets.make_multilabel_classification function to
generate the regression problem from the provided search strategies.

Parameters
----------
draw:
Callback function, to be used internally by Hypothesis
n_samples: SearchStrategy[int]
A search strategy for the number of samples in the X array
n_features: SearchStrategy[int]
A search strategy for the total number of features in the X array
n_targets: SearchStrategy[int], default=just(5)
A search strategy for the number of targets in the y array.
n_pos_labels: SearchStrategy[int], default=None
A search strategy for the expected number of positive class labels per sample.
If None, n_pos_labels will be set to ceil(0.4 * n_targets).
random_state: int, RandomState instance or None, default=None
Pass a random state or integer to determine the random number
generation for data set generation.
Returns
-------
(X, y): SearchStrategy[array], SearchStrategy[array]
A tuple of search strategies for arrays subject to the constraints of
the provided parameters.
"""
n_targets_ = draw(n_targets)
if n_pos_labels is None:
n_pos_labels = just(int(ceil(0.4 * n_targets_)))
ret = make_multilabel_classification(
n_samples=draw(n_samples),
n_features=draw(n_features),
n_classes=draw(n_targets),
n_labels=draw(n_pos_labels),
length=50,
allow_unlabeled=True,
sparse=False,
return_indicator="dense",
return_distributions=False,
random_state=random_state,
)
X, y = ret[0], ret[1]
return X.astype(np.float32), y.astype(np.float32)


def standard_settings():
"""Default hypothesis settings. Set a smaller max_examples on Windows"""
kwargs = {
"deadline": None,
"max_examples": 20,
"max_examples": 40,
"print_blob": True,
}
if _platform == "win32":
Expand Down
214 changes: 132 additions & 82 deletions tests/python/test_lightgbm_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,44 @@
import pytest
import scipy.sparse
import treelite
from hypothesis import given, settings
from hypothesis.strategies import sampled_from
from sklearn.datasets import load_iris, load_svmlight_file
from sklearn.model_selection import train_test_split

import tl2cgen
from tl2cgen.contrib.util import _libext

from .hypothesis_util import standard_regression_datasets, standard_settings
from .metadata import (
example_model_db,
format_libpath_for_example_model,
load_example_model,
try:
import lightgbm
except ImportError:
pytest.skip("LightGBM not installed; skipping", allow_module_level=True)

try:
from hypothesis import given, settings
from hypothesis.strategies import data as hypothesis_callback
from hypothesis.strategies import integers, just, sampled_from
except ImportError:
pytest.skip("hypothesis not installed; skipping", allow_module_level=True)


try:
from sklearn.model_selection import train_test_split
except ImportError:
pytest.skip("scikit-learn not installed; skipping", allow_module_level=True)


from .hypothesis_util import (
standard_classification_datasets,
standard_regression_datasets,
standard_settings,
)
from .metadata import format_libpath_for_example_model, load_example_model
from .util import (
TemporaryDirectory,
check_predictor,
has_pandas,
os_compatible_toolchains,
os_platform,
to_categorical,
)

try:
import lightgbm
except ImportError:
# skip this test suite if LightGBM is not installed
pytest.skip("LightGBM not installed; skipping", allow_module_level=True)


def _compile_lightgbm_model( # pylint: disable=too-many-arguments
*,
Expand Down Expand Up @@ -109,28 +119,46 @@ def test_lightgbm_regression(toolchain, objective, reg_sqrt, dataset):
np.testing.assert_almost_equal(out_pred, expected_pred, decimal=4)


@pytest.mark.parametrize("toolchain", os_compatible_toolchains())
@pytest.mark.parametrize("boosting_type", ["gbdt", "rf"])
@pytest.mark.parametrize("objective", ["multiclass", "multiclassova"])
@given(
dataset=standard_classification_datasets(
n_classes=integers(min_value=3, max_value=5), n_informative=just(5)
),
objective=sampled_from(["multiclass", "multiclassova"]),
boosting_type=sampled_from(["gbdt", "rf"]),
num_boost_round=integers(min_value=5, max_value=50),
use_categorical=sampled_from([True, False]),
toolchain=sampled_from(os_compatible_toolchains()),
callback=hypothesis_callback(),
)
@settings(**standard_settings())
def test_lightgbm_multiclass_classification(
tmpdir, objective, boosting_type, toolchain
dataset,
objective,
boosting_type,
num_boost_round,
use_categorical,
toolchain,
callback,
):
# pylint: disable=too-many-locals
# pylint: disable=too-many-locals,too-many-arguments
"""Test a multi-class classifier"""
model_path = pathlib.Path(tmpdir) / "iris_lightgbm.txt"

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, shuffle=False
)
dtrain = lightgbm.Dataset(X_train, y_train, free_raw_data=False)
dtest = lightgbm.Dataset(X_test, y_test, reference=dtrain, free_raw_data=False)
X, y = dataset
num_class = np.max(y) + 1
if use_categorical:
n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1]))
df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1)
dtrain = lightgbm.Dataset(
df, label=y, free_raw_data=False, categorical_feature="auto"
)
else:
dtrain = lightgbm.Dataset(X, label=y, free_raw_data=False)
X_pred = X.copy()
param = {
"task": "train",
"boosting": boosting_type,
"objective": objective,
"metric": "multi_logloss",
"num_class": 3,
"num_class": num_class,
"num_leaves": 31,
"learning_rate": 0.05,
}
Expand All @@ -139,75 +167,97 @@ def test_lightgbm_multiclass_classification(
bst = lightgbm.train(
param,
dtrain,
num_boost_round=10,
valid_sets=[dtrain, dtest],
valid_names=["train", "test"],
)
bst.save_model(model_path)

predictor = _compile_lightgbm_model(
model_path=model_path,
libname=f"iris_{objective}",
prefix=tmpdir,
toolchain=toolchain,
params={"quantize": 1},
verbose=True,
num_boost_round=num_boost_round,
valid_sets=[dtrain],
valid_names=["train"],
)
tl_model = treelite.frontend.from_lightgbm(bst)
expected_prob = treelite.gtil.predict(tl_model, X_pred)
expected_margin = treelite.gtil.predict(tl_model, X_pred, pred_margin=True)

dmat = tl2cgen.DMatrix(X_test, dtype="float64")
out_pred = predictor.predict(dmat)
expected_pred = bst.predict(X_test).reshape((X_test.shape[0], 1, -1))
np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5)
with TemporaryDirectory() as tmpdir:
libpath = pathlib.Path(tmpdir) / (f"lgb_multiclass_{objective}" + _libext())
tl2cgen.export_lib(
tl_model,
toolchain=toolchain,
libpath=libpath,
params={"parallel_comp": tl_model.num_tree, "quantize": 1},
)
predictor = tl2cgen.Predictor(libpath)
dmat = tl2cgen.DMatrix(X_pred, dtype="float64")
out_prob = predictor.predict(dmat)
out_margin = predictor.predict(dmat, pred_margin=True)
np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5)


@pytest.mark.parametrize("toolchain", os_compatible_toolchains())
@pytest.mark.parametrize("objective", ["binary", "xentlambda", "xentropy"])
def test_lightgbm_binary_classification(tmpdir, objective, toolchain):
# pylint: disable=too-many-locals
@given(
dataset=standard_classification_datasets(n_classes=just(2)),
objective=sampled_from(["binary", "xentlambda", "xentropy"]),
num_boost_round=integers(min_value=5, max_value=10),
use_categorical=sampled_from([True, False]),
toolchain=sampled_from(os_compatible_toolchains()),
callback=hypothesis_callback(),
)
@settings(**standard_settings())
def test_lightgbm_binary_classification(
dataset,
objective,
num_boost_round,
use_categorical,
toolchain,
callback,
):
# pylint: disable=too-many-locals,too-many-arguments
"""Test a binary classifier"""
dataset = "mushroom"
model_path = pathlib.Path(tmpdir) / "mushroom_lightgbm.txt"
dtest_path = example_model_db[dataset].dtest
X, y = dataset
if use_categorical:
n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1]))
df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1)
dtrain = lightgbm.Dataset(
df, label=y, free_raw_data=False, categorical_feature="auto"
)
else:
dtrain = lightgbm.Dataset(X, label=y, free_raw_data=False)
X_pred = X.copy()

dtrain = lightgbm.Dataset(example_model_db[dataset].dtrain)
dtest = lightgbm.Dataset(dtest_path, reference=dtrain)
param = {
params = {
"task": "train",
"boosting_type": "gbdt",
"objective": objective,
"metric": "auc",
"num_leaves": 7,
"learning_rate": 0.1,
}

bst = lightgbm.train(
param,
params,
dtrain,
num_boost_round=10,
valid_sets=[dtrain, dtest],
valid_names=["train", "test"],
)
bst.save_model(model_path)

shape = (dtest.num_data(), 1, -1)
expected_prob = bst.predict(dtest_path).reshape(shape)
expected_margin = bst.predict(dtest_path, raw_score=True).reshape(shape)

predictor = _compile_lightgbm_model(
model_path=model_path,
libname=f"agaricus_{objective}",
prefix=tmpdir,
toolchain=toolchain,
params={},
verbose=True,
)
dmat = tl2cgen.DMatrix(
load_svmlight_file(dtest_path, zero_based=True)[0], dtype="float64"
num_boost_round=num_boost_round,
valid_sets=[dtrain],
valid_names=["train"],
)

out_prob = predictor.predict(dmat)
np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
out_margin = predictor.predict(dmat, pred_margin=True)
np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5)
with TemporaryDirectory() as tmpdir:
model_path = pathlib.Path(tmpdir) / f"lgb_binary_{objective}.txt"
bst.save_model(model_path)
tl_model = treelite.frontend.load_lightgbm_model(model_path)
expected_prob = treelite.gtil.predict(tl_model, X_pred)
expected_margin = treelite.gtil.predict(tl_model, X_pred, pred_margin=True)

libpath = pathlib.Path(tmpdir) / (f"lgb_binary_{objective}" + _libext())
tl2cgen.export_lib(
tl_model,
toolchain=toolchain,
libpath=libpath,
params={"parallel_comp": tl_model.num_tree},
verbose=True,
)
predictor = tl2cgen.Predictor(libpath)
dtest = tl2cgen.DMatrix(X_pred)
out_prob = predictor.predict(dtest)
out_margin = predictor.predict(dtest, pred_margin=True)
np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5)


@pytest.mark.parametrize("toolchain", os_compatible_toolchains())
Expand Down
Loading
Loading