Skip to content

Commit

Permalink
Beef up test coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Feb 22, 2024
1 parent 69a8169 commit 5de5a9f
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 27 deletions.
2 changes: 1 addition & 1 deletion python/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ generated-members=np.float32,np.uintc,np.uintp,np.uint32

[MESSAGES CONTROL]
# globally disable pylint checks (comma separated)
disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc
disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc,import-outside-toplevel
88 changes: 63 additions & 25 deletions tests/python/test_xgboost_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

# pylint: disable=R0201, R0915
import os
import pathlib

import numpy as np
import pytest
import treelite
from hypothesis import assume, given, settings
from hypothesis.strategies import data as hypothesis_callback
from hypothesis.strategies import integers, sampled_from
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
Expand All @@ -16,7 +18,12 @@

from .hypothesis_util import standard_regression_datasets, standard_settings
from .metadata import format_libpath_for_example_model, load_example_model
from .util import TemporaryDirectory, check_predictor, os_compatible_toolchains
from .util import (
TemporaryDirectory,
check_predictor,
os_compatible_toolchains,
to_categorical,
)

try:
import xgboost as xgb
Expand All @@ -25,63 +32,92 @@
pytest.skip("XGBoost not installed; skipping", allow_module_level=True)


def generate_data_for_squared_log_error(n_targets: int = 1):
"""Generate data containing outliers."""
n_rows = 4096
n_cols = 16

outlier_mean = 10000 # mean of generated outliers
n_outliers = 64

X = np.random.randn(n_rows, n_cols)
y = np.random.randn(n_rows, n_targets)
y += np.abs(np.min(y, axis=0))

# Create outliers
for _ in range(0, n_outliers):
ind = np.random.randint(0, len(y) - 1)
y[ind, :] += np.random.randint(0, outlier_mean)

# rmsle requires all label be greater than -1.
assert np.all(y > -1.0)

return X, y


@given(
toolchain=sampled_from(os_compatible_toolchains()),
objective=sampled_from(
[
"reg:linear",
"reg:squarederror",
"reg:squaredlogerror",
"reg:pseudohubererror",
]
),
model_format=sampled_from(["binary", "json"]),
num_parallel_tree=integers(min_value=1, max_value=10),
dataset=standard_regression_datasets(),
num_boost_round=integers(min_value=3, max_value=10),
num_parallel_tree=integers(min_value=1, max_value=3),
callback=hypothesis_callback(),
)
@settings(**standard_settings())
def test_xgb_regression(toolchain, objective, model_format, num_parallel_tree, dataset):
def test_xgb_regression(
toolchain, objective, num_boost_round, num_parallel_tree, callback
):
# pylint: disable=too-many-locals
"""Test a random regression dataset"""
X, y = dataset
if objective == "reg:squaredlogerror":
assume(np.all(y > -1))
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, shuffle=False
)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
X, y = generate_data_for_squared_log_error()
use_categorical = False
else:
X, y = callback.draw(standard_regression_datasets())
use_categorical = callback.draw(sampled_from([True, False]))
if use_categorical:
n_categorical = callback.draw(sampled_from([3, 5, X.shape[1]]))
assume(n_categorical <= X.shape[1])
df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1)
dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)
model_format = "json"
else:
dtrain = xgb.DMatrix(X, label=y)
X_pred = X.copy()
model_format = callback.draw(sampled_from(["json", "legacy_binary"]))
param = {
"max_depth": 8,
"eta": 1,
"eta": 0.1,
"verbosity": 0,
"objective": objective,
"num_parallel_tree": num_parallel_tree,
}
num_round = 10
bst = xgb.train(
param,
dtrain,
num_boost_round=num_round,
evals=[(dtrain, "train"), (dtest, "test")],
num_boost_round=num_boost_round,
evals=[(dtrain, "train")],
)
with TemporaryDirectory() as tmpdir:
if model_format == "json":
model_name = "regression.json"
model_path = os.path.join(tmpdir, model_name)
model_path = pathlib.Path(tmpdir) / "regression.json"
bst.save_model(model_path)
model = treelite.frontend.load_xgboost_model(filename=model_path)
else:
model_name = "regression.model"
model_path = os.path.join(tmpdir, model_name)
model_path = pathlib.Path(tmpdir) / "regression.model"
bst.save_model(model_path)
model = treelite.frontend.load_xgboost_model_legacy_binary(
filename=model_path
)

assert model.num_feature == dtrain.num_col()
assert model.num_tree == num_round * num_parallel_tree
libpath = os.path.join(tmpdir, "regression" + _libext())
assert model.num_tree == num_boost_round * num_parallel_tree
libpath = pathlib.Path(tmpdir) / ("regression" + _libext())
tl2cgen.export_lib(
model,
toolchain=toolchain,
Expand All @@ -94,9 +130,11 @@ def test_xgb_regression(toolchain, objective, model_format, num_parallel_tree, d
assert predictor.num_feature == dtrain.num_col()
assert predictor.num_target == 1
np.testing.assert_array_equal(predictor.num_class, [1])
dmat = tl2cgen.DMatrix(X_test, dtype="float32")
dmat = tl2cgen.DMatrix(X_pred, dtype="float32")
out_pred = predictor.predict(dmat)
expected_pred = bst.predict(dtest, strict_shape=True)[:, :, np.newaxis]
expected_pred = bst.predict(
xgb.DMatrix(X_pred), strict_shape=True, validate_features=False
)[:, :, np.newaxis]
np.testing.assert_almost_equal(out_pred, expected_pred, decimal=3)


Expand Down
85 changes: 84 additions & 1 deletion tests/python/util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# -*- coding: utf-8 -*-
"""Utility functions for tests"""
import ctypes
import os
import tempfile
from contextlib import contextmanager
from math import ceil
from sys import platform as _platform
from typing import Iterator, List
from typing import Any, Iterator, List, Optional, Tuple, Union

import numpy as np
import treelite
Expand Down Expand Up @@ -100,3 +102,84 @@ def TemporaryDirectory(*args, **kwargs) -> Iterator[str]:
except (PermissionError, NotADirectoryError):
if _platform != "win32":
raise


# pylint: disable=R0914
def to_categorical(
features: np.ndarray,
*,
n_categorical: int,
invalid_frac: float = 0.0,
random_state: Optional[Union[int, np.random.RandomState]] = None,
) -> Tuple[Any, np.ndarray]:
"""Generate a Pandas DataFrame with a mix of numerical and categorical columns.
Also returns a NumPy array whose value is equal to the DataFrame, except for
a given fraction of elements (they will be replaced with invalid categorical values).
Credit: Levs Dolgovs (@levsnv)
https://github.com/rapidsai/cuml/blob/3bab4d1/python/cuml/tests/test_fil.py#L641
Returns
-------
generated_df:
Generated DataFrame
generated_array:
NumPy array whose value is equal to generated_df, except with a fraction of elements
replaced with an invalid categorical value.
Parameters
----------
features:
A 2D NumPy array, to be used to generate the DataFrame. Use
:py:meth:`~sklearn.datasets.make_regression` or
:py:meth:`~sklearn.datasets.make_classification` to randomly generate this array.
n_categorical:
Number of categorical columns in the generated DataFrame.
invalid_frac:
Fraction of elements in the categorical columns that will be replaced with invalid
categorical values. The replacement will occur only in generated_array, not generated_df.
random_state:
Random seed or RandomState object, to control the behavior of randomness
"""
if not has_pandas():
raise RuntimeError("Pandas is required for to_categorical()")
import pandas as pd

features = features.copy() # Avoid clobbering source matrix
seed = ctypes.c_size_t(hash(random_state)).value # Allow RandomState object
rng = np.random.default_rng(seed=seed)
n_features = features.shape[1]
# First n_categorical columns will be converted into categorical columns.
cat_cols = features[:, :n_categorical]
# Normalize the columns to fit range [0, 1]
cat_cols = cat_cols - cat_cols.min(axis=0, keepdims=True)
cat_cols /= cat_cols.max(axis=0, keepdims=True)
# Bin the columns into 100 bins to obtain categorical values
rough_n_categories = 100
cat_cols = (cat_cols * rough_n_categories).astype(int)

# Mix categorical and numerical columns in a random order
new_col_idx = rng.choice(n_features, n_features, replace=False, shuffle=True)
df_cols = {}
for icol in range(n_categorical):
col = cat_cols[:, icol]
df_cols[new_col_idx[icol]] = pd.Series(
pd.Categorical(col, categories=np.unique(col))
)
for icol in range(n_categorical, n_features):
df_cols[new_col_idx[icol]] = pd.Series(features[:, icol])
generated_df = pd.DataFrame(df_cols)
generated_df.sort_index(axis=1, inplace=True)

# Randomly inject invalid categories
invalid_idx = rng.choice(
a=cat_cols.size,
size=ceil(cat_cols.size * invalid_frac),
replace=False,
shuffle=False,
)
cat_cols.flat[invalid_idx] += 1000
generated_array = np.concatenate([cat_cols, features[:, n_categorical:]], axis=1)
generated_array[:, new_col_idx] = generated_array

return generated_df, generated_array

0 comments on commit 5de5a9f

Please sign in to comment.