Beef up test coverage

dmlc · Feb 22, 2024 · 5de5a9f · 5de5a9f
1 parent 69a8169
commit 5de5a9f
Show file tree

Hide file tree

Showing 3 changed files with 148 additions and 27 deletions.
diff --git a/python/.pylintrc b/python/.pylintrc
@@ -25,4 +25,4 @@ generated-members=np.float32,np.uintc,np.uintp,np.uint32
 
 [MESSAGES CONTROL]
 # globally disable pylint checks (comma separated)
-disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc
+disable=fixme,too-few-public-methods,too-many-instance-attributes,missing-raises-doc,import-outside-toplevel
diff --git a/tests/python/test_xgboost_integration.py b/tests/python/test_xgboost_integration.py
@@ -2,11 +2,13 @@
 
 # pylint: disable=R0201, R0915
 import os
+import pathlib
 
 import numpy as np
 import pytest
 import treelite
 from hypothesis import assume, given, settings
+from hypothesis.strategies import data as hypothesis_callback
 from hypothesis.strategies import integers, sampled_from
 from sklearn.datasets import load_iris
 from sklearn.model_selection import train_test_split
@@ -16,7 +18,12 @@
 
 from .hypothesis_util import standard_regression_datasets, standard_settings
 from .metadata import format_libpath_for_example_model, load_example_model
-from .util import TemporaryDirectory, check_predictor, os_compatible_toolchains
+from .util import (
+    TemporaryDirectory,
+    check_predictor,
+    os_compatible_toolchains,
+    to_categorical,
+)
 
 try:
     import xgboost as xgb
@@ -25,63 +32,92 @@
     pytest.skip("XGBoost not installed; skipping", allow_module_level=True)
 
 
+def generate_data_for_squared_log_error(n_targets: int = 1):
+    """Generate data containing outliers."""
+    n_rows = 4096
+    n_cols = 16
+
+    outlier_mean = 10000  # mean of generated outliers
+    n_outliers = 64
+
+    X = np.random.randn(n_rows, n_cols)
+    y = np.random.randn(n_rows, n_targets)
+    y += np.abs(np.min(y, axis=0))
+
+    # Create outliers
+    for _ in range(0, n_outliers):
+        ind = np.random.randint(0, len(y) - 1)
+        y[ind, :] += np.random.randint(0, outlier_mean)
+
+    # rmsle requires all label be greater than -1.
+    assert np.all(y > -1.0)
+
+    return X, y
+
+
 @given(
     toolchain=sampled_from(os_compatible_toolchains()),
     objective=sampled_from(
         [
-            "reg:linear",
             "reg:squarederror",
             "reg:squaredlogerror",
             "reg:pseudohubererror",
         ]
     ),
-    model_format=sampled_from(["binary", "json"]),
-    num_parallel_tree=integers(min_value=1, max_value=10),
-    dataset=standard_regression_datasets(),
+    num_boost_round=integers(min_value=3, max_value=10),
+    num_parallel_tree=integers(min_value=1, max_value=3),
+    callback=hypothesis_callback(),
 )
 @settings(**standard_settings())
-def test_xgb_regression(toolchain, objective, model_format, num_parallel_tree, dataset):
+def test_xgb_regression(
+    toolchain, objective, num_boost_round, num_parallel_tree, callback
+):
     # pylint: disable=too-many-locals
     """Test a random regression dataset"""
-    X, y = dataset
     if objective == "reg:squaredlogerror":
-        assume(np.all(y > -1))
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.2, shuffle=False
-    )
-    dtrain = xgb.DMatrix(X_train, label=y_train)
-    dtest = xgb.DMatrix(X_test, label=y_test)
+        X, y = generate_data_for_squared_log_error()
+        use_categorical = False
+    else:
+        X, y = callback.draw(standard_regression_datasets())
+        use_categorical = callback.draw(sampled_from([True, False]))
+    if use_categorical:
+        n_categorical = callback.draw(sampled_from([3, 5, X.shape[1]]))
+        assume(n_categorical <= X.shape[1])
+        df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1)
+        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)
+        model_format = "json"
+    else:
+        dtrain = xgb.DMatrix(X, label=y)
+        X_pred = X.copy()
+        model_format = callback.draw(sampled_from(["json", "legacy_binary"]))
     param = {
         "max_depth": 8,
-        "eta": 1,
+        "eta": 0.1,
         "verbosity": 0,
         "objective": objective,
         "num_parallel_tree": num_parallel_tree,
     }
-    num_round = 10
     bst = xgb.train(
         param,
         dtrain,
-        num_boost_round=num_round,
-        evals=[(dtrain, "train"), (dtest, "test")],
+        num_boost_round=num_boost_round,
+        evals=[(dtrain, "train")],
     )
     with TemporaryDirectory() as tmpdir:
         if model_format == "json":
-            model_name = "regression.json"
-            model_path = os.path.join(tmpdir, model_name)
+            model_path = pathlib.Path(tmpdir) / "regression.json"
             bst.save_model(model_path)
             model = treelite.frontend.load_xgboost_model(filename=model_path)
         else:
-            model_name = "regression.model"
-            model_path = os.path.join(tmpdir, model_name)
+            model_path = pathlib.Path(tmpdir) / "regression.model"
             bst.save_model(model_path)
             model = treelite.frontend.load_xgboost_model_legacy_binary(
                 filename=model_path
             )
 
         assert model.num_feature == dtrain.num_col()
-        assert model.num_tree == num_round * num_parallel_tree
-        libpath = os.path.join(tmpdir, "regression" + _libext())
+        assert model.num_tree == num_boost_round * num_parallel_tree
+        libpath = pathlib.Path(tmpdir) / ("regression" + _libext())
         tl2cgen.export_lib(
             model,
             toolchain=toolchain,
@@ -94,9 +130,11 @@ def test_xgb_regression(toolchain, objective, model_format, num_parallel_tree, d
         assert predictor.num_feature == dtrain.num_col()
         assert predictor.num_target == 1
         np.testing.assert_array_equal(predictor.num_class, [1])
-        dmat = tl2cgen.DMatrix(X_test, dtype="float32")
+        dmat = tl2cgen.DMatrix(X_pred, dtype="float32")
         out_pred = predictor.predict(dmat)
-        expected_pred = bst.predict(dtest, strict_shape=True)[:, :, np.newaxis]
+        expected_pred = bst.predict(
+            xgb.DMatrix(X_pred), strict_shape=True, validate_features=False
+        )[:, :, np.newaxis]
         np.testing.assert_almost_equal(out_pred, expected_pred, decimal=3)
 
 

diff --git a/tests/python/util.py b/tests/python/util.py
@@ -1,10 +1,12 @@
 # -*- coding: utf-8 -*-
 """Utility functions for tests"""
+import ctypes
 import os
 import tempfile
 from contextlib import contextmanager
+from math import ceil
 from sys import platform as _platform
-from typing import Iterator, List
+from typing import Any, Iterator, List, Optional, Tuple, Union
 
 import numpy as np
 import treelite
@@ -100,3 +102,84 @@ def TemporaryDirectory(*args, **kwargs) -> Iterator[str]:
         except (PermissionError, NotADirectoryError):
             if _platform != "win32":
                 raise
+
+
+# pylint: disable=R0914
+def to_categorical(
+    features: np.ndarray,
+    *,
+    n_categorical: int,
+    invalid_frac: float = 0.0,
+    random_state: Optional[Union[int, np.random.RandomState]] = None,
+) -> Tuple[Any, np.ndarray]:
+    """Generate a Pandas DataFrame with a mix of numerical and categorical columns.
+    Also returns a NumPy array whose value is equal to the DataFrame, except for
+    a given fraction of elements (they will be replaced with invalid categorical values).
+
+    Credit: Levs Dolgovs (@levsnv)
+    https://github.com/rapidsai/cuml/blob/3bab4d1/python/cuml/tests/test_fil.py#L641
+
+    Returns
+    -------
+    generated_df:
+        Generated DataFrame
+    generated_array:
+        NumPy array whose value is equal to generated_df, except with a fraction of elements
+        replaced with an invalid categorical value.
+
+    Parameters
+    ----------
+    features:
+        A 2D NumPy array, to be used to generate the DataFrame. Use
+        :py:meth:`~sklearn.datasets.make_regression` or
+        :py:meth:`~sklearn.datasets.make_classification` to randomly generate this array.
+    n_categorical:
+        Number of categorical columns in the generated DataFrame.
+    invalid_frac:
+        Fraction of elements in the categorical columns that will be replaced with invalid
+        categorical values. The replacement will occur only in generated_array, not generated_df.
+    random_state:
+        Random seed or RandomState object, to control the behavior of randomness
+    """
+    if not has_pandas():
+        raise RuntimeError("Pandas is required for to_categorical()")
+    import pandas as pd
+
+    features = features.copy()  # Avoid clobbering source matrix
+    seed = ctypes.c_size_t(hash(random_state)).value  # Allow RandomState object
+    rng = np.random.default_rng(seed=seed)
+    n_features = features.shape[1]
+    # First n_categorical columns will be converted into categorical columns.
+    cat_cols = features[:, :n_categorical]
+    # Normalize the columns to fit range [0, 1]
+    cat_cols = cat_cols - cat_cols.min(axis=0, keepdims=True)
+    cat_cols /= cat_cols.max(axis=0, keepdims=True)
+    # Bin the columns into 100 bins to obtain categorical values
+    rough_n_categories = 100
+    cat_cols = (cat_cols * rough_n_categories).astype(int)
+
+    # Mix categorical and numerical columns in a random order
+    new_col_idx = rng.choice(n_features, n_features, replace=False, shuffle=True)
+    df_cols = {}
+    for icol in range(n_categorical):
+        col = cat_cols[:, icol]
+        df_cols[new_col_idx[icol]] = pd.Series(
+            pd.Categorical(col, categories=np.unique(col))
+        )
+    for icol in range(n_categorical, n_features):
+        df_cols[new_col_idx[icol]] = pd.Series(features[:, icol])
+    generated_df = pd.DataFrame(df_cols)
+    generated_df.sort_index(axis=1, inplace=True)
+
+    # Randomly inject invalid categories
+    invalid_idx = rng.choice(
+        a=cat_cols.size,
+        size=ceil(cat_cols.size * invalid_frac),
+        replace=False,
+        shuffle=False,
+    )
+    cat_cols.flat[invalid_idx] += 1000
+    generated_array = np.concatenate([cat_cols, features[:, n_categorical:]], axis=1)
+    generated_array[:, new_col_idx] = generated_array
+
+    return generated_df, generated_array