Skip to content

Commit

Permalink
[WIP] Initial support for polars.
Browse files Browse the repository at this point in the history
pyarrow is required for polars input. Categorical data is not yet supported, will wait
until the re-coder is completed. This patch also helps setup the code for accepting
CPU-based arrow data.

- Add masked dataframe support
- Initial support for polars.
  • Loading branch information
trivialfis committed Dec 18, 2024
1 parent dc092ae commit bc48554
Show file tree
Hide file tree
Showing 10 changed files with 409 additions and 41 deletions.
1 change: 1 addition & 0 deletions ops/conda_env/python_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies:
- numpy
- scipy
- pandas
- pyarrow
- scikit-learn
- dask
- distributed
Expand Down
17 changes: 15 additions & 2 deletions python-package/xgboost/_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ class _ArrayLikeArg(Protocol):
def __array_interface__(self) -> "ArrayInf": ...


class _TransformedDf(Protocol):
def array_interface(self) -> bytes: ...

@property
def shape(self) -> Tuple[int, int]: ...


ArrayInf = TypedDict(
"ArrayInf",
{
Expand Down Expand Up @@ -92,7 +99,10 @@ def __cuda_array_interface__(self, interface: ArrayInf) -> None:


def make_array_interface(
ptr: CNumericPtr, shape: Tuple[int, ...], dtype: Type[np.number], is_cuda: bool
ptr: Union[CNumericPtr, int],
shape: Tuple[int, ...],
dtype: Type[np.number],
is_cuda: bool,
) -> ArrayInf:
"""Make an __(cuda)_array_interface__ from a pointer."""
# Use an empty array to handle typestr and descr
Expand All @@ -103,7 +113,10 @@ def make_array_interface(
empty = np.empty(shape=(0,), dtype=dtype)
array = empty.__array_interface__ # pylint: disable=no-member

addr = ctypes.cast(ptr, ctypes.c_void_p).value
if not isinstance(ptr, int):
addr = ctypes.cast(ptr, ctypes.c_void_p).value
else:
addr = ptr
length = int(np.prod(shape))
# Handle empty dataset.
assert addr is not None or length == 0
Expand Down
7 changes: 7 additions & 0 deletions python-package/xgboost/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ def import_cupy() -> types.ModuleType:
return cupy


@functools.cache
def import_polars() -> types.ModuleType:
import polars as pl

return pl


try:
import scipy.sparse as scipy_sparse
from scipy.sparse import csr_matrix as scipy_csr
Expand Down
20 changes: 15 additions & 5 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import scipy.sparse

from ._data_utils import (
_TransformedDf,
array_interface,
cuda_array_interface,
from_array_interface,
Expand Down Expand Up @@ -64,7 +65,7 @@
TransformedData,
c_bst_ulong,
)
from .compat import PANDAS_INSTALLED, DataFrame, py_str
from .compat import PANDAS_INSTALLED, DataFrame, py_str, import_polars
from .libpath import find_lib_path


Expand Down Expand Up @@ -1431,7 +1432,7 @@ def _ref_data_from_array(self, data: np.ndarray) -> None:
"""Reference data from numpy array."""
_check_call(_LIB.XGProxyDMatrixSetDataDense(self.handle, array_interface(data)))

def _ref_data_from_pandas(self, data: DataType) -> None:
def _ref_data_from_pandas(self, data: _TransformedDf) -> None:
"""Reference data from a pandas DataFrame. The input is a PandasTransformed
instance.
Expand Down Expand Up @@ -2601,8 +2602,8 @@ def inplace_predict(
assert proxy is None or isinstance(proxy, _ProxyDMatrix)

from .data import (
ArrowTransformed,
PandasTransformed,
_arrow_transform,
_is_arrow,
_is_cudf_df,
_is_cudf_pandas,
Expand All @@ -2611,7 +2612,11 @@ def inplace_predict(
_is_np_array_like,
_is_pandas_df,
_is_pandas_series,
_is_polars,
_is_tuple,
_is_polars_series,
_transform_polars_df,
_transform_arrow_table,
_transform_pandas_df,
)

Expand All @@ -2620,7 +2625,12 @@ def inplace_predict(

enable_categorical = True
if _is_arrow(data):
data = _arrow_transform(data)
data, fns, _ = _transform_arrow_table(data, enable_categorical, None, None)
if _is_polars_series(data):
pl = import_polars()
data = pl.DataFrame({data.name: data})
if _is_polars(data):
data, fns, _ = _transform_polars_df(data, enable_categorical, None, None)
if _is_pandas_series(data):
import pandas as pd

Expand Down Expand Up @@ -2659,7 +2669,7 @@ def inplace_predict(
)
)
return _prediction_output(shape, dims, preds, False)
if isinstance(data, PandasTransformed):
if isinstance(data, (ArrowTransformed, PandasTransformed)):
_check_call(
_LIB.XGBoosterPredictFromColumnar(
self.handle,
Expand Down
Loading

0 comments on commit bc48554

Please sign in to comment.