diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 99a690f38993..1db55385af1b 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -188,6 +188,23 @@ def _get_sample_count(total_nrow: int, params: str) -> int: return sample_cnt.value +def _np2d_to_np1d(mat: np.ndarray) -> Tuple[np.ndarray, int]: + if mat.dtype in (np.float32, np.float64): + dtype = mat.dtype + else: + dtype = np.float32 + if mat.flags["F_CONTIGUOUS"]: + order = "F" + layout = _C_API_IS_COL_MAJOR + else: + order = "C" + layout = _C_API_IS_ROW_MAJOR + # ensure dtype and order, copies if either do not match + data = np.asarray(mat, dtype=dtype, order=order) + # flatten array without copying + return data.ravel(order=order), layout + + class _MissingType(Enum): NONE = "None" NAN = "NaN" @@ -684,7 +701,8 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va _C_API_DTYPE_INT32 = 2 _C_API_DTYPE_INT64 = 3 -"""Matrix is row major in Python""" +"""Macro definition of data order in matrix""" +_C_API_IS_COL_MAJOR = 0 _C_API_IS_ROW_MAJOR = 1 """Macro definition of prediction type in C API of LightGBM""" @@ -2297,11 +2315,7 @@ def __init_from_np2d( raise ValueError("Input numpy.ndarray must be 2 dimensional") self._handle = ctypes.c_void_p() - if mat.dtype == np.float32 or mat.dtype == np.float64: - data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype) - else: # change non-float data to float data, need to copy - data = np.asarray(mat.reshape(mat.size), dtype=np.float32) - + data, layout = _np2d_to_np1d(mat) ptr_data, type_ptr_data, _ = _c_float_array(data) _safe_call( _LIB.LGBM_DatasetCreateFromMat( @@ -2309,7 +2323,7 @@ def __init_from_np2d( ctypes.c_int(type_ptr_data), ctypes.c_int32(mat.shape[0]), ctypes.c_int32(mat.shape[1]), - ctypes.c_int(_C_API_IS_ROW_MAJOR), + ctypes.c_int(layout), _c_str(params_str), ref_dataset, ctypes.byref(self._handle), diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 0dfe3e47fa11..bdd4d3f58b80 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -947,3 +947,39 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity." ) assert expected_warning in capsys.readouterr().out + + +@pytest.mark.parametrize("order", ["C", "F"]) +@pytest.mark.parametrize("dtype", ["float32", "int64"]) +def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype): + X = rng.random(size=(100, 3)) + X = np.require(X, dtype=dtype, requirements=order) + X1d, layout = lgb.basic._np2d_to_np1d(X) + if order == "F": + assert layout == lgb.basic._C_API_IS_COL_MAJOR + else: + assert layout == lgb.basic._C_API_IS_ROW_MAJOR + if dtype == "float32": + assert np.shares_memory(X, X1d) + else: + # makes a copy + assert not np.shares_memory(X, X1d) + + +def test_equal_datasets_from_row_major_and_col_major_data(tmp_path): + # row-major dataset + X_row, y = make_blobs(n_samples=1_000, n_features=3, centers=2) + assert X_row.flags["C_CONTIGUOUS"] and not X_row.flags["F_CONTIGUOUS"] + ds_row = lgb.Dataset(X_row, y) + ds_row_path = tmp_path / "ds_row.txt" + ds_row._dump_text(ds_row_path) + + # col-major dataset + X_col = np.asfortranarray(X_row) + assert X_col.flags["F_CONTIGUOUS"] and not X_col.flags["C_CONTIGUOUS"] + ds_col = lgb.Dataset(X_col, y) + ds_col_path = tmp_path / "ds_col.txt" + ds_col._dump_text(ds_col_path) + + # check datasets are equal + assert filecmp.cmp(ds_row_path, ds_col_path)