From e3cc12054a33bd634b93ff4628b713827e85fccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 12 Nov 2024 16:53:58 -0600 Subject: [PATCH 1/8] do not copy column-major numpy arrays when creating Dataset --- python-package/lightgbm/basic.py | 20 +++++++++++++++----- tests/python_package_test/test_basic.py | 13 +++++++++++++ tests/python_package_test/test_engine.py | 24 ++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index cf3723aadc63..6d52f670c0fd 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -188,6 +188,14 @@ def _get_sample_count(total_nrow: int, params: str) -> int: return sample_cnt.value +def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray: + data = mat.ravel(order="A") # keeps memory layout + if data.dtype not in (np.float32, np.float64): + # change non-float data to float data, need to copy + data = data.astype(np.float32) + return data + + class _MissingType(Enum): NONE = "None" NAN = "NaN" @@ -685,6 +693,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va _C_API_DTYPE_INT64 = 3 """Matrix is row major in Python""" +_C_API_IS_COL_MAJOR = 0 _C_API_IS_ROW_MAJOR = 1 """Macro definition of prediction type in C API of LightGBM""" @@ -2297,10 +2306,11 @@ def __init_from_np2d( raise ValueError("Input numpy.ndarray must be 2 dimensional") self._handle = ctypes.c_void_p() - if mat.dtype == np.float32 or mat.dtype == np.float64: - data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype) - else: # change non-float data to float data, need to copy - data = np.asarray(mat.reshape(mat.size), dtype=np.float32) + data = _np2d_to_np1d(mat) + if mat.flags["C_CONTIGUOUS"]: + layout = _C_API_IS_ROW_MAJOR + else: + layout = _C_API_IS_COL_MAJOR ptr_data, type_ptr_data, _ = _c_float_array(data) _safe_call( @@ -2309,7 +2319,7 @@ def __init_from_np2d( ctypes.c_int(type_ptr_data), ctypes.c_int32(mat.shape[0]), ctypes.c_int32(mat.shape[1]), - ctypes.c_int(_C_API_IS_ROW_MAJOR), + ctypes.c_int(layout), _c_str(params_str), ref_dataset, ctypes.byref(self._handle), diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 0dfe3e47fa11..72ef2866338a 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -947,3 +947,16 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity." ) assert expected_warning in capsys.readouterr().out + + +@pytest.mark.parametrize("order", ["C", "F"]) +@pytest.mark.parametrize("dtype", ["float32", "int64"]) +def test_no_copy_in_dataset_from_numpy_2d(order, dtype): + X = np.random.rand(100, 3) + X = np.require(X, dtype=dtype, requirements=order) + X1d = lgb.basic._np2d_to_np1d(X) + if dtype == "float32": + assert np.shares_memory(X, X1d) + else: + # makes a copy + assert not np.shares_memory(X, X1d) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 9ae471e7f4b9..07735daeec86 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4611,3 +4611,27 @@ def test_bagging_by_query_in_lambdarank(): ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] assert ndcg_score_bagging_by_query >= ndcg_score - 0.1 assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1 + + +def test_train_with_column_major_dataset(): + params = {"num_leaves": 16} + rounds = 2 + + X_row, y = make_synthetic_regression() + assert X_row.flags["C_CONTIGUOUS"] + ds_row = lgb.Dataset(X_row, y) + bst_row = lgb.train(params, ds_row, num_boost_round=rounds) + pred_row = bst_row.predict(X_row) + # check that we didn't get a trivial model + dumped_row = bst_row.dump_model() + assert len(dumped_row["tree_info"]) == rounds + for tree in dumped_row["tree_info"]: + assert tree["num_leaves"] > 1 + + X_col = np.asfortranarray(X_row) + assert X_col.flags["F_CONTIGUOUS"] + ds_col = lgb.Dataset(X_col, y) + bst_col = lgb.train(params, ds_col, num_boost_round=rounds) + dumped_col = bst_col.dump_model() + + assert dumped_row == dumped_col From 84607e39a564a5c4907af3a3f2532bcd28c2d8a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 12 Nov 2024 17:47:32 -0600 Subject: [PATCH 2/8] fix logic --- python-package/lightgbm/basic.py | 25 ++++++++++++++++-------- tests/python_package_test/test_engine.py | 1 - 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 6d52f670c0fd..36d5a92ede23 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -189,11 +189,18 @@ def _get_sample_count(total_nrow: int, params: str) -> int: def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray: - data = mat.ravel(order="A") # keeps memory layout - if data.dtype not in (np.float32, np.float64): - # change non-float data to float data, need to copy - data = data.astype(np.float32) - return data + if mat.dtype in (np.float32, np.float64): + dtype = mat.dtype + else: + dtype = np.float32 + if mat.flags["F_CONTIGUOUS"]: + order = "F" + else: + order = "C" + # ensure dtype and order, copies if either do not match + data = np.asarray(mat, dtype=dtype, order=order) + # flatten array without copying + return data.ravel(order=order) class _MissingType(Enum): @@ -2307,10 +2314,12 @@ def __init_from_np2d( self._handle = ctypes.c_void_p() data = _np2d_to_np1d(mat) - if mat.flags["C_CONTIGUOUS"]: - layout = _C_API_IS_ROW_MAJOR - else: + if mat.flags["F_CONTIGUOUS"]: layout = _C_API_IS_COL_MAJOR + else: + # the array was row major or not contiguous (slice) + # in any case we flatten as row-major + layout = _C_API_IS_ROW_MAJOR ptr_data, type_ptr_data, _ = _c_float_array(data) _safe_call( diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 07735daeec86..fa50868a20e0 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4621,7 +4621,6 @@ def test_train_with_column_major_dataset(): assert X_row.flags["C_CONTIGUOUS"] ds_row = lgb.Dataset(X_row, y) bst_row = lgb.train(params, ds_row, num_boost_round=rounds) - pred_row = bst_row.predict(X_row) # check that we didn't get a trivial model dumped_row = bst_row.dump_model() assert len(dumped_row["tree_info"]) == rounds From 0d612247505bdee5e18deef0a63f09707bcbd888 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 12 Nov 2024 18:25:46 -0600 Subject: [PATCH 3/8] lint --- tests/python_package_test/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 72ef2866338a..8ff86d2e44aa 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -952,7 +952,7 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dtype", ["float32", "int64"]) def test_no_copy_in_dataset_from_numpy_2d(order, dtype): - X = np.random.rand(100, 3) + X = np.random.default_rng().random(size=(100, 3)) X = np.require(X, dtype=dtype, requirements=order) X1d = lgb.basic._np2d_to_np1d(X) if dtype == "float32": From f7df7ad3f506c9599b56da7dbdd57f812c7b0c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 28 Nov 2024 17:43:13 -0600 Subject: [PATCH 4/8] code review --- python-package/lightgbm/basic.py | 17 ++++++----------- tests/python_package_test/test_basic.py | 4 ++-- tests/python_package_test/test_engine.py | 23 ++++++++++------------- 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 36d5a92ede23..80e6a50cb53d 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -188,19 +188,21 @@ def _get_sample_count(total_nrow: int, params: str) -> int: return sample_cnt.value -def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray: +def _np2d_to_np1d(mat: np.ndarray) -> Tuple[np.ndarray, int]: if mat.dtype in (np.float32, np.float64): dtype = mat.dtype else: dtype = np.float32 if mat.flags["F_CONTIGUOUS"]: order = "F" + layout = _C_API_IS_COL_MAJOR else: order = "C" + layout = _C_API_IS_ROW_MAJOR # ensure dtype and order, copies if either do not match data = np.asarray(mat, dtype=dtype, order=order) # flatten array without copying - return data.ravel(order=order) + return data.ravel(order=order), layout class _MissingType(Enum): @@ -699,7 +701,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va _C_API_DTYPE_INT32 = 2 _C_API_DTYPE_INT64 = 3 -"""Matrix is row major in Python""" +"""Macro definition of data order in matrix""" _C_API_IS_COL_MAJOR = 0 _C_API_IS_ROW_MAJOR = 1 @@ -2313,14 +2315,7 @@ def __init_from_np2d( raise ValueError("Input numpy.ndarray must be 2 dimensional") self._handle = ctypes.c_void_p() - data = _np2d_to_np1d(mat) - if mat.flags["F_CONTIGUOUS"]: - layout = _C_API_IS_COL_MAJOR - else: - # the array was row major or not contiguous (slice) - # in any case we flatten as row-major - layout = _C_API_IS_ROW_MAJOR - + data, layout = _np2d_to_np1d(mat) ptr_data, type_ptr_data, _ = _c_float_array(data) _safe_call( _LIB.LGBM_DatasetCreateFromMat( diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 8ff86d2e44aa..178639653d72 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -951,8 +951,8 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dtype", ["float32", "int64"]) -def test_no_copy_in_dataset_from_numpy_2d(order, dtype): - X = np.random.default_rng().random(size=(100, 3)) +def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype): + X = rng.random(size=(100, 3)) X = np.require(X, dtype=dtype, requirements=order) X1d = lgb.basic._np2d_to_np1d(X) if dtype == "float32": diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index fa50868a20e0..fe7024dae63d 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1,5 +1,6 @@ # coding: utf-8 import copy +import filecmp import itertools import json import math @@ -4613,24 +4614,20 @@ def test_bagging_by_query_in_lambdarank(): assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1 -def test_train_with_column_major_dataset(): - params = {"num_leaves": 16} - rounds = 2 - +def test_equal_datasets_from_row_major_and_col_major_data(tmp_path): + # row-major dataset X_row, y = make_synthetic_regression() assert X_row.flags["C_CONTIGUOUS"] ds_row = lgb.Dataset(X_row, y) - bst_row = lgb.train(params, ds_row, num_boost_round=rounds) - # check that we didn't get a trivial model - dumped_row = bst_row.dump_model() - assert len(dumped_row["tree_info"]) == rounds - for tree in dumped_row["tree_info"]: - assert tree["num_leaves"] > 1 + ds_row_path = tmp_path / "ds_row.txt" + ds_row._dump_text(ds_row_path) + # col-major dataset X_col = np.asfortranarray(X_row) assert X_col.flags["F_CONTIGUOUS"] ds_col = lgb.Dataset(X_col, y) - bst_col = lgb.train(params, ds_col, num_boost_round=rounds) - dumped_col = bst_col.dump_model() + ds_col_path = tmp_path / "ds_col.txt" + ds_col._dump_text(ds_col_path) - assert dumped_row == dumped_col + # check datasets are equal + assert filecmp.cmp(ds_row_path, ds_col_path) From 50dda905bb7a631aad8a9d38cfaf932c33d68d52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Thu, 28 Nov 2024 17:53:24 -0600 Subject: [PATCH 5/8] update test --- tests/python_package_test/test_basic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 178639653d72..d6ea4cdfdb25 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -954,7 +954,11 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype): X = rng.random(size=(100, 3)) X = np.require(X, dtype=dtype, requirements=order) - X1d = lgb.basic._np2d_to_np1d(X) + X1d, layout = lgb.basic._np2d_to_np1d(X) + if order == "F": + assert layout == lgb.basic._C_API_IS_COL_MAJOR + else: + assert layout == lgb.basic._C_API_IS_ROW_MAJOR if dtype == "float32": assert np.shares_memory(X, X1d) else: From 38c678620d73b27f4471c9d8cf7b8053969102a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Fri, 29 Nov 2024 09:20:01 -0600 Subject: [PATCH 6/8] move dataset test to basic --- tests/python_package_test/test_basic.py | 19 +++++++++++++++++++ tests/python_package_test/test_engine.py | 20 -------------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index d6ea4cdfdb25..b6b213751bd0 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -964,3 +964,22 @@ def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype): else: # makes a copy assert not np.shares_memory(X, X1d) + + +def test_equal_datasets_from_row_major_and_col_major_data(tmp_path): + # row-major dataset + X_row, y = make_blobs(n_samples=1_000, n_features=1, centers=2) + assert X_row.flags["C_CONTIGUOUS"] + ds_row = lgb.Dataset(X_row, y) + ds_row_path = tmp_path / "ds_row.txt" + ds_row._dump_text(ds_row_path) + + # col-major dataset + X_col = np.asfortranarray(X_row) + assert X_col.flags["F_CONTIGUOUS"] + ds_col = lgb.Dataset(X_col, y) + ds_col_path = tmp_path / "ds_col.txt" + ds_col._dump_text(ds_col_path) + + # check datasets are equal + assert filecmp.cmp(ds_row_path, ds_col_path) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index fe7024dae63d..9ae471e7f4b9 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1,6 +1,5 @@ # coding: utf-8 import copy -import filecmp import itertools import json import math @@ -4612,22 +4611,3 @@ def test_bagging_by_query_in_lambdarank(): ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"] assert ndcg_score_bagging_by_query >= ndcg_score - 0.1 assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1 - - -def test_equal_datasets_from_row_major_and_col_major_data(tmp_path): - # row-major dataset - X_row, y = make_synthetic_regression() - assert X_row.flags["C_CONTIGUOUS"] - ds_row = lgb.Dataset(X_row, y) - ds_row_path = tmp_path / "ds_row.txt" - ds_row._dump_text(ds_row_path) - - # col-major dataset - X_col = np.asfortranarray(X_row) - assert X_col.flags["F_CONTIGUOUS"] - ds_col = lgb.Dataset(X_col, y) - ds_col_path = tmp_path / "ds_col.txt" - ds_col._dump_text(ds_col_path) - - # check datasets are equal - assert filecmp.cmp(ds_row_path, ds_col_path) From 524f45e1bd0ac3f32535aee5f729f4a0b2a79e6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Fri, 29 Nov 2024 09:28:09 -0600 Subject: [PATCH 7/8] increase features --- tests/python_package_test/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index b6b213751bd0..90fabd5c3c5e 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -968,7 +968,7 @@ def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype): def test_equal_datasets_from_row_major_and_col_major_data(tmp_path): # row-major dataset - X_row, y = make_blobs(n_samples=1_000, n_features=1, centers=2) + X_row, y = make_blobs(n_samples=1_000, n_features=3, centers=2) assert X_row.flags["C_CONTIGUOUS"] ds_row = lgb.Dataset(X_row, y) ds_row_path = tmp_path / "ds_row.txt" From 95a21a4040350be26a71afcd58f8902e6965bb02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Fri, 29 Nov 2024 09:29:55 -0600 Subject: [PATCH 8/8] assert single layout --- tests/python_package_test/test_basic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 90fabd5c3c5e..bdd4d3f58b80 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -969,14 +969,14 @@ def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype): def test_equal_datasets_from_row_major_and_col_major_data(tmp_path): # row-major dataset X_row, y = make_blobs(n_samples=1_000, n_features=3, centers=2) - assert X_row.flags["C_CONTIGUOUS"] + assert X_row.flags["C_CONTIGUOUS"] and not X_row.flags["F_CONTIGUOUS"] ds_row = lgb.Dataset(X_row, y) ds_row_path = tmp_path / "ds_row.txt" ds_row._dump_text(ds_row_path) # col-major dataset X_col = np.asfortranarray(X_row) - assert X_col.flags["F_CONTIGUOUS"] + assert X_col.flags["F_CONTIGUOUS"] and not X_col.flags["C_CONTIGUOUS"] ds_col = lgb.Dataset(X_col, y) ds_col_path = tmp_path / "ds_col.txt" ds_col._dump_text(ds_col_path)