From e3cc12054a33bd634b93ff4628b713827e85fccf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 12 Nov 2024 16:53:58 -0600
Subject: [PATCH 1/8] do not copy column-major numpy arrays when creating
 Dataset

---
 python-package/lightgbm/basic.py         | 20 +++++++++++++++-----
 tests/python_package_test/test_basic.py  | 13 +++++++++++++
 tests/python_package_test/test_engine.py | 24 ++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index cf3723aadc63..6d52f670c0fd 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -188,6 +188,14 @@ def _get_sample_count(total_nrow: int, params: str) -> int:
     return sample_cnt.value
 
 
+def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray:
+    data = mat.ravel(order="A")  # keeps memory layout
+    if data.dtype not in (np.float32, np.float64):
+        # change non-float data to float data, need to copy
+        data = data.astype(np.float32)
+    return data
+
+
 class _MissingType(Enum):
     NONE = "None"
     NAN = "NaN"
@@ -685,6 +693,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va
 _C_API_DTYPE_INT64 = 3
 
 """Matrix is row major in Python"""
+_C_API_IS_COL_MAJOR = 0
 _C_API_IS_ROW_MAJOR = 1
 
 """Macro definition of prediction type in C API of LightGBM"""
@@ -2297,10 +2306,11 @@ def __init_from_np2d(
             raise ValueError("Input numpy.ndarray must be 2 dimensional")
 
         self._handle = ctypes.c_void_p()
-        if mat.dtype == np.float32 or mat.dtype == np.float64:
-            data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
-        else:  # change non-float data to float data, need to copy
-            data = np.asarray(mat.reshape(mat.size), dtype=np.float32)
+        data = _np2d_to_np1d(mat)
+        if mat.flags["C_CONTIGUOUS"]:
+            layout = _C_API_IS_ROW_MAJOR
+        else:
+            layout = _C_API_IS_COL_MAJOR
 
         ptr_data, type_ptr_data, _ = _c_float_array(data)
         _safe_call(
@@ -2309,7 +2319,7 @@ def __init_from_np2d(
                 ctypes.c_int(type_ptr_data),
                 ctypes.c_int32(mat.shape[0]),
                 ctypes.c_int32(mat.shape[1]),
-                ctypes.c_int(_C_API_IS_ROW_MAJOR),
+                ctypes.c_int(layout),
                 _c_str(params_str),
                 ref_dataset,
                 ctypes.byref(self._handle),
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 0dfe3e47fa11..72ef2866338a 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -947,3 +947,16 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c
         "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
     )
     assert expected_warning in capsys.readouterr().out
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+@pytest.mark.parametrize("dtype", ["float32", "int64"])
+def test_no_copy_in_dataset_from_numpy_2d(order, dtype):
+    X = np.random.rand(100, 3)
+    X = np.require(X, dtype=dtype, requirements=order)
+    X1d = lgb.basic._np2d_to_np1d(X)
+    if dtype == "float32":
+        assert np.shares_memory(X, X1d)
+    else:
+        # makes a copy
+        assert not np.shares_memory(X, X1d)
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 9ae471e7f4b9..07735daeec86 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -4611,3 +4611,27 @@ def test_bagging_by_query_in_lambdarank():
     ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"]
     assert ndcg_score_bagging_by_query >= ndcg_score - 0.1
     assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1
+
+
+def test_train_with_column_major_dataset():
+    params = {"num_leaves": 16}
+    rounds = 2
+
+    X_row, y = make_synthetic_regression()
+    assert X_row.flags["C_CONTIGUOUS"]
+    ds_row = lgb.Dataset(X_row, y)
+    bst_row = lgb.train(params, ds_row, num_boost_round=rounds)
+    pred_row = bst_row.predict(X_row)
+    # check that we didn't get a trivial model
+    dumped_row = bst_row.dump_model()
+    assert len(dumped_row["tree_info"]) == rounds
+    for tree in dumped_row["tree_info"]:
+        assert tree["num_leaves"] > 1
+
+    X_col = np.asfortranarray(X_row)
+    assert X_col.flags["F_CONTIGUOUS"]
+    ds_col = lgb.Dataset(X_col, y)
+    bst_col = lgb.train(params, ds_col, num_boost_round=rounds)
+    dumped_col = bst_col.dump_model()
+
+    assert dumped_row == dumped_col

From 84607e39a564a5c4907af3a3f2532bcd28c2d8a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 12 Nov 2024 17:47:32 -0600
Subject: [PATCH 2/8] fix logic

---
 python-package/lightgbm/basic.py         | 25 ++++++++++++++++--------
 tests/python_package_test/test_engine.py |  1 -
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 6d52f670c0fd..36d5a92ede23 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -189,11 +189,18 @@ def _get_sample_count(total_nrow: int, params: str) -> int:
 
 
 def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray:
-    data = mat.ravel(order="A")  # keeps memory layout
-    if data.dtype not in (np.float32, np.float64):
-        # change non-float data to float data, need to copy
-        data = data.astype(np.float32)
-    return data
+    if mat.dtype in (np.float32, np.float64):
+        dtype = mat.dtype
+    else:
+        dtype = np.float32
+    if mat.flags["F_CONTIGUOUS"]:
+        order = "F"
+    else:
+        order = "C"
+    # ensure dtype and order, copies if either do not match
+    data = np.asarray(mat, dtype=dtype, order=order)
+    # flatten array without copying
+    return data.ravel(order=order)
 
 
 class _MissingType(Enum):
@@ -2307,10 +2314,12 @@ def __init_from_np2d(
 
         self._handle = ctypes.c_void_p()
         data = _np2d_to_np1d(mat)
-        if mat.flags["C_CONTIGUOUS"]:
-            layout = _C_API_IS_ROW_MAJOR
-        else:
+        if mat.flags["F_CONTIGUOUS"]:
             layout = _C_API_IS_COL_MAJOR
+        else:
+            # the array was row major or not contiguous (slice)
+            # in any case we flatten as row-major
+            layout = _C_API_IS_ROW_MAJOR
 
         ptr_data, type_ptr_data, _ = _c_float_array(data)
         _safe_call(
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 07735daeec86..fa50868a20e0 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -4621,7 +4621,6 @@ def test_train_with_column_major_dataset():
     assert X_row.flags["C_CONTIGUOUS"]
     ds_row = lgb.Dataset(X_row, y)
     bst_row = lgb.train(params, ds_row, num_boost_round=rounds)
-    pred_row = bst_row.predict(X_row)
     # check that we didn't get a trivial model
     dumped_row = bst_row.dump_model()
     assert len(dumped_row["tree_info"]) == rounds

From 0d612247505bdee5e18deef0a63f09707bcbd888 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 12 Nov 2024 18:25:46 -0600
Subject: [PATCH 3/8] lint

---
 tests/python_package_test/test_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 72ef2866338a..8ff86d2e44aa 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -952,7 +952,7 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c
 @pytest.mark.parametrize("order", ["C", "F"])
 @pytest.mark.parametrize("dtype", ["float32", "int64"])
 def test_no_copy_in_dataset_from_numpy_2d(order, dtype):
-    X = np.random.rand(100, 3)
+    X = np.random.default_rng().random(size=(100, 3))
     X = np.require(X, dtype=dtype, requirements=order)
     X1d = lgb.basic._np2d_to_np1d(X)
     if dtype == "float32":

From f7df7ad3f506c9599b56da7dbdd57f812c7b0c59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Thu, 28 Nov 2024 17:43:13 -0600
Subject: [PATCH 4/8] code review

---
 python-package/lightgbm/basic.py         | 17 ++++++-----------
 tests/python_package_test/test_basic.py  |  4 ++--
 tests/python_package_test/test_engine.py | 23 ++++++++++-------------
 3 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 36d5a92ede23..80e6a50cb53d 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -188,19 +188,21 @@ def _get_sample_count(total_nrow: int, params: str) -> int:
     return sample_cnt.value
 
 
-def _np2d_to_np1d(mat: np.ndarray) -> np.ndarray:
+def _np2d_to_np1d(mat: np.ndarray) -> Tuple[np.ndarray, int]:
     if mat.dtype in (np.float32, np.float64):
         dtype = mat.dtype
     else:
         dtype = np.float32
     if mat.flags["F_CONTIGUOUS"]:
         order = "F"
+        layout = _C_API_IS_COL_MAJOR
     else:
         order = "C"
+        layout = _C_API_IS_ROW_MAJOR
     # ensure dtype and order, copies if either do not match
     data = np.asarray(mat, dtype=dtype, order=order)
     # flatten array without copying
-    return data.ravel(order=order)
+    return data.ravel(order=order), layout
 
 
 class _MissingType(Enum):
@@ -699,7 +701,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va
 _C_API_DTYPE_INT32 = 2
 _C_API_DTYPE_INT64 = 3
 
-"""Matrix is row major in Python"""
+"""Macro definition of data order in matrix"""
 _C_API_IS_COL_MAJOR = 0
 _C_API_IS_ROW_MAJOR = 1
 
@@ -2313,14 +2315,7 @@ def __init_from_np2d(
             raise ValueError("Input numpy.ndarray must be 2 dimensional")
 
         self._handle = ctypes.c_void_p()
-        data = _np2d_to_np1d(mat)
-        if mat.flags["F_CONTIGUOUS"]:
-            layout = _C_API_IS_COL_MAJOR
-        else:
-            # the array was row major or not contiguous (slice)
-            # in any case we flatten as row-major
-            layout = _C_API_IS_ROW_MAJOR
-
+        data, layout = _np2d_to_np1d(mat)
         ptr_data, type_ptr_data, _ = _c_float_array(data)
         _safe_call(
             _LIB.LGBM_DatasetCreateFromMat(
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 8ff86d2e44aa..178639653d72 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -951,8 +951,8 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c
 
 @pytest.mark.parametrize("order", ["C", "F"])
 @pytest.mark.parametrize("dtype", ["float32", "int64"])
-def test_no_copy_in_dataset_from_numpy_2d(order, dtype):
-    X = np.random.default_rng().random(size=(100, 3))
+def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype):
+    X = rng.random(size=(100, 3))
     X = np.require(X, dtype=dtype, requirements=order)
     X1d = lgb.basic._np2d_to_np1d(X)
     if dtype == "float32":
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index fa50868a20e0..fe7024dae63d 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1,5 +1,6 @@
 # coding: utf-8
 import copy
+import filecmp
 import itertools
 import json
 import math
@@ -4613,24 +4614,20 @@ def test_bagging_by_query_in_lambdarank():
     assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1
 
 
-def test_train_with_column_major_dataset():
-    params = {"num_leaves": 16}
-    rounds = 2
-
+def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
+    # row-major dataset
     X_row, y = make_synthetic_regression()
     assert X_row.flags["C_CONTIGUOUS"]
     ds_row = lgb.Dataset(X_row, y)
-    bst_row = lgb.train(params, ds_row, num_boost_round=rounds)
-    # check that we didn't get a trivial model
-    dumped_row = bst_row.dump_model()
-    assert len(dumped_row["tree_info"]) == rounds
-    for tree in dumped_row["tree_info"]:
-        assert tree["num_leaves"] > 1
+    ds_row_path = tmp_path / "ds_row.txt"
+    ds_row._dump_text(ds_row_path)
 
+    # col-major dataset
     X_col = np.asfortranarray(X_row)
     assert X_col.flags["F_CONTIGUOUS"]
     ds_col = lgb.Dataset(X_col, y)
-    bst_col = lgb.train(params, ds_col, num_boost_round=rounds)
-    dumped_col = bst_col.dump_model()
+    ds_col_path = tmp_path / "ds_col.txt"
+    ds_col._dump_text(ds_col_path)
 
-    assert dumped_row == dumped_col
+    # check datasets are equal
+    assert filecmp.cmp(ds_row_path, ds_col_path)

From 50dda905bb7a631aad8a9d38cfaf932c33d68d52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Thu, 28 Nov 2024 17:53:24 -0600
Subject: [PATCH 5/8] update test

---
 tests/python_package_test/test_basic.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 178639653d72..d6ea4cdfdb25 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -954,7 +954,11 @@ def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(c
 def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype):
     X = rng.random(size=(100, 3))
     X = np.require(X, dtype=dtype, requirements=order)
-    X1d = lgb.basic._np2d_to_np1d(X)
+    X1d, layout = lgb.basic._np2d_to_np1d(X)
+    if order == "F":
+        assert layout == lgb.basic._C_API_IS_COL_MAJOR
+    else:
+        assert layout == lgb.basic._C_API_IS_ROW_MAJOR
     if dtype == "float32":
         assert np.shares_memory(X, X1d)
     else:

From 38c678620d73b27f4471c9d8cf7b8053969102a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Fri, 29 Nov 2024 09:20:01 -0600
Subject: [PATCH 6/8] move dataset test to basic

---
 tests/python_package_test/test_basic.py  | 19 +++++++++++++++++++
 tests/python_package_test/test_engine.py | 20 --------------------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index d6ea4cdfdb25..b6b213751bd0 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -964,3 +964,22 @@ def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype):
     else:
         # makes a copy
         assert not np.shares_memory(X, X1d)
+
+
+def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
+    # row-major dataset
+    X_row, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    assert X_row.flags["C_CONTIGUOUS"]
+    ds_row = lgb.Dataset(X_row, y)
+    ds_row_path = tmp_path / "ds_row.txt"
+    ds_row._dump_text(ds_row_path)
+
+    # col-major dataset
+    X_col = np.asfortranarray(X_row)
+    assert X_col.flags["F_CONTIGUOUS"]
+    ds_col = lgb.Dataset(X_col, y)
+    ds_col_path = tmp_path / "ds_col.txt"
+    ds_col._dump_text(ds_col_path)
+
+    # check datasets are equal
+    assert filecmp.cmp(ds_row_path, ds_col_path)
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index fe7024dae63d..9ae471e7f4b9 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1,6 +1,5 @@
 # coding: utf-8
 import copy
-import filecmp
 import itertools
 import json
 import math
@@ -4612,22 +4611,3 @@ def test_bagging_by_query_in_lambdarank():
     ndcg_score_no_bagging_by_query = gbm_no_bagging_by_query.best_score["valid_0"]["ndcg@5"]
     assert ndcg_score_bagging_by_query >= ndcg_score - 0.1
     assert ndcg_score_no_bagging_by_query >= ndcg_score - 0.1
-
-
-def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
-    # row-major dataset
-    X_row, y = make_synthetic_regression()
-    assert X_row.flags["C_CONTIGUOUS"]
-    ds_row = lgb.Dataset(X_row, y)
-    ds_row_path = tmp_path / "ds_row.txt"
-    ds_row._dump_text(ds_row_path)
-
-    # col-major dataset
-    X_col = np.asfortranarray(X_row)
-    assert X_col.flags["F_CONTIGUOUS"]
-    ds_col = lgb.Dataset(X_col, y)
-    ds_col_path = tmp_path / "ds_col.txt"
-    ds_col._dump_text(ds_col_path)
-
-    # check datasets are equal
-    assert filecmp.cmp(ds_row_path, ds_col_path)

From 524f45e1bd0ac3f32535aee5f729f4a0b2a79e6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Fri, 29 Nov 2024 09:28:09 -0600
Subject: [PATCH 7/8] increase features

---
 tests/python_package_test/test_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index b6b213751bd0..90fabd5c3c5e 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -968,7 +968,7 @@ def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype):
 
 def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
     # row-major dataset
-    X_row, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    X_row, y = make_blobs(n_samples=1_000, n_features=3, centers=2)
     assert X_row.flags["C_CONTIGUOUS"]
     ds_row = lgb.Dataset(X_row, y)
     ds_row_path = tmp_path / "ds_row.txt"

From 95a21a4040350be26a71afcd58f8902e6965bb02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Fri, 29 Nov 2024 09:29:55 -0600
Subject: [PATCH 8/8] assert single layout

---
 tests/python_package_test/test_basic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 90fabd5c3c5e..bdd4d3f58b80 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -969,14 +969,14 @@ def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype):
 def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
     # row-major dataset
     X_row, y = make_blobs(n_samples=1_000, n_features=3, centers=2)
-    assert X_row.flags["C_CONTIGUOUS"]
+    assert X_row.flags["C_CONTIGUOUS"] and not X_row.flags["F_CONTIGUOUS"]
     ds_row = lgb.Dataset(X_row, y)
     ds_row_path = tmp_path / "ds_row.txt"
     ds_row._dump_text(ds_row_path)
 
     # col-major dataset
     X_col = np.asfortranarray(X_row)
-    assert X_col.flags["F_CONTIGUOUS"]
+    assert X_col.flags["F_CONTIGUOUS"] and not X_col.flags["C_CONTIGUOUS"]
     ds_col = lgb.Dataset(X_col, y)
     ds_col_path = tmp_path / "ds_col.txt"
     ds_col._dump_text(ds_col_path)