From e0799d383319b3949e1f181f036db5243e1571ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Thu, 23 Jan 2025 19:02:16 +0100
Subject: [PATCH] Investigate issue 1129 (#1131)

* add test to investigate

Signed-off-by: xadupre <xadupre@microsoft.com>

* Add unit test from issue 1129

Signed-off-by: xadupre <xadupre@microsoft.com>

* Update CI with the latest verions of onnx (#1130)

Signed-off-by: xadupre <xadupre@microsoft.com>

* fix missing provider

Signed-off-by: xadupre <xadupre@microsoft.com>

---------

Signed-off-by: xadupre <xadupre@microsoft.com>
---
 tests/test_issues_2024.py                   | 56 +++++++++++++++++++++
 tests/test_sklearn_pipeline_concat_tfidf.py |  4 +-
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/tests/test_issues_2024.py b/tests/test_issues_2024.py
index 15c01469d..1cf6fa94d 100644
--- a/tests/test_issues_2024.py
+++ b/tests/test_issues_2024.py
@@ -271,6 +271,62 @@ def Classifier(features: list[str]) -> base.BaseEstimator:
                 )
                 assert modelengine is not None
 
+    def test_issue_1129_lr(self):
+
+        import numpy as np
+        from numpy.testing import assert_almost_equal
+        import pandas as pd
+        from sklearn.linear_model import LogisticRegression
+        from sklearn.tree import DecisionTreeClassifier
+        from sklearn.ensemble import RandomForestClassifier
+        import skl2onnx
+        from onnxruntime import InferenceSession
+
+        # Create a small dataframe with 10 rows and 2 columns
+        np.random.seed(0)
+        data = {
+            "float_column": np.random.rand(10).astype(np.float64),
+            "int_column": np.random.randint(0, 100, size=10).astype(np.int64),
+        }
+        x_ = pd.DataFrame(data)
+        y = np.random.binomial(1, 0.5, size=10)
+
+        # Create a test dataset with 10 rows
+        test_data = {
+            "float_column": np.random.rand(10).astype(np.float64),
+            "int_column": np.random.randint(0, 100, size=10).astype(np.int64),
+        }
+        x_test_ = pd.DataFrame(test_data)
+
+        for cls in [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier]:
+            with self.subTest(cls=cls):
+                # Select and train a model
+                if cls == LogisticRegression:
+                    x = x_.astype(np.float64)
+                    x_test = x_test_.astype(np.float64)
+                    decimal = 10
+                else:
+                    x = x_.astype(np.float32)
+                    x_test = x_test_.astype(np.float32)
+                    decimal = 4
+                model = cls()
+                model.fit(x, y)
+                # Take predictions and probabilities with sklearn
+                sklearn_preds = model.predict(x_test)
+                sklearn_probs = model.predict_proba(x_test)
+
+                # Convert the model to ONNX
+                onnx_model = skl2onnx.to_onnx(
+                    model, x.values, options={"zipmap": False}
+                )
+                # Take predictions and probabilities with ONNX
+                sess = InferenceSession(
+                    onnx_model.SerializeToString(), providers=["CPUExecutionProvider"]
+                )
+                onnx_prediction = sess.run(None, {"X": x_test.to_numpy()})
+                assert_almost_equal(sklearn_probs, onnx_prediction[1], decimal=decimal)
+                assert_almost_equal(sklearn_preds, onnx_prediction[0])
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/tests/test_sklearn_pipeline_concat_tfidf.py b/tests/test_sklearn_pipeline_concat_tfidf.py
index 03b88f11e..c9c7b96f5 100644
--- a/tests/test_sklearn_pipeline_concat_tfidf.py
+++ b/tests/test_sklearn_pipeline_concat_tfidf.py
@@ -379,8 +379,8 @@ def test_issue_712_svc_binary_empty(self):
             target_opset=TARGET_OPSET,
             options={CountVectorizer: {"keep_empty_string": True}},
         )
-        with open("debug.onnx", "wb") as f:
-            f.write(onx.SerializeToString())
+        # with open("debug.onnx", "wb") as f:
+        #     f.write(onx.SerializeToString())
         sess = InferenceSession(
             onx.SerializeToString(), providers=["CPUExecutionProvider"]
         )