From d34c7d7d08c34188c6486c3815ac39ce15575cdd Mon Sep 17 00:00:00 2001 From: Pierre Bartet Date: Wed, 22 Jan 2025 16:42:20 +0100 Subject: [PATCH 1/5] Update test_sklearn_feature_selection_converters.py Signed-off-by: Pierre Bartet --- ...st_sklearn_feature_selection_converters.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_sklearn_feature_selection_converters.py b/tests/test_sklearn_feature_selection_converters.py index 1feaa11e3..6d5d7c955 100644 --- a/tests/test_sklearn_feature_selection_converters.py +++ b/tests/test_sklearn_feature_selection_converters.py @@ -21,6 +21,9 @@ SelectPercentile, VarianceThreshold, ) +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from skl2onnx import convert_sklearn from skl2onnx.common.data_types import Int64TensorType, FloatTensorType @@ -340,6 +343,23 @@ def test_select_k_best_float(self): self.assertTrue(model_onnx is not None) dump_data_and_model(X, model, model_onnx, basename="SklearnSelectKBest") + def test_select_k_best_scaler_logistic_regression_pipeline_float(self): + model = make_pipeline(SelectKBest(k=3), StandardScaler(), LogisticRegression()) + + X = np.array( + [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.float32 + ) + y = np.array([0, 1, 0, 1]) + model.fit(X, y) + model_onnx = convert_sklearn( + model, + "select k best", + [("input", FloatTensorType([None, X.shape[1]]))], + target_opset=TARGET_OPSET, + ) + self.assertTrue(model_onnx is not None) + dump_data_and_model(X, model, model_onnx, basename="SklearnSelectKBest") + def test_select_percentile_float(self): model = SelectPercentile() X = np.array( From 62f82065b31da4ae2f99976b418c46b3d9a3a243 Mon Sep 17 00:00:00 2001 From: Pierre Bartet Date: Fri, 24 Jan 2025 14:54:10 +0100 Subject: [PATCH 2/5] Fix the problem? Signed-off-by: Pierre Bartet --- skl2onnx/shape_calculators/array_feature_extractor.py | 9 +++++++++ skl2onnx/shape_calculators/concat.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/skl2onnx/shape_calculators/array_feature_extractor.py b/skl2onnx/shape_calculators/array_feature_extractor.py index e1ac00b09..fe75045bb 100644 --- a/skl2onnx/shape_calculators/array_feature_extractor.py +++ b/skl2onnx/shape_calculators/array_feature_extractor.py @@ -16,3 +16,12 @@ def calculate_sklearn_array_feature_extractor(operator): register_shape_calculator( "SklearnArrayFeatureExtractor", calculate_sklearn_array_feature_extractor ) + +def calculate_sklearn_select_k_best(operator): + check_input_and_output_numbers(operator, output_count_range=1) + i = operator.inputs[0] + N = i.get_first_dimension() + C = operator.raw_operator._get_support_mask().sum() + operator.outputs[0].type = i.type.__class__([N, C]) + +register_shape_calculator("SklearnSelectKBest", calculate_sklearn_select_k_best) diff --git a/skl2onnx/shape_calculators/concat.py b/skl2onnx/shape_calculators/concat.py index afdb6ab9e..f6049a9de 100644 --- a/skl2onnx/shape_calculators/concat.py +++ b/skl2onnx/shape_calculators/concat.py @@ -100,6 +100,6 @@ def more_generic(t1, t2): register_shape_calculator("SklearnSelectFpr", calculate_sklearn_concat) register_shape_calculator("SklearnSelectFromModel", calculate_sklearn_concat) register_shape_calculator("SklearnSelectFwe", calculate_sklearn_concat) -register_shape_calculator("SklearnSelectKBest", calculate_sklearn_concat) +# register_shape_calculator("SklearnSelectKBest", calculate_sklearn_concat) register_shape_calculator("SklearnSelectPercentile", calculate_sklearn_concat) register_shape_calculator("SklearnVarianceThreshold", calculate_sklearn_concat) From f92052f3ed15b2504b349904a933752bef49f618 Mon Sep 17 00:00:00 2001 From: Pierre Bartet Date: Fri, 24 Jan 2025 15:16:37 +0100 Subject: [PATCH 3/5] Fix formatting Signed-off-by: Pierre Bartet --- skl2onnx/shape_calculators/array_feature_extractor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skl2onnx/shape_calculators/array_feature_extractor.py b/skl2onnx/shape_calculators/array_feature_extractor.py index fe75045bb..1e8959f26 100644 --- a/skl2onnx/shape_calculators/array_feature_extractor.py +++ b/skl2onnx/shape_calculators/array_feature_extractor.py @@ -17,6 +17,7 @@ def calculate_sklearn_array_feature_extractor(operator): "SklearnArrayFeatureExtractor", calculate_sklearn_array_feature_extractor ) + def calculate_sklearn_select_k_best(operator): check_input_and_output_numbers(operator, output_count_range=1) i = operator.inputs[0] @@ -24,4 +25,5 @@ def calculate_sklearn_select_k_best(operator): C = operator.raw_operator._get_support_mask().sum() operator.outputs[0].type = i.type.__class__([N, C]) + register_shape_calculator("SklearnSelectKBest", calculate_sklearn_select_k_best) From 00e904548b3f9d076f663d966cdca18bb9097665 Mon Sep 17 00:00:00 2001 From: Pierre Bartet Date: Fri, 24 Jan 2025 15:46:29 +0100 Subject: [PATCH 4/5] Fix all selectors Signed-off-by: Pierre Bartet --- skl2onnx/shape_calculators/__init__.py | 2 ++ .../array_feature_extractor.py | 11 -------- skl2onnx/shape_calculators/concat.py | 10 -------- .../shape_calculators/feature_selection.py | 25 +++++++++++++++++++ ...st_sklearn_feature_selection_converters.py | 2 ++ 5 files changed, 29 insertions(+), 21 deletions(-) create mode 100644 skl2onnx/shape_calculators/feature_selection.py diff --git a/skl2onnx/shape_calculators/__init__.py b/skl2onnx/shape_calculators/__init__.py index ab5556b1e..71f9042cb 100644 --- a/skl2onnx/shape_calculators/__init__.py +++ b/skl2onnx/shape_calculators/__init__.py @@ -10,6 +10,7 @@ from . import cross_decomposition from . import dict_vectorizer from . import ensemble_shapes +from . import feature_selection from . import feature_hasher from . import flatten from . import function_transformer @@ -63,6 +64,7 @@ dict_vectorizer, ensemble_shapes, feature_hasher, + feature_selection, flatten, function_transformer, gaussian_process, diff --git a/skl2onnx/shape_calculators/array_feature_extractor.py b/skl2onnx/shape_calculators/array_feature_extractor.py index 1e8959f26..e1ac00b09 100644 --- a/skl2onnx/shape_calculators/array_feature_extractor.py +++ b/skl2onnx/shape_calculators/array_feature_extractor.py @@ -16,14 +16,3 @@ def calculate_sklearn_array_feature_extractor(operator): register_shape_calculator( "SklearnArrayFeatureExtractor", calculate_sklearn_array_feature_extractor ) - - -def calculate_sklearn_select_k_best(operator): - check_input_and_output_numbers(operator, output_count_range=1) - i = operator.inputs[0] - N = i.get_first_dimension() - C = operator.raw_operator._get_support_mask().sum() - operator.outputs[0].type = i.type.__class__([N, C]) - - -register_shape_calculator("SklearnSelectKBest", calculate_sklearn_select_k_best) diff --git a/skl2onnx/shape_calculators/concat.py b/skl2onnx/shape_calculators/concat.py index f6049a9de..9561c03af 100644 --- a/skl2onnx/shape_calculators/concat.py +++ b/skl2onnx/shape_calculators/concat.py @@ -93,13 +93,3 @@ def more_generic(t1, t2): register_shape_calculator("SklearnConcat", calculate_sklearn_concat) -register_shape_calculator("SklearnGenericUnivariateSelect", calculate_sklearn_concat) -register_shape_calculator("SklearnRFE", calculate_sklearn_concat) -register_shape_calculator("SklearnRFECV", calculate_sklearn_concat) -register_shape_calculator("SklearnSelectFdr", calculate_sklearn_concat) -register_shape_calculator("SklearnSelectFpr", calculate_sklearn_concat) -register_shape_calculator("SklearnSelectFromModel", calculate_sklearn_concat) -register_shape_calculator("SklearnSelectFwe", calculate_sklearn_concat) -# register_shape_calculator("SklearnSelectKBest", calculate_sklearn_concat) -register_shape_calculator("SklearnSelectPercentile", calculate_sklearn_concat) -register_shape_calculator("SklearnVarianceThreshold", calculate_sklearn_concat) diff --git a/skl2onnx/shape_calculators/feature_selection.py b/skl2onnx/shape_calculators/feature_selection.py new file mode 100644 index 000000000..3f935ea3c --- /dev/null +++ b/skl2onnx/shape_calculators/feature_selection.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 + + +from ..common._registration import register_shape_calculator +from ..common.utils import check_input_and_output_numbers + + +def calculate_sklearn_select(operator): + check_input_and_output_numbers(operator, output_count_range=1) + i = operator.inputs[0] + N = i.get_first_dimension() + C = operator.raw_operator._get_support_mask().sum() + operator.outputs[0].type = i.type.__class__([N, C]) + + +register_shape_calculator("SklearnGenericUnivariateSelect", calculate_sklearn_select) +register_shape_calculator("SklearnRFE", calculate_sklearn_select) +register_shape_calculator("SklearnRFECV", calculate_sklearn_select) +register_shape_calculator("SklearnSelectFdr", calculate_sklearn_select) +register_shape_calculator("SklearnSelectFpr", calculate_sklearn_select) +register_shape_calculator("SklearnSelectFromModel", calculate_sklearn_select) +register_shape_calculator("SklearnSelectFwe", calculate_sklearn_select) +register_shape_calculator("SklearnSelectKBest", calculate_sklearn_select) +register_shape_calculator("SklearnSelectPercentile", calculate_sklearn_select) +register_shape_calculator("SklearnVarianceThreshold", calculate_sklearn_select) diff --git a/tests/test_sklearn_feature_selection_converters.py b/tests/test_sklearn_feature_selection_converters.py index 6d5d7c955..3a6c48fbe 100644 --- a/tests/test_sklearn_feature_selection_converters.py +++ b/tests/test_sklearn_feature_selection_converters.py @@ -33,11 +33,13 @@ class TestSklearnFeatureSelectionConverters(unittest.TestCase): def test_generic_univariate_select_int(self): model = GenericUnivariateSelect() + X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.int64 ) y = np.array([0, 1, 0, 1]) model.fit(X, y) + model_onnx = convert_sklearn( model, "generic univariate select", From 1dad533a991985be1b0994c2e1650607fc2ff907 Mon Sep 17 00:00:00 2001 From: Pierre Bartet Date: Fri, 24 Jan 2025 15:54:59 +0100 Subject: [PATCH 5/5] Rely on a public sklearn attribute instead of a private one Signed-off-by: Pierre Bartet --- .../shape_calculators/feature_selection.py | 2 +- ...st_sklearn_feature_selection_converters.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/skl2onnx/shape_calculators/feature_selection.py b/skl2onnx/shape_calculators/feature_selection.py index 3f935ea3c..8c1164817 100644 --- a/skl2onnx/shape_calculators/feature_selection.py +++ b/skl2onnx/shape_calculators/feature_selection.py @@ -9,7 +9,7 @@ def calculate_sklearn_select(operator): check_input_and_output_numbers(operator, output_count_range=1) i = operator.inputs[0] N = i.get_first_dimension() - C = operator.raw_operator._get_support_mask().sum() + C = operator.raw_operator.get_support().sum() operator.outputs[0].type = i.type.__class__([N, C]) diff --git a/tests/test_sklearn_feature_selection_converters.py b/tests/test_sklearn_feature_selection_converters.py index 3a6c48fbe..7ede71464 100644 --- a/tests/test_sklearn_feature_selection_converters.py +++ b/tests/test_sklearn_feature_selection_converters.py @@ -51,6 +51,26 @@ def test_generic_univariate_select_int(self): X, model, model_onnx, basename="SklearnGenericUnivariateSelect" ) + def test_generic_univariate_select_kbest_int(self): + model = GenericUnivariateSelect(mode="k_best", param=2) + + X = np.array( + [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.int64 + ) + y = np.array([0, 1, 0, 1]) + model.fit(X, y) + + model_onnx = convert_sklearn( + model, + "generic univariate select", + [("input", Int64TensorType([None, X.shape[1]]))], + target_opset=TARGET_OPSET, + ) + self.assertTrue(model_onnx is not None) + dump_data_and_model( + X, model, model_onnx, basename="SklearnGenericUnivariateSelect" + ) + def test_rfe_int(self): model = RFE(estimator=SVR(kernel="linear")) X = np.array(