From 325033ab26abb3918597dfd161eb172b825eae06 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 22 Mar 2024 02:07:09 -0700 Subject: [PATCH 001/130] kmeans oop init commit --- onedal/cluster/kmeans.cpp | 7 +- onedal/cluster/kmeans.py | 5 +- setup_sklearnex.py | 1 - sklearnex/cluster/k_means.py | 378 +++++++++++++++++++++++-- sklearnex/cluster/tests/test_kmeans.py | 26 +- sklearnex/dispatcher.py | 13 +- sklearnex/preview/__init__.py | 2 +- sklearnex/preview/cluster/__init__.py | 19 -- sklearnex/preview/cluster/_common.py | 84 ------ sklearnex/preview/cluster/k_means.py | 371 ------------------------ 10 files changed, 397 insertions(+), 509 deletions(-) mode change 100755 => 100644 sklearnex/cluster/k_means.py delete mode 100644 sklearnex/preview/cluster/__init__.py delete mode 100644 sklearnex/preview/cluster/_common.py delete mode 100644 sklearnex/preview/cluster/k_means.py diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index e4561450d3..b63319ef00 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -68,7 +68,12 @@ struct params2desc { desc.set_cluster_count( params["cluster_count"].cast() ); desc.set_accuracy_threshold( params["accuracy_threshold"].cast() ); desc.set_max_iteration_count( params["max_iteration_count"].cast() ); - +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 + auto result_options = params["result_options"].cast(); + if (result_options == "computeAssignments"){ + desc.set_result_options(result_options::compute_assignments); + } +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 return desc; } }; diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index c6d51e9b11..81e1172251 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -142,7 +142,7 @@ def _check_params_vs_input( self._n_init = 1 assert self.algorithm == "lloyd" - def _get_onedal_params(self, dtype=np.float32): + def _get_onedal_params(self, dtype=np.float32, result_options = None): thr = self._tol if hasattr(self, "_tol") else self.tol return { "fptype": "float" if dtype == np.float32 else "double", @@ -151,6 +151,7 @@ def _get_onedal_params(self, dtype=np.float32): "max_iteration_count": self.max_iter, "cluster_count": self.n_clusters, "accuracy_threshold": thr, + "result_options": "" if result_options is None else result_options, } def _get_params_and_input(self, X, policy): @@ -340,7 +341,7 @@ def _set_cluster_centers(self, cluster_centers): cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) def _predict_raw(self, X_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(dtype) + params = self._get_onedal_params(dtype, result_options="computeAssignments") result = module.infer(policy, params, self.model_, X_table) diff --git a/setup_sklearnex.py b/setup_sklearnex.py index 1746de32b4..f2ceed3cfb 100755 --- a/setup_sklearnex.py +++ b/setup_sklearnex.py @@ -81,7 +81,6 @@ "sklearnex.neighbors", "sklearnex.preview", "sklearnex.preview.covariance", - "sklearnex.preview.cluster", "sklearnex.svm", "sklearnex.utils", ] diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py old mode 100755 new mode 100644 index 41171730b6..45018cf303 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -1,17 +1,361 @@ -# =============================================================================== -# Copyright 2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -from daal4py.sklearn.cluster import KMeans +# ============================================================================== +# Copyright 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import logging + +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2023, "P", 200)): + from abc import ABC + import numpy as np + from scipy.sparse import issparse + from sklearn.cluster import KMeans as sklearn_KMeans + from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + from sklearn.utils.validation import ( + _deprecate_positional_args, + _num_samples, + check_is_fitted, + ) + + from daal4py.sklearn._n_jobs_support import control_n_jobs + from daal4py.sklearn._utils import sklearn_check_version + from onedal.cluster import KMeans as onedal_KMeans + + from .._device_offload import dispatch, wrap_output_data + from .._utils import PatchingConditionsChain + + def get_cluster_centers(self): + return self._cluster_centers_ + + def set_cluster_centers(self, value): + self._cluster_centers_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.cluster_centers_ = value + + def get_labels(self): + return self._labels_ + + def set_labels(self, value): + self._labels_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.labels_ = value + + def get_inertia(self): + return self._inertia_ + + def set_inertia(self, value): + self._inertia_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.inertia_ = value + + def get_n_iter(self): + return self._n_iter_ + + def set_n_iter(self, value): + self._n_iter_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.n_iter_ = value + + class BaseKMeans(ABC): + def _save_attributes(self): + assert hasattr(self, "_onedal_estimator") + self.n_features_in_ = self._onedal_estimator.n_features_in_ + self.fit_status_ = 0 + self._tol = self._onedal_estimator._tol + self._n_init = self._onedal_estimator._n_init + self._n_iter_ = self._onedal_estimator.n_iter_ + self._labels_ = self._onedal_estimator.labels_ + self._inertia_ = self._onedal_estimator.inertia_ + self._algorithm = self._onedal_estimator.algorithm + self._cluster_centers_ = self._onedal_estimator.cluster_centers_ + self._sparse = False + + self.n_iter_ = property(get_n_iter, set_n_iter) + self.labels_ = property(get_labels, set_labels) + self.inertia_ = property(get_labels, set_inertia) + self.cluster_centers_ = property(get_cluster_centers, set_cluster_centers) + + self._is_in_fit = True + self.n_iter_ = self._n_iter_ + self.labels_ = self._labels_ + self.inertia_ = self._inertia_ + self.cluster_centers_ = self._cluster_centers_ + self._is_in_fit = False + + + @control_n_jobs(decorated_methods=["fit", "predict"]) + class KMeans(sklearn_KMeans, BaseKMeans): + __doc__ = sklearn_KMeans.__doc__ + n_iter_, inertia_ = None, None + labels_, cluster_centers_ = None, None + + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**sklearn_KMeans._parameter_constraints} + + @_deprecate_positional_args + def __init__( + self, + n_clusters=8, + *, + init="k-means++", + n_init="auto" if sklearn_check_version("1.4") else "warn", + max_iter=300, + tol=1e-4, + verbose=0, + random_state=None, + copy_x=True, + algorithm="lloyd", + ): + super().__init__( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + tol=tol, + n_init=n_init, + verbose=verbose, + random_state=random_state, + copy_x=copy_x, + algorithm=algorithm, + ) + + elif sklearn_check_version("1.0"): + + @_deprecate_positional_args + def __init__( + self, + n_clusters=8, + *, + init="k-means++", + n_init=10, + max_iter=300, + tol=1e-4, + verbose=0, + random_state=None, + copy_x=True, + algorithm="auto", + ): + super().__init__( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + tol=tol, + n_init=n_init, + verbose=verbose, + random_state=random_state, + copy_x=copy_x, + algorithm=algorithm, + ) + + else: + + @_deprecate_positional_args + def __init__( + self, + n_clusters=8, + *, + init="k-means++", + n_init=10, + max_iter=300, + tol=1e-4, + precompute_distances="deprecated", + verbose=0, + random_state=None, + copy_x=True, + n_jobs="deprecated", + algorithm="auto", + ): + super().__init__( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + tol=tol, + precompute_distances=precompute_distances, + n_init=n_init, + verbose=verbose, + random_state=random_state, + copy_x=copy_x, + n_jobs=n_jobs, + algorithm=algorithm, + ) + + def _initialize_onedal_estimator(self): + onedal_params = { + "n_clusters": self.n_clusters, + "init": self.init, + "max_iter": self.max_iter, + "tol": self.tol, + "n_init": self.n_init, + "verbose": self.verbose, + "random_state": self.random_state, + } + + self._onedal_estimator = onedal_KMeans(**onedal_params) + + def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): + assert method_name == "fit" + + class_name = self.__class__.__name__ + patching_status = PatchingConditionsChain(f"sklearn.cluster.{class_name}.fit") + + sample_count = _num_samples(X) + self._algorithm = self.algorithm + supported_algs = ["auto", "full", "lloyd"] + correct_count = self.n_clusters < sample_count + + patching_status.and_conditions( + [ + ( + self.algorithm in supported_algs, + "Only lloyd algorithm is supported.", + ), + (not issparse(self.init), "Sparse init values are not supported"), + (correct_count, "n_clusters is smaller than number of samples"), + (sample_weight is None, "Sample weight is not None."), + (not issparse(X), "Sparse input is not supported."), + ] + ) + + return patching_status + + def fit(self, X, y=None, sample_weight=None): + if sklearn_check_version("1.0"): + self._check_feature_names(X, reset=True) + if sklearn_check_version("1.2"): + self._validate_params() + + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_KMeans.fit, + }, + X, + y, + sample_weight, + ) + + return self + + def _onedal_fit(self, X, _, sample_weight, queue=None): + assert sample_weight is None + + X = self._validate_data( + X, + accept_sparse=False, + dtype=[np.float64, np.float32], + ) + + if sklearn_check_version("1.2"): + self._check_params_vs_input(X) + else: + self._check_params(X) + + self._n_features_out = self.n_clusters + self._n_threads = _openmp_effective_n_threads() + + self._initialize_onedal_estimator() + self._onedal_estimator.fit(X, queue=queue) + + self._save_attributes() + + def _onedal_predict_supported(self, method_name, X): + assert method_name == "predict" + + class_name = self.__class__.__name__ + patching_status = PatchingConditionsChain( + f"sklearn.cluster.{class_name}.predict" + ) + + supported_algs = ["auto", "full", "lloyd"] + dense_centers = not issparse(self.cluster_centers_) + + patching_status.and_conditions( + [ + ( + self.algorithm in supported_algs, + "Only lloyd algorithm is supported.", + ), + (dense_centers, "Sparse clusters is not supported."), + (not issparse(X), "Sparse input is not supported."), + ] + ) + + return patching_status + + @wrap_output_data + def predict(self, X): + if sklearn_check_version("1.0"): + self._check_feature_names(X, reset=True) + if sklearn_check_version("1.2"): + self._validate_params() + + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KMeans.predict, + }, + X, + ) + + def _onedal_predict(self, X, queue=None): + X = self._validate_data( + X, accept_sparse=False, reset=False, dtype=[np.float64, np.float32] + ) + if not hasattr(self, "_onedal_estimator"): + self._initialize_onedal_estimator() + self._onedal_estimator.cluster_centers_ = self.cluster_centers_ + + return self._onedal_estimator.predict(X, queue=queue) + + def _onedal_supported(self, method_name, *data): + if method_name == "fit": + return self._onedal_fit_supported(method_name, *data) + if method_name == "predict": + return self._onedal_predict_supported(method_name, *data) + raise RuntimeError( + f"Unknown method {method_name} in {self.__class__.__name__}" + ) + + def _onedal_gpu_supported(self, method_name, *data): + return self._onedal_supported(method_name, *data) + + def _onedal_cpu_supported(self, method_name, *data): + return self._onedal_supported(method_name, *data) + + @wrap_output_data + def fit_transform(self, X, y=None, sample_weight=None): + return self.fit(X, sample_weight=sample_weight)._transform(X) + + @wrap_output_data + def transform(self, X): + check_is_fitted(self) + + X = self._check_test_data(X) + return self._transform(X) + + fit.__doc__ = sklearn_KMeans.fit.__doc__ + predict.__doc__ = sklearn_KMeans.predict.__doc__ + fit_transform.__doc__ = sklearn_KMeans.fit_transform.__doc__ + transform.__doc__ = sklearn_KMeans.transform.__doc__ +else: + from daal4py.sklearn.cluster import KMeans + + logging.warning( + "Sklearnex KMeans requires oneDAL version >= 2023.2 " "but it was not found" + ) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 0424ee9e82..4d13577390 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -15,16 +15,32 @@ # =============================================================================== import numpy as np +import pytest from numpy.testing import assert_allclose +from daal4py.sklearn._utils import daal_check_version +from onedal.tests.utils._dataframes_support import ( + _as_numpy, + _convert_to_dataframe, + get_dataframes_and_queues, +) -def test_sklearnex_import(): + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import(dataframe, queue): from sklearnex.cluster import KMeans X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) + y = np.array([[0, 0], [12, 3]]) + expected_cluster_labels = np.array([1, 0], dtype=np.int32) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) + kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - assert "daal4py" in kmeans.__module__ + if daal_check_version((2024, "P", 200)): + assert "sklearnex" in kmeans.__module__ + else: + assert "daal4py" in kmeans.__module__ - result = kmeans.predict([[0, 0], [12, 3]]) - expected = np.array([1, 0], dtype=np.int32) - assert_allclose(expected, result) + result_cluster_labels = kmeans.predict(y) + assert_allclose(expected_cluster_labels, result_cluster_labels) diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index 04bf07fe1f..cd52485944 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -46,7 +46,6 @@ def get_patch_map_core(preview=False): import sklearn.covariance as covariance_module # Preview classes for patching - from .preview.cluster import KMeans as KMeans_sklearnex from .preview.covariance import ( EmpiricalCovariance as EmpiricalCovariance_sklearnex, ) @@ -56,13 +55,6 @@ def get_patch_map_core(preview=False): # when preview is used, setting the mapping element[1] to None # should NOT be done. This may lose track of the unpatched # sklearn estimator or function. - # KMeans - cluster_module, _, _ = mapping["kmeans"][0][0] - sklearn_obj = mapping["kmeans"][0][1] - mapping.pop("kmeans") - mapping["kmeans"] = [ - [(cluster_module, "kmeans", KMeans_sklearnex), sklearn_obj] - ] # Covariance mapping["empiricalcovariance"] = [ @@ -114,6 +106,7 @@ def get_patch_map_core(preview=False): from .utils.parallel import _FuncWrapperOld as _FuncWrapper_sklearnex from .cluster import DBSCAN as DBSCAN_sklearnex + from .cluster import KMeans as KMeans_sklearnex from .decomposition import PCA as PCA_sklearnex from .ensemble import ExtraTreesClassifier as ExtraTreesClassifier_sklearnex from .ensemble import ExtraTreesRegressor as ExtraTreesRegressor_sklearnex @@ -134,6 +127,10 @@ def get_patch_map_core(preview=False): mapping.pop("dbscan") mapping["dbscan"] = [[(cluster_module, "DBSCAN", DBSCAN_sklearnex), None]] + # DBSCAN + mapping.pop("kmeans") + mapping["kmeans"] = [[(cluster_module, "KMeans", KMeans_sklearnex), None]] + # PCA mapping.pop("pca") mapping["pca"] = [[(decomposition_module, "PCA", PCA_sklearnex), None]] diff --git a/sklearnex/preview/__init__.py b/sklearnex/preview/__init__.py index 235ac0a2df..dd6b856ba4 100644 --- a/sklearnex/preview/__init__.py +++ b/sklearnex/preview/__init__.py @@ -14,4 +14,4 @@ # limitations under the License. # ============================================================================== -__all__ = ["cluster", "covariance"] +__all__ = ["covariance"] diff --git a/sklearnex/preview/cluster/__init__.py b/sklearnex/preview/cluster/__init__.py deleted file mode 100644 index d8c187f895..0000000000 --- a/sklearnex/preview/cluster/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# ============================================================================== -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from .k_means import KMeans - -__all__ = ["KMeans"] diff --git a/sklearnex/preview/cluster/_common.py b/sklearnex/preview/cluster/_common.py deleted file mode 100644 index 1722bc08e6..0000000000 --- a/sklearnex/preview/cluster/_common.py +++ /dev/null @@ -1,84 +0,0 @@ -# ============================================================================== -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from abc import ABC - - -def get_cluster_centers(self): - return self._cluster_centers_ - - -def set_cluster_centers(self, value): - self._cluster_centers_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.cluster_centers_ = value - - -def get_labels(self): - return self._labels_ - - -def set_labels(self, value): - self._labels_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.labels_ = value - - -def get_inertia(self): - return self._inertia_ - - -def set_inertia(self, value): - self._inertia_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.inertia_ = value - - -def get_n_iter(self): - return self._n_iter_ - - -def set_n_iter(self, value): - self._n_iter_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.n_iter_ = value - - -class BaseKMeans(ABC): - def _save_attributes(self): - assert hasattr(self, "_onedal_estimator") - self.n_features_in_ = self._onedal_estimator.n_features_in_ - self.fit_status_ = 0 - self._tol = self._onedal_estimator._tol - self._n_init = self._onedal_estimator._n_init - self._n_iter_ = self._onedal_estimator.n_iter_ - self._labels_ = self._onedal_estimator.labels_ - self._inertia_ = self._onedal_estimator.inertia_ - self._algorithm = self._onedal_estimator.algorithm - self._cluster_centers_ = self._onedal_estimator.cluster_centers_ - self._sparse = False - - self.n_iter_ = property(get_n_iter, set_n_iter) - self.labels_ = property(get_labels, set_labels) - self.inertia_ = property(get_labels, set_inertia) - self.cluster_centers_ = property(get_cluster_centers, set_cluster_centers) - - self._is_in_fit = True - self.n_iter_ = self._n_iter_ - self.labels_ = self._labels_ - self.inertia_ = self._inertia_ - self.cluster_centers_ = self._cluster_centers_ - self._is_in_fit = False diff --git a/sklearnex/preview/cluster/k_means.py b/sklearnex/preview/cluster/k_means.py deleted file mode 100644 index 420df2e343..0000000000 --- a/sklearnex/preview/cluster/k_means.py +++ /dev/null @@ -1,371 +0,0 @@ -# ============================================================================== -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import logging - -from daal4py.sklearn._utils import daal_check_version - -if daal_check_version((2023, "P", 200)): - import numpy as np - from scipy.sparse import issparse - from sklearn.cluster import KMeans as sklearn_KMeans - from sklearn.utils._openmp_helpers import _openmp_effective_n_threads - from sklearn.utils.validation import ( - _deprecate_positional_args, - _num_samples, - check_is_fitted, - ) - - from daal4py.sklearn._n_jobs_support import control_n_jobs - from daal4py.sklearn._utils import sklearn_check_version - from onedal.cluster import KMeans as onedal_KMeans - - from ..._device_offload import dispatch, wrap_output_data - from ..._utils import PatchingConditionsChain - from ._common import BaseKMeans - - @control_n_jobs(decorated_methods=["fit", "predict"]) - class KMeans(sklearn_KMeans, BaseKMeans): - __doc__ = sklearn_KMeans.__doc__ - n_iter_, inertia_ = None, None - labels_, cluster_centers_ = None, None - - if sklearn_check_version("1.2"): - _parameter_constraints: dict = {**sklearn_KMeans._parameter_constraints} - - @_deprecate_positional_args - def __init__( - self, - n_clusters=8, - *, - init="k-means++", - n_init="auto" if sklearn_check_version("1.4") else "warn", - max_iter=300, - tol=1e-4, - verbose=0, - random_state=None, - copy_x=True, - algorithm="lloyd", - ): - super().__init__( - n_clusters=n_clusters, - init=init, - max_iter=max_iter, - tol=tol, - n_init=n_init, - verbose=verbose, - random_state=random_state, - copy_x=copy_x, - algorithm=algorithm, - ) - - elif sklearn_check_version("1.0"): - - @_deprecate_positional_args - def __init__( - self, - n_clusters=8, - *, - init="k-means++", - n_init=10, - max_iter=300, - tol=1e-4, - verbose=0, - random_state=None, - copy_x=True, - algorithm="auto", - ): - super().__init__( - n_clusters=n_clusters, - init=init, - max_iter=max_iter, - tol=tol, - n_init=n_init, - verbose=verbose, - random_state=random_state, - copy_x=copy_x, - algorithm=algorithm, - ) - - else: - - @_deprecate_positional_args - def __init__( - self, - n_clusters=8, - *, - init="k-means++", - n_init=10, - max_iter=300, - tol=1e-4, - precompute_distances="deprecated", - verbose=0, - random_state=None, - copy_x=True, - n_jobs="deprecated", - algorithm="auto", - ): - super().__init__( - n_clusters=n_clusters, - init=init, - max_iter=max_iter, - tol=tol, - precompute_distances=precompute_distances, - n_init=n_init, - verbose=verbose, - random_state=random_state, - copy_x=copy_x, - n_jobs=n_jobs, - algorithm=algorithm, - ) - - def _initialize_onedal_estimator(self): - onedal_params = { - "n_clusters": self.n_clusters, - "init": self.init, - "max_iter": self.max_iter, - "tol": self.tol, - "n_init": self.n_init, - "verbose": self.verbose, - "random_state": self.random_state, - } - - self._onedal_estimator = onedal_KMeans(**onedal_params) - - def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): - assert method_name == "fit" - - class_name = self.__class__.__name__ - patching_status = PatchingConditionsChain(f"sklearn.cluster.{class_name}.fit") - - sample_count = _num_samples(X) - self._algorithm = self.algorithm - supported_algs = ["auto", "full", "lloyd"] - correct_count = self.n_clusters < sample_count - - patching_status.and_conditions( - [ - ( - self.algorithm in supported_algs, - "Only lloyd algorithm is supported.", - ), - (not issparse(self.init), "Sparse init values are not supported"), - (correct_count, "n_clusters is smaller than number of samples"), - (sample_weight is None, "Sample weight is not None."), - (not issparse(X), "Sparse input is not supported."), - ] - ) - - return patching_status - - def fit(self, X, y=None, sample_weight=None): - """Compute k-means clustering. - - Parameters - ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) - Training instances to cluster. It must be noted that the data - will be converted to C ordering, which will cause a memory - copy if the given data is not C-contiguous. - - y : Ignored - not used, present here for API consistency by convention. - - sample_weight : array-like, shape (n_samples,), optional - The weights for each observation in X. If None, all observations - are assigned equal weight (default: None) - - """ - - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=True) - if sklearn_check_version("1.2"): - self._validate_params() - - dispatch( - self, - "fit", - { - "onedal": self.__class__._onedal_fit, - "sklearn": sklearn_KMeans.fit, - }, - X, - y, - sample_weight, - ) - - return self - - def _onedal_fit(self, X, _, sample_weight, queue=None): - assert sample_weight is None - - X = self._validate_data( - X, - accept_sparse=False, - dtype=[np.float64, np.float32], - ) - - if sklearn_check_version("1.2"): - self._check_params_vs_input(X) - else: - self._check_params(X) - - self._n_features_out = self.n_clusters - self._n_threads = _openmp_effective_n_threads() - - self._initialize_onedal_estimator() - self._onedal_estimator.fit(X, queue=queue) - - self._save_attributes() - - def _onedal_predict_supported(self, method_name, X): - assert method_name == "predict" - - class_name = self.__class__.__name__ - patching_status = PatchingConditionsChain( - f"sklearn.cluster.{class_name}.predict" - ) - - supported_algs = ["auto", "full", "lloyd"] - dense_centers = not issparse(self.cluster_centers_) - - patching_status.and_conditions( - [ - ( - self.algorithm in supported_algs, - "Only lloyd algorithm is supported.", - ), - (dense_centers, "Sparse clusters is not supported."), - (not issparse(X), "Sparse input is not supported."), - ] - ) - - return patching_status - - @wrap_output_data - def predict(self, X): - """Compute k-means clustering. - - Parameters - ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) - Training instances to cluster. It must be noted that the data - will be converted to C ordering, which will cause a memory - copy if the given data is not C-contiguous. - - y : Ignored - not used, present here for API consistency by convention. - - sample_weight : array-like, shape (n_samples,), optional - The weights for each observation in X. If None, all observations - are assigned equal weight (default: None) - - """ - - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=True) - if sklearn_check_version("1.2"): - self._validate_params() - - return dispatch( - self, - "predict", - { - "onedal": self.__class__._onedal_predict, - "sklearn": sklearn_KMeans.predict, - }, - X, - ) - - def _onedal_predict(self, X, queue=None): - X = self._validate_data( - X, accept_sparse=False, reset=False, dtype=[np.float64, np.float32] - ) - if not hasattr(self, "_onedal_estimator"): - self._initialize_onedal_estimator() - self._onedal_estimator.cluster_centers_ = self.cluster_centers_ - - return self._onedal_estimator.predict(X, queue=queue) - - def _onedal_supported(self, method_name, *data): - if method_name == "fit": - return self._onedal_fit_supported(method_name, *data) - if method_name == "predict": - return self._onedal_predict_supported(method_name, *data) - raise RuntimeError( - f"Unknown method {method_name} in {self.__class__.__name__}" - ) - - def _onedal_gpu_supported(self, method_name, *data): - return self._onedal_supported(method_name, *data) - - def _onedal_cpu_supported(self, method_name, *data): - return self._onedal_supported(method_name, *data) - - @wrap_output_data - def fit_transform(self, X, y=None, sample_weight=None): - """Compute clustering and transform X to cluster-distance space. - - Equivalent to fit(X).transform(X), but more efficiently implemented. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - New data to transform. - - y : Ignored - Not used, present here for API consistency by convention. - - sample_weight : array-like of shape (n_samples,), default=None - The weights for each observation in X. If None, all observations - are assigned equal weight. - - Returns - ------- - X_new : ndarray of shape (n_samples, n_clusters) - X transformed in the new space. - """ - return self.fit(X, sample_weight=sample_weight)._transform(X) - - @wrap_output_data - def transform(self, X): - """Transform X to a cluster-distance space. - - In the new space, each dimension is the distance to the cluster - centers. Note that even if X is sparse, the array returned by - `transform` will typically be dense. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - New data to transform. - - Returns - ------- - X_new : ndarray of shape (n_samples, n_clusters) - X transformed in the new space. - """ - check_is_fitted(self) - - X = self._check_test_data(X) - return self._transform(X) - -else: - from daal4py.sklearn.cluster import KMeans - - logging.warning( - "Preview KMeans requires oneDAL version >= 2023.2 " "but it was not found" - ) From 4fbc3120ce5ee050b31a32c23d5c21dee1ec239e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 22 Mar 2024 02:14:11 -0700 Subject: [PATCH 002/130] reformat --- onedal/cluster/kmeans.py | 2 +- sklearnex/cluster/k_means.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 81e1172251..84e9bc79cc 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -142,7 +142,7 @@ def _check_params_vs_input( self._n_init = 1 assert self.algorithm == "lloyd" - def _get_onedal_params(self, dtype=np.float32, result_options = None): + def _get_onedal_params(self, dtype=np.float32, result_options=None): thr = self._tol if hasattr(self, "_tol") else self.tol return { "fptype": "float" if dtype == np.float32 else "double", diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 45018cf303..d14dec0f2b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -20,6 +20,7 @@ if daal_check_version((2023, "P", 200)): from abc import ABC + import numpy as np from scipy.sparse import issparse from sklearn.cluster import KMeans as sklearn_KMeans @@ -95,7 +96,6 @@ def _save_attributes(self): self.cluster_centers_ = self._cluster_centers_ self._is_in_fit = False - @control_n_jobs(decorated_methods=["fit", "predict"]) class KMeans(sklearn_KMeans, BaseKMeans): __doc__ = sklearn_KMeans.__doc__ @@ -352,7 +352,8 @@ def transform(self, X): fit.__doc__ = sklearn_KMeans.fit.__doc__ predict.__doc__ = sklearn_KMeans.predict.__doc__ fit_transform.__doc__ = sklearn_KMeans.fit_transform.__doc__ - transform.__doc__ = sklearn_KMeans.transform.__doc__ + transform.__doc__ = sklearn_KMeans.transform.__doc__ + else: from daal4py.sklearn.cluster import KMeans From 06248fe881da8371a65bbb5d28ba731d66f0de70 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 22 Mar 2024 02:18:03 -0700 Subject: [PATCH 003/130] reformat --- sklearnex/dispatcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index cd52485944..9095f49fa6 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -55,7 +55,6 @@ def get_patch_map_core(preview=False): # when preview is used, setting the mapping element[1] to None # should NOT be done. This may lose track of the unpatched # sklearn estimator or function. - # Covariance mapping["empiricalcovariance"] = [ [ From a14ddbe9f196830cbc58e639b0d6653ec20d581a Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 22 Mar 2024 04:15:32 -0700 Subject: [PATCH 004/130] experimental --- sklearnex/cluster/k_means.py | 2 +- sklearnex/cluster/tests/test_kmeans.py | 2 +- sklearnex/conftest.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index d14dec0f2b..161f56adbb 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -297,7 +297,7 @@ def _onedal_predict_supported(self, method_name, X): return patching_status @wrap_output_data - def predict(self, X): + def predict(self, X, sample_weight=None): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 4d13577390..14d6a00ac8 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -37,7 +37,7 @@ def test_sklearnex_import(dataframe, queue): y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - if daal_check_version((2024, "P", 200)): + if daal_check_version((2023, "P", 200)): assert "sklearnex" in kmeans.__module__ else: assert "daal4py" in kmeans.__module__ diff --git a/sklearnex/conftest.py b/sklearnex/conftest.py index 20d1ace0ee..baffb644e9 100644 --- a/sklearnex/conftest.py +++ b/sklearnex/conftest.py @@ -39,8 +39,8 @@ def pytest_runtest_call(item): sklearnex_stderr_handler = sklearnex_logger.handlers sklearnex_logger.handlers = [] sklearnex_logger.addHandler(log_handler) - sklearnex_logger.setLevel(logging.INFO) - log_handler.setLevel(logging.INFO) + sklearnex_logger.setLevel(logging.DEBUG) + log_handler.setLevel(logging.DEBUG) yield From 03f85a92d0ebffc0147a36e3fb837d35c2d3ce95 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 22 Mar 2024 06:40:06 -0700 Subject: [PATCH 005/130] address ci failures --- sklearnex/cluster/k_means.py | 11 ++++------- sklearnex/cluster/tests/test_kmeans.py | 2 +- sklearnex/conftest.py | 4 ++-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 161f56adbb..4ddfed70b7 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -221,10 +221,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self.algorithm in supported_algs, "Only lloyd algorithm is supported.", ), - (not issparse(self.init), "Sparse init values are not supported"), (correct_count, "n_clusters is smaller than number of samples"), (sample_weight is None, "Sample weight is not None."), - (not issparse(X), "Sparse input is not supported."), ] ) @@ -255,7 +253,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): X = self._validate_data( X, - accept_sparse=False, + accept_sparse="csr", dtype=[np.float64, np.float32], ) @@ -281,7 +279,7 @@ def _onedal_predict_supported(self, method_name, X): ) supported_algs = ["auto", "full", "lloyd"] - dense_centers = not issparse(self.cluster_centers_) + # dense_centers = not issparse(self.cluster_centers_) patching_status.and_conditions( [ @@ -289,8 +287,7 @@ def _onedal_predict_supported(self, method_name, X): self.algorithm in supported_algs, "Only lloyd algorithm is supported.", ), - (dense_centers, "Sparse clusters is not supported."), - (not issparse(X), "Sparse input is not supported."), + # (dense_centers, "Sparse clusters is not supported."), ] ) @@ -315,7 +312,7 @@ def predict(self, X, sample_weight=None): def _onedal_predict(self, X, queue=None): X = self._validate_data( - X, accept_sparse=False, reset=False, dtype=[np.float64, np.float32] + X, accept_sparse="csr", reset=False, dtype=[np.float64, np.float32], accept_large_sparse=False, ) if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 14d6a00ac8..8a2fd0cdca 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -43,4 +43,4 @@ def test_sklearnex_import(dataframe, queue): assert "daal4py" in kmeans.__module__ result_cluster_labels = kmeans.predict(y) - assert_allclose(expected_cluster_labels, result_cluster_labels) + assert_allclose(expected_cluster_labels, _as_numpy(result_cluster_labels)) diff --git a/sklearnex/conftest.py b/sklearnex/conftest.py index baffb644e9..20d1ace0ee 100644 --- a/sklearnex/conftest.py +++ b/sklearnex/conftest.py @@ -39,8 +39,8 @@ def pytest_runtest_call(item): sklearnex_stderr_handler = sklearnex_logger.handlers sklearnex_logger.handlers = [] sklearnex_logger.addHandler(log_handler) - sklearnex_logger.setLevel(logging.DEBUG) - log_handler.setLevel(logging.DEBUG) + sklearnex_logger.setLevel(logging.INFO) + log_handler.setLevel(logging.INFO) yield From bf8c75f4f02ad5ef30c7f4f311d7d07f7f06b186 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 22 Mar 2024 08:24:56 -0700 Subject: [PATCH 006/130] deselected tests --- deselected_tests.yaml | 12 ++++++------ sklearnex/cluster/k_means.py | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index e0735f86bb..dbd0f346c4 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -182,6 +182,12 @@ deselected_tests: - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 + # Tests have been ported from preview. Fail due to different combination of init methods, investigation required. + - cluster/tests/test_k_means.py::test_kmeans_elkan_results + - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 + - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 + - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 + # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 - linear_model/tests/test_logistic.py::test_dtype_match @@ -1181,9 +1187,3 @@ gpu: - tests/test_common.py::test_check_n_features_in_after_fitting[SVC()] # originated with pca dpctl/dpnp fit, to be re-assesed with pca out-of-preview - decomposition/tests/test_pca.py::test_pca_n_components_mostly_explained_variance_ratio - -preview: - - cluster/tests/test_k_means.py::test_kmeans_elkan_results - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 - - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 4ddfed70b7..4c9abd97b3 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -312,7 +312,11 @@ def predict(self, X, sample_weight=None): def _onedal_predict(self, X, queue=None): X = self._validate_data( - X, accept_sparse="csr", reset=False, dtype=[np.float64, np.float32], accept_large_sparse=False, + X, + accept_sparse="csr", + reset=False, + dtype=[np.float64, np.float32], + accept_large_sparse=False, ) if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() From c20c1f4d68fe6b7be1e12a961f13f7fa63793d1f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 26 Mar 2024 05:39:09 -0700 Subject: [PATCH 007/130] will be reverted --- sklearnex/cluster/k_means.py | 12 +++++++----- sklearnex/dispatcher.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 4c9abd97b3..6d446b99bd 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -221,8 +221,10 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self.algorithm in supported_algs, "Only lloyd algorithm is supported.", ), + (not issparse(self.init), "Sparse init values are not supported"), (correct_count, "n_clusters is smaller than number of samples"), (sample_weight is None, "Sample weight is not None."), + (not issparse(X), "Sparse input is not supported."), ] ) @@ -253,7 +255,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): X = self._validate_data( X, - accept_sparse="csr", + accept_sparse=False, dtype=[np.float64, np.float32], ) @@ -279,7 +281,7 @@ def _onedal_predict_supported(self, method_name, X): ) supported_algs = ["auto", "full", "lloyd"] - # dense_centers = not issparse(self.cluster_centers_) + dense_centers = not issparse(self.cluster_centers_) patching_status.and_conditions( [ @@ -287,7 +289,8 @@ def _onedal_predict_supported(self, method_name, X): self.algorithm in supported_algs, "Only lloyd algorithm is supported.", ), - # (dense_centers, "Sparse clusters is not supported."), + (dense_centers, "Sparse clusters is not supported."), + (not issparse(X), "Sparse input is not supported."), ] ) @@ -313,10 +316,9 @@ def predict(self, X, sample_weight=None): def _onedal_predict(self, X, queue=None): X = self._validate_data( X, - accept_sparse="csr", + accept_sparse=False, reset=False, dtype=[np.float64, np.float32], - accept_large_sparse=False, ) if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index 351d59f11c..26695dad33 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -127,7 +127,7 @@ def get_patch_map_core(preview=False): mapping.pop("dbscan") mapping["dbscan"] = [[(cluster_module, "DBSCAN", DBSCAN_sklearnex), None]] - # DBSCAN + # KMeans mapping.pop("kmeans") mapping["kmeans"] = [[(cluster_module, "KMeans", KMeans_sklearnex), None]] From ad99db4ed70e8b8a6827a6f9eb51b691d189a15b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 26 Mar 2024 07:07:44 -0700 Subject: [PATCH 008/130] enable deslected tests --- deselected_tests.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 01f1e90b4f..8568427101 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -179,10 +179,10 @@ deselected_tests: - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 # Tests have been ported from preview. Fail due to different combination of init methods, investigation required. - - cluster/tests/test_k_means.py::test_kmeans_elkan_results - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 - - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 + #- cluster/tests/test_k_means.py::test_kmeans_elkan_results + #- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 + #- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 + #- cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 From 8b46e065f76cda23a4fb4fea7bb4ed8b49e6a282 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 27 Mar 2024 05:12:52 -0700 Subject: [PATCH 009/130] include elkan --- sklearnex/cluster/k_means.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 6d446b99bd..28a7207db4 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -280,14 +280,14 @@ def _onedal_predict_supported(self, method_name, X): f"sklearn.cluster.{class_name}.predict" ) - supported_algs = ["auto", "full", "lloyd"] + supported_algs = ["auto", "full", "lloyd", "elkan"] dense_centers = not issparse(self.cluster_centers_) patching_status.and_conditions( [ ( self.algorithm in supported_algs, - "Only lloyd algorithm is supported.", + "Only lloyd algorithm is supported, elkan is computed using lloyd", ), (dense_centers, "Sparse clusters is not supported."), (not issparse(X), "Sparse input is not supported."), From be476239c42f9bb238621a22c88c5a5bcc45eb14 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 27 Mar 2024 05:45:34 -0700 Subject: [PATCH 010/130] address CI failure --- sklearnex/tests/test_run_to_run_stability_tests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearnex/tests/test_run_to_run_stability_tests.py b/sklearnex/tests/test_run_to_run_stability_tests.py index 33f39bea79..f45688710d 100755 --- a/sklearnex/tests/test_run_to_run_stability_tests.py +++ b/sklearnex/tests/test_run_to_run_stability_tests.py @@ -146,6 +146,8 @@ def _run_test(model, methods, dataset): res, _ = func(X, y, model, methods) for a, b, n in zip(res, baseline, name): + if model == "KMeans" and n == "tol": + continue np.testing.assert_allclose( a, b, rtol=0.0, atol=0.0, err_msg=str(n + " is incorrect") ) From 0083124629652a9941a18d8b1e9885d0edf351ef Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 27 Mar 2024 06:35:00 -0700 Subject: [PATCH 011/130] address ci failures --- deselected_tests.yaml | 6 ------ sklearnex/cluster/k_means.py | 11 ++++++++--- sklearnex/tests/test_run_to_run_stability_tests.py | 5 +++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 8568427101..d8556a0979 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -178,12 +178,6 @@ deselected_tests: - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 - # Tests have been ported from preview. Fail due to different combination of init methods, investigation required. - #- cluster/tests/test_k_means.py::test_kmeans_elkan_results - #- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 - #- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 - #- cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 - # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 - linear_model/tests/test_logistic.py::test_dtype_match diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 28a7207db4..f2570a647b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -212,14 +212,14 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): sample_count = _num_samples(X) self._algorithm = self.algorithm - supported_algs = ["auto", "full", "lloyd"] + supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count patching_status.and_conditions( [ ( self.algorithm in supported_algs, - "Only lloyd algorithm is supported.", + "Only lloyd algorithm is supported, elkan is computed using lloyd", ), (not issparse(self.init), "Sparse init values are not supported"), (correct_count, "n_clusters is smaller than number of samples"), @@ -302,7 +302,6 @@ def predict(self, X, sample_weight=None): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - return dispatch( self, "predict", @@ -320,6 +319,12 @@ def _onedal_predict(self, X, queue=None): reset=False, dtype=[np.float64, np.float32], ) + if sklearn_check_version("1.3") and sample_weight is not None: + warnings.warn( + "'sample_weight' was deprecated in version 1.3 and " + "will be removed in 1.5.", + FutureWarning, + ) if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() self._onedal_estimator.cluster_centers_ = self.cluster_centers_ diff --git a/sklearnex/tests/test_run_to_run_stability_tests.py b/sklearnex/tests/test_run_to_run_stability_tests.py index f45688710d..66b5b37765 100755 --- a/sklearnex/tests/test_run_to_run_stability_tests.py +++ b/sklearnex/tests/test_run_to_run_stability_tests.py @@ -146,8 +146,6 @@ def _run_test(model, methods, dataset): res, _ = func(X, y, model, methods) for a, b, n in zip(res, baseline, name): - if model == "KMeans" and n == "tol": - continue np.testing.assert_allclose( a, b, rtol=0.0, atol=0.0, err_msg=str(n + " is incorrect") ) @@ -359,6 +357,9 @@ def _run_test(model, methods, dataset): "LogisticRegressionCV", # Absolute diff is 1e-10, will be fixed for next release "RandomForestRegressor", # Absolute diff is 1e-14 in OOB score, # will be fixed for next release + "KMeans", # sparsity support required, + # '_tol' attribute shows numerical instability (diff is 1e-14) coming from basic_statistics + # variance calculation. ] From 2e113fc1f877f251e36711865cc06b18204f3606 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 27 Mar 2024 11:04:36 -0700 Subject: [PATCH 012/130] enable all deselected tests --- deselected_tests.yaml | 63 ++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index d8556a0979..08757ab3e5 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -167,16 +167,19 @@ deselected_tests: # test_non_uniform_strategies fails due to differences in handling of vacuous clusters after update # See https://github.com/IntelPython/daal4py/issues/69 - - cluster/tests/test_k_means.py::test_relocated_clusters >=0.23,<0.24 - - cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24 + # - cluster/tests/test_k_means.py::test_relocated_clusters >=0.23,<0.24 + # - cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24 # In scikit-learn, these algorithms are not included in this test. However, scikit-learn-intelex # does and throws an error. This is due to the different structure of the transformer.__module__.split("."). - tests/test_common.py::test_transformers_get_feature_names_out[KMeans()] >=1.0 # oneAPI Data Analytics Library (oneDAL) does not check convergence for tol == 0.0 for ease of benchmarking - - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 + # - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 + # - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 + + # Sparse Support required + - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 @@ -240,7 +243,7 @@ deselected_tests: - inspection/tests/test_permutation_importance.py::test_permutation_importance_sample_weight >=0.24 # Patched and unpatched kmeans set same values to different clusters. Need to investigate. - - preprocessing/tests/test_discretization.py::test_nonuniform_strategies[kmeans-expected_2bins1-expected_3bins1-expected_5bins1] >=0.24 + # - preprocessing/tests/test_discretization.py::test_nonuniform_strategies[kmeans-expected_2bins1-expected_3bins1-expected_5bins1] >=0.24 # OOB scores in scikit-learn and oneDAL are different because of different random number generators - ensemble/tests/test_forest.py::test_forest_classifier_oob[X1-y1-0.65-array-ExtraTreesClassifier] @@ -346,12 +349,12 @@ deselected_tests: - tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data >=1.4 # New failing sklearn1.4.1 tests for kmeans associated with incorrect n_iter_ values in daal4py - - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-dense] >=1.4 - - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_matrix] >=1.4 - - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_array] >=1.4 - - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-dense] >=1.4 - - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_matrix] >=1.4 - - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_array] >=1.4 + # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-dense] >=1.4 + # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_matrix] >=1.4 + # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_array] >=1.4 + # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-dense] >=1.4 + # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_matrix] >=1.4 + # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_array] >=1.4 # -------------------------------------------------------- @@ -434,8 +437,8 @@ gpu: # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - - cluster/tests/test_k_means.py::test_k_means_fit_predict - - cluster/tests/test_k_means.py::test_predict + # - cluster/tests/test_k_means.py::test_k_means_fit_predict + # - cluster/tests/test_k_means.py::test_predict - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples @@ -567,8 +570,8 @@ gpu: - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_idempotent] - tests/test_common.py::test_estimators[GaussianMixture()-check_n_features_in] - tests/test_common.py::test_estimators[GaussianMixture()-check_fit2d_predict1d] - - tests/test_common.py::test_estimators[KMeans()-check_clustering] - - tests/test_common.py::test_estimators[KMeans()-check_clustering(readonly_memmap=True)] + # - tests/test_common.py::test_estimators[KMeans()-check_clustering] + # - tests/test_common.py::test_estimators[KMeans()-check_clustering(readonly_memmap=True)] - tests/test_common.py::test_estimators[RandomForestClassifier()-check_class_weight_classifiers] - tests/test_common.py::test_estimators[SVC()-check_sample_weights_pandas_series] - tests/test_common.py::test_estimators[SVC()-check_sample_weights_not_an_array] @@ -617,21 +620,21 @@ gpu: - manifold/tests/test_t_sne.py::test_n_iter_without_progress # KMeans based (unsupported for GPU) - - cluster/tests/test_k_means.py - - tests/test_common.py::test_estimators[KMeans() - - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted] - - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted] - - tests/test_common.py::test_check_n_features_in_after_fitting[BayesianGaussianMixture()] - - tests/test_common.py::test_check_n_features_in_after_fitting[GaussianMixture()] - - tests/test_common.py::test_check_n_features_in_after_fitting[KMeans()] - - tests/test_common.py::test_set_output_transform[KMeans()] - - tests/test_common.py::test_set_output_transform_pandas[KMeans()] - - tests/test_common.py::test_global_output_transform_pandas[KMeans()] - - mixture/tests/test_gaussian_mixture.py - - model_selection/tests/test_validation.py::test_cross_val_predict - - metrics/tests/test_score_objects.py::test_supervised_cluster_scorers - - tests/test_pipeline.py::test_fit_predict_on_pipeline - - tests/test_discriminant_analysis.py::test_lda_predict + # - cluster/tests/test_k_means.py + # - tests/test_common.py::test_estimators[KMeans() + # - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted] + # - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted] + # - tests/test_common.py::test_check_n_features_in_after_fitting[BayesianGaussianMixture()] + # - tests/test_common.py::test_check_n_features_in_after_fitting[GaussianMixture()] + # - tests/test_common.py::test_check_n_features_in_after_fitting[KMeans()] + # - tests/test_common.py::test_set_output_transform[KMeans()] + # - tests/test_common.py::test_set_output_transform_pandas[KMeans()] + # - tests/test_common.py::test_global_output_transform_pandas[KMeans()] + # - mixture/tests/test_gaussian_mixture.py + # - model_selection/tests/test_validation.py::test_cross_val_predict + # - metrics/tests/test_score_objects.py::test_supervised_cluster_scorers + # - tests/test_pipeline.py::test_fit_predict_on_pipeline + # - tests/test_discriminant_analysis.py::test_lda_predict # Other device issues - tests/test_metaestimators.py::test_meta_estimators_delegate_data_validation[StackingClassifier] - tests/test_multiclass.py::test_ovr_always_present From 72f77a1efc87a9024b96c65f3fd615223e3389aa Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 28 Mar 2024 04:51:56 -0700 Subject: [PATCH 013/130] deselected tests --- deselected_tests.yaml | 7 ++++--- sklearnex/cluster/k_means.py | 17 ++++++++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 08757ab3e5..d0ac9fd46a 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -168,18 +168,19 @@ deselected_tests: # test_non_uniform_strategies fails due to differences in handling of vacuous clusters after update # See https://github.com/IntelPython/daal4py/issues/69 # - cluster/tests/test_k_means.py::test_relocated_clusters >=0.23,<0.24 - # - cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24 + - cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24 # In scikit-learn, these algorithms are not included in this test. However, scikit-learn-intelex # does and throws an error. This is due to the different structure of the transformer.__module__.split("."). - tests/test_common.py::test_transformers_get_feature_names_out[KMeans()] >=1.0 # oneAPI Data Analytics Library (oneDAL) does not check convergence for tol == 0.0 for ease of benchmarking - # - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - # - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 + - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 + - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 # Sparse Support required - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 + - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index f2570a647b..4c651304be 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -272,7 +272,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._save_attributes() - def _onedal_predict_supported(self, method_name, X): + def _onedal_predict_supported(self, method_name, X, sample_weight): assert method_name == "predict" class_name = self.__class__.__name__ @@ -297,7 +297,9 @@ def _onedal_predict_supported(self, method_name, X): return patching_status @wrap_output_data - def predict(self, X, sample_weight=None): + def predict( + self, X, sample_weight="deprecated" if sklearn_check_version("1.3") else None + ): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): @@ -310,21 +312,30 @@ def predict(self, X, sample_weight=None): "sklearn": sklearn_KMeans.predict, }, X, + sample_weight, ) - def _onedal_predict(self, X, queue=None): + def _onedal_predict(self, X, sample_weight=None, queue=None): X = self._validate_data( X, accept_sparse=False, reset=False, dtype=[np.float64, np.float32], ) + if ( + sklearn_check_version("1.3") + and isinstance(sample_weight, str) + and sample_weight == "deprecated" + ): + sample_weight = None + if sklearn_check_version("1.3") and sample_weight is not None: warnings.warn( "'sample_weight' was deprecated in version 1.3 and " "will be removed in 1.5.", FutureWarning, ) + if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() self._onedal_estimator.cluster_centers_ = self.cluster_centers_ From 2c14d8c6163075218de311280dafbf1936023f81 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 28 Mar 2024 05:57:51 -0700 Subject: [PATCH 014/130] compiler update --- .ci/pipeline/build-and-test-lnx.yml | 2 +- .ci/scripts/install_dpcpp.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index d00f2cb072..88aa1383a2 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -24,7 +24,7 @@ steps: displayName: "System info" - script: | conda update -y -q conda - conda create -q -y -n CB -c conda-forge -c intel python=$(PYTHON_VERSION) intel::dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.0.2" + conda create -q -y -n CB -c conda-forge -c intel python=$(PYTHON_VERSION) intel::dal-devel mpich pyyaml "dpcpp-cpp-rt=2024.1.0" displayName: "Conda create" - script: | . /usr/share/miniconda/etc/profile.d/conda.sh diff --git a/.ci/scripts/install_dpcpp.sh b/.ci/scripts/install_dpcpp.sh index 86432e17ca..1f45d9770d 100755 --- a/.ci/scripts/install_dpcpp.sh +++ b/.ci/scripts/install_dpcpp.sh @@ -21,5 +21,5 @@ rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo add-apt-repository -y "deb https://apt.repos.intel.com/oneapi all main" sudo apt-get update -sudo apt-get install -y intel-dpcpp-cpp-compiler-2024.0 +sudo apt-get install -y intel-dpcpp-cpp-compiler-2024.1 sudo bash -c 'echo libintelocl.so > /etc/OpenCL/vendors/intel-cpu.icd' From 305dc0859e25b76d33d53f9bb50931bdddb4c940 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 28 Mar 2024 07:04:31 -0700 Subject: [PATCH 015/130] init signature --- sklearnex/cluster/k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 4c651304be..db00747656 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -145,7 +145,7 @@ def __init__( verbose=0, random_state=None, copy_x=True, - algorithm="auto", + algorithm="lloyd" if sklearn_check_version("1.1") else "auto", ): super().__init__( n_clusters=n_clusters, From 8b3571f010c56ae28df251eb33dca4e93b231be7 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 29 Mar 2024 04:09:03 -0700 Subject: [PATCH 016/130] deselected tests --- deselected_tests.yaml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index d0ac9fd46a..b4d378699e 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -181,6 +181,7 @@ deselected_tests: # Sparse Support required - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 + - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 @@ -571,8 +572,6 @@ gpu: - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_idempotent] - tests/test_common.py::test_estimators[GaussianMixture()-check_n_features_in] - tests/test_common.py::test_estimators[GaussianMixture()-check_fit2d_predict1d] - # - tests/test_common.py::test_estimators[KMeans()-check_clustering] - # - tests/test_common.py::test_estimators[KMeans()-check_clustering(readonly_memmap=True)] - tests/test_common.py::test_estimators[RandomForestClassifier()-check_class_weight_classifiers] - tests/test_common.py::test_estimators[SVC()-check_sample_weights_pandas_series] - tests/test_common.py::test_estimators[SVC()-check_sample_weights_not_an_array] @@ -607,7 +606,6 @@ gpu: - tests/test_multiclass.py::test_ovr_coef_ - tests/test_multiclass.py::test_ovr_deprecated_coef_intercept - tests/test_multiclass.py::test_pairwise_cross_val_score - - tests/test_multioutput.py::test_multiclass_multioutput_estimator_predict_proba - tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data @@ -621,16 +619,10 @@ gpu: - manifold/tests/test_t_sne.py::test_n_iter_without_progress # KMeans based (unsupported for GPU) - # - cluster/tests/test_k_means.py - # - tests/test_common.py::test_estimators[KMeans() # - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted] # - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted] # - tests/test_common.py::test_check_n_features_in_after_fitting[BayesianGaussianMixture()] # - tests/test_common.py::test_check_n_features_in_after_fitting[GaussianMixture()] - # - tests/test_common.py::test_check_n_features_in_after_fitting[KMeans()] - # - tests/test_common.py::test_set_output_transform[KMeans()] - # - tests/test_common.py::test_set_output_transform_pandas[KMeans()] - # - tests/test_common.py::test_global_output_transform_pandas[KMeans()] # - mixture/tests/test_gaussian_mixture.py # - model_selection/tests/test_validation.py::test_cross_val_predict # - metrics/tests/test_score_objects.py::test_supervised_cluster_scorers From 64e63153f58d8a1617ffdf0856205735293a1642 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 3 Apr 2024 05:33:09 -0700 Subject: [PATCH 017/130] format --- onedal/cluster/kmeans.cpp | 2 +- onedal/cluster/kmeans.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index b63319ef00..b1a3d0d277 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -70,7 +70,7 @@ struct params2desc { desc.set_max_iteration_count( params["max_iteration_count"].cast() ); #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 auto result_options = params["result_options"].cast(); - if (result_options == "computeAssignments"){ + if (result_options == "compute_assignments"){ desc.set_result_options(result_options::compute_assignments); } #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 84e9bc79cc..bd9041b8e7 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -341,7 +341,7 @@ def _set_cluster_centers(self, cluster_centers): cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) def _predict_raw(self, X_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(dtype, result_options="computeAssignments") + params = self._get_onedal_params(dtype, result_options="compute_assignments") result = module.infer(policy, params, self.model_, X_table) From 764b9d8cdca4abdf524ea5014d831b8c2a4d82d1 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 22 Apr 2024 07:10:18 -0700 Subject: [PATCH 018/130] add sparsity support --- onedal/cluster/kmeans.py | 54 ++++++++++++++++++++++++++++++++++-- sklearnex/cluster/k_means.py | 11 ++------ 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index bd9041b8e7..cee76583c2 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -18,8 +18,11 @@ from abc import ABC import numpy as np +from scipy import sparse as sp -from daal4py.sklearn._utils import daal_check_version, get_dtype +from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype +from daal4py import engines_mt19937 +from daal4py import kmeans_init as daal4py_kmeans_init from onedal import _backend from ..datatypes import _convert_to_supported, from_table, to_table @@ -155,7 +158,7 @@ def _get_onedal_params(self, dtype=np.float32, result_options=None): } def _get_params_and_input(self, X, policy): - X_loc = _check_array(X, dtype=[np.float64, np.float32], force_all_finite=False) + X_loc = _check_array(X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False) X_loc = _convert_to_supported(policy, X_loc) @@ -194,6 +197,44 @@ def _init_centroids_custom( return centers_table + #TODO: remove when oneDAL KMeansInit has sparsity support + def _init_centroids_sparse( + self, X, init, random_seed, policy, dtype=np.float32, n_centroids=None + ): + n_clusters = self.n_clusters if n_centroids is None else n_centroids + X_fptype = parse_dtype(dtype) + daal_engine = engines_mt19937( + fptype=X_fptype, method="defaultDense", seed=random_seed + ) + if isinstance(init, str) and init == "k-means++": + _n_local_trials = 2 + int(np.log(nClusters)) + kmeans_init_res = daal4py_kmeans_init( + n_clusters, + fptype=X_fptype, + nTrials=_n_local_trials, + method="plusPlusCSR", + engine=daal_engine, + ).compute(X) + centers_table = to_table(kmeans_init_res.centroids) + elif isinstance(init, str) and init == "random": + kmeans_init_res = daal4py_kmeans_init( + n_clusters, + fptype=X_fptype, + method="randomCSR", + engine=daal_engine, + ).compute(X) + centers_table = to_table(kmeans_init_res.centroids) + elif _is_arraylike_not_scalar(init): + centers = np.asarray(init) + # assert centers.shape[0] == n_clusters + # assert centers.shape[1] == X.column_count + centers = _convert_to_supported(policy, init) + centers_table = to_table(centers) + else: + raise TypeError("Unsupported type of the `init` value") + + return centers_table + def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float32): n_samples = X.shape[0] @@ -266,7 +307,9 @@ def is_better_iteration(inertia, labels): init = check_array(init, dtype=dtype, copy=True, order="C") self._validate_center_shape(X, init) - use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) + is_sparse = sp.issparse(X) + use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) and not is_sparse + use_sparse_init = is_sparse for _ in range(self._n_init): if use_custom_init: @@ -275,6 +318,11 @@ def is_better_iteration(inertia, labels): centroids_table = self._init_centroids_custom( X_table, init, random_seed, policy, dtype=dtype ) + elif use_sparse_init: + random_seed = random_state.randint(np.iinfo("i").max) + centroids_table = self._init_centroids_sparse( + X, init, random_seed, policy, dtype=dtype + ) else: centroids_table = self._init_centroids_generic( X, init, random_state, policy, dtype=dtype diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index db00747656..b809b68768 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -221,10 +221,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self.algorithm in supported_algs, "Only lloyd algorithm is supported, elkan is computed using lloyd", ), - (not issparse(self.init), "Sparse init values are not supported"), (correct_count, "n_clusters is smaller than number of samples"), (sample_weight is None, "Sample weight is not None."), - (not issparse(X), "Sparse input is not supported."), ] ) @@ -255,7 +253,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): X = self._validate_data( X, - accept_sparse=False, + accept_sparse="csr", dtype=[np.float64, np.float32], ) @@ -281,7 +279,6 @@ def _onedal_predict_supported(self, method_name, X, sample_weight): ) supported_algs = ["auto", "full", "lloyd", "elkan"] - dense_centers = not issparse(self.cluster_centers_) patching_status.and_conditions( [ @@ -289,8 +286,6 @@ def _onedal_predict_supported(self, method_name, X, sample_weight): self.algorithm in supported_algs, "Only lloyd algorithm is supported, elkan is computed using lloyd", ), - (dense_centers, "Sparse clusters is not supported."), - (not issparse(X), "Sparse input is not supported."), ] ) @@ -318,7 +313,7 @@ def predict( def _onedal_predict(self, X, sample_weight=None, queue=None): X = self._validate_data( X, - accept_sparse=False, + accept_sparse="csr", reset=False, dtype=[np.float64, np.float32], ) @@ -370,8 +365,8 @@ def transform(self, X): fit.__doc__ = sklearn_KMeans.fit.__doc__ predict.__doc__ = sklearn_KMeans.predict.__doc__ - fit_transform.__doc__ = sklearn_KMeans.fit_transform.__doc__ transform.__doc__ = sklearn_KMeans.transform.__doc__ + fit_transform.__doc__ = sklearn_KMeans.fit_transform.__doc__ else: from daal4py.sklearn.cluster import KMeans From b2b2964251db38045e2afe63ccabefb5423ab56c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 22 Apr 2024 07:11:36 -0700 Subject: [PATCH 019/130] lint --- onedal/cluster/kmeans.py | 82 +++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index cee76583c2..60aaef047d 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -20,9 +20,9 @@ import numpy as np from scipy import sparse as sp -from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype from daal4py import engines_mt19937 from daal4py import kmeans_init as daal4py_kmeans_init +from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype from onedal import _backend from ..datatypes import _convert_to_supported, from_table, to_table @@ -158,7 +158,9 @@ def _get_onedal_params(self, dtype=np.float32, result_options=None): } def _get_params_and_input(self, X, policy): - X_loc = _check_array(X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False) + X_loc = _check_array( + X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False + ) X_loc = _convert_to_supported(policy, X_loc) @@ -197,43 +199,43 @@ def _init_centroids_custom( return centers_table - #TODO: remove when oneDAL KMeansInit has sparsity support + # TODO: remove when oneDAL KMeansInit has sparsity support def _init_centroids_sparse( - self, X, init, random_seed, policy, dtype=np.float32, n_centroids=None - ): - n_clusters = self.n_clusters if n_centroids is None else n_centroids - X_fptype = parse_dtype(dtype) - daal_engine = engines_mt19937( - fptype=X_fptype, method="defaultDense", seed=random_seed - ) - if isinstance(init, str) and init == "k-means++": - _n_local_trials = 2 + int(np.log(nClusters)) - kmeans_init_res = daal4py_kmeans_init( - n_clusters, - fptype=X_fptype, - nTrials=_n_local_trials, - method="plusPlusCSR", - engine=daal_engine, - ).compute(X) - centers_table = to_table(kmeans_init_res.centroids) - elif isinstance(init, str) and init == "random": - kmeans_init_res = daal4py_kmeans_init( - n_clusters, - fptype=X_fptype, - method="randomCSR", - engine=daal_engine, - ).compute(X) - centers_table = to_table(kmeans_init_res.centroids) - elif _is_arraylike_not_scalar(init): - centers = np.asarray(init) - # assert centers.shape[0] == n_clusters - # assert centers.shape[1] == X.column_count - centers = _convert_to_supported(policy, init) - centers_table = to_table(centers) - else: - raise TypeError("Unsupported type of the `init` value") + self, X, init, random_seed, policy, dtype=np.float32, n_centroids=None + ): + n_clusters = self.n_clusters if n_centroids is None else n_centroids + X_fptype = parse_dtype(dtype) + daal_engine = engines_mt19937( + fptype=X_fptype, method="defaultDense", seed=random_seed + ) + if isinstance(init, str) and init == "k-means++": + _n_local_trials = 2 + int(np.log(nClusters)) + kmeans_init_res = daal4py_kmeans_init( + n_clusters, + fptype=X_fptype, + nTrials=_n_local_trials, + method="plusPlusCSR", + engine=daal_engine, + ).compute(X) + centers_table = to_table(kmeans_init_res.centroids) + elif isinstance(init, str) and init == "random": + kmeans_init_res = daal4py_kmeans_init( + n_clusters, + fptype=X_fptype, + method="randomCSR", + engine=daal_engine, + ).compute(X) + centers_table = to_table(kmeans_init_res.centroids) + elif _is_arraylike_not_scalar(init): + centers = np.asarray(init) + # assert centers.shape[0] == n_clusters + # assert centers.shape[1] == X.column_count + centers = _convert_to_supported(policy, init) + centers_table = to_table(centers) + else: + raise TypeError("Unsupported type of the `init` value") - return centers_table + return centers_table def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float32): n_samples = X.shape[0] @@ -308,7 +310,11 @@ def is_better_iteration(inertia, labels): self._validate_center_shape(X, init) is_sparse = sp.issparse(X) - use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) and not is_sparse + use_custom_init = ( + daal_check_version((2023, "P", 200)) + and not callable(self.init) + and not is_sparse + ) use_sparse_init = is_sparse for _ in range(self._n_init): From 44b055bbe11bf22b1be7a84a0bdea8a758723841 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 01:26:39 -0700 Subject: [PATCH 020/130] minor fix --- onedal/cluster/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 60aaef047d..119fd774c1 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -209,7 +209,7 @@ def _init_centroids_sparse( fptype=X_fptype, method="defaultDense", seed=random_seed ) if isinstance(init, str) and init == "k-means++": - _n_local_trials = 2 + int(np.log(nClusters)) + _n_local_trials = 2 + int(np.log(n_clusters)) kmeans_init_res = daal4py_kmeans_init( n_clusters, fptype=X_fptype, From c689503836ebd8a3ea99576502471ab06da6a7b2 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 02:09:28 -0700 Subject: [PATCH 021/130] callable init --- onedal/cluster/kmeans.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 119fd774c1..a1eaf9ca96 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -173,7 +173,7 @@ def _get_params_and_input(self, X, policy): return (params, X_table, dtype) - def _init_centroids_custom( + def _init_centroids_custom_dense( self, X_table, init, random_seed, policy, dtype=np.float32, n_centroids=None ): n_clusters = self.n_clusters if n_centroids is None else n_centroids @@ -200,7 +200,7 @@ def _init_centroids_custom( return centers_table # TODO: remove when oneDAL KMeansInit has sparsity support - def _init_centroids_sparse( + def _init_centroids_custom_sparse( self, X, init, random_seed, policy, dtype=np.float32, n_centroids=None ): n_clusters = self.n_clusters if n_centroids is None else n_centroids @@ -310,23 +310,27 @@ def is_better_iteration(inertia, labels): self._validate_center_shape(X, init) is_sparse = sp.issparse(X) - use_custom_init = ( + use_custom_dense_init = ( daal_check_version((2023, "P", 200)) and not callable(self.init) and not is_sparse ) - use_sparse_init = is_sparse + use_custom_sparse_init = ( + daal_check_version((2023, "P", 200)) + and not callable(self.init) + and is_sparse + ) for _ in range(self._n_init): - if use_custom_init: + if use_custom_dense_init: # random_seed = random_state.tomaxint() random_seed = random_state.randint(np.iinfo("i").max) - centroids_table = self._init_centroids_custom( + centroids_table = self._init_centroids_custom_dense( X_table, init, random_seed, policy, dtype=dtype ) - elif use_sparse_init: + elif use_custom_sparse_init: random_seed = random_state.randint(np.iinfo("i").max) - centroids_table = self._init_centroids_sparse( + centroids_table = self._init_centroids_custom_sparse( X, init, random_seed, policy, dtype=dtype ) else: From 99336d4a626e16b57f8a9657a190cdb9c24be15b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 02:26:16 -0700 Subject: [PATCH 022/130] lint --- onedal/cluster/kmeans.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a1eaf9ca96..6eb358ba90 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -316,9 +316,7 @@ def is_better_iteration(inertia, labels): and not is_sparse ) use_custom_sparse_init = ( - daal_check_version((2023, "P", 200)) - and not callable(self.init) - and is_sparse + daal_check_version((2023, "P", 200)) and not callable(self.init) and is_sparse ) for _ in range(self._n_init): From bdd9e952c50ff8da52f25311eb1b779cba8698fc Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 06:39:03 -0700 Subject: [PATCH 023/130] table fix --- onedal/cluster/kmeans.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 6eb358ba90..6068c818c4 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -217,7 +217,8 @@ def _init_centroids_custom_sparse( method="plusPlusCSR", engine=daal_engine, ).compute(X) - centers_table = to_table(kmeans_init_res.centroids) + centers = _convert_to_supported(policy, kmeans_init_res.centroids) + centers_table = to_table(centers) elif isinstance(init, str) and init == "random": kmeans_init_res = daal4py_kmeans_init( n_clusters, @@ -225,11 +226,12 @@ def _init_centroids_custom_sparse( method="randomCSR", engine=daal_engine, ).compute(X) - centers_table = to_table(kmeans_init_res.centroids) + centers = _convert_to_supported(policy, kmeans_init_res.centroids) + centers_table = to_table(centers) elif _is_arraylike_not_scalar(init): centers = np.asarray(init) - # assert centers.shape[0] == n_clusters - # assert centers.shape[1] == X.column_count + assert centers.shape[0] == n_clusters + assert centers.shape[1] == X.column_count centers = _convert_to_supported(policy, init) centers_table = to_table(centers) else: From 53ac098f9075129a5e7655b9dcd2f454eea0d96e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 06:45:59 -0700 Subject: [PATCH 024/130] minor --- sklearnex/cluster/k_means.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index b809b68768..4c4fb10a8b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -284,7 +284,11 @@ def _onedal_predict_supported(self, method_name, X, sample_weight): [ ( self.algorithm in supported_algs, - "Only lloyd algorithm is supported, elkan is computed using lloyd", + "Only lloyd algorithm is supported, elkan is computed using lloyd.", + ), + ( + hasattr(self, "_onedal_estimator"), + "oneDAL model was not fit.", ), ] ) From 66a02dde2768a42b48a549067d91b03b359a28d8 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 07:46:07 -0700 Subject: [PATCH 025/130] minor --- onedal/cluster/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 6068c818c4..b8c3a2d960 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -231,7 +231,7 @@ def _init_centroids_custom_sparse( elif _is_arraylike_not_scalar(init): centers = np.asarray(init) assert centers.shape[0] == n_clusters - assert centers.shape[1] == X.column_count + assert centers.shape[1] == X.shape[1] centers = _convert_to_supported(policy, init) centers_table = to_table(centers) else: From 9c5580a4073d345f68cf8fcdaaa295d29e174d67 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 11:19:45 -0700 Subject: [PATCH 026/130] rename attribute --- onedal/cluster/kmeans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index b8c3a2d960..fea84fd124 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -90,9 +90,9 @@ def _tolerance(self, rtol, X_table, policy, dtype=np.float32): if rtol == 0.0: return rtol # TODO: Support CSR in Basic Statistics - dummy = to_table(None) + dummy_weights_table = to_table(None) bs = self._get_basic_statistics_backend("variance") - res = bs.compute_raw(X_table, dummy, policy, dtype) + res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) mean_var = from_table(res["variance"]).mean() return mean_var * rtol From 6aee2f7d36ac8c9ec8865b9ba4347de45356857b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 23 Apr 2024 12:03:05 -0700 Subject: [PATCH 027/130] test, revert later --- onedal/cluster/kmeans.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index fea84fd124..8b5467f84e 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -158,12 +158,13 @@ def _get_onedal_params(self, dtype=np.float32, result_options=None): } def _get_params_and_input(self, X, policy): + print("is sparse X:", sp.issparse(X)) X_loc = _check_array( X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False ) - + print("is sparse X_loc:", sp.issparse(X_loc)) X_loc = _convert_to_supported(policy, X_loc) - + print("is sparse X_loc 2:", sp.issparse(X_loc)) dtype = get_dtype(X_loc) X_table = to_table(X_loc) From e6a01c63469f042e46ee8697feace9225a7b0909 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 24 Apr 2024 04:12:26 -0700 Subject: [PATCH 028/130] minor --- onedal/basic_statistics/basic_statistics.cpp | 1 + onedal/cluster/kmeans.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp index 6801f84296..e72d9c7f50 100644 --- a/onedal/basic_statistics/basic_statistics.cpp +++ b/onedal/basic_statistics/basic_statistics.cpp @@ -41,6 +41,7 @@ struct method2t { const auto method = params["method"].cast(); ONEDAL_PARAM_DISPATCH_VALUE(method, "dense", ops, Float, method::dense); + ONEDAL_PARAM_DISPATCH_VALUE(method, "sparse", ops, Float, method::sparse); ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 8b5467f84e..32255b7622 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -158,13 +158,10 @@ def _get_onedal_params(self, dtype=np.float32, result_options=None): } def _get_params_and_input(self, X, policy): - print("is sparse X:", sp.issparse(X)) X_loc = _check_array( X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False ) - print("is sparse X_loc:", sp.issparse(X_loc)) X_loc = _convert_to_supported(policy, X_loc) - print("is sparse X_loc 2:", sp.issparse(X_loc)) dtype = get_dtype(X_loc) X_table = to_table(X_loc) From ef2b6a1e9bfd24e3b98df17a2368559494ee078c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 24 Apr 2024 06:09:39 -0700 Subject: [PATCH 029/130] add sparsity --- onedal/basic_statistics/basic_statistics.cpp | 20 ++++++++++++++++---- onedal/basic_statistics/basic_statistics.py | 16 +++++++++------- onedal/cluster/kmeans.cpp | 7 ++++--- onedal/cluster/kmeans.py | 6 +++--- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp index e72d9c7f50..5bd0e7a942 100644 --- a/onedal/basic_statistics/basic_statistics.cpp +++ b/onedal/basic_statistics/basic_statistics.cpp @@ -111,8 +111,20 @@ struct params2desc { template auto operator()(const py::dict& params) { auto desc = dal::basic_statistics::descriptor() - .set_result_options(get_onedal_result_options(params)); + Method, + dal::basic_statistics::task::compute>() + .set_result_options(get_onedal_result_options(params)); + return desc; + } +}; + +struct params2desc_partial { + template + auto operator()(const py::dict& params) { + auto desc = dal::basic_statistics::descriptor() + .set_result_options(get_onedal_result_options(params)); return desc; } }; @@ -149,7 +161,7 @@ void init_partial_compute_ops(py::module& m) { const table& weights) { using namespace dal::basic_statistics; using input_t = partial_compute_input; - partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc{}); + partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc_partial{}); return fptype2t{ method2t{ Task{}, ops } }(params); } ); @@ -160,7 +172,7 @@ void init_finalize_compute_ops(pybind11::module_& m) { using namespace dal::basic_statistics; using input_t = partial_compute_result; m.def("finalize_compute", [](const Policy& policy, const pybind11::dict& params, const input_t& data) { - finalize_compute_ops ops(policy, data, params2desc{}); + finalize_compute_ops ops(policy, data, params2desc_partial{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); } diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 852c71dd20..1b30d8cfe4 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -18,11 +18,13 @@ from numbers import Number import numpy as np +from scipy import sparse as sp from onedal import _backend from ..common._base import BaseEstimator from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import _check_array class BaseBasicStatistics(metaclass=ABCMeta): @@ -54,16 +56,16 @@ def _get_result_options(self, options): assert isinstance(options, str) return options - def _get_onedal_params(self, dtype=np.float32): + def _get_onedal_params(self, data_table, dtype=np.float32): options = self._get_result_options(self.options) return { "fptype": "float" if dtype == np.float32 else "double", - "method": self.algorithm, + "method": "sparse" if sp.issparse(data_table) else self.algorithm, "result_option": options, } def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(dtype) + params = self._get_onedal_params(data_table, dtype) result = module.train(policy, params, data_table, weights_table) @@ -75,14 +77,14 @@ def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float def _compute(self, data, weights, module, queue): policy = self._get_policy(queue, data, weights) - if not (data is None): - data = np.asarray(data) + data_loc = _check_array(data, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False) + if not (weights is None): weights = np.asarray(weights) - data, weights = _convert_to_supported(policy, data, weights) + data_loc, weights = _convert_to_supported(policy, data_loc, weights) - data_table, weights_table = to_table(data, weights) + data_table, weights_table = to_table(data_loc, weights) dtype = data.dtype res = self._compute_raw(data_table, weights_table, module, policy, dtype) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index b1a3d0d277..6528243659 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -38,6 +38,7 @@ struct method2t { const auto method = params["method"].cast(); ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_dense", ops, Float, method::lloyd_dense); + ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_csr", ops, Float, method::lloyd_csr); ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -47,13 +48,13 @@ struct method2t { template struct descriptor_creator {}; -template +template struct descriptor_creator { static auto get() { return dal::kmeans::descriptor{}; } }; diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 32255b7622..c345388a2a 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -145,11 +145,11 @@ def _check_params_vs_input( self._n_init = 1 assert self.algorithm == "lloyd" - def _get_onedal_params(self, dtype=np.float32, result_options=None): + def _get_onedal_params(self, X_table, dtype=np.float32, result_options=None): thr = self._tol if hasattr(self, "_tol") else self.tol return { "fptype": "float" if dtype == np.float32 else "double", - "method": "by_default", + "method": "lloyd_csr" if sp.issparse(X_table) else "by_default", "seed": -1, "max_iteration_count": self.max_iter, "cluster_count": self.n_clusters, @@ -167,7 +167,7 @@ def _get_params_and_input(self, X, policy): self._check_params_vs_input(X_table, policy, dtype=dtype) - params = self._get_onedal_params(dtype) + params = self._get_onedal_params(X_table, dtype) return (params, X_table, dtype) From e2c7c311bb62c2e9b1cce30aacb147899971b42a Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 24 Apr 2024 06:11:41 -0700 Subject: [PATCH 030/130] lint --- onedal/basic_statistics/basic_statistics.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 1b30d8cfe4..a71fb83ab1 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -77,7 +77,12 @@ def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float def _compute(self, data, weights, module, queue): policy = self._get_policy(queue, data, weights) - data_loc = _check_array(data, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False) + data_loc = _check_array( + data, + dtype=[np.float64, np.float32], + accept_sparse="csr", + force_all_finite=False, + ) if not (weights is None): weights = np.asarray(weights) From 52d159bebe62882dbee6a02a92e1222907a4a0ee Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 25 Apr 2024 23:59:06 -0700 Subject: [PATCH 031/130] replace basic stat with numpy --- onedal/basic_statistics/basic_statistics.cpp | 6 +-- onedal/basic_statistics/basic_statistics.py | 3 +- onedal/cluster/kmeans.py | 48 ++++++++++++-------- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp index 5bd0e7a942..21ae47eafc 100644 --- a/onedal/basic_statistics/basic_statistics.cpp +++ b/onedal/basic_statistics/basic_statistics.cpp @@ -118,7 +118,7 @@ struct params2desc { } }; -struct params2desc_partial { +struct params2desc_incremental { template auto operator()(const py::dict& params) { auto desc = dal::basic_statistics::descriptor; - partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc_partial{}); + partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc_incremental{}); return fptype2t{ method2t{ Task{}, ops } }(params); } ); @@ -172,7 +172,7 @@ void init_finalize_compute_ops(pybind11::module_& m) { using namespace dal::basic_statistics; using input_t = partial_compute_result; m.def("finalize_compute", [](const Policy& policy, const pybind11::dict& params, const input_t& data) { - finalize_compute_ops ops(policy, data, params2desc_partial{}); + finalize_compute_ops ops(policy, data, params2desc_incremental{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); } diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index a71fb83ab1..772b3a77e1 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -89,7 +89,8 @@ def _compute(self, data, weights, module, queue): data_loc, weights = _convert_to_supported(policy, data_loc, weights) - data_table, weights_table = to_table(data_loc, weights) + data_table = to_table(data_loc) + weights_table = to_table(weights) dtype = data.dtype res = self._compute_raw(data_table, weights_table, module, policy, dtype) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index c345388a2a..fe8ed33353 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -36,13 +36,14 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils import check_array, check_random_state +from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.utils.validation import check_is_fitted -from onedal.basic_statistics import BasicStatistics - from ..common._base import BaseEstimator as onedal_BaseEstimator from ..utils import _check_array, _is_arraylike_not_scalar +# from onedal.basic_statistics import BasicStatistics + class _BaseKMeans(onedal_BaseEstimator, TransformerMixin, ClusterMixin, ABC): def __init__( @@ -82,31 +83,40 @@ def _validate_center_shape(self, X, centers): def _get_kmeans_init(self, cluster_count, seed, algorithm): return KMeansInit(cluster_count=cluster_count, seed=seed, algorithm=algorithm) - def _get_basic_statistics_backend(self, result_options): - return BasicStatistics(result_options) - - def _tolerance(self, rtol, X_table, policy, dtype=np.float32): + # def _get_basic_statistics_backend(self, result_options): + # return BasicStatistics(result_options) + + # def _tolerance(self, rtol, X_table, policy, dtype=np.float32): + # """Compute absolute tolerance from the relative tolerance""" + # if rtol == 0.0: + # return rtol + # # TODO: Support CSR in Basic Statistics + # dummy_weights_table = to_table(None) + # bs = self._get_basic_statistics_backend("variance") + # res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) + # mean_var = from_table(res["variance"]).mean() + # return mean_var * rtol + + def _tolerance(self, X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol - # TODO: Support CSR in Basic Statistics - dummy_weights_table = to_table(None) - bs = self._get_basic_statistics_backend("variance") - res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) - mean_var = from_table(res["variance"]).mean() + if sp.issparse(X): + variances = mean_variance_axis(X, axis=0)[1] + mean_var = np.mean(variances) + else: + mean_var = np.var(X, axis=0).mean() return mean_var * rtol - def _check_params_vs_input( - self, X_table, policy, default_n_init=10, dtype=np.float32 - ): + def _check_params_vs_input(self, X_loc, policy, default_n_init=10, dtype=np.float32): # n_clusters - if X_table.shape[0] < self.n_clusters: + if X_loc.shape[0] < self.n_clusters: raise ValueError( - f"n_samples={X_table.shape[0]} should be >= n_clusters={self.n_clusters}." + f"n_samples={X_loc.shape[0]} should be >= n_clusters={self.n_clusters}." ) # tol - self._tol = self._tolerance(self.tol, X_table, policy, dtype) + self._tol = self._tolerance(X_loc, self.tol) # n-init # TODO(1.4): Remove @@ -165,7 +175,7 @@ def _get_params_and_input(self, X, policy): dtype = get_dtype(X_loc) X_table = to_table(X_loc) - self._check_params_vs_input(X_table, policy, dtype=dtype) + self._check_params_vs_input(X_loc, policy, dtype=dtype) params = self._get_onedal_params(X_table, dtype) @@ -227,7 +237,7 @@ def _init_centroids_custom_sparse( centers = _convert_to_supported(policy, kmeans_init_res.centroids) centers_table = to_table(centers) elif _is_arraylike_not_scalar(init): - centers = np.asarray(init) + centers = np.asarray(init, dtype=dtype) assert centers.shape[0] == n_clusters assert centers.shape[1] == X.shape[1] centers = _convert_to_supported(policy, init) From 845b8c6e9bea29dd28f147f758b74a15b248e7b2 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 26 Apr 2024 01:04:28 -0700 Subject: [PATCH 032/130] remove skip --- sklearnex/tests/test_patching.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearnex/tests/test_patching.py b/sklearnex/tests/test_patching.py index 07c599d6b1..0a013f7f49 100755 --- a/sklearnex/tests/test_patching.py +++ b/sklearnex/tests/test_patching.py @@ -126,7 +126,6 @@ def test_standard_estimator_patching(caplog, dataframe, queue, dtype, estimator, elif dtype == np.float64 and not queue.sycl_device.has_aspect_fp64: pytest.skip("Hardware does not support fp64 SYCL testing") elif queue.sycl_device.is_gpu and estimator in [ - "KMeans", "ElasticNet", "Lasso", "Ridge", From 1044fad8da46006443b1be24c94a9b6be4500a38 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 26 Apr 2024 03:49:45 -0700 Subject: [PATCH 033/130] CI fixes --- onedal/basic_statistics/basic_statistics.py | 3 +- onedal/cluster/kmeans.py | 51 +++++++++------------ sklearnex/cluster/k_means.py | 3 ++ 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 772b3a77e1..a71fb83ab1 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -89,8 +89,7 @@ def _compute(self, data, weights, module, queue): data_loc, weights = _convert_to_supported(policy, data_loc, weights) - data_table = to_table(data_loc) - weights_table = to_table(weights) + data_table, weights_table = to_table(data_loc, weights) dtype = data.dtype res = self._compute_raw(data_table, weights_table, module, policy, dtype) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index fe8ed33353..caa6538056 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -36,14 +36,13 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils import check_array, check_random_state -from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.utils.validation import check_is_fitted +from onedal.basic_statistics import BasicStatistics + from ..common._base import BaseEstimator as onedal_BaseEstimator from ..utils import _check_array, _is_arraylike_not_scalar -# from onedal.basic_statistics import BasicStatistics - class _BaseKMeans(onedal_BaseEstimator, TransformerMixin, ClusterMixin, ABC): def __init__( @@ -83,40 +82,30 @@ def _validate_center_shape(self, X, centers): def _get_kmeans_init(self, cluster_count, seed, algorithm): return KMeansInit(cluster_count=cluster_count, seed=seed, algorithm=algorithm) - # def _get_basic_statistics_backend(self, result_options): - # return BasicStatistics(result_options) - - # def _tolerance(self, rtol, X_table, policy, dtype=np.float32): - # """Compute absolute tolerance from the relative tolerance""" - # if rtol == 0.0: - # return rtol - # # TODO: Support CSR in Basic Statistics - # dummy_weights_table = to_table(None) - # bs = self._get_basic_statistics_backend("variance") - # res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) - # mean_var = from_table(res["variance"]).mean() - # return mean_var * rtol - - def _tolerance(self, X, rtol): + def _get_basic_statistics_backend(self, result_options): + return BasicStatistics(result_options) + + def _tolerance(self, rtol, X_table, policy, dtype=np.float32): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol - if sp.issparse(X): - variances = mean_variance_axis(X, axis=0)[1] - mean_var = np.mean(variances) - else: - mean_var = np.var(X, axis=0).mean() + dummy_weights_table = to_table(None) + bs = self._get_basic_statistics_backend("variance") + res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) + mean_var = from_table(res["variance"]).mean() return mean_var * rtol - def _check_params_vs_input(self, X_loc, policy, default_n_init=10, dtype=np.float32): + def _check_params_vs_input( + self, X_table, policy, default_n_init=10, dtype=np.float32 + ): # n_clusters - if X_loc.shape[0] < self.n_clusters: + if X_table.shape[0] < self.n_clusters: raise ValueError( - f"n_samples={X_loc.shape[0]} should be >= n_clusters={self.n_clusters}." + f"n_samples={X_table.shape[0]} should be >= n_clusters={self.n_clusters}." ) # tol - self._tol = self._tolerance(X_loc, self.tol) + self._tol = self._tolerance(self.tol, X_table, policy, dtype) # n-init # TODO(1.4): Remove @@ -175,7 +164,7 @@ def _get_params_and_input(self, X, policy): dtype = get_dtype(X_loc) X_table = to_table(X_loc) - self._check_params_vs_input(X_loc, policy, dtype=dtype) + self._check_params_vs_input(X_table, policy, dtype=dtype) params = self._get_onedal_params(X_table, dtype) @@ -276,7 +265,7 @@ def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float3 return to_table(centers) def _fit_backend(self, X_table, centroids_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(dtype) + params = self._get_onedal_params(X_table, dtype) # TODO: check all features for having correct type meta = _backend.get_table_metadata(X_table) @@ -407,7 +396,9 @@ def _set_cluster_centers(self, cluster_centers): cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) def _predict_raw(self, X_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(dtype, result_options="compute_assignments") + params = self._get_onedal_params( + X_table, dtype, result_options="compute_assignments" + ) result = module.infer(policy, params, self.model_, X_table) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 4c4fb10a8b..3e42ff9600 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -31,6 +31,7 @@ check_is_fitted, ) + from daal4py.sklearn._device_offload import support_usm_ndarray from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from onedal.cluster import KMeans as onedal_KMeans @@ -367,6 +368,8 @@ def transform(self, X): X = self._check_test_data(X) return self._transform(X) + score = support_usm_ndarray()(sklearn_KMeans.score) + fit.__doc__ = sklearn_KMeans.fit.__doc__ predict.__doc__ = sklearn_KMeans.predict.__doc__ transform.__doc__ = sklearn_KMeans.transform.__doc__ From 55a2df888d336a0f08663b4ab9e45eaa0254a4b9 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 26 Apr 2024 05:57:55 -0700 Subject: [PATCH 034/130] CI fixes --- onedal/cluster/kmeans.py | 9 +++++---- sklearnex/cluster/k_means.py | 9 ++++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index caa6538056..a94b432b9e 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -226,9 +226,8 @@ def _init_centroids_custom_sparse( centers = _convert_to_supported(policy, kmeans_init_res.centroids) centers_table = to_table(centers) elif _is_arraylike_not_scalar(init): - centers = np.asarray(init, dtype=dtype) - assert centers.shape[0] == n_clusters - assert centers.shape[1] == X.shape[1] + assert init.shape[0] == n_clusters + assert init.shape[1] == X.shape[1] centers = _convert_to_supported(policy, init) centers_table = to_table(centers) else: @@ -305,7 +304,9 @@ def is_better_iteration(inertia, labels): init = self.init init_is_array_like = _is_arraylike_not_scalar(init) if init_is_array_like: - init = check_array(init, dtype=dtype, copy=True, order="C") + init = check_array( + init, dtype=dtype, accept_sparse="csr", copy=True, order="C" + ) self._validate_center_shape(X, init) is_sparse = sp.issparse(X) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 3e42ff9600..98154d4646 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -215,6 +215,13 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count + if sample_weight is not None: + if sample_weight.shape == (X.shape[0],) and (np.allclose(sample_weight, np.ones_like(sample_weight))): + is_sample_weight_valid = True + else: + is_sample_weight_valid = False + else: + is_sample_weight_valid = True patching_status.and_conditions( [ @@ -223,7 +230,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): "Only lloyd algorithm is supported, elkan is computed using lloyd", ), (correct_count, "n_clusters is smaller than number of samples"), - (sample_weight is None, "Sample weight is not None."), + (is_sample_weight_valid, "Sample weight must be None or array of ones of length n_samples."), ] ) From 64f4d3014e35fbf6047ac874bd6a543193eed43c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 26 Apr 2024 08:52:52 -0700 Subject: [PATCH 035/130] lint --- sklearnex/cluster/k_means.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 98154d4646..0d734f5903 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -216,7 +216,9 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count if sample_weight is not None: - if sample_weight.shape == (X.shape[0],) and (np.allclose(sample_weight, np.ones_like(sample_weight))): + if sample_weight.shape == (X.shape[0],) and ( + np.allclose(sample_weight, np.ones_like(sample_weight)) + ): is_sample_weight_valid = True else: is_sample_weight_valid = False @@ -230,7 +232,10 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): "Only lloyd algorithm is supported, elkan is computed using lloyd", ), (correct_count, "n_clusters is smaller than number of samples"), - (is_sample_weight_valid, "Sample weight must be None or array of ones of length n_samples."), + ( + is_sample_weight_valid, + "Sample weight must be None or array of ones of length n_samples.", + ), ] ) From b95f784b863bb31e039aa08c8c47eec637d6331c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 26 Apr 2024 09:44:16 -0700 Subject: [PATCH 036/130] minor --- sklearnex/cluster/k_means.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 0d734f5903..89ec3e9364 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -216,8 +216,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count if sample_weight is not None: - if sample_weight.shape == (X.shape[0],) and ( - np.allclose(sample_weight, np.ones_like(sample_weight)) + if len(sample_weight) == (X.shape[0],) and ( + np.allclose(np.asarray(sample_weight), np.ones_like(sample_weight)) ): is_sample_weight_valid = True else: From fa1f7047b2d0e9ea6662f06310b4018d69db783f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 26 Apr 2024 14:15:55 -0700 Subject: [PATCH 037/130] fix sample_weight --- sklearnex/cluster/k_means.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 89ec3e9364..f8951970f6 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -26,6 +26,7 @@ from sklearn.cluster import KMeans as sklearn_KMeans from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( + _check_sample_weight, _deprecate_positional_args, _num_samples, check_is_fitted, @@ -215,15 +216,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count - if sample_weight is not None: - if len(sample_weight) == (X.shape[0],) and ( - np.allclose(np.asarray(sample_weight), np.ones_like(sample_weight)) - ): - is_sample_weight_valid = True - else: - is_sample_weight_valid = False - else: - is_sample_weight_valid = True + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) patching_status.and_conditions( [ @@ -233,8 +226,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): ), (correct_count, "n_clusters is smaller than number of samples"), ( - is_sample_weight_valid, - "Sample weight must be None or array of ones of length n_samples.", + np.allclose(sample_weight, np.ones_like(sample_weight)), + "Sample weights are not ones.", ), ] ) From 919d5a0b645e4c015841403625fa21bcb87116d6 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sat, 27 Apr 2024 06:08:50 -0700 Subject: [PATCH 038/130] pandas dtype --- sklearnex/cluster/k_means.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index f8951970f6..4c71f6d229 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -216,7 +216,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None) patching_status.and_conditions( [ From ca18b840f4f1927ae4398bf5ed967f475f791073 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sat, 27 Apr 2024 06:11:09 -0700 Subject: [PATCH 039/130] lint --- sklearnex/cluster/k_means.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 4c71f6d229..fa1dbcca4f 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -217,7 +217,9 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None + ) patching_status.and_conditions( [ From 6c12e380e923de5c8ca09e962f028720b69c898b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sat, 27 Apr 2024 09:27:17 -0700 Subject: [PATCH 040/130] remove deselected tests --- deselected_tests.yaml | 31 ++++--------------------------- sklearnex/cluster/k_means.py | 2 -- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 6e194b9189..0d69fdb6d2 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -178,10 +178,10 @@ deselected_tests: - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 - # Sparse Support required - - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 + # # Sparse Support required + # - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 + # - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 + # - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 @@ -244,9 +244,6 @@ deselected_tests: # Different results scikit-learn-intelex and scikit-learn linear regression with weights. Need to investigate. - inspection/tests/test_permutation_importance.py::test_permutation_importance_sample_weight >=0.24 - # Patched and unpatched kmeans set same values to different clusters. Need to investigate. - # - preprocessing/tests/test_discretization.py::test_nonuniform_strategies[kmeans-expected_2bins1-expected_3bins1-expected_5bins1] >=0.24 - # OOB scores in scikit-learn and oneDAL are different because of different random number generators - ensemble/tests/test_forest.py::test_forest_classifier_oob[X1-y1-0.65-array-ExtraTreesClassifier] - ensemble/tests/test_forest.py::test_forest_classifier_oob[True-X1-y1-0.65-array-ExtraTreesClassifier] >=1.3 @@ -350,14 +347,6 @@ deselected_tests: - tests/test_common.py::test_estimators[LogisticRegression()-check_sample_weights_invariance(kind=zeros)] >=1.4 - tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data >=1.4 - # New failing sklearn1.4.1 tests for kmeans associated with incorrect n_iter_ values in daal4py - # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-dense] >=1.4 - # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_matrix] >=1.4 - # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_array] >=1.4 - # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-dense] >=1.4 - # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_matrix] >=1.4 - # - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_array] >=1.4 - # Deselected tests for incremental algorithms # Need to rework getting policy to correctly obtain it for method without data (finalize_fit) # and avoid keeping it in class attribute, also need to investigate how to implement @@ -449,8 +438,6 @@ gpu: # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - # - cluster/tests/test_k_means.py::test_k_means_fit_predict - # - cluster/tests/test_k_means.py::test_predict - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples @@ -628,16 +615,6 @@ gpu: - tests/test_common.py::test_search_cv - manifold/tests/test_t_sne.py::test_n_iter_without_progress - # KMeans based (unsupported for GPU) - # - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted] - # - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted] - # - tests/test_common.py::test_check_n_features_in_after_fitting[BayesianGaussianMixture()] - # - tests/test_common.py::test_check_n_features_in_after_fitting[GaussianMixture()] - # - mixture/tests/test_gaussian_mixture.py - # - model_selection/tests/test_validation.py::test_cross_val_predict - # - metrics/tests/test_score_objects.py::test_supervised_cluster_scorers - # - tests/test_pipeline.py::test_fit_predict_on_pipeline - # - tests/test_discriminant_analysis.py::test_lda_predict # Other device issues - tests/test_metaestimators.py::test_meta_estimators_delegate_data_validation[StackingClassifier] - tests/test_multiclass.py::test_ovr_always_present diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index fa1dbcca4f..5555bc8952 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -258,8 +258,6 @@ def fit(self, X, y=None, sample_weight=None): return self def _onedal_fit(self, X, _, sample_weight, queue=None): - assert sample_weight is None - X = self._validate_data( X, accept_sparse="csr", From 91288f641d46742982df7c996c30c74f55636ab5 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 3 May 2024 04:32:42 -0700 Subject: [PATCH 041/130] use numpy variance --- deselected_tests.yaml | 6 --- onedal/cluster/kmeans.py | 38 ++++++++++++------- .../tests/test_run_to_run_stability_tests.py | 3 -- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 0d69fdb6d2..deabd581ed 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -167,7 +167,6 @@ deselected_tests: # test_non_uniform_strategies fails due to differences in handling of vacuous clusters after update # See https://github.com/IntelPython/daal4py/issues/69 - # - cluster/tests/test_k_means.py::test_relocated_clusters >=0.23,<0.24 - cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24 # In scikit-learn, these algorithms are not included in this test. However, scikit-learn-intelex @@ -178,11 +177,6 @@ deselected_tests: - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 - # # Sparse Support required - # - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 - # - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 - # - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 - # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 - linear_model/tests/test_logistic.py::test_dtype_match diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a94b432b9e..f2236dad99 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -36,13 +36,14 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils import check_array, check_random_state +from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.utils.validation import check_is_fitted -from onedal.basic_statistics import BasicStatistics - from ..common._base import BaseEstimator as onedal_BaseEstimator from ..utils import _check_array, _is_arraylike_not_scalar +# from onedal.basic_statistics import BasicStatistics + class _BaseKMeans(onedal_BaseEstimator, TransformerMixin, ClusterMixin, ABC): def __init__( @@ -82,17 +83,28 @@ def _validate_center_shape(self, X, centers): def _get_kmeans_init(self, cluster_count, seed, algorithm): return KMeansInit(cluster_count=cluster_count, seed=seed, algorithm=algorithm) - def _get_basic_statistics_backend(self, result_options): - return BasicStatistics(result_options) + # def _get_basic_statistics_backend(self, result_options): + # return BasicStatistics(result_options) - def _tolerance(self, rtol, X_table, policy, dtype=np.float32): + # def _tolerance(self, rtol, X_table, policy, dtype=np.float32): + # """Compute absolute tolerance from the relative tolerance""" + # if rtol == 0.0: + # return rtol + # dummy_weights_table = to_table(None) + # bs = self._get_basic_statistics_backend("variance") + # res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) + # mean_var = from_table(res["variance"]).mean() + # return mean_var * rtol + + def _tolerance(self, X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol - dummy_weights_table = to_table(None) - bs = self._get_basic_statistics_backend("variance") - res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) - mean_var = from_table(res["variance"]).mean() + if sp.issparse(X): + variances = mean_variance_axis(X, axis=0)[1] + mean_var = np.mean(variances) + else: + mean_var = np.var(X, axis=0).mean() return mean_var * rtol def _check_params_vs_input( @@ -105,7 +117,7 @@ def _check_params_vs_input( ) # tol - self._tol = self._tolerance(self.tol, X_table, policy, dtype) + self._tol = self._tolerance(X_table, self.tol) # n-init # TODO(1.4): Remove @@ -164,7 +176,7 @@ def _get_params_and_input(self, X, policy): dtype = get_dtype(X_loc) X_table = to_table(X_loc) - self._check_params_vs_input(X_table, policy, dtype=dtype) + self._check_params_vs_input(X_loc, policy, dtype=dtype) params = self._get_onedal_params(X_table, dtype) @@ -337,14 +349,14 @@ def is_better_iteration(inertia, labels): ) if self.verbose: - print("Initialization complete") + print("Initialization complete.") labels, inertia, model, n_iter = self._fit_backend( X_table, centroids_table, module, policy, dtype ) if self.verbose: - print("KMeans iteration completed with " "inertia {}.".format(inertia)) + print("KMeans iteration completed with inertia {}.".format(inertia)) if is_better_iteration(inertia, labels): best_model, best_n_iter = model, n_iter diff --git a/sklearnex/tests/test_run_to_run_stability_tests.py b/sklearnex/tests/test_run_to_run_stability_tests.py index 66b5b37765..33f39bea79 100755 --- a/sklearnex/tests/test_run_to_run_stability_tests.py +++ b/sklearnex/tests/test_run_to_run_stability_tests.py @@ -357,9 +357,6 @@ def _run_test(model, methods, dataset): "LogisticRegressionCV", # Absolute diff is 1e-10, will be fixed for next release "RandomForestRegressor", # Absolute diff is 1e-14 in OOB score, # will be fixed for next release - "KMeans", # sparsity support required, - # '_tol' attribute shows numerical instability (diff is 1e-14) coming from basic_statistics - # variance calculation. ] From b51e6bd0d9f8efd4cf8d6ec55e904d5f7aee2b56 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 17 May 2024 13:59:04 -0700 Subject: [PATCH 042/130] test sparse offset --- onedal/datatypes/data_conversion.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/onedal/datatypes/data_conversion.cpp b/onedal/datatypes/data_conversion.cpp index 0d7ceea6a2..f391e39c14 100644 --- a/onedal/datatypes/data_conversion.cpp +++ b/onedal/datatypes/data_conversion.cpp @@ -113,6 +113,16 @@ inline csr_table_t convert_to_csr_impl(PyObject* py_data, for (std::int64_t i = 0; i < row_indices_count; ++i) row_indices_one_based_data[i] = row_indices_zero_based[i] + 1; + auto row_indices_one_based_offset = dal::array::empty(row_count + 1); + auto row_indices_one_based_offset_data = row_indices_one_based_offset.get_mutable_data(); + row_indices_one_based_offset_data[0] = 1; + std::int64_t running_elem_count = 0; + for (std::int64_t i = 1; i < row_count + 1; ++i){ + for (std::int64_t j = running_elem_count; row_indices_one_based_data[j] == i; ++j) + running_elem_count++; + row_indices_one_based_offset_data[i] = running_elem_count + 1; + } + const std::int64_t *column_indices_zero_based = static_cast(array_data(np_column_indices)); const std::int64_t column_indices_count = @@ -133,7 +143,7 @@ inline csr_table_t convert_to_csr_impl(PyObject* py_data, Py_DECREF(np_data); }), column_indices_one_based, - row_indices_one_based, + row_indices_one_based_offset, #if ONEDAL_VERSION <= 20230100 // row_count parameter is present in csr_table's constructor only in older versions of oneDAL row_count, From ffbf7aa11733d23cf8d2050de06586efd9813aa0 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 22 May 2024 05:14:55 -0700 Subject: [PATCH 043/130] revert b51e6bd0d9 --- onedal/datatypes/data_conversion.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/onedal/datatypes/data_conversion.cpp b/onedal/datatypes/data_conversion.cpp index f391e39c14..0d7ceea6a2 100644 --- a/onedal/datatypes/data_conversion.cpp +++ b/onedal/datatypes/data_conversion.cpp @@ -113,16 +113,6 @@ inline csr_table_t convert_to_csr_impl(PyObject* py_data, for (std::int64_t i = 0; i < row_indices_count; ++i) row_indices_one_based_data[i] = row_indices_zero_based[i] + 1; - auto row_indices_one_based_offset = dal::array::empty(row_count + 1); - auto row_indices_one_based_offset_data = row_indices_one_based_offset.get_mutable_data(); - row_indices_one_based_offset_data[0] = 1; - std::int64_t running_elem_count = 0; - for (std::int64_t i = 1; i < row_count + 1; ++i){ - for (std::int64_t j = running_elem_count; row_indices_one_based_data[j] == i; ++j) - running_elem_count++; - row_indices_one_based_offset_data[i] = running_elem_count + 1; - } - const std::int64_t *column_indices_zero_based = static_cast(array_data(np_column_indices)); const std::int64_t column_indices_count = @@ -143,7 +133,7 @@ inline csr_table_t convert_to_csr_impl(PyObject* py_data, Py_DECREF(np_data); }), column_indices_one_based, - row_indices_one_based_offset, + row_indices_one_based, #if ONEDAL_VERSION <= 20230100 // row_count parameter is present in csr_table's constructor only in older versions of oneDAL row_count, From da7e6125e386427563e60b2616afa34894714f63 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 22 May 2024 05:24:37 -0700 Subject: [PATCH 044/130] remove basic_statistics changes --- onedal/basic_statistics/basic_statistics.cpp | 21 ++++---------------- onedal/basic_statistics/basic_statistics.py | 21 +++++++------------- 2 files changed, 11 insertions(+), 31 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp index 21ae47eafc..6801f84296 100644 --- a/onedal/basic_statistics/basic_statistics.cpp +++ b/onedal/basic_statistics/basic_statistics.cpp @@ -41,7 +41,6 @@ struct method2t { const auto method = params["method"].cast(); ONEDAL_PARAM_DISPATCH_VALUE(method, "dense", ops, Float, method::dense); - ONEDAL_PARAM_DISPATCH_VALUE(method, "sparse", ops, Float, method::sparse); ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -111,20 +110,8 @@ struct params2desc { template auto operator()(const py::dict& params) { auto desc = dal::basic_statistics::descriptor() - .set_result_options(get_onedal_result_options(params)); - return desc; - } -}; - -struct params2desc_incremental { - template - auto operator()(const py::dict& params) { - auto desc = dal::basic_statistics::descriptor() - .set_result_options(get_onedal_result_options(params)); + dal::basic_statistics::method::dense, dal::basic_statistics::task::compute>() + .set_result_options(get_onedal_result_options(params)); return desc; } }; @@ -161,7 +148,7 @@ void init_partial_compute_ops(py::module& m) { const table& weights) { using namespace dal::basic_statistics; using input_t = partial_compute_input; - partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc_incremental{}); + partial_compute_ops ops(policy, input_t{ prev, data, weights }, params2desc{}); return fptype2t{ method2t{ Task{}, ops } }(params); } ); @@ -172,7 +159,7 @@ void init_finalize_compute_ops(pybind11::module_& m) { using namespace dal::basic_statistics; using input_t = partial_compute_result; m.def("finalize_compute", [](const Policy& policy, const pybind11::dict& params, const input_t& data) { - finalize_compute_ops ops(policy, data, params2desc_incremental{}); + finalize_compute_ops ops(policy, data, params2desc{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); } diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index a71fb83ab1..852c71dd20 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -18,13 +18,11 @@ from numbers import Number import numpy as np -from scipy import sparse as sp from onedal import _backend from ..common._base import BaseEstimator from ..datatypes import _convert_to_supported, from_table, to_table -from ..utils import _check_array class BaseBasicStatistics(metaclass=ABCMeta): @@ -56,16 +54,16 @@ def _get_result_options(self, options): assert isinstance(options, str) return options - def _get_onedal_params(self, data_table, dtype=np.float32): + def _get_onedal_params(self, dtype=np.float32): options = self._get_result_options(self.options) return { "fptype": "float" if dtype == np.float32 else "double", - "method": "sparse" if sp.issparse(data_table) else self.algorithm, + "method": self.algorithm, "result_option": options, } def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(data_table, dtype) + params = self._get_onedal_params(dtype) result = module.train(policy, params, data_table, weights_table) @@ -77,19 +75,14 @@ def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float def _compute(self, data, weights, module, queue): policy = self._get_policy(queue, data, weights) - data_loc = _check_array( - data, - dtype=[np.float64, np.float32], - accept_sparse="csr", - force_all_finite=False, - ) - + if not (data is None): + data = np.asarray(data) if not (weights is None): weights = np.asarray(weights) - data_loc, weights = _convert_to_supported(policy, data_loc, weights) + data, weights = _convert_to_supported(policy, data, weights) - data_table, weights_table = to_table(data_loc, weights) + data_table, weights_table = to_table(data, weights) dtype = data.dtype res = self._compute_raw(data_table, weights_table, module, policy, dtype) From 9ea8b2b2308ccd8e4d0268b8d985ae3ae8f4b99f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 22 May 2024 05:28:26 -0700 Subject: [PATCH 045/130] remove comments --- onedal/cluster/kmeans.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index f2236dad99..b7d93f853e 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -42,8 +42,6 @@ from ..common._base import BaseEstimator as onedal_BaseEstimator from ..utils import _check_array, _is_arraylike_not_scalar -# from onedal.basic_statistics import BasicStatistics - class _BaseKMeans(onedal_BaseEstimator, TransformerMixin, ClusterMixin, ABC): def __init__( @@ -83,19 +81,6 @@ def _validate_center_shape(self, X, centers): def _get_kmeans_init(self, cluster_count, seed, algorithm): return KMeansInit(cluster_count=cluster_count, seed=seed, algorithm=algorithm) - # def _get_basic_statistics_backend(self, result_options): - # return BasicStatistics(result_options) - - # def _tolerance(self, rtol, X_table, policy, dtype=np.float32): - # """Compute absolute tolerance from the relative tolerance""" - # if rtol == 0.0: - # return rtol - # dummy_weights_table = to_table(None) - # bs = self._get_basic_statistics_backend("variance") - # res = bs.compute_raw(X_table, dummy_weights_table, policy, dtype) - # mean_var = from_table(res["variance"]).mean() - # return mean_var * rtol - def _tolerance(self, X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: From 9c7c3d0cc6abed4325c11ddf830b2b9f88e37e2a Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 28 May 2024 09:17:36 -0700 Subject: [PATCH 046/130] minor --- sklearnex/cluster/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/cluster/__init__.py b/sklearnex/cluster/__init__.py index 28ed0afd2c..97806348f0 100755 --- a/sklearnex/cluster/__init__.py +++ b/sklearnex/cluster/__init__.py @@ -17,4 +17,4 @@ from .dbscan import DBSCAN from .k_means import KMeans -__all__ = ["KMeans", "DBSCAN"] +__all__ = ["DBSCAN", "KMeans"] From f0748d144b84a08ba37055a1e5af1525026dec48 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 7 Jun 2024 01:58:53 -0700 Subject: [PATCH 047/130] update --- onedal/cluster/kmeans.py | 83 +++++++++++----------------------------- 1 file changed, 22 insertions(+), 61 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index b7d93f853e..6f1f3529cb 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -20,8 +20,6 @@ import numpy as np from scipy import sparse as sp -from daal4py import engines_mt19937 -from daal4py import kmeans_init as daal4py_kmeans_init from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype from onedal import _backend @@ -167,20 +165,30 @@ def _get_params_and_input(self, X, policy): return (params, X_table, dtype) - def _init_centroids_custom_dense( - self, X_table, init, random_seed, policy, dtype=np.float32, n_centroids=None + def _init_centroids_custom( + self, X_table, init, random_seed, policy, is_sparse, dtype=np.float32, n_centroids = None ): n_clusters = self.n_clusters if n_centroids is None else n_centroids if isinstance(init, str) and init == "k-means++": - alg = self._get_kmeans_init( - cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_dense" - ) + if not is_sparse: + alg = self._get_kmeans_init( + cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_dense" + ) + else: + alg = self._get_kmeans_init( + cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_csr" + ) centers_table = alg.compute_raw(X_table, policy, dtype) elif isinstance(init, str) and init == "random": - alg = self._get_kmeans_init( - cluster_count=n_clusters, seed=random_seed, algorithm="random_dense" - ) + if not is_sparse: + alg = self._get_kmeans_init( + cluster_count=n_clusters, seed=random_seed, algorithm="random_dense" + ) + else: + alg = self._get_kmeans_init( + cluster_count=n_clusters, seed=random_seed, algorithm="random_csr" + ) centers_table = alg.compute_raw(X_table, policy, dtype) elif _is_arraylike_not_scalar(init): centers = np.asarray(init) @@ -193,44 +201,6 @@ def _init_centroids_custom_dense( return centers_table - # TODO: remove when oneDAL KMeansInit has sparsity support - def _init_centroids_custom_sparse( - self, X, init, random_seed, policy, dtype=np.float32, n_centroids=None - ): - n_clusters = self.n_clusters if n_centroids is None else n_centroids - X_fptype = parse_dtype(dtype) - daal_engine = engines_mt19937( - fptype=X_fptype, method="defaultDense", seed=random_seed - ) - if isinstance(init, str) and init == "k-means++": - _n_local_trials = 2 + int(np.log(n_clusters)) - kmeans_init_res = daal4py_kmeans_init( - n_clusters, - fptype=X_fptype, - nTrials=_n_local_trials, - method="plusPlusCSR", - engine=daal_engine, - ).compute(X) - centers = _convert_to_supported(policy, kmeans_init_res.centroids) - centers_table = to_table(centers) - elif isinstance(init, str) and init == "random": - kmeans_init_res = daal4py_kmeans_init( - n_clusters, - fptype=X_fptype, - method="randomCSR", - engine=daal_engine, - ).compute(X) - centers = _convert_to_supported(policy, kmeans_init_res.centroids) - centers_table = to_table(centers) - elif _is_arraylike_not_scalar(init): - assert init.shape[0] == n_clusters - assert init.shape[1] == X.shape[1] - centers = _convert_to_supported(policy, init) - centers_table = to_table(centers) - else: - raise TypeError("Unsupported type of the `init` value") - - return centers_table def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float32): n_samples = X.shape[0] @@ -307,26 +277,17 @@ def is_better_iteration(inertia, labels): self._validate_center_shape(X, init) is_sparse = sp.issparse(X) - use_custom_dense_init = ( + use_custom_init = ( daal_check_version((2023, "P", 200)) and not callable(self.init) - and not is_sparse - ) - use_custom_sparse_init = ( - daal_check_version((2023, "P", 200)) and not callable(self.init) and is_sparse ) for _ in range(self._n_init): - if use_custom_dense_init: + if use_custom_init: # random_seed = random_state.tomaxint() random_seed = random_state.randint(np.iinfo("i").max) - centroids_table = self._init_centroids_custom_dense( - X_table, init, random_seed, policy, dtype=dtype - ) - elif use_custom_sparse_init: - random_seed = random_state.randint(np.iinfo("i").max) - centroids_table = self._init_centroids_custom_sparse( - X, init, random_seed, policy, dtype=dtype + centroids_table = self._init_centroids_custom( + X_table, init, random_seed, policy, is_sparse, dtype=dtype ) else: centroids_table = self._init_centroids_generic( From 8a716ebee6048b059f5742aeb351a26330b29a42 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 7 Jun 2024 03:51:59 -0700 Subject: [PATCH 048/130] update --- onedal/cluster/kmeans.cpp | 24 +++++++-------- onedal/cluster/kmeans.py | 30 ++++++++++++------- onedal/cluster/kmeans_init.cpp | 54 ++++++++++++++++++++++------------ 3 files changed, 65 insertions(+), 43 deletions(-) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index 6528243659..794378edfc 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -49,13 +49,9 @@ template struct descriptor_creator {}; template -struct descriptor_creator { +struct descriptor_creator { static auto get() { - return dal::kmeans::descriptor{}; + return dal::kmeans::descriptor{}; } }; @@ -66,12 +62,12 @@ struct params2desc { auto desc = descriptor_creator::get(); - desc.set_cluster_count( params["cluster_count"].cast() ); - desc.set_accuracy_threshold( params["accuracy_threshold"].cast() ); - desc.set_max_iteration_count( params["max_iteration_count"].cast() ); + desc.set_cluster_count(params["cluster_count"].cast()); + desc.set_accuracy_threshold(params["accuracy_threshold"].cast()); + desc.set_max_iteration_count(params["max_iteration_count"].cast()); #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 auto result_options = params["result_options"].cast(); - if (result_options == "compute_assignments"){ + if (result_options == "compute_assignments") { desc.set_result_options(result_options::compute_assignments); } #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 @@ -179,10 +175,10 @@ ONEDAL_PY_INIT_MODULE(kmeans) { auto sub = m.def_submodule("kmeans"); #ifdef ONEDAL_DATA_PARALLEL_SPMD - #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200 - ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list); - ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list); - #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200 + ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list); + ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list); +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200 #else // ONEDAL_DATA_PARALLEL_SPMD ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list, task_list); ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list, task_list); diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 6f1f3529cb..57577362b5 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -139,11 +139,11 @@ def _check_params_vs_input( self._n_init = 1 assert self.algorithm == "lloyd" - def _get_onedal_params(self, X_table, dtype=np.float32, result_options=None): + def _get_onedal_params(self, X_loc, dtype=np.float32, result_options=None): thr = self._tol if hasattr(self, "_tol") else self.tol return { "fptype": "float" if dtype == np.float32 else "double", - "method": "lloyd_csr" if sp.issparse(X_table) else "by_default", + "method": "lloyd_csr" if sp.issparse(X_loc) else "by_default", "seed": -1, "max_iteration_count": self.max_iter, "cluster_count": self.n_clusters, @@ -166,14 +166,23 @@ def _get_params_and_input(self, X, policy): return (params, X_table, dtype) def _init_centroids_custom( - self, X_table, init, random_seed, policy, is_sparse, dtype=np.float32, n_centroids = None + self, + X_table, + init, + random_seed, + policy, + is_sparse, + dtype=np.float32, + n_centroids=None, ): n_clusters = self.n_clusters if n_centroids is None else n_centroids if isinstance(init, str) and init == "k-means++": if not is_sparse: alg = self._get_kmeans_init( - cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_dense" + cluster_count=n_clusters, + seed=random_seed, + algorithm="plus_plus_dense", ) else: alg = self._get_kmeans_init( @@ -191,7 +200,10 @@ def _init_centroids_custom( ) centers_table = alg.compute_raw(X_table, policy, dtype) elif _is_arraylike_not_scalar(init): - centers = np.asarray(init) + if sp.issparse(init): + centers = init.toarray() + else: + centers = np.asarray(init) assert centers.shape[0] == n_clusters assert centers.shape[1] == X_table.column_count centers = _convert_to_supported(policy, init) @@ -201,7 +213,6 @@ def _init_centroids_custom( return centers_table - def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float32): n_samples = X.shape[0] @@ -276,12 +287,9 @@ def is_better_iteration(inertia, labels): ) self._validate_center_shape(X, init) - is_sparse = sp.issparse(X) - use_custom_init = ( - daal_check_version((2023, "P", 200)) - and not callable(self.init) - ) + use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) + is_sparse = sp.issparse(X) for _ in range(self._n_init): if use_custom_init: # random_seed = random_state.tomaxint() diff --git a/onedal/cluster/kmeans_init.cpp b/onedal/cluster/kmeans_init.cpp index 16d7e10c62..41e6689658 100644 --- a/onedal/cluster/kmeans_init.cpp +++ b/onedal/cluster/kmeans_init.cpp @@ -43,6 +43,8 @@ struct method2t { ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_VALUE(method, "random_dense", ops, Float, method::random_dense); ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_dense", ops, Float, method::plus_plus_dense); + ONEDAL_PARAM_DISPATCH_VALUE(method, "random_csr", ops, Float, method::random_csr); + ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_csr", ops, Float, method::plus_plus_csr); ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -53,13 +55,10 @@ template struct descriptor_creator; template -struct descriptor_creator { +struct descriptor_creator { static auto get() { - return dal::kmeans_init::descriptor{}; + return dal::kmeans_init:: + descriptor{}; } }; @@ -74,6 +73,16 @@ struct descriptor_creator +struct descriptor_creator { + static auto get() { + return dal::kmeans_init:: + descriptor{}; + } +}; + template struct descriptor_creator +struct descriptor_creator { + static auto get() { + return dal::kmeans_init::descriptor{}; + } +}; + struct params2desc { template auto operator()(const py::dict& params) { @@ -93,14 +113,15 @@ struct params2desc { const auto cluster_count = params["cluster_count"].cast(); auto desc = descriptor_creator::get() // - .set_cluster_count(cluster_count); + .set_cluster_count(cluster_count); if constexpr (!std::is_same_v) { const auto seed = params["seed"].cast(); desc.set_seed(seed); } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v || + std::is_same_v) { const auto local_trials_count = params["local_trials_count"].cast(); desc.set_local_trials_count(local_trials_count); } @@ -116,16 +137,13 @@ template struct init_compute_ops_dispatcher { void operator()(py::module_& m) { using Task = dal::kmeans_init::task::init; - m.def("compute", - [](const Policy& policy, - const py::dict& params, - const table& data) { - using namespace dal::kmeans_init; - using input_t = compute_input; - - compute_ops ops(policy, input_t{ data }, params2desc{}); - return fptype2t{ method2t{ Task{}, ops } }(params); - }); + m.def("compute", [](const Policy& policy, const py::dict& params, const table& data) { + using namespace dal::kmeans_init; + using input_t = compute_input; + + compute_ops ops(policy, input_t{ data }, params2desc{}); + return fptype2t{ method2t{ Task{}, ops } }(params); + }); } }; From cf0535dda0ff5aca2f6362120ae77b99043c08db Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 11 Jun 2024 03:46:37 -0700 Subject: [PATCH 049/130] add result option --- onedal/cluster/kmeans.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index 794378edfc..c1aa349548 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -70,6 +70,9 @@ struct params2desc { if (result_options == "compute_assignments") { desc.set_result_options(result_options::compute_assignments); } + if (result_options == "compute_exact_objective_function") { + desc.set_result_options(result_options::compute_exact_objective_function); + } #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 return desc; } From 5e4defe496d93a4993661d3a0a01a292aab260c4 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 11 Jun 2024 08:48:47 -0700 Subject: [PATCH 050/130] refactor for csr --- onedal/cluster/kmeans.py | 10 +++++----- onedal/cluster/kmeans_init.py | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 57577362b5..a41cefe921 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -152,14 +152,14 @@ def _get_onedal_params(self, X_loc, dtype=np.float32, result_options=None): } def _get_params_and_input(self, X, policy): - X_loc = _check_array( + X = _check_array( X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False ) - X_loc = _convert_to_supported(policy, X_loc) - dtype = get_dtype(X_loc) - X_table = to_table(X_loc) + X = _convert_to_supported(policy, X) + dtype = get_dtype(X) + X_table = to_table(X) - self._check_params_vs_input(X_loc, policy, dtype=dtype) + self._check_params_vs_input(X, policy, dtype=dtype) params = self._get_onedal_params(X_table, dtype) diff --git a/onedal/cluster/kmeans_init.py b/onedal/cluster/kmeans_init.py index 1e7aa8ec83..58f8f61676 100755 --- a/onedal/cluster/kmeans_init.py +++ b/onedal/cluster/kmeans_init.py @@ -21,6 +21,7 @@ from ..common._base import BaseEstimator as onedal_BaseEstimator from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import _check_array if daal_check_version((2023, "P", 200)): @@ -56,16 +57,15 @@ def _get_onedal_params(self, dtype=np.float32): } def _get_params_and_input(self, X, policy): - X_loc = np.asarray(X) - types = [np.float32, np.float64] - if get_dtype(X_loc) not in types: - X_loc = X_loc.astype(np.float64) + X = _check_array( + X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False + ) - X_loc = _convert_to_supported(policy, X_loc) + X = _convert_to_supported(policy, X) - dtype = get_dtype(X_loc) + dtype = get_dtype(X) params = self._get_onedal_params(dtype) - return (params, to_table(X_loc), dtype) + return (params, to_table(X), dtype) def _compute_raw(self, X_table, module, policy, dtype=np.float32): params = self._get_onedal_params(dtype) From b377cde55e8e3d0d4cff8619146bbc8b49985532 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 11 Jun 2024 09:06:47 -0700 Subject: [PATCH 051/130] lint --- onedal/cluster/kmeans_init.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans_init.py b/onedal/cluster/kmeans_init.py index 58f8f61676..0516784634 100755 --- a/onedal/cluster/kmeans_init.py +++ b/onedal/cluster/kmeans_init.py @@ -58,7 +58,10 @@ def _get_onedal_params(self, dtype=np.float32): def _get_params_and_input(self, X, policy): X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False + X, + dtype=[np.float64, np.float32], + accept_sparse="csr", + force_all_finite=False, ) X = _convert_to_supported(policy, X) From 5a9b13bf75fa70db7796aeb5170758360aa211f4 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 11 Jun 2024 11:19:36 -0700 Subject: [PATCH 052/130] refactor and ci --- onedal/cluster/kmeans_init.cpp | 4 +- sklearnex/cluster/k_means.py | 163 +++++++++++++-------------------- 2 files changed, 65 insertions(+), 102 deletions(-) diff --git a/onedal/cluster/kmeans_init.cpp b/onedal/cluster/kmeans_init.cpp index 41e6689658..1cf0eef486 100644 --- a/onedal/cluster/kmeans_init.cpp +++ b/onedal/cluster/kmeans_init.cpp @@ -120,8 +120,8 @@ struct params2desc { desc.set_seed(seed); } - if constexpr (std::is_same_v || - std::is_same_v) { + if constexpr ((std::is_same_v || + std::is_same_v)) { const auto local_trials_count = params["local_trials_count"].cast(); desc.set_local_trials_count(local_trials_count); } diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 5555bc8952..c04a955659 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -107,91 +107,34 @@ class KMeans(sklearn_KMeans, BaseKMeans): if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_KMeans._parameter_constraints} - @_deprecate_positional_args - def __init__( - self, - n_clusters=8, - *, - init="k-means++", - n_init="auto" if sklearn_check_version("1.4") else "warn", - max_iter=300, - tol=1e-4, - verbose=0, - random_state=None, - copy_x=True, - algorithm="lloyd", - ): - super().__init__( - n_clusters=n_clusters, - init=init, - max_iter=max_iter, - tol=tol, - n_init=n_init, - verbose=verbose, - random_state=random_state, - copy_x=copy_x, - algorithm=algorithm, - ) - - elif sklearn_check_version("1.0"): - - @_deprecate_positional_args - def __init__( - self, - n_clusters=8, - *, - init="k-means++", - n_init=10, - max_iter=300, - tol=1e-4, - verbose=0, - random_state=None, - copy_x=True, - algorithm="lloyd" if sklearn_check_version("1.1") else "auto", - ): - super().__init__( - n_clusters=n_clusters, - init=init, - max_iter=max_iter, - tol=tol, - n_init=n_init, - verbose=verbose, - random_state=random_state, - copy_x=copy_x, - algorithm=algorithm, - ) - - else: - - @_deprecate_positional_args - def __init__( - self, - n_clusters=8, - *, - init="k-means++", - n_init=10, - max_iter=300, - tol=1e-4, - precompute_distances="deprecated", - verbose=0, - random_state=None, - copy_x=True, - n_jobs="deprecated", - algorithm="auto", - ): - super().__init__( - n_clusters=n_clusters, - init=init, - max_iter=max_iter, - tol=tol, - precompute_distances=precompute_distances, - n_init=n_init, - verbose=verbose, - random_state=random_state, - copy_x=copy_x, - n_jobs=n_jobs, - algorithm=algorithm, - ) + def __init__( + self, + n_clusters=8, + *, + init="k-means++", + n_init=( + "auto" + if sklearn_check_version("1.4") + else "warn" if sklearn_check_version("1.2") else 10 + ), + max_iter=300, + tol=1e-4, + verbose=0, + random_state=None, + copy_x=True, + algorithm="lloyd" if sklearn_check_version("1.1") else "auto", + ): + super().__init__( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + tol=tol, + n_init=n_init, + verbose=verbose, + random_state=random_state, + copy_x=copy_x, + algorithm=algorithm, + ) def _initialize_onedal_estimator(self): onedal_params = { @@ -302,24 +245,44 @@ def _onedal_predict_supported(self, method_name, X, sample_weight): return patching_status - @wrap_output_data - def predict( - self, X, sample_weight="deprecated" if sklearn_check_version("1.3") else None - ): - if sklearn_check_version("1.0"): + if sklearn_check_version("1.5"): + + @wrap_output_data + def predict(self, X): self._check_feature_names(X, reset=True) - if sklearn_check_version("1.2"): self._validate_params() - return dispatch( + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KMeans.predict, + }, + X, + ) + + else: + + @wrap_output_data + def predict( self, - "predict", - { - "onedal": self.__class__._onedal_predict, - "sklearn": sklearn_KMeans.predict, - }, X, - sample_weight, - ) + sample_weight="deprecated" if sklearn_check_version("1.3") else None, + ): + if sklearn_check_version("1.0"): + self._check_feature_names(X, reset=True) + if sklearn_check_version("1.2"): + self._validate_params() + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KMeans.predict, + }, + X, + sample_weight, + ) def _onedal_predict(self, X, sample_weight=None, queue=None): X = self._validate_data( @@ -385,5 +348,5 @@ def transform(self, X): from daal4py.sklearn.cluster import KMeans logging.warning( - "Sklearnex KMeans requires oneDAL version >= 2023.2 " "but it was not found" + "Sklearnex KMeans requires oneDAL version >= 2023.2, falling back to daal4py." ) From 860663dda7d9fd7d842fb4a5577abf0bf3b9d58e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 11 Jun 2024 11:53:22 -0700 Subject: [PATCH 053/130] add version check for oneDAL --- onedal/cluster/kmeans_init.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/onedal/cluster/kmeans_init.cpp b/onedal/cluster/kmeans_init.cpp index 1cf0eef486..df5ed2c820 100644 --- a/onedal/cluster/kmeans_init.cpp +++ b/onedal/cluster/kmeans_init.cpp @@ -43,8 +43,10 @@ struct method2t { ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_VALUE(method, "random_dense", ops, Float, method::random_dense); ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_dense", ops, Float, method::plus_plus_dense); +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240500 ONEDAL_PARAM_DISPATCH_VALUE(method, "random_csr", ops, Float, method::random_csr); ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_csr", ops, Float, method::plus_plus_csr); +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240500 ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -75,22 +77,23 @@ struct descriptor_creator struct descriptor_creator { static auto get() { - return dal::kmeans_init:: - descriptor{}; + return dal::kmeans_init::descriptor{}; } }; +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240500 template struct descriptor_creator { static auto get() { - return dal::kmeans_init::descriptor{}; + return dal::kmeans_init:: + descriptor{}; } }; @@ -104,6 +107,7 @@ struct descriptor_creator{}; } }; +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240500 struct params2desc { template @@ -120,12 +124,16 @@ struct params2desc { desc.set_seed(seed); } - if constexpr ((std::is_same_v || - std::is_same_v)) { + if constexpr (std::is_same_v) { const auto local_trials_count = params["local_trials_count"].cast(); desc.set_local_trials_count(local_trials_count); } - +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240500 + if constexpr (std::is_same_v) { + const auto local_trials_count = params["local_trials_count"].cast(); + desc.set_local_trials_count(local_trials_count); + } +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240500 return desc; } }; From 8fb53d48c79abc7488737280f301175fa64afd90 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 18 Jun 2024 10:38:54 -0700 Subject: [PATCH 054/130] update --- onedal/cluster/kmeans.py | 9 ++++++--- sklearnex/cluster/k_means.py | 11 ++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a41cefe921..ca3e6f5956 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -176,6 +176,8 @@ def _init_centroids_custom( n_centroids=None, ): n_clusters = self.n_clusters if n_centroids is None else n_centroids + # Use host policy for KMeans init, only for sparse data + init_policy = self._get_policy(None, None) if is_sparse else policy if isinstance(init, str) and init == "k-means++": if not is_sparse: @@ -188,7 +190,7 @@ def _init_centroids_custom( alg = self._get_kmeans_init( cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_csr" ) - centers_table = alg.compute_raw(X_table, policy, dtype) + centers_table = alg.compute_raw(X_table, init_policy, dtype) elif isinstance(init, str) and init == "random": if not is_sparse: alg = self._get_kmeans_init( @@ -198,14 +200,16 @@ def _init_centroids_custom( alg = self._get_kmeans_init( cluster_count=n_clusters, seed=random_seed, algorithm="random_csr" ) - centers_table = alg.compute_raw(X_table, policy, dtype) + centers_table = alg.compute_raw(X_table, init_policy, dtype) elif _is_arraylike_not_scalar(init): if sp.issparse(init): + # oneDAL KMeans doesn't support sparse centroids centers = init.toarray() else: centers = np.asarray(init) assert centers.shape[0] == n_clusters assert centers.shape[1] == X_table.column_count + # Use original policy for KMeans init when arraylike init is provided centers = _convert_to_supported(policy, init) centers_table = to_table(centers) else: @@ -292,7 +296,6 @@ def is_better_iteration(inertia, labels): is_sparse = sp.issparse(X) for _ in range(self._n_init): if use_custom_init: - # random_seed = random_state.tomaxint() random_seed = random_state.randint(np.iinfo("i").max) centroids_table = self._init_centroids_custom( X_table, init, random_seed, policy, is_sparse, dtype=dtype diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index c04a955659..226c5b3f98 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -159,7 +159,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count - + is_sparse_supported = not issparse(X) or daal_check_version((2024, "P", 600)) sample_weight = _check_sample_weight( sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None ) @@ -175,6 +175,10 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): np.allclose(sample_weight, np.ones_like(sample_weight)), "Sample weights are not ones.", ), + ( + is_sparse_supported, + "Sparse data is not supported for oneDAL KMeans version < 2024.6.0.", + ), ] ) @@ -224,6 +228,7 @@ def _onedal_predict_supported(self, method_name, X, sample_weight): assert method_name == "predict" class_name = self.__class__.__name__ + is_sparse_supported = not issparse(X) or daal_check_version((2024, "P", 600)) patching_status = PatchingConditionsChain( f"sklearn.cluster.{class_name}.predict" ) @@ -236,6 +241,10 @@ def _onedal_predict_supported(self, method_name, X, sample_weight): self.algorithm in supported_algs, "Only lloyd algorithm is supported, elkan is computed using lloyd.", ), + ( + is_sparse_supported, + "Sparse data is not supported for oneDAL KMeans version < 2024.6.0.", + ), ( hasattr(self, "_onedal_estimator"), "oneDAL model was not fit.", From ba7aa6b9bebbb331a429ebe727ec20e02c7b589e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 18 Jun 2024 12:48:29 -0700 Subject: [PATCH 055/130] fix for CI --- sklearnex/tests/test_run_to_run_stability.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py index 9e4a670fdf..64b913f990 100755 --- a/sklearnex/tests/test_run_to_run_stability.py +++ b/sklearnex/tests/test_run_to_run_stability.py @@ -118,8 +118,10 @@ def _run_test(estimator, method, datasets): str(i): i for i in [ SVC(), - KMeans(), - KMeans(init="random"), + # KMeans sparse instances will be enabled when daal 2024.6 is released + # KMeans(), + # KMeans(init="random"), + # KMeans(init="k-means++"), ] } ) From 77f91c5780ad45eaa97ee1ac22d450405fbc6cad Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 18 Jun 2024 13:51:34 -0700 Subject: [PATCH 056/130] ci fix --- onedal/cluster/kmeans_init.py | 4 ++++ sklearnex/cluster/k_means.py | 28 +++++++++++++++------------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/onedal/cluster/kmeans_init.py b/onedal/cluster/kmeans_init.py index 0516784634..711839b4d9 100755 --- a/onedal/cluster/kmeans_init.py +++ b/onedal/cluster/kmeans_init.py @@ -15,6 +15,7 @@ # ============================================================================== import numpy as np +from scipy.sparse import issparse from sklearn.utils import check_random_state from daal4py.sklearn._utils import daal_check_version, get_dtype @@ -79,6 +80,9 @@ def _compute_raw(self, X_table, module, policy, dtype=np.float32): def _compute(self, X, module, queue): policy = self._get_policy(queue, X) + # oneDAL KMeans Init for sparse data does not have GPU support + if issparse(X): + policy = self._get_policy(None, None) _, X_table, dtype = self._get_params_and_input(X, policy) centroids = self._compute_raw(X_table, module, policy, dtype) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 226c5b3f98..28d764aeee 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -224,7 +224,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._save_attributes() - def _onedal_predict_supported(self, method_name, X, sample_weight): + def _onedal_predict_supported(self, method_name, X, sample_weight=None): assert method_name == "predict" class_name = self.__class__.__name__ @@ -300,19 +300,21 @@ def _onedal_predict(self, X, sample_weight=None, queue=None): reset=False, dtype=[np.float64, np.float32], ) - if ( - sklearn_check_version("1.3") - and isinstance(sample_weight, str) - and sample_weight == "deprecated" - ): - sample_weight = None - if sklearn_check_version("1.3") and sample_weight is not None: - warnings.warn( - "'sample_weight' was deprecated in version 1.3 and " - "will be removed in 1.5.", - FutureWarning, - ) + if not sklearn_check_version("1.5"): + if ( + sklearn_check_version("1.3") + and isinstance(sample_weight, str) + and sample_weight == "deprecated" + ): + sample_weight = None + + if sklearn_check_version("1.3") and sample_weight is not None: + warnings.warn( + "'sample_weight' was deprecated in version 1.3 and " + "will be removed in 1.5.", + FutureWarning, + ) if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() From 55ff15c079c8331eeaf7156668fa92840fbf54fc Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 19 Jun 2024 16:13:43 -0700 Subject: [PATCH 057/130] minor --- sklearnex/cluster/k_means.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 28d764aeee..d6de0d4515 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -258,7 +258,7 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): @wrap_output_data def predict(self, X): - self._check_feature_names(X, reset=True) + self._check_feature_names(X, reset=False) self._validate_params() return dispatch( self, @@ -279,7 +279,7 @@ def predict( sample_weight="deprecated" if sklearn_check_version("1.3") else None, ): if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=True) + self._check_feature_names(X, reset=False) if sklearn_check_version("1.2"): self._validate_params() return dispatch( From 0e5e52b2ef14ab140781a5c218859c854ce0f96c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 19 Jun 2024 23:15:09 -0700 Subject: [PATCH 058/130] some fixes --- onedal/cluster/kmeans.py | 65 ++++++++++++++++++------------------ sklearnex/cluster/k_means.py | 17 ++++++---- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 29276ee949..50c87e7590 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -18,7 +18,6 @@ from abc import ABC import numpy as np -from scipy import sparse as sp from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype from onedal import _backend @@ -38,7 +37,7 @@ from ..common._base import BaseEstimator as onedal_BaseEstimator from ..common._mixin import ClusterMixin, TransformerMixin -from ..utils import _check_array, _is_arraylike_not_scalar +from ..utils import _check_array, _is_arraylike_not_scalar, _is_csr class _BaseKMeans(onedal_BaseEstimator, TransformerMixin, ClusterMixin, ABC): @@ -83,24 +82,22 @@ def _tolerance(self, X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol - if sp.issparse(X): + if _is_csr(X): variances = mean_variance_axis(X, axis=0)[1] mean_var = np.mean(variances) else: mean_var = np.var(X, axis=0).mean() return mean_var * rtol - def _check_params_vs_input( - self, X_table, policy, default_n_init=10, dtype=np.float32 - ): + def _check_params_vs_input(self, X, policy, default_n_init=10, dtype=np.float32): # n_clusters - if X_table.shape[0] < self.n_clusters: + if X.shape[0] < self.n_clusters: raise ValueError( - f"n_samples={X_table.shape[0]} should be >= n_clusters={self.n_clusters}." + f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}." ) # tol - self._tol = self._tolerance(X_table, self.tol) + self._tol = self._tolerance(X, self.tol) # n-init # TODO(1.4): Remove @@ -139,11 +136,11 @@ def _check_params_vs_input( self._n_init = 1 assert self.algorithm == "lloyd" - def _get_onedal_params(self, X_loc, dtype=np.float32, result_options=None): + def _get_onedal_params(self, is_csr=False, dtype=np.float32, result_options=None): thr = self._tol if hasattr(self, "_tol") else self.tol return { "fptype": "float" if dtype == np.float32 else "double", - "method": "lloyd_csr" if sp.issparse(X_loc) else "by_default", + "method": "lloyd_csr" if is_csr else "by_default", "seed": -1, "max_iteration_count": self.max_iter, "cluster_count": self.n_clusters, @@ -161,26 +158,26 @@ def _get_params_and_input(self, X, policy): self._check_params_vs_input(X, policy, dtype=dtype) - params = self._get_onedal_params(X_table, dtype) + params = self._get_onedal_params(dtype) return (params, X_table, dtype) - def _init_centroids_custom( + def _init_centroids_onedal( self, X_table, init, random_seed, policy, - is_sparse, + is_csr, dtype=np.float32, n_centroids=None, ): n_clusters = self.n_clusters if n_centroids is None else n_centroids - # Use host policy for KMeans init, only for sparse data - init_policy = self._get_policy(None, None) if is_sparse else policy + # Use host policy for KMeans init, only for csr data + init_policy = self._get_policy(None, None) # if is_csr else policy if isinstance(init, str) and init == "k-means++": - if not is_sparse: + if not is_csr: alg = self._get_kmeans_init( cluster_count=n_clusters, seed=random_seed, @@ -192,7 +189,7 @@ def _init_centroids_custom( ) centers_table = alg.compute_raw(X_table, init_policy, dtype) elif isinstance(init, str) and init == "random": - if not is_sparse: + if not is_csr: alg = self._get_kmeans_init( cluster_count=n_clusters, seed=random_seed, algorithm="random_dense" ) @@ -210,14 +207,15 @@ def _init_centroids_custom( assert centers.shape[0] == n_clusters assert centers.shape[1] == X_table.column_count # Use original policy for KMeans init when arraylike init is provided - centers = _convert_to_supported(policy, init) + centers = _convert_to_supported(policy, centers) centers_table = to_table(centers) else: raise TypeError("Unsupported type of the `init` value") return centers_table - def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float32): + def _init_centroids_sklearn(self, X, init, random_state, policy, dtype=np.float32): + # For oneDAL versions < 2023.2, using the scikit-learn implementation n_samples = X.shape[0] if isinstance(init, str) and init == "k-means++": @@ -245,8 +243,10 @@ def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float3 centers = _convert_to_supported(policy, centers) return to_table(centers) - def _fit_backend(self, X_table, centroids_table, module, policy, dtype=np.float32): - params = self._get_onedal_params(X_table, dtype) + def _fit_backend( + self, X_table, centroids_table, module, policy, dtype=np.float32, is_csr=False + ): + params = self._get_onedal_params(is_csr, dtype) # TODO: check all features for having correct type meta = _backend.get_table_metadata(X_table) @@ -291,17 +291,17 @@ def is_better_iteration(inertia, labels): ) self._validate_center_shape(X, init) - use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) + use_onedal_init = daal_check_version((2023, "P", 200)) and not callable(self.init) - is_sparse = sp.issparse(X) + is_csr = _is_csr(X) for _ in range(self._n_init): - if use_custom_init: + if use_onedal_init: random_seed = random_state.randint(np.iinfo("i").max) - centroids_table = self._init_centroids_custom( - X_table, init, random_seed, policy, is_sparse, dtype=dtype + centroids_table = self._init_centroids_onedal( + X_table, init, random_seed, policy, is_csr, dtype=dtype ) else: - centroids_table = self._init_centroids_generic( + centroids_table = self._init_centroids_sklearn( X, init, random_state, policy, dtype=dtype ) @@ -309,7 +309,7 @@ def is_better_iteration(inertia, labels): print("Initialization complete.") labels, inertia, model, n_iter = self._fit_backend( - X_table, centroids_table, module, policy, dtype + X_table, centroids_table, module, policy, dtype, is_csr ) if self.verbose: @@ -365,9 +365,9 @@ def _set_cluster_centers(self, cluster_centers): cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) - def _predict_raw(self, X_table, module, policy, dtype=np.float32): + def _predict_raw(self, X_table, module, policy, dtype=np.float32, is_csr=False): params = self._get_onedal_params( - X_table, dtype, result_options="compute_assignments" + is_csr, dtype, result_options="compute_assignments" ) result = module.infer(policy, params, self.model_, X_table) @@ -376,12 +376,13 @@ def _predict_raw(self, X_table, module, policy, dtype=np.float32): def _predict(self, X, module, queue=None): check_is_fitted(self) + is_csr = _is_csr(X) policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) X_table, dtype = to_table(X), X.dtype - return self._predict_raw(X_table, module, policy, dtype) + return self._predict_raw(X_table, module, policy, dtype, is_csr) def _transform(self, X): return euclidean_distances(X, self.cluster_centers_) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index d6de0d4515..d611f0d0d7 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -36,6 +36,7 @@ from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from onedal.cluster import KMeans as onedal_KMeans + from onedal.utils import _is_csr from .._device_offload import dispatch, wrap_output_data from .._utils import PatchingConditionsChain @@ -159,7 +160,9 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count - is_sparse_supported = not issparse(X) or daal_check_version((2024, "P", 600)) + is_data_supported = ( + _is_csr(X) and daal_check_version((2024, "P", 600)) + ) or not issparse(X) sample_weight = _check_sample_weight( sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None ) @@ -176,8 +179,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): "Sample weights are not ones.", ), ( - is_sparse_supported, - "Sparse data is not supported for oneDAL KMeans version < 2024.6.0.", + is_data_supported, + "Supported data formats: Dense, CSR (oneDAL version >= 2024.6.0).", ), ] ) @@ -228,7 +231,9 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): assert method_name == "predict" class_name = self.__class__.__name__ - is_sparse_supported = not issparse(X) or daal_check_version((2024, "P", 600)) + is_data_supported = ( + _is_csr(X) and daal_check_version((2024, "P", 600)) + ) or not issparse(X) patching_status = PatchingConditionsChain( f"sklearn.cluster.{class_name}.predict" ) @@ -242,8 +247,8 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): "Only lloyd algorithm is supported, elkan is computed using lloyd.", ), ( - is_sparse_supported, - "Sparse data is not supported for oneDAL KMeans version < 2024.6.0.", + is_data_supported, + "Supported data formats: Dense, CSR (oneDAL version >= 2024.6.0).", ), ( hasattr(self, "_onedal_estimator"), From e0a3b6ee02da16061a4d02af189424aacf382d02 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 20 Jun 2024 00:15:13 -0700 Subject: [PATCH 059/130] ci fixes --- onedal/cluster/kmeans.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 50c87e7590..6d3b01f1dc 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -174,7 +174,7 @@ def _init_centroids_onedal( ): n_clusters = self.n_clusters if n_centroids is None else n_centroids # Use host policy for KMeans init, only for csr data - init_policy = self._get_policy(None, None) # if is_csr else policy + init_policy = self._get_policy(None, None)# if is_csr else policy if isinstance(init, str) and init == "k-means++": if not is_csr: @@ -199,7 +199,7 @@ def _init_centroids_onedal( ) centers_table = alg.compute_raw(X_table, init_policy, dtype) elif _is_arraylike_not_scalar(init): - if sp.issparse(init): + if _is_csr(init): # oneDAL KMeans doesn't support sparse centroids centers = init.toarray() else: @@ -248,7 +248,6 @@ def _fit_backend( ): params = self._get_onedal_params(is_csr, dtype) - # TODO: check all features for having correct type meta = _backend.get_table_metadata(X_table) assert meta.get_npy_dtype(0) == dtype From cc1a9dfcdee7e28da1be72788093e9ddebb09ec4 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 20 Jun 2024 00:19:17 -0700 Subject: [PATCH 060/130] lint --- onedal/cluster/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 6d3b01f1dc..9ad9b530c3 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -174,7 +174,7 @@ def _init_centroids_onedal( ): n_clusters = self.n_clusters if n_centroids is None else n_centroids # Use host policy for KMeans init, only for csr data - init_policy = self._get_policy(None, None)# if is_csr else policy + init_policy = self._get_policy(None, None) # if is_csr else policy if isinstance(init, str) and init == "k-means++": if not is_csr: From 48e869e6c2d8ea3ea7be13805ba2db0bd9087a9c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 20 Jun 2024 07:51:46 -0700 Subject: [PATCH 061/130] add version checks --- onedal/cluster/kmeans.cpp | 2 ++ onedal/cluster/kmeans_init.cpp | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index c1aa349548..b88612bd9c 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -38,7 +38,9 @@ struct method2t { const auto method = params["method"].cast(); ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_dense", ops, Float, method::lloyd_dense); +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_csr", ops, Float, method::lloyd_csr); +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } diff --git a/onedal/cluster/kmeans_init.cpp b/onedal/cluster/kmeans_init.cpp index df5ed2c820..d973f177ad 100644 --- a/onedal/cluster/kmeans_init.cpp +++ b/onedal/cluster/kmeans_init.cpp @@ -43,10 +43,10 @@ struct method2t { ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_VALUE(method, "random_dense", ops, Float, method::random_dense); ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_dense", ops, Float, method::plus_plus_dense); -#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240500 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 ONEDAL_PARAM_DISPATCH_VALUE(method, "random_csr", ops, Float, method::random_csr); ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_csr", ops, Float, method::plus_plus_csr); -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240500 +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240600 ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -86,7 +86,7 @@ struct descriptor_creator= 20240500 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 template struct descriptor_creator{}; } }; -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240500 +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240600 struct params2desc { template @@ -128,12 +128,12 @@ struct params2desc { const auto local_trials_count = params["local_trials_count"].cast(); desc.set_local_trials_count(local_trials_count); } -#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240500 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 if constexpr (std::is_same_v) { const auto local_trials_count = params["local_trials_count"].cast(); desc.set_local_trials_count(local_trials_count); } -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240500 +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240600 return desc; } }; From 51e24200f4185ea2cfc31cee7bc8f62a5ee65c2f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sun, 23 Jun 2024 20:45:21 -0700 Subject: [PATCH 062/130] csr condition for policy --- onedal/cluster/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 9ad9b530c3..b4a1eb7f62 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -174,7 +174,7 @@ def _init_centroids_onedal( ): n_clusters = self.n_clusters if n_centroids is None else n_centroids # Use host policy for KMeans init, only for csr data - init_policy = self._get_policy(None, None) # if is_csr else policy + init_policy = self._get_policy(None, None) if is_csr else policy if isinstance(init, str) and init == "k-means++": if not is_csr: From f7c8a4f4a183847e069c11ac8dca3c0133bc6fb5 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sun, 23 Jun 2024 21:07:16 -0700 Subject: [PATCH 063/130] version check for stability check --- sklearnex/tests/test_run_to_run_stability.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py index 64b913f990..99583a5da5 100755 --- a/sklearnex/tests/test_run_to_run_stability.py +++ b/sklearnex/tests/test_run_to_run_stability.py @@ -39,6 +39,7 @@ ) import daal4py as d4p +from daal4py.sklearn._utils import daal_check_version from onedal.tests.utils._dataframes_support import _as_numpy, get_dataframes_and_queues from sklearnex.cluster import DBSCAN, KMeans from sklearnex.decomposition import PCA @@ -118,10 +119,15 @@ def _run_test(estimator, method, datasets): str(i): i for i in [ SVC(), - # KMeans sparse instances will be enabled when daal 2024.6 is released - # KMeans(), - # KMeans(init="random"), - # KMeans(init="k-means++"), + *( + [] + if not daal_check_version((2024, "P", 600)) + else [ + KMeans(), + KMeans(init="random"), + KMeans(init="k-means++"), + ] + ), ] } ) From b5b2a943a865a373b593c9ae62b761d274201b6b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sun, 23 Jun 2024 21:09:35 -0700 Subject: [PATCH 064/130] update test --- sklearnex/cluster/tests/test_kmeans.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 8a2fd0cdca..faf313fa90 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -30,14 +30,14 @@ def test_sklearnex_import(dataframe, queue): from sklearnex.cluster import KMeans - X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) - y = np.array([[0, 0], [12, 3]]) + X_train = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) + X_test = np.array([[0, 0], [12, 3]]) expected_cluster_labels = np.array([1, 0], dtype=np.int32) - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) - y = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) + X_train = _convert_to_dataframe(X_train, sycl_queue=queue, target_df=dataframe) + X_test = _convert_to_dataframe(X_test, sycl_queue=queue, target_df=dataframe) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - if daal_check_version((2023, "P", 200)): + if daal_check_version((2024, "P", 600)): assert "sklearnex" in kmeans.__module__ else: assert "daal4py" in kmeans.__module__ From e561c153b0c9a03a689a53d6e80806f30eb13d8e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sun, 23 Jun 2024 21:17:38 -0700 Subject: [PATCH 065/130] floating methods --- sklearnex/cluster/k_means.py | 58 ++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index d611f0d0d7..985a4196ff 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -41,39 +41,39 @@ from .._device_offload import dispatch, wrap_output_data from .._utils import PatchingConditionsChain - def get_cluster_centers(self): - return self._cluster_centers_ + class BaseKMeans(ABC): + def _get_cluster_centers(self): + return self._cluster_centers_ - def set_cluster_centers(self, value): - self._cluster_centers_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.cluster_centers_ = value + def _set_cluster_centers(self, value): + self._cluster_centers_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.cluster_centers_ = value - def get_labels(self): - return self._labels_ + def _get_labels(self): + return self._labels_ - def set_labels(self, value): - self._labels_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.labels_ = value + def _set_labels(self, value): + self._labels_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.labels_ = value - def get_inertia(self): - return self._inertia_ + def _get_inertia(self): + return self._inertia_ - def set_inertia(self, value): - self._inertia_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.inertia_ = value + def _set_inertia(self, value): + self._inertia_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.inertia_ = value - def get_n_iter(self): - return self._n_iter_ + def _get_n_iter(self): + return self._n_iter_ - def set_n_iter(self, value): - self._n_iter_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.n_iter_ = value + def _set_n_iter(self, value): + self._n_iter_ = value + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.n_iter_ = value - class BaseKMeans(ABC): def _save_attributes(self): assert hasattr(self, "_onedal_estimator") self.n_features_in_ = self._onedal_estimator.n_features_in_ @@ -87,10 +87,10 @@ def _save_attributes(self): self._cluster_centers_ = self._onedal_estimator.cluster_centers_ self._sparse = False - self.n_iter_ = property(get_n_iter, set_n_iter) - self.labels_ = property(get_labels, set_labels) - self.inertia_ = property(get_labels, set_inertia) - self.cluster_centers_ = property(get_cluster_centers, set_cluster_centers) + self.n_iter_ = property(_get_n_iter, _set_n_iter) + self.labels_ = property(_get_labels, _set_labels) + self.inertia_ = property(_get_labels, _set_inertia) + self.cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) self._is_in_fit = True self.n_iter_ = self._n_iter_ From 4dec2731067aeb8f09b1008195bb055bbc88c2b1 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Sun, 23 Jun 2024 21:48:57 -0700 Subject: [PATCH 066/130] minor --- sklearnex/cluster/k_means.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 985a4196ff..66d1e59131 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -24,7 +24,6 @@ import numpy as np from scipy.sparse import issparse from sklearn.cluster import KMeans as sklearn_KMeans - from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( _check_sample_weight, _deprecate_positional_args, @@ -87,10 +86,12 @@ def _save_attributes(self): self._cluster_centers_ = self._onedal_estimator.cluster_centers_ self._sparse = False - self.n_iter_ = property(_get_n_iter, _set_n_iter) - self.labels_ = property(_get_labels, _set_labels) - self.inertia_ = property(_get_labels, _set_inertia) - self.cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) + self.n_iter_ = property(self._get_n_iter, self._set_n_iter) + self.labels_ = property(self._get_labels, self._set_labels) + self.inertia_ = property(self._get_labels, self._set_inertia) + self.cluster_centers_ = property( + self._get_cluster_centers, self._set_cluster_centers + ) self._is_in_fit = True self.n_iter_ = self._n_iter_ @@ -99,7 +100,7 @@ def _save_attributes(self): self.cluster_centers_ = self._cluster_centers_ self._is_in_fit = False - @control_n_jobs(decorated_methods=["fit", "predict"]) + @control_n_jobs(decorated_methods=["fit", "predict", "transform", "fit_transform"]) class KMeans(sklearn_KMeans, BaseKMeans): __doc__ = sklearn_KMeans.__doc__ n_iter_, inertia_ = None, None @@ -220,7 +221,6 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._check_params(X) self._n_features_out = self.n_clusters - self._n_threads = _openmp_effective_n_threads() self._initialize_onedal_estimator() self._onedal_estimator.fit(X, queue=queue) From d19491347299e7289171a96e3447485b3f04a23c Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 24 Jun 2024 05:46:30 -0700 Subject: [PATCH 067/130] ci fixes --- sklearnex/cluster/k_means.py | 2 ++ sklearnex/cluster/tests/test_kmeans.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 66d1e59131..61a220bf9c 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -24,6 +24,7 @@ import numpy as np from scipy.sparse import issparse from sklearn.cluster import KMeans as sklearn_KMeans + from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( _check_sample_weight, _deprecate_positional_args, @@ -221,6 +222,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._check_params(X) self._n_features_out = self.n_clusters + self._n_threads = _openmp_effective_n_threads() self._initialize_onedal_estimator() self._onedal_estimator.fit(X, queue=queue) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index faf313fa90..f45ce6254e 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -36,8 +36,8 @@ def test_sklearnex_import(dataframe, queue): X_train = _convert_to_dataframe(X_train, sycl_queue=queue, target_df=dataframe) X_test = _convert_to_dataframe(X_test, sycl_queue=queue, target_df=dataframe) - kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - if daal_check_version((2024, "P", 600)): + kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train) + if daal_check_version((2023, "P", 200)): assert "sklearnex" in kmeans.__module__ else: assert "daal4py" in kmeans.__module__ From 763699d0de4921a8d93833ae3b89eadbc2d33434 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 24 Jun 2024 07:03:02 -0700 Subject: [PATCH 068/130] minor --- sklearnex/cluster/tests/test_kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index f45ce6254e..ebe5b30f7f 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -42,5 +42,5 @@ def test_sklearnex_import(dataframe, queue): else: assert "daal4py" in kmeans.__module__ - result_cluster_labels = kmeans.predict(y) + result_cluster_labels = kmeans.predict(X_test) assert_allclose(expected_cluster_labels, _as_numpy(result_cluster_labels)) From c46a707c28650d17a52acbdf5dba97f2da6f467b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 24 Jun 2024 23:15:18 -0700 Subject: [PATCH 069/130] address review --- deselected_tests.yaml | 3 +++ onedal/cluster/kmeans.py | 43 ++++++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 647a117986..478c73233f 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -451,6 +451,9 @@ gpu: # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan + # Different number of iterations for tol = 1e-100 + - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse] + - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples - ensemble/tests/test_common.py::test_ensemble_heterogeneous_estimators_behavior diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index b4a1eb7f62..a90912e75f 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -21,14 +21,14 @@ from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype from onedal import _backend +from onedal.basic_statistics import BasicStatistics from ..datatypes import _convert_to_supported, from_table, to_table if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit -else: - from sklearn.cluster import _kmeans_plusplus +from sklearn.cluster._kmeans import _kmeans_plusplus from sklearn.exceptions import ConvergenceWarning from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils import check_random_state @@ -78,26 +78,28 @@ def _validate_center_shape(self, X, centers): def _get_kmeans_init(self, cluster_count, seed, algorithm): return KMeansInit(cluster_count=cluster_count, seed=seed, algorithm=algorithm) - def _tolerance(self, X, rtol): + def _tolerance(self, X_table, rtol, is_csr, policy, dtype): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol - if _is_csr(X): - variances = mean_variance_axis(X, axis=0)[1] - mean_var = np.mean(variances) - else: - mean_var = np.var(X, axis=0).mean() + dummy = to_table(None) + bs = BasicStatistics("variance") + + res = bs.compute_raw(X_table, dummy, policy, dtype, is_csr) + mean_var = from_table(res["variance"]).mean() return mean_var * rtol - def _check_params_vs_input(self, X, policy, default_n_init=10, dtype=np.float32): + def _check_params_vs_input( + self, X_table, is_csr, policy, default_n_init=10, dtype=np.float32 + ): # n_clusters - if X.shape[0] < self.n_clusters: + if X_table.shape[0] < self.n_clusters: raise ValueError( - f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}." + f"n_samples={X_table.shape[0]} should be >= n_clusters={self.n_clusters}." ) # tol - self._tol = self._tolerance(X, self.tol) + self._tol = self._tolerance(X_table, self.tol, is_csr, policy, dtype) # n-init # TODO(1.4): Remove @@ -148,7 +150,7 @@ def _get_onedal_params(self, is_csr=False, dtype=np.float32, result_options=None "result_options": "" if result_options is None else result_options, } - def _get_params_and_input(self, X, policy): + def _get_params_and_input(self, X, is_csr, policy): X = _check_array( X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False ) @@ -156,9 +158,9 @@ def _get_params_and_input(self, X, policy): dtype = get_dtype(X) X_table = to_table(X) - self._check_params_vs_input(X, policy, dtype=dtype) + self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype) - params = self._get_onedal_params(dtype) + params = self._get_onedal_params(is_csr, dtype) return (params, X_table, dtype) @@ -215,7 +217,8 @@ def _init_centroids_onedal( return centers_table def _init_centroids_sklearn(self, X, init, random_state, policy, dtype=np.float32): - # For oneDAL versions < 2023.2, using the scikit-learn implementation + # For oneDAL versions < 2023.2 or callable init, + # using the scikit-learn implementation n_samples = X.shape[0] if isinstance(init, str) and init == "k-means++": @@ -262,7 +265,8 @@ def _fit_backend( def _fit(self, X, module, queue=None): policy = self._get_policy(queue, X) - _, X_table, dtype = self._get_params_and_input(X, policy) + is_csr = _is_csr(X) + _, X_table, dtype = self._get_params_and_input(X, is_csr, policy) self.n_features_in_ = X_table.column_count @@ -292,7 +296,6 @@ def is_better_iteration(inertia, labels): use_onedal_init = daal_check_version((2023, "P", 200)) and not callable(self.init) - is_csr = _is_csr(X) for _ in range(self._n_init): if use_onedal_init: random_seed = random_state.randint(np.iinfo("i").max) @@ -365,9 +368,7 @@ def _set_cluster_centers(self, cluster_centers): cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) def _predict_raw(self, X_table, module, policy, dtype=np.float32, is_csr=False): - params = self._get_onedal_params( - is_csr, dtype, result_options="compute_assignments" - ) + params = self._get_onedal_params(is_csr, dtype) result = module.infer(policy, params, self.model_, X_table) From b085804dee34366a42de774738592b5be6bfbf77 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 25 Jun 2024 09:32:13 -0700 Subject: [PATCH 070/130] address review --- onedal/cluster/kmeans.py | 3 - sklearnex/cluster/k_means.py | 110 +++++++++++++++++------------------ 2 files changed, 52 insertions(+), 61 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a90912e75f..1d659a1302 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -33,7 +33,6 @@ from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils import check_random_state from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.utils.validation import check_is_fitted from ..common._base import BaseEstimator as onedal_BaseEstimator from ..common._mixin import ClusterMixin, TransformerMixin @@ -375,7 +374,6 @@ def _predict_raw(self, X_table, module, policy, dtype=np.float32, is_csr=False): return from_table(result.responses).reshape(-1) def _predict(self, X, module, queue=None): - check_is_fitted(self) is_csr = _is_csr(X) policy = self._get_policy(queue, X) @@ -496,7 +494,6 @@ def transform(self, X): X_new : ndarray of shape (n_samples, n_clusters) X transformed in the new space. """ - check_is_fitted(self) return self._transform(X) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 61a220bf9c..864a16b81b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -42,35 +42,43 @@ from .._utils import PatchingConditionsChain class BaseKMeans(ABC): - def _get_cluster_centers(self): - return self._cluster_centers_ + @property + def _cluster_centers_(self): + return self.__cluster_centers_ - def _set_cluster_centers(self, value): - self._cluster_centers_ = value + @_cluster_centers_.setter + def _cluster_centers_(self, value): + self.__cluster_centers_ = value if hasattr(self, "_onedal_estimator"): self._onedal_estimator.cluster_centers_ = value - def _get_labels(self): - return self._labels_ + @property + def labels_(self): + return self.__labels - def _set_labels(self, value): - self._labels_ = value + @labels_.setter + def labels_(self, value): + self.__labels = value if hasattr(self, "_onedal_estimator"): self._onedal_estimator.labels_ = value - def _get_inertia(self): - return self._inertia_ + @property + def inertia_(self): + return self.__inertia - def _set_inertia(self, value): - self._inertia_ = value + @inertia_.setter + def inertia_(self, value): + self.__inertia = value if hasattr(self, "_onedal_estimator"): self._onedal_estimator.inertia_ = value - def _get_n_iter(self): - return self._n_iter_ + @property + def n_iter_(self): + return self.__n_iter - def _set_n_iter(self, value): - self._n_iter_ = value + @n_iter_.setter + def n_iter_(self, value): + self.__n_iter = value if hasattr(self, "_onedal_estimator"): self._onedal_estimator.n_iter_ = value @@ -85,14 +93,6 @@ def _save_attributes(self): self._inertia_ = self._onedal_estimator.inertia_ self._algorithm = self._onedal_estimator.algorithm self._cluster_centers_ = self._onedal_estimator.cluster_centers_ - self._sparse = False - - self.n_iter_ = property(self._get_n_iter, self._set_n_iter) - self.labels_ = property(self._get_labels, self._set_labels) - self.inertia_ = property(self._get_labels, self._set_inertia) - self.cluster_centers_ = property( - self._get_cluster_centers, self._set_cluster_centers - ) self._is_in_fit = True self.n_iter_ = self._n_iter_ @@ -162,12 +162,19 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd", "elkan"] correct_count = self.n_clusters < sample_count + is_data_supported = ( _is_csr(X) and daal_check_version((2024, "P", 600)) ) or not issparse(X) - sample_weight = _check_sample_weight( - sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None - ) + + _acceptable_sample_weights = True + if sample_weight: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None + ) + _acceptable_sample_weights = np.allclose( + sample_weight, np.ones_like(sample_weight) + ) patching_status.and_conditions( [ @@ -177,8 +184,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): ), (correct_count, "n_clusters is smaller than number of samples"), ( - np.allclose(sample_weight, np.ones_like(sample_weight)), - "Sample weights are not ones.", + _acceptable_sample_weights, + "oneDAL doesn't support sample_weight, either None or ones are acceptable", ), ( is_data_supported, @@ -190,10 +197,6 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): return patching_status def fit(self, X, y=None, sample_weight=None): - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=True) - if sklearn_check_version("1.2"): - self._validate_params() dispatch( self, @@ -204,23 +207,23 @@ def fit(self, X, y=None, sample_weight=None): }, X, y, - sample_weight, + sample_weight=sample_weight, ) return self def _onedal_fit(self, X, _, sample_weight, queue=None): + if sklearn_check_version("1.2"): + self._validate_params() + else: + self._check_params(X) + X = self._validate_data( X, accept_sparse="csr", dtype=[np.float64, np.float32], ) - if sklearn_check_version("1.2"): - self._check_params_vs_input(X) - else: - self._check_params(X) - self._n_features_out = self.n_clusters self._n_threads = _openmp_effective_n_threads() @@ -265,8 +268,7 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): @wrap_output_data def predict(self, X): - self._check_feature_names(X, reset=False) - self._validate_params() + return dispatch( self, "predict", @@ -285,10 +287,7 @@ def predict( X, sample_weight="deprecated" if sklearn_check_version("1.3") else None, ): - if sklearn_check_version("1.0"): - self._check_feature_names(X, reset=False) - if sklearn_check_version("1.2"): - self._validate_params() + return dispatch( self, "predict", @@ -297,10 +296,12 @@ def predict( "sklearn": sklearn_KMeans.predict, }, X, - sample_weight, + sample_weight=sample_weight, ) def _onedal_predict(self, X, sample_weight=None, queue=None): + check_is_fitted(self) + self._validate_params() X = self._validate_data( X, accept_sparse="csr", @@ -308,15 +309,11 @@ def _onedal_predict(self, X, sample_weight=None, queue=None): dtype=[np.float64, np.float32], ) - if not sklearn_check_version("1.5"): - if ( - sklearn_check_version("1.3") - and isinstance(sample_weight, str) - and sample_weight == "deprecated" - ): + if not sklearn_check_version("1.5") and sklearn_check_version("1.3"): + if isinstance(sample_weight, str) and sample_weight == "deprecated": sample_weight = None - if sklearn_check_version("1.3") and sample_weight is not None: + if sample_weight: warnings.warn( "'sample_weight' was deprecated in version 1.3 and " "will be removed in 1.5.", @@ -338,11 +335,8 @@ def _onedal_supported(self, method_name, *data): f"Unknown method {method_name} in {self.__class__.__name__}" ) - def _onedal_gpu_supported(self, method_name, *data): - return self._onedal_supported(method_name, *data) - - def _onedal_cpu_supported(self, method_name, *data): - return self._onedal_supported(method_name, *data) + _onedal_gpu_supported = _onedal_supported + _onedal_cpu_supported = _onedal_supported @wrap_output_data def fit_transform(self, X, y=None, sample_weight=None): From ae66a9e33c117a76e4daeadc2b7dfebab82dc4bc Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 25 Jun 2024 11:04:58 -0700 Subject: [PATCH 071/130] minor --- sklearnex/cluster/k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 864a16b81b..0d427c350b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -301,7 +301,7 @@ def predict( def _onedal_predict(self, X, sample_weight=None, queue=None): check_is_fitted(self) - self._validate_params() + X = self._validate_data( X, accept_sparse="csr", From 0020d1b63afcd0b80cffc3ab4e3ebbf9c05905f0 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 25 Jun 2024 11:12:14 -0700 Subject: [PATCH 072/130] update comments --- onedal/cluster/kmeans.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 1d659a1302..082bc30a3d 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -175,6 +175,7 @@ def _init_centroids_onedal( ): n_clusters = self.n_clusters if n_centroids is None else n_centroids # Use host policy for KMeans init, only for csr data + # as oneDAL KMeansInit for CSR data is not implemented on GPU init_policy = self._get_policy(None, None) if is_csr else policy if isinstance(init, str) and init == "k-means++": @@ -201,13 +202,14 @@ def _init_centroids_onedal( centers_table = alg.compute_raw(X_table, init_policy, dtype) elif _is_arraylike_not_scalar(init): if _is_csr(init): - # oneDAL KMeans doesn't support sparse centroids + # oneDAL KMeans only supports Dense Centroids centers = init.toarray() else: centers = np.asarray(init) assert centers.shape[0] == n_clusters assert centers.shape[1] == X_table.column_count - # Use original policy for KMeans init when arraylike init is provided + # KMeans is implemented on both CPU and GPU for Dense and CSR data + # The original policy can be used here centers = _convert_to_supported(policy, centers) centers_table = to_table(centers) else: From ae7738830c0f53472203ef24a829650932782afc Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 25 Jun 2024 13:56:40 -0700 Subject: [PATCH 073/130] refactor --- deselected_tests.yaml | 2 + onedal/cluster/kmeans.py | 4 +- sklearnex/cluster/k_means.py | 77 ++++++------------------------------ 3 files changed, 17 insertions(+), 66 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 478c73233f..18635672ef 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -184,6 +184,8 @@ deselected_tests: # oneAPI Data Analytics Library (oneDAL) does not check convergence for tol == 0.0 for ease of benchmarking - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 + # oneDAL uses lloyd algorithm for elkan, so doesn't make sense to raise the warning + - cluster/tests/test_k_means.py::test_warning_elkan_1_cluster # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 082bc30a3d..2ec8ca8f90 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -309,14 +309,14 @@ def is_better_iteration(inertia, labels): ) if self.verbose: - print("Initialization complete.") + print("Initialization complete") labels, inertia, model, n_iter = self._fit_backend( X_table, centroids_table, module, policy, dtype, is_csr ) if self.verbose: - print("KMeans iteration completed with inertia {}.".format(inertia)) + print("Iteration {}, inertia {}.".format(n_iter, inertia)) if is_better_iteration(inertia, labels): best_model, best_n_iter = model, n_iter diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 0d427c350b..ab4143f47b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -19,7 +19,6 @@ from daal4py.sklearn._utils import daal_check_version if daal_check_version((2023, "P", 200)): - from abc import ABC import numpy as np from scipy.sparse import issparse @@ -41,68 +40,8 @@ from .._device_offload import dispatch, wrap_output_data from .._utils import PatchingConditionsChain - class BaseKMeans(ABC): - @property - def _cluster_centers_(self): - return self.__cluster_centers_ - - @_cluster_centers_.setter - def _cluster_centers_(self, value): - self.__cluster_centers_ = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.cluster_centers_ = value - - @property - def labels_(self): - return self.__labels - - @labels_.setter - def labels_(self, value): - self.__labels = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.labels_ = value - - @property - def inertia_(self): - return self.__inertia - - @inertia_.setter - def inertia_(self, value): - self.__inertia = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.inertia_ = value - - @property - def n_iter_(self): - return self.__n_iter - - @n_iter_.setter - def n_iter_(self, value): - self.__n_iter = value - if hasattr(self, "_onedal_estimator"): - self._onedal_estimator.n_iter_ = value - - def _save_attributes(self): - assert hasattr(self, "_onedal_estimator") - self.n_features_in_ = self._onedal_estimator.n_features_in_ - self.fit_status_ = 0 - self._tol = self._onedal_estimator._tol - self._n_init = self._onedal_estimator._n_init - self._n_iter_ = self._onedal_estimator.n_iter_ - self._labels_ = self._onedal_estimator.labels_ - self._inertia_ = self._onedal_estimator.inertia_ - self._algorithm = self._onedal_estimator.algorithm - self._cluster_centers_ = self._onedal_estimator.cluster_centers_ - - self._is_in_fit = True - self.n_iter_ = self._n_iter_ - self.labels_ = self._labels_ - self.inertia_ = self._inertia_ - self.cluster_centers_ = self._cluster_centers_ - self._is_in_fit = False - @control_n_jobs(decorated_methods=["fit", "predict", "transform", "fit_transform"]) - class KMeans(sklearn_KMeans, BaseKMeans): + class KMeans(sklearn_KMeans): __doc__ = sklearn_KMeans.__doc__ n_iter_, inertia_ = None, None labels_, cluster_centers_ = None, None @@ -168,7 +107,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): ) or not issparse(X) _acceptable_sample_weights = True - if sample_weight: + if sample_weight is not None: sample_weight = _check_sample_weight( sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None ) @@ -207,7 +146,7 @@ def fit(self, X, y=None, sample_weight=None): }, X, y, - sample_weight=sample_weight, + sample_weight, ) return self @@ -351,6 +290,16 @@ def transform(self, X): score = support_usm_ndarray()(sklearn_KMeans.score) + def _save_attributes(self): + assert hasattr(self, "_onedal_estimator") + self.cluster_centers_ = self._onedal_estimator.cluster_centers_ + self.labels_ = self._onedal_estimator.labels_ + self.inertia_ = self._onedal_estimator.inertia_ + self.n_iter_ = self._onedal_estimator.n_iter_ + self.n_features_in_ = self._onedal_estimator.n_features_in_ + + self._n_init = self._onedal_estimator._n_init + fit.__doc__ = sklearn_KMeans.fit.__doc__ predict.__doc__ = sklearn_KMeans.predict.__doc__ transform.__doc__ = sklearn_KMeans.transform.__doc__ From 698adacea68a3434ad0ef6d5a221646ace4f862f Mon Sep 17 00:00:00 2001 From: "md.shafiul.alam" Date: Tue, 25 Jun 2024 16:48:35 -0700 Subject: [PATCH 074/130] ci --- deselected_tests.yaml | 2 -- sklearnex/cluster/k_means.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 18635672ef..478c73233f 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -184,8 +184,6 @@ deselected_tests: # oneAPI Data Analytics Library (oneDAL) does not check convergence for tol == 0.0 for ease of benchmarking - cluster/tests/test_k_means.py::test_kmeans_convergence >=0.23 - cluster/tests/test_k_means.py::test_kmeans_verbose >=0.23 - # oneDAL uses lloyd algorithm for elkan, so doesn't make sense to raise the warning - - cluster/tests/test_k_means.py::test_warning_elkan_1_cluster # The Newton-CG solver solution computed in float32 disagrees with that of float64 by a small # margin above the test threshold, see https://github.com/scikit-learn/scikit-learn/pull/13645 diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index ab4143f47b..cfca09064b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -154,6 +154,7 @@ def fit(self, X, y=None, sample_weight=None): def _onedal_fit(self, X, _, sample_weight, queue=None): if sklearn_check_version("1.2"): self._validate_params() + self._check_params_vs_input(X) else: self._check_params(X) From 7f1114c6dbebf58e5c3ef2f4ec818e559b0b7a85 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 26 Jun 2024 06:17:32 -0700 Subject: [PATCH 075/130] address ci --- deselected_tests.yaml | 3 ++- sklearnex/cluster/k_means.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 478c73233f..eb59f14e3f 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -452,7 +452,8 @@ gpu: - cluster/tests/test_dbscan.py::test_weighted_dbscan # Different number of iterations for tol = 1e-100 - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse] + - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] + - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index cfca09064b..8e20c48e75 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -136,6 +136,8 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): return patching_status def fit(self, X, y=None, sample_weight=None): + if sklearn_check_version("1.2"): + self._validate_params() dispatch( self, @@ -152,18 +154,17 @@ def fit(self, X, y=None, sample_weight=None): return self def _onedal_fit(self, X, _, sample_weight, queue=None): - if sklearn_check_version("1.2"): - self._validate_params() - self._check_params_vs_input(X) - else: - self._check_params(X) - X = self._validate_data( X, accept_sparse="csr", dtype=[np.float64, np.float32], ) + if sklearn_check_version("1.2"): + self._check_params_vs_input(X) + else: + self._check_params(X) + self._n_features_out = self.n_clusters self._n_threads = _openmp_effective_n_threads() @@ -208,6 +209,7 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): @wrap_output_data def predict(self, X): + self._validate_params() return dispatch( self, @@ -227,6 +229,8 @@ def predict( X, sample_weight="deprecated" if sklearn_check_version("1.3") else None, ): + if sklearn_check_version("1.2"): + self._validate_params() return dispatch( self, From 2850a8517a5c6c042a47f40b438bfc85af5dafa5 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 9 Jul 2024 05:41:36 -0700 Subject: [PATCH 076/130] update test --- sklearnex/cluster/tests/test_kmeans.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 271b3ea908..7a1ba79449 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -33,7 +33,6 @@ def test_sklearnex_import(dataframe, queue): X_train = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) X_test = np.array([[0, 0], [12, 3]]) - expected_cluster_labels = np.array([1, 0], dtype=np.int32) X_train = _convert_to_dataframe(X_train, sycl_queue=queue, target_df=dataframe) X_test = _convert_to_dataframe(X_test, sycl_queue=queue, target_df=dataframe) @@ -44,4 +43,9 @@ def test_sklearnex_import(dataframe, queue): assert "daal4py" in kmeans.__module__ result_cluster_labels = kmeans.predict(X_test) + if queue and queue.sycl_device.is_gpu: + # KMeans Init Dense GPU implementation is different from CPU + expected_cluster_labels = np.array([0, 1], dtype=np.int32) + else: + expected_cluster_labels = np.array([1, 0], dtype=np.int32) assert_allclose(expected_cluster_labels, _as_numpy(result_cluster_labels)) From db40680d279416ae2859f704a643e922f9077e17 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 9 Jul 2024 05:44:09 -0700 Subject: [PATCH 077/130] version check --- sklearnex/cluster/k_means.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 8e20c48e75..cf5f9a48fe 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -209,7 +209,8 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): @wrap_output_data def predict(self, X): - self._validate_params() + if sklearn_check_version("1.2"): + self._validate_params() return dispatch( self, From ad38abdcd8791494281e3b1e94776cbe6ae9b84b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 9 Jul 2024 08:07:16 -0700 Subject: [PATCH 078/130] lint --- sklearnex/cluster/tests/test_kmeans.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 7a1ba79449..f92361f1b9 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -16,7 +16,6 @@ import numpy as np import pytest -import pytest from numpy.testing import assert_allclose from daal4py.sklearn._utils import daal_check_version From eea103b79b7e8716722df1f556ddef5fd93631d0 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 11 Jul 2024 22:06:27 -0700 Subject: [PATCH 079/130] minor fix --- sklearnex/cluster/k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index cf5f9a48fe..abaf744166 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -31,7 +31,7 @@ check_is_fitted, ) - from daal4py.sklearn._device_offload import support_usm_ndarray + from onedal._device_offload import support_usm_ndarray from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from onedal.cluster import KMeans as onedal_KMeans From d84d1c84a14e7980902111675dddc772a58cd8dd Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 11 Jul 2024 22:08:45 -0700 Subject: [PATCH 080/130] lint --- sklearnex/cluster/k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index abaf744166..3d59c3a88a 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -31,9 +31,9 @@ check_is_fitted, ) - from onedal._device_offload import support_usm_ndarray from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version + from onedal._device_offload import support_usm_ndarray from onedal.cluster import KMeans as onedal_KMeans from onedal.utils import _is_csr From 235aa13b92a94f9044a643212670c52cb84a2b9b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 11 Jul 2024 22:50:28 -0700 Subject: [PATCH 081/130] basic stat fix --- onedal/cluster/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 2ec8ca8f90..061ae2b87d 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -84,7 +84,7 @@ def _tolerance(self, X_table, rtol, is_csr, policy, dtype): dummy = to_table(None) bs = BasicStatistics("variance") - res = bs.compute_raw(X_table, dummy, policy, dtype, is_csr) + res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr) mean_var = from_table(res["variance"]).mean() return mean_var * rtol From a32389496ef2389b0884f6d9a0bc60ef4365cccd Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 06:58:10 -0700 Subject: [PATCH 082/130] score --- onedal/cluster/kmeans.cpp | 3 -- onedal/cluster/kmeans.py | 40 +++++++++++++++++++------ sklearnex/cluster/k_means.py | 58 +++++++++++++++++++++++++++++++++--- 3 files changed, 85 insertions(+), 16 deletions(-) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index b88612bd9c..d4bdb4700c 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -69,9 +69,6 @@ struct params2desc { desc.set_max_iteration_count(params["max_iteration_count"].cast()); #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200 auto result_options = params["result_options"].cast(); - if (result_options == "compute_assignments") { - desc.set_result_options(result_options::compute_assignments); - } if (result_options == "compute_exact_objective_function") { desc.set_result_options(result_options::compute_exact_objective_function); } diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 061ae2b87d..05a4ae256c 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -368,21 +368,28 @@ def _set_cluster_centers(self, cluster_centers): cluster_centers_ = property(_get_cluster_centers, _set_cluster_centers) - def _predict_raw(self, X_table, module, policy, dtype=np.float32, is_csr=False): - params = self._get_onedal_params(is_csr, dtype) - - result = module.infer(policy, params, self.model_, X_table) - - return from_table(result.responses).reshape(-1) - - def _predict(self, X, module, queue=None): + def _predict(self, X, module, queue=None, result_options=None): is_csr = _is_csr(X) policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) X_table, dtype = to_table(X), X.dtype + params = self._get_onedal_params(is_csr, dtype, result_options) - return self._predict_raw(X_table, module, policy, dtype, is_csr) + result = module.infer(policy, params, self.model_, X_table) + + if result_options: + # Only set for score function + return result.objective_function_value * -1 + else: + return result.responses.ravel() + + def _score(self, X, module, queue=None): + result_options = "compute_exact_objective_function" + + return self._predict( + X, self._get_backend("kmeans", "clustering", None), queue, result_options + ) def _transform(self, X): return euclidean_distances(X, self.cluster_centers_) @@ -499,6 +506,21 @@ def transform(self, X): return self._transform(X) + def score(self, X, queue=None): + """Opposite of the value of X on the K-means objective. + + Parameters + ---------- + X: {array-like, sparse matrix} of shape (n_samples, n_features) + New data. + + Returns + ------- + score: float + Opposite of the value of X on the K-means objective. + """ + return super()._score(X, self._get_backend("kmeans", "clustering", None), queue) + def k_means( X, diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 3d59c3a88a..6ed364a12c 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -174,8 +174,6 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._save_attributes() def _onedal_predict_supported(self, method_name, X, sample_weight=None): - assert method_name == "predict" - class_name = self.__class__.__name__ is_data_supported = ( _is_csr(X) and daal_check_version((2024, "P", 600)) @@ -186,6 +184,15 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): supported_algs = ["auto", "full", "lloyd", "elkan"] + _acceptable_sample_weights = True + if sample_weight is not None: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None + ) + _acceptable_sample_weights = np.allclose( + sample_weight, np.ones_like(sample_weight) + ) + patching_status.and_conditions( [ ( @@ -200,6 +207,10 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): hasattr(self, "_onedal_estimator"), "oneDAL model was not fit.", ), + ( + _acceptable_sample_weights, + "oneDAL doesn't support sample_weight, either None or ones are acceptable", + ), ] ) @@ -274,7 +285,7 @@ def _onedal_predict(self, X, sample_weight=None, queue=None): def _onedal_supported(self, method_name, *data): if method_name == "fit": return self._onedal_fit_supported(method_name, *data) - if method_name == "predict": + if method_name in ["predict", "score"]: return self._onedal_predict_supported(method_name, *data) raise RuntimeError( f"Unknown method {method_name} in {self.__class__.__name__}" @@ -294,7 +305,46 @@ def transform(self, X): X = self._check_test_data(X) return self._transform(X) - score = support_usm_ndarray()(sklearn_KMeans.score) + @wrap_output_data + def score(self, X, y=None, sample_weight=None): + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": sklearn_KMeans.score, + }, + X, + y, + sample_weight=sample_weight, + ) + + def _onedal_score(self, X, y, sample_weight=None, queue=None): + check_is_fitted(self) + + X = self._validate_data( + X, + accept_sparse="csr", + reset=False, + dtype=[np.float64, np.float32], + ) + + if not sklearn_check_version("1.5") and sklearn_check_version("1.3"): + if isinstance(sample_weight, str) and sample_weight == "deprecated": + sample_weight = None + + if sample_weight: + warnings.warn( + "'sample_weight' was deprecated in version 1.3 and " + "will be removed in 1.5.", + FutureWarning, + ) + + if not hasattr(self, "_onedal_estimator"): + self._initialize_onedal_estimator() + self._onedal_estimator.cluster_centers_ = self.cluster_centers_ + + return self._onedal_estimator.score(X, queue=queue) def _save_attributes(self): assert hasattr(self, "_onedal_estimator") From 62639cd68a2a1d3e9a3e1f7886754ca0d2a5770f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 07:23:16 -0700 Subject: [PATCH 083/130] minor --- onedal/cluster/kmeans.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 05a4ae256c..9a7ca82417 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -380,9 +380,9 @@ def _predict(self, X, module, queue=None, result_options=None): if result_options: # Only set for score function - return result.objective_function_value * -1 + return from_table(result.objective_function_value) * -1 else: - return result.responses.ravel() + return from_table(result.responses).ravel() def _score(self, X, module, queue=None): result_options = "compute_exact_objective_function" From 979ced6e34e036c0fa5d816b347ceabd0abb64c9 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 08:31:39 -0700 Subject: [PATCH 084/130] ci fix + refactor --- onedal/cluster/kmeans.cpp | 3 ++- onedal/cluster/kmeans.py | 2 +- sklearnex/cluster/k_means.py | 4 +--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index d4bdb4700c..6fdefebd4b 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -157,7 +157,8 @@ void init_infer_result(py::module_& m) { auto cls = py::class_(m, "infer_result") .def(py::init()) - .DEF_ONEDAL_PY_PROPERTY(responses, result_t); + .DEF_ONEDAL_PY_PROPERTY(responses, result_t) + .DEF_ONEDAL_PY_PROPERTY(objective_function_value, result_t); } ONEDAL_PY_DECLARE_INSTANTIATOR(init_model); diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 9a7ca82417..5697c64199 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -380,7 +380,7 @@ def _predict(self, X, module, queue=None, result_options=None): if result_options: # Only set for score function - return from_table(result.objective_function_value) * -1 + return from_table(result.objective_function_value)[0] * -1 else: return from_table(result.responses).ravel() diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 6ed364a12c..2a74764907 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -23,7 +23,6 @@ import numpy as np from scipy.sparse import issparse from sklearn.cluster import KMeans as sklearn_KMeans - from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( _check_sample_weight, _deprecate_positional_args, @@ -33,7 +32,6 @@ from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version - from onedal._device_offload import support_usm_ndarray from onedal.cluster import KMeans as onedal_KMeans from onedal.utils import _is_csr @@ -166,7 +164,6 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._check_params(X) self._n_features_out = self.n_clusters - self._n_threads = _openmp_effective_n_threads() self._initialize_onedal_estimator() self._onedal_estimator.fit(X, queue=queue) @@ -360,6 +357,7 @@ def _save_attributes(self): predict.__doc__ = sklearn_KMeans.predict.__doc__ transform.__doc__ = sklearn_KMeans.transform.__doc__ fit_transform.__doc__ = sklearn_KMeans.fit_transform.__doc__ + score.__doc__ = sklearn_KMeans.score.__doc__ else: from daal4py.sklearn.cluster import KMeans From eb72712f805171a965a221a7c0b14f233bbb0c2e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 09:41:28 -0700 Subject: [PATCH 085/130] more fixes --- sklearnex/cluster/k_means.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 2a74764907..88c3529719 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -23,6 +23,7 @@ import numpy as np from scipy.sparse import issparse from sklearn.cluster import KMeans as sklearn_KMeans + from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( _check_sample_weight, _deprecate_positional_args, @@ -166,6 +167,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._n_features_out = self.n_clusters self._initialize_onedal_estimator() + self._n_threads = _openmp_effective_n_threads() self._onedal_estimator.fit(X, queue=queue) self._save_attributes() @@ -200,13 +202,9 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): is_data_supported, "Supported data formats: Dense, CSR (oneDAL version >= 2024.6.0).", ), - ( - hasattr(self, "_onedal_estimator"), - "oneDAL model was not fit.", - ), ( _acceptable_sample_weights, - "oneDAL doesn't support sample_weight, either None or ones are acceptable", + "oneDAL doesn't support sample_weight, None or ones are acceptable", ), ] ) From dd552ff5121b4f6b2e93760932f19f8487676c4d Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 14:10:28 -0700 Subject: [PATCH 086/130] not a table --- onedal/cluster/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 5697c64199..26c582e5e4 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -380,7 +380,7 @@ def _predict(self, X, module, queue=None, result_options=None): if result_options: # Only set for score function - return from_table(result.objective_function_value)[0] * -1 + return result.objective_function_value * -1 else: return from_table(result.responses).ravel() From 83f28ca59c7a5661a6a1e5e62d8a9238639e8a21 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 14:18:49 -0700 Subject: [PATCH 087/130] minor --- onedal/cluster/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 26c582e5e4..bd40e1e3a8 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -380,7 +380,7 @@ def _predict(self, X, module, queue=None, result_options=None): if result_options: # Only set for score function - return result.objective_function_value * -1 + return result.objective_function_value * (-1) else: return from_table(result.responses).ravel() From 47693a4204a8a23c75f5affe480874c4af240251 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 15:01:17 -0700 Subject: [PATCH 088/130] sample weight --- sklearnex/cluster/k_means.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 88c3529719..1aad0833ee 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -264,7 +264,7 @@ def _onedal_predict(self, X, sample_weight=None, queue=None): if isinstance(sample_weight, str) and sample_weight == "deprecated": sample_weight = None - if sample_weight: + if sample_weight is not None: warnings.warn( "'sample_weight' was deprecated in version 1.3 and " "will be removed in 1.5.", @@ -328,7 +328,7 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): if isinstance(sample_weight, str) and sample_weight == "deprecated": sample_weight = None - if sample_weight: + if sample_weight is not None: warnings.warn( "'sample_weight' was deprecated in version 1.3 and " "will be removed in 1.5.", From 945c93dbe2d714833997c818c15e387b79898075 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 15 Jul 2024 15:59:38 -0700 Subject: [PATCH 089/130] import --- sklearnex/cluster/k_means.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 1aad0833ee..9b40da1e58 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -20,6 +20,8 @@ if daal_check_version((2023, "P", 200)): + import warnings + import numpy as np from scipy.sparse import issparse from sklearn.cluster import KMeans as sklearn_KMeans From c457e502e35bdf74740e99d869a12ca31533e06a Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 5 Aug 2024 09:42:51 -0700 Subject: [PATCH 090/130] preview remove --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 1c5a815598..d89df18628 100644 --- a/setup.py +++ b/setup.py @@ -569,7 +569,6 @@ def run(self): "sklearnex.neighbors", "sklearnex.preview", "sklearnex.preview.covariance", - "sklearnex.preview.cluster", "sklearnex.preview.decomposition", "sklearnex.preview.linear_model", "sklearnex.svm", From d231333ba6cafcfb05b150dcfc4b5d577d3b1dcc Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 6 Aug 2024 12:35:38 -0700 Subject: [PATCH 091/130] SPMD fix --- onedal/cluster/kmeans.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index bd40e1e3a8..bf37cb16f5 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -22,6 +22,7 @@ from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype from onedal import _backend from onedal.basic_statistics import BasicStatistics +from onedal.spmd.basic_statistics import BasicStatistics as BasicStatistics_SPMD from ..datatypes import _convert_to_supported, from_table, to_table @@ -32,7 +33,6 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils import check_random_state -from sklearn.utils.sparsefuncs import mean_variance_axis from ..common._base import BaseEstimator as onedal_BaseEstimator from ..common._mixin import ClusterMixin, TransformerMixin @@ -82,10 +82,15 @@ def _tolerance(self, X_table, rtol, is_csr, policy, dtype): if rtol == 0.0: return rtol dummy = to_table(None) - bs = BasicStatistics("variance") + + if not isinstance(policy, _SPMDDataParallelInteropPolicy): + bs = BasicStatistics("variance") + else: + bs = BasicStatistics_SPMD("variance") res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr) mean_var = from_table(res["variance"]).mean() + return mean_var * rtol def _check_params_vs_input( From 354446b0a6becaffccef62bf8a6fef61df4d8578 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 6 Aug 2024 13:27:59 -0700 Subject: [PATCH 092/130] SPMD fix --- deselected_tests.yaml | 1 - onedal/cluster/kmeans.py | 17 ++++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index aa5493caf2..08e31fea1c 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -460,7 +460,6 @@ gpu: # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - # Different number of iterations for tol = 1e-100 - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index bf37cb16f5..871bcd1d03 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -22,7 +22,11 @@ from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype from onedal import _backend from onedal.basic_statistics import BasicStatistics -from onedal.spmd.basic_statistics import BasicStatistics as BasicStatistics_SPMD + +try: + from onedal.spmd.basic_statistics import BasicStatistics as BasicStatistics_SPMD +except ImportError: + BasicStatistics_SPMD = None from ..datatypes import _convert_to_supported, from_table, to_table @@ -36,6 +40,7 @@ from ..common._base import BaseEstimator as onedal_BaseEstimator from ..common._mixin import ClusterMixin, TransformerMixin +from ..common._spmd_policy import _SPMDDataParallelInteropPolicy as spmd_policy from ..utils import _check_array, _is_arraylike_not_scalar, _is_csr @@ -83,10 +88,14 @@ def _tolerance(self, X_table, rtol, is_csr, policy, dtype): return rtol dummy = to_table(None) - if not isinstance(policy, _SPMDDataParallelInteropPolicy): + if not isinstance(policy, spmd_policy): bs = BasicStatistics("variance") - else: + elif BasicStatistics_SPMD is not None: bs = BasicStatistics_SPMD("variance") + else: + raise ImportError( + "Failed to import BasicStatistics from onedal.spmd, check if SPMD backend was built properly" + ) res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr) mean_var = from_table(res["variance"]).mean() @@ -105,8 +114,6 @@ def _check_params_vs_input( # tol self._tol = self._tolerance(X_table, self.tol, is_csr, policy, dtype) - # n-init - # TODO(1.4): Remove self._n_init = self.n_init if self._n_init == "warn": warnings.warn( From 02e49f56c0249eb65018166ef5d5bec7c2bb72a5 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 6 Aug 2024 14:31:04 -0700 Subject: [PATCH 093/130] SPMD fix --- onedal/cluster/kmeans.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 871bcd1d03..fac5c09636 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -40,7 +40,7 @@ from ..common._base import BaseEstimator as onedal_BaseEstimator from ..common._mixin import ClusterMixin, TransformerMixin -from ..common._spmd_policy import _SPMDDataParallelInteropPolicy as spmd_policy +from ..common._policy import _DataParallelInteropPolicy, _HostInteropPolicy from ..utils import _check_array, _is_arraylike_not_scalar, _is_csr @@ -88,14 +88,14 @@ def _tolerance(self, X_table, rtol, is_csr, policy, dtype): return rtol dummy = to_table(None) - if not isinstance(policy, spmd_policy): + if isinstance(policy, _HostInteropPolicy) or isinstance( + policy, _DataParallelInteropPolicy + ): bs = BasicStatistics("variance") elif BasicStatistics_SPMD is not None: bs = BasicStatistics_SPMD("variance") else: - raise ImportError( - "Failed to import BasicStatistics from onedal.spmd, check if SPMD backend was built properly" - ) + raise ImportError("Failed to import BasicStatistics from onedal.spmd") res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr) mean_var = from_table(res["variance"]).mean() From 7e099c44504574d19bc44092d9e63bc819f28a5f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 6 Aug 2024 14:53:18 -0700 Subject: [PATCH 094/130] refactor --- onedal/cluster/kmeans.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index fac5c09636..09a8a330b5 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -28,7 +28,10 @@ except ImportError: BasicStatistics_SPMD = None -from ..datatypes import _convert_to_supported, from_table, to_table +try: + from ..common._policy import _DataParallelInteropPolicy as dp_policy +except ImportError: + dp_policy = None if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit @@ -40,7 +43,8 @@ from ..common._base import BaseEstimator as onedal_BaseEstimator from ..common._mixin import ClusterMixin, TransformerMixin -from ..common._policy import _DataParallelInteropPolicy, _HostInteropPolicy +from ..common._policy import _HostInteropPolicy as host_policy +from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import _check_array, _is_arraylike_not_scalar, _is_csr @@ -88,9 +92,9 @@ def _tolerance(self, X_table, rtol, is_csr, policy, dtype): return rtol dummy = to_table(None) - if isinstance(policy, _HostInteropPolicy) or isinstance( - policy, _DataParallelInteropPolicy - ): + _is_host_policy = isinstance(policy, host_policy) + _is_dp_policy = dp_policy is not None and isinstance(policy, dp_policy) + if _is_host_policy or _is_dp_policy: bs = BasicStatistics("variance") elif BasicStatistics_SPMD is not None: bs = BasicStatistics_SPMD("variance") From e820c0fe3849d14dcf944096fbbd64fdbd3b38db Mon Sep 17 00:00:00 2001 From: "md.shafiul.alam" Date: Tue, 6 Aug 2024 15:55:42 -0700 Subject: [PATCH 095/130] deselect --- deselected_tests.yaml | 4 ++-- onedal/cluster/kmeans.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 08e31fea1c..db6ff1f4bb 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -460,8 +460,8 @@ gpu: # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] + - cluster/tests/test_kmeans.py::test_kmeans_elkan_results + - cluster/tests/test_kmeans.py::test_unsupervised_grid_search - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 09a8a330b5..acbefcb2cb 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -394,8 +394,7 @@ def _predict(self, X, module, queue=None, result_options=None): result = module.infer(policy, params, self.model_, X_table) - if result_options: - # Only set for score function + if result_options: # This is only set for score function return result.objective_function_value * (-1) else: return from_table(result.responses).ravel() From c0cab69a7636065a03c8e7e6cb912d36a9d1d785 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 6 Aug 2024 23:20:42 -0700 Subject: [PATCH 096/130] deselect refactor --- deselected_tests.yaml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index db6ff1f4bb..ceab401d35 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -377,8 +377,6 @@ deselected_tests: - model_selection/tests/test_classification_threshold.py::test_fit_and_score_over_thresholds_sample_weight >=1.5 - model_selection/tests/test_classification_threshold.py::test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence >=1.5 - # -------------------------------------------------------- - # No need to test daal4py patching reduced_tests: - cluster/tests/test_affinity_propagation.py - cluster/tests/test_bicluster.py @@ -450,16 +448,11 @@ public: # Fails from numpy 2.0 and sklearn 1.4+ - neighbors/tests/test_neighbors.py::test_KNeighborsClassifier_raise_on_all_zero_weights - # -------------------------------------------------------- - # The following tests currently fail with GPU offload gpu: - # Segfaults - ensemble/tests/test_weight_boosting.py - # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results - cluster/tests/test_kmeans.py::test_unsupervised_grid_search @@ -1121,3 +1114,6 @@ gpu: - tests/test_common.py::test_estimators[DBSCAN()-check_fit2d_predict1d] - tests/test_common.py::test_check_n_features_in_after_fitting[DBSCAN()] - tests/test_common.py::test_check_n_features_in_after_fitting[SVC()] + +preview: + # The following preview tests are deselected. From e764442315dd95da38701152ae2cea26dcdbbe1f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 7 Aug 2024 00:35:07 -0700 Subject: [PATCH 097/130] deselect update --- deselected_tests.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index ceab401d35..6f5d62b3fa 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -1114,6 +1114,3 @@ gpu: - tests/test_common.py::test_estimators[DBSCAN()-check_fit2d_predict1d] - tests/test_common.py::test_check_n_features_in_after_fitting[DBSCAN()] - tests/test_common.py::test_check_n_features_in_after_fitting[SVC()] - -preview: - # The following preview tests are deselected. From 1fd3c63df0e0ec440684dd4353f18c8f3769f6cd Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 7 Aug 2024 00:50:48 -0700 Subject: [PATCH 098/130] deselect update --- deselected_tests.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 6f5d62b3fa..39d1e456a6 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -453,8 +453,9 @@ gpu: - ensemble/tests/test_weight_boosting.py # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results - - cluster/tests/test_kmeans.py::test_unsupervised_grid_search + - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] + - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] + - model_selection/tests/test_search.py::test_unsupervised_grid_search - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples From 2a7f88bc6ee22fe8d9ee7a96a9e466d005332c9f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 7 Aug 2024 00:56:48 -0700 Subject: [PATCH 099/130] deselect update --- deselected_tests.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 39d1e456a6..b9685960fc 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -453,8 +453,7 @@ gpu: - ensemble/tests/test_weight_boosting.py # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] + - cluster/tests/test_kmeans.py::test_kmeans_elkan_results - model_selection/tests/test_search.py::test_unsupervised_grid_search - ensemble/tests/test_bagging.py::test_gridsearch From 83e3a083f28d7932ab751c8790994155caf33547 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Wed, 7 Aug 2024 07:18:06 -0700 Subject: [PATCH 100/130] deselect --- deselected_tests.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index b9685960fc..4569bbe6ad 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -453,7 +453,8 @@ gpu: - ensemble/tests/test_weight_boosting.py # Fails - cluster/tests/test_dbscan.py::test_weighted_dbscan - - cluster/tests/test_kmeans.py::test_kmeans_elkan_results + - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] + - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] - model_selection/tests/test_search.py::test_unsupervised_grid_search - ensemble/tests/test_bagging.py::test_gridsearch From 772c9046d77ad65d9dcd5a0b969e9228928354c6 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 29 Aug 2024 07:55:51 -0700 Subject: [PATCH 101/130] reverting to previous --- .github/workflows/oneDAL.yml | 86 ++++ .github/workflows/renovate-validation.yml | 2 +- README.md | 2 +- conda-recipe/meta.yaml | 1 - daal4py/__init__.py | 2 - daal4py/oneapi/__init__.py | 66 --- daal4py/sklearn/_utils.py | 32 +- dependencies-dev | 7 +- deselected_tests.yaml | 408 +---------------- doc/daal4py/sklearn.rst | 4 +- doc/sources/algorithms.rst | 190 +++++++- doc/sources/distributed-mode.rst | 28 +- doc/sources/quick-start.rst | 8 - generator/wrapper_gen.py | 31 -- onedal/basic_statistics/basic_statistics.cpp | 1 + onedal/cluster/dbscan.cpp | 1 + onedal/cluster/kmeans_common.cpp | 2 - onedal/datatypes/data_conversion.cpp | 2 +- onedal/ensemble/forest.cpp | 1 - onedal/linear_model/__init__.py | 3 +- .../linear_model/incremental_linear_model.py | 110 +++++ onedal/linear_model/linear_model.cpp | 1 + onedal/linear_model/logistic_regression.cpp | 11 +- onedal/linear_model/logistic_regression.py | 22 +- .../test_incremental_ridge_regression.py | 107 +++++ .../tests/test_logistic_regression.py | 28 ++ onedal/neighbors/neighbors.cpp | 1 + onedal/primitives/optimizers.hpp | 4 - onedal/primitives/pairwise_distances.hpp | 2 - onedal/primitives/tree_visitor.cpp | 6 +- scripts/build_backend.py | 80 ---- scripts/version.py | 2 +- setup.py | 57 --- sklearnex/dispatcher.py | 14 + sklearnex/linear_model/__init__.py | 2 + sklearnex/linear_model/incremental_ridge.py | 418 ++++++++++++++++++ sklearnex/linear_model/logistic_regression.py | 65 ++- .../tests/test_incremental_ridge.py | 153 +++++++ sklearnex/linear_model/tests/test_logreg.py | 45 +- sklearnex/tests/test_memory_usage.py | 1 + src/daal4py.cpp | 5 - src/oneapi/oneapi.h | 90 ---- src/oneapi/oneapi.pyx | 176 -------- src/oneapi/oneapi_backend.cpp | 224 ---------- src/oneapi/oneapi_backend.h | 55 --- tests/daal4py/sycl/bf_knn_classification.py | 141 ------ tests/daal4py/sycl/covariance.py | 111 ----- tests/daal4py/sycl/covariance_streaming.py | 142 ------ tests/daal4py/sycl/dbscan.py | 117 ----- .../sycl/decision_forest_classification.py | 169 ------- .../decision_forest_classification_hist.py | 170 ------- .../sycl/decision_forest_regression.py | 152 ------- .../sycl/decision_forest_regression_hist.py | 153 ------- .../sycl/gradient_boosted_regression.py | 138 ------ tests/daal4py/sycl/kmeans.py | 123 ------ tests/daal4py/sycl/linear_regression.py | 146 ------ tests/daal4py/sycl/log_reg_binary_dense.py | 135 ------ tests/daal4py/sycl/log_reg_dense.py | 162 ------- tests/daal4py/sycl/low_order_moms_dense.py | 145 ------ .../daal4py/sycl/low_order_moms_streaming.py | 162 ------- tests/daal4py/sycl/pca.py | 122 ----- tests/daal4py/sycl/pca_transform.py | 107 ----- tests/daal4py/sycl/sklearn_sycl.py | 191 -------- tests/daal4py/sycl/svm.py | 157 ------- tests/run_examples.py | 44 +- 65 files changed, 1281 insertions(+), 4062 deletions(-) create mode 100644 .github/workflows/oneDAL.yml delete mode 100644 daal4py/oneapi/__init__.py create mode 100644 onedal/linear_model/tests/test_incremental_ridge_regression.py create mode 100644 sklearnex/linear_model/incremental_ridge.py create mode 100644 sklearnex/linear_model/tests/test_incremental_ridge.py delete mode 100755 src/oneapi/oneapi.h delete mode 100644 src/oneapi/oneapi.pyx delete mode 100644 src/oneapi/oneapi_backend.cpp delete mode 100644 src/oneapi/oneapi_backend.h delete mode 100644 tests/daal4py/sycl/bf_knn_classification.py delete mode 100644 tests/daal4py/sycl/covariance.py delete mode 100644 tests/daal4py/sycl/covariance_streaming.py delete mode 100644 tests/daal4py/sycl/dbscan.py delete mode 100644 tests/daal4py/sycl/decision_forest_classification.py delete mode 100755 tests/daal4py/sycl/decision_forest_classification_hist.py delete mode 100644 tests/daal4py/sycl/decision_forest_regression.py delete mode 100755 tests/daal4py/sycl/decision_forest_regression_hist.py delete mode 100644 tests/daal4py/sycl/gradient_boosted_regression.py delete mode 100644 tests/daal4py/sycl/kmeans.py delete mode 100644 tests/daal4py/sycl/linear_regression.py delete mode 100644 tests/daal4py/sycl/log_reg_binary_dense.py delete mode 100644 tests/daal4py/sycl/log_reg_dense.py delete mode 100644 tests/daal4py/sycl/low_order_moms_dense.py delete mode 100644 tests/daal4py/sycl/low_order_moms_streaming.py delete mode 100644 tests/daal4py/sycl/pca.py delete mode 100644 tests/daal4py/sycl/pca_transform.py delete mode 100644 tests/daal4py/sycl/sklearn_sycl.py delete mode 100755 tests/daal4py/sycl/svm.py diff --git a/.github/workflows/oneDAL.yml b/.github/workflows/oneDAL.yml new file mode 100644 index 0000000000..fd2111202c --- /dev/null +++ b/.github/workflows/oneDAL.yml @@ -0,0 +1,86 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +name: oneDAL-nightly + +on: + schedule: + - cron: '5 21 * * *' + workflow_dispatch: + +env: + OTHER_REPO: "oneapi-src/oneDAL" + WF_NAME: "Nightly-build" + +permissions: + contents: read + +jobs: + collect_artifacts: + name: Collect Artifacts + if: github.repository == 'intel/scikit-learn-intelex' + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - name: Get run ID of "Nightly-build" workflow + id: get-run-id + run: | + RUN_ID=`gh run --repo ${OTHER_REPO} list --workflow "${WF_NAME}" --json databaseId --jq .[0].databaseId` + echo "Detected latest run id of ${RUN_ID} for workflow ${WF_NAME}" + echo "run-id=${RUN_ID}" >> "$GITHUB_OUTPUT" + env: + GH_TOKEN: ${{ secrets.INTEL_DAAL_PAT }} + - name: Qualify "Nightly-build" workflow + run: | + STATUS=`gh run --repo ${OTHER_REPO} view ${{ steps.get-run-id.outputs.run-id }} --json status --exit-status --jq .status` + echo "Status of run: ${STATUS}" + # if latest nightly build is currently running, wait for it to complete and reacquire status + if [[ $STATUS == "queued" ]] || [[ $STATUS == "in_progress" ]]; then + gh run --repo ${OTHER_REPO} watch ${{ steps.get-run-id.outputs.run-id }} -i 300 + STATUS=`gh run --repo ${OTHER_REPO} view ${{ steps.get-run-id.outputs.run-id }} --json status --exit-status --jq .status` + fi + T_R=`gh run --repo ${OTHER_REPO} view ${{ steps.get-run-id.outputs.run-id }} --json startedAt --exit-status --jq .startedAt` + # if the previous run is successful but older than 25 hours set an exit code + if [[ $STATUS == "completed" ]]; then exit $((($(date '+%s') - $(date -d ${T_R} '+%s'))/90000)); fi + env: + GH_TOKEN: ${{ secrets.INTEL_DAAL_PAT }} + - name: Download Artifacts + run: | + gh run --repo ${OTHER_REPO} download ${{ steps.get-run-id.outputs.run-id }} + ls -la + env: + GH_TOKEN: ${{ secrets.INTEL_DAAL_PAT }} + - name: Archive Linux build + uses: actions/upload-artifact@v4 + with: + name: __release_lnx + path: ./__release_lnx + - name: Archive Windows build + uses: actions/upload-artifact@v4 + with: + name: __release_win + path: ./__release_win + - name: Archive DPC++ + uses: actions/upload-artifact@v4 + with: + name: icx_compiler + path: ./icx_compiler/icx.zip + - name: Archive Intel OpenCL CPU runtime + uses: actions/upload-artifact@v4 + with: + name: opencl_rt_installer + path: ./opencl_rt_installer/opencl_rt.msi diff --git a/.github/workflows/renovate-validation.yml b/.github/workflows/renovate-validation.yml index ae90dd7578..d48ad99827 100644 --- a/.github/workflows/renovate-validation.yml +++ b/.github/workflows/renovate-validation.yml @@ -25,6 +25,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 - name: Validate - uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.0.1 + uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.1.0 with: config_file_path: .github/renovate.json diff --git a/README.md b/README.md index 2bbd451c77..e6ac933e58 100755 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ [![PyPI Version](https://img.shields.io/pypi/v/scikit-learn-intelex)](https://pypi.org/project/scikit-learn-intelex/) [![Conda Version](https://img.shields.io/conda/vn/conda-forge/scikit-learn-intelex)](https://anaconda.org/conda-forge/scikit-learn-intelex) [![python version](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue) -[![scikit-learn supported versions](https://img.shields.io/badge/sklearn-1.0%20%7C%201.2%20%7C%201.3%20%7C%201.4-blue)](https://img.shields.io/badge/sklearn-01.0%20%7C%201.2%20%7C%201.3%20%7C%201.4-blue) +[![scikit-learn supported versions](https://img.shields.io/badge/sklearn-1.0%20%7C%201.2%20%7C%201.3%20%7C%201.4%20%7C%201.5-blue)](https://img.shields.io/badge/sklearn-1.0%20%7C%201.2%20%7C%201.3%20%7C%201.4%20%7C%201.5-blue) --- diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 39fde9e4d1..dc45023622 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -81,7 +81,6 @@ test: - python -m unittest discover -v -s tests -p test*.py - pytest --pyargs daal4py/sklearn/ - python tests/run_examples.py - - python -m daal4py tests/daal4py/sycl/sklearn_sycl.py about: about: diff --git a/daal4py/__init__.py b/daal4py/__init__.py index f116dfa105..d656d22756 100644 --- a/daal4py/__init__.py +++ b/daal4py/__init__.py @@ -29,7 +29,6 @@ current_path = os.path.dirname(__file__) path_to_env = site.getsitepackages()[0] path_to_libs = os.path.join(path_to_env, "Library", "bin") - path_to_oneapi_backend = os.path.join(current_path, "oneapi") if sys.version_info.minor >= 8: if "DALROOT" in os.environ: dal_root_redist = os.path.join(os.environ["DALROOT"], "redist", arch_dir) @@ -37,7 +36,6 @@ os.add_dll_directory(dal_root_redist) os.environ["PATH"] = dal_root_redist + os.pathsep + os.environ["PATH"] os.add_dll_directory(path_to_libs) - os.add_dll_directory(path_to_oneapi_backend) os.environ["PATH"] = path_to_libs + os.pathsep + os.environ["PATH"] try: diff --git a/daal4py/oneapi/__init__.py b/daal4py/oneapi/__init__.py deleted file mode 100644 index 9aac700cc5..0000000000 --- a/daal4py/oneapi/__init__.py +++ /dev/null @@ -1,66 +0,0 @@ -# ============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import platform - -if "Windows" in platform.system(): - import os - import shutil - import sys - import sysconfig - - current_path = os.path.dirname(__file__) - - sitepackages_path = sysconfig.get_paths()["purelib"] - installed_package_path = os.path.join(sitepackages_path, "daal4py", "oneapi") - if sys.version_info.minor >= 8: - dpc_path = shutil.which("icpx") - if dpc_path is not None: - dpc_bin_dir = os.path.dirname(dpc_path) - dpc_compiler_dir = os.path.join(dpc_bin_dir, "compiler") - if os.path.exists(dpc_bin_dir): - os.add_dll_directory(dpc_bin_dir) - if os.path.exists(dpc_compiler_dir): - os.add_dll_directory(dpc_compiler_dir) - os.add_dll_directory(current_path) - if os.path.exists(installed_package_path): - os.add_dll_directory(installed_package_path) - os.environ["PATH"] = current_path + os.pathsep + os.environ["PATH"] - os.environ["PATH"] = installed_package_path + os.pathsep + os.environ["PATH"] - -try: - from daal4py._oneapi import * - from daal4py._oneapi import ( - _get_device_name_sycl_ctxt, - _get_in_sycl_ctxt, - _get_sycl_ctxt, - _get_sycl_ctxt_params, - ) -except ModuleNotFoundError: - raise -except ImportError: - import daal4py - - version = daal4py._get__version__()[1:-1].split(", ") - major_version, minor_version = version[0], version[1] - raise ImportError( - f"dpcpp_cpp_rt >= {major_version}.{minor_version} " - "has to be installed or upgraded to use this module.\n" - "You can download or upgrade it using the following commands:\n" - f"`pip install --upgrade dpcpp_cpp_rt>={major_version}.{minor_version}.*` " - "or " - f"`conda install -c intel dpcpp_cpp_rt>={major_version}.{minor_version}.*`" - ) diff --git a/daal4py/sklearn/_utils.py b/daal4py/sklearn/_utils.py index dd19db5c79..7b21c7b405 100644 --- a/daal4py/sklearn/_utils.py +++ b/daal4py/sklearn/_utils.py @@ -40,17 +40,6 @@ except (ImportError, ModuleNotFoundError): pandas_is_imported = False -try: - from daal4py.oneapi import is_in_sycl_ctxt as is_in_ctx - - ctx_imported = True -except (ImportError, ModuleNotFoundError): - ctx_imported = False - -oneapi_is_available = "daal4py.oneapi" in sys.modules -if oneapi_is_available: - from daal4py.oneapi import _get_device_name_sycl_ctxt - def set_idp_sklearn_verbose(): logLevel = os.environ.get("IDP_SKLEARN_VERBOSE") @@ -142,19 +131,7 @@ def make2d(X): def get_patch_message(s): if s == "daal": - message = "running accelerated version on " - if oneapi_is_available: - dev = _get_device_name_sycl_ctxt() - if dev == "cpu" or dev is None: - message += "CPU" - elif dev == "gpu": - message += "GPU" - else: - raise ValueError( - f"Unexpected device name {dev}." " Supported types are cpu and gpu" - ) - else: - message += "CPU" + message = "running accelerated version on CPU" elif s == "sklearn": message = "fallback to original Scikit-learn" @@ -168,13 +145,6 @@ def get_patch_message(s): return message -def is_in_sycl_ctxt(): - if ctx_imported: - return is_in_ctx() - else: - return False - - def is_DataFrame(X): if pandas_is_imported: return isinstance(X, DataFrame) diff --git a/dependencies-dev b/dependencies-dev index 9d5c455a33..cdf7423232 100644 --- a/dependencies-dev +++ b/dependencies-dev @@ -1,6 +1,7 @@ Cython==3.0.11 Jinja2==3.1.4 -numpy==2.0.1 -pybind11==2.13.1 +numpy==2.0.1 ; python_version <= '3.9' +numpy==2.1.0 ; python_version > '3.9' +pybind11==2.13.5 cmake==3.30.2 -setuptools==72.1.0 +setuptools==73.0.1 diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 4569bbe6ad..f986580656 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -368,8 +368,11 @@ deselected_tests: - tests/test_common.py::test_estimators[IncrementalLinearRegression()-check_estimators_pickle(readonly_memmap=True)] - tests/test_common.py::test_estimators[IncrementalPCA()-check_estimators_pickle] - tests/test_common.py::test_estimators[IncrementalPCA()-check_estimators_pickle(readonly_memmap=True)] + - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle] + - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle(readonly_memmap=True)] # There are not enough data to run onedal backend - tests/test_common.py::test_estimators[IncrementalLinearRegression()-check_fit2d_1sample] + - tests/test_common.py::test_estimators[IncrementalRidge()-check_fit2d_1sample] # Deselection of LogisticRegression tests over accuracy comparisons with sample_weights # and without. Because scikit-learn-intelex does not support sample_weights, it's doing @@ -448,6 +451,8 @@ public: # Fails from numpy 2.0 and sklearn 1.4+ - neighbors/tests/test_neighbors.py::test_KNeighborsClassifier_raise_on_all_zero_weights + # -------------------------------------------------------- + # The following tests currently fail with GPU offloading gpu: # Segfaults - ensemble/tests/test_weight_boosting.py @@ -456,6 +461,9 @@ gpu: - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] - model_selection/tests/test_search.py::test_unsupervised_grid_search + - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] + - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] + - model_selection/tests/test_search.py::test_unsupervised_grid_search - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples @@ -715,403 +723,5 @@ gpu: - tests/test_common.py::test_estimators[ExtraTreesRegressor()-check_sample_weights_invariance(kind=ones)] # RuntimeError: Device support is not implemented, failing as result of fallback to cpu false - # NearestNeighbors - - cluster/tests/test_dbscan.py - - cluster/tests/test_spectral - - manifold/tests/test_t_sne.py::test_binary_search_neighbors - - manifold/tests/test_t_sne.py::test_binary_perplexity_stability - - manifold/tests/test_t_sne.py::test_gradient_bh_multithread_match_sequential - - neighbors/tests/test_kde.py::test_kernel_density_sampling - - tests/test_common.py::test_check_n_features_in_after_fitting[NearestNeighbors()] - - tests/test_common.py::test_estimators[NearestNeighbors()] - - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv0] - - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv1] - - manifold/tests/test_t_sne.py::test_barnes_hut_angle - # KNeighborsRegressor - - ensemble/tests/test_bagging.py::test_regression - - ensemble/tests/test_bagging.py::test_single_estimator - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-chebyshev-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-chebyshev-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-cityblock-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-euclidean-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-l1-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-l2-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-manhattan-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-manhattan-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-minkowski-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-chebyshev-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-chebyshev-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-cityblock-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-cityblock-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-euclidean-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-euclidean-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-l1-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-l1-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-l2-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-manhattan-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-manhattan-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-minkowski-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-minkowski-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-chebyshev-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-chebyshev-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-cityblock-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-cityblock-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-euclidean-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-euclidean-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-l1-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-l1-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-l2-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-manhattan-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-manhattan-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-minkowski-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-minkowski-1000-5-100] - - tests/test_common.py::test_check_n_features_in_after_fitting[KNeighborsRegressor()] - - tests/test_common.py::test_f_contiguous_array_estimator[KNeighborsRegressor] - - tests/test_common.py::test_estimators[KNeighborsRegressor()- - # KNeighborsClassifier - - ensemble/tests/test_bagging.py::test_oob_score_consistency - - ensemble/tests/test_bagging.py::test_max_samples_consistency - - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_predict_proba[MLPClassifier] - - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_predict_proba[RandomForestClassifier] - - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_decision_function - - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[False-auto] - - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[False-predict] - - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[True-auto] - - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[True-predict] - - metrics/tests/test_score_objects.py::test_multimetric_scorer_calls_method_once_classifier_no_decision - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-chebyshev-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-chebyshev-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-cityblock-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-euclidean-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-l1-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-l2-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-manhattan-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-manhattan-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-minkowski-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-chebyshev-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-chebyshev-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-cityblock-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-cityblock-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-euclidean-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-euclidean-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-l1-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-l1-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-l2-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-manhattan-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-manhattan-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-minkowski-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-minkowski-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-chebyshev-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-chebyshev-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-cityblock-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-cityblock-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-euclidean-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-euclidean-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-l1-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-l1-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-l2-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-manhattan-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-manhattan-1000-5-100] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-minkowski-100-100-10] - - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-minkowski-1000-5-100] - - tests/test_common.py::test_check_n_features_in_after_fitting[KNeighborsClassifier()] - - tests/test_common.py::test_f_contiguous_array_estimator[KNeighborsClassifier] - - tests/test_common.py::test_estimators[KNeighborsClassifier()- - - model_selection/tests/test_search.py::test_search_cv_pairwise_property_equivalence_of_precomputed - - model_selection/tests/test_validation.py::test_cross_val_score_multilabel - - neighbors/tests/test_neighbors.py::test_precomputed_cross_validation - # SVR - - ensemble/tests/test_bagging.py::test_sparse_regression - - tests/test_common.py::test_check_n_features_in_after_fitting[NuSVR()] - - tests/test_common.py::test_check_n_features_in_after_fitting[SVR()] - - tests/test_multiclass.py::test_ovr_single_label_predict_proba - - utils/tests/test_validation.py::test_check_is_fitted - - tests/test_common.py::test_estimators[NuSVR()- - - tests/test_common.py::test_estimators[SVR()- - # SVC - - ensemble/tests/test_bagging.py::test_oob_score_classification - - ensemble/tests/test_bagging.py::test_deprecated_base_estimator_has_decision_function - - ensemble/tests/test_stacking.py::test_stacking_classifier_error[y1-params1-ValueError-does - - feature_selection/tests/test_rfe - - metrics/tests/test_classification.py::test_classification_report_dictionary_output - - metrics/tests/test_classification.py::test_multilabel_confusion_matrix_multiclass - - metrics/tests/test_classification.py::test_precision_recall_f1_score_multiclass - - metrics/tests/test_classification.py::test_confusion_matrix_multiclass_subset_labels - - metrics/tests/test_classification.py::test_confusion_matrix_error[empty - - metrics/tests/test_classification.py::test_confusion_matrix_error[unknown - - metrics/tests/test_classification.py::test_classification_report_multiclass - - metrics/tests/test_classification.py::test_classification_report_multiclass_with_label_detection - - metrics/tests/test_classification.py::test_classification_report_multiclass_with_digits - - metrics/tests/test_classification.py::test_classification_report_multiclass_with_string_label - - metrics/tests/test_classification.py::test_classification_report_multiclass_with_unicode_label - - metrics/tests/test_classification.py::test_classification_report_multiclass_with_long_string_label - - model_selection/tests/test_validation.py::test_permutation_score - - svm/tests/test_sparse.py::test_unsorted_indices - - svm/tests/test_sparse.py::test_sparse_decision_function - - svm/tests/test_sparse.py::test_weight - - svm/tests/test_sparse.py::test_sparse_svc_clone_with_callable_kernel - - svm/tests/test_sparse.py::test_timeout - - tests/test_common.py::test_check_n_features_in_after_fitting[NuSVC()] - - tests/test_multiclass.py::test_pairwise_indices - - tests/test_multiclass.py::test_pairwise_n_features_in - - tests/test_pipeline.py::test_pipeline_memory - - tests/test_common.py::test_estimators[NuSVC()- - - tests/test_common.py::test_estimators[SVC()- - - model_selection/tests/test_search.py::test_grid_search_precomputed_kernel - - model_selection/tests/test_search.py::test_search_cv_results_rank_tie_breaking - - model_selection/tests/test_split.py::test_kfold_can_detect_dependent_samples_on_digits - - model_selection/tests/test_validation.py::test_cross_val_score_mask - - model_selection/tests/test_validation.py::test_cross_val_score_precomputed - - model_selection/tests/test_validation.py::test_cross_val_score_with_score_func_classification - svm/tests/test_svm.py::test_unfitted - # part SVC, part KNeighborsClassifier - - semi_supervised/tests/test_self_training - # unsorted NearestNeighbors/KNClassifier/KNRegressor - - neighbors/tests/test_neighbors.py::test_unsupervised_inputs[float64-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_unsupervised_inputs[float64-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_unsupervised_inputs[float64-NearestNeighbors] - - neighbors/tests/test_neighbors.py::test_precomputed_dense - - neighbors/tests/test_neighbors.py::test_precomputed_sparse_knn[csr] - - neighbors/tests/test_neighbors.py::test_precomputed_sparse_knn[lil] - - neighbors/tests/test_neighbors.py::test_precomputed_sparse_radius[csr] - - neighbors/tests/test_neighbors.py::test_precomputed_sparse_radius[lil] - - neighbors/tests/test_neighbors.py::test_precomputed_sparse_invalid - - neighbors/tests/test_neighbors.py::test_unsupervised_radius_neighbors[float64] - - neighbors/tests/test_neighbors.py::test_neighbors_regressors_zero_distance - - neighbors/tests/test_neighbors.py::test_radius_neighbors_boundary_handling - - neighbors/tests/test_neighbors.py::test_radius_neighbors_returns_array_of_objects - - neighbors/tests/test_neighbors.py::test_query_equidistant_kth_nn[kd_tree] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_sort_results[kd_tree-euclidean] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_sort_results[brute-precomputed] - - neighbors/tests/test_neighbors.py::test_kneighbors_regressor - - neighbors/tests/test_neighbors.py::test_KNeighborsRegressor_multioutput_uniform_weight - - neighbors/tests/test_neighbors.py::test_kneighbors_regressor_multioutput - - neighbors/tests/test_neighbors.py::test_kneighbors_regressor_sparse - - neighbors/tests/test_neighbors.py::test_neighbors_validate_parameters[KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_validate_parameters[KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-2-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-2-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-100-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-100-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-2-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-2-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-100-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-100-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-2-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-2-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-100-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-100-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-2-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-2-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-100-KNeighborsClassifier] - - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-100-KNeighborsRegressor] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-braycurtis] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-canberra] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-correlation] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-dice] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-hamming] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-haversine] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-jaccard] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-mahalanobis] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-matching] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-nan_euclidean] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-rogerstanimoto] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-russellrao] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-seuclidean] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-sokalmichener] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-sokalsneath] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-sqeuclidean] - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-yule] - - neighbors/tests/test_neighbors.py::test_callable_metric - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-braycurtis] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-canberra] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-chebyshev] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-cityblock] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-correlation] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-cosine] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-dice] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-euclidean] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-hamming] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-haversine] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-jaccard] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-kulsinski] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-l1] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-l2] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-mahalanobis] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-manhattan] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-matching] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-minkowski] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-nan_euclidean] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-precomputed] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-rogerstanimoto] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-russellrao] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-seuclidean] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-sokalmichener] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-sokalsneath] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-sqeuclidean] - - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-yule] - - neighbors/tests/test_neighbors.py::test_predict_sparse_ball_kd_tree - - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_train_is_not_query - - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_X_None[kd_tree] - - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_duplicates[kd_tree] - - neighbors/tests/test_neighbors.py::test_same_knn_parallel[ball_tree] - - neighbors/tests/test_neighbors.py::test_same_knn_parallel[kd_tree] - - neighbors/tests/test_neighbors.py::test_same_knn_parallel[auto] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-threading] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-sequential] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-multiprocessing] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-loky] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-testing] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-threading] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-sequential] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-multiprocessing] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-loky] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-testing] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-threading] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-sequential] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-multiprocessing] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-loky] - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-testing] - - neighbors/tests/test_neighbors.py::test_dtype_convert - - neighbors/tests/test_neighbors.py::test_sparse_metric_callable - - neighbors/tests/test_neighbors.py::test_pairwise_boolean_distance - - neighbors/tests/test_neighbors.py::test_pipeline_with_nearest_neighbors_transformer - - neighbors/tests/test_neighbors.py::test_auto_algorithm[X0-precomputed-None-brute] - - neighbors/tests/test_neighbors.py::test_auto_algorithm[X3-euclidean-None-kd_tree] - - neighbors/tests/test_neighbors.py::test_auto_algorithm[X4-seuclidean-metric_params4-ball_tree] - - neighbors/tests/test_neighbors.py::test_auto_algorithm[X5-correlation-None-brute] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[braycurtis] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[canberra] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[correlation] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[dice] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[hamming] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[haversine] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[jaccard] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[mahalanobis] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[matching] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[nan_euclidean] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[rogerstanimoto] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[russellrao] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[seuclidean] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[sokalmichener] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[sokalsneath] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[sqeuclidean] - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[yule] - - neighbors/tests/test_neighbors.py::test_regressor_predict_on_arraylikes - # `precomputed` metric is not implemented for DBSCAN - - neighbors/tests/test_neighbors_pipeline.py::test_dbscan - - neighbors/tests/test_neighbors_pipeline.py::test_kneighbors_regressor - # unsorted svm - - svm/tests/test_svm.py::test_libsvm_iris - - svm/tests/test_svm.py::test_svr - - svm/tests/test_svm.py::test_linearsvr - - svm/tests/test_svm.py::test_svr_errors - - svm/tests/test_svm.py::test_probability - - svm/tests/test_svm.py::test_decision_function - - svm/tests/test_svm.py::test_decision_function_shape[SVC] - - svm/tests/test_svm.py::test_decision_function_shape[NuSVC] - - svm/tests/test_svm.py::test_svr_predict - - svm/tests/test_svm.py::test_weight - - svm/tests/test_svm.py::test_svm_classifier_sided_sample_weight[estimator1] - - svm/tests/test_svm.py::test_svm_regressor_sided_sample_weight[estimator0] - - svm/tests/test_svm.py::test_svm_regressor_sided_sample_weight[estimator1] - - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-zero-NuSVC] - - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-zero-SVR] - - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-zero-NuSVR] - - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-negative-NuSVC] - - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-negative-SVR] - - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-negative-NuSVR] - - svm/tests/test_svm.py::test_negative_weights_svc_leave_just_one_label[mask-label-1-NuSVC] - - svm/tests/test_svm.py::test_negative_weights_svc_leave_just_one_label[mask-label-2-NuSVC] - - svm/tests/test_svm.py::test_negative_weights_svc_leave_two_labels[partial-mask-label-1-NuSVC] - - svm/tests/test_svm.py::test_negative_weights_svc_leave_two_labels[partial-mask-label-2-NuSVC] - - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-1-NuSVC] - - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-1-NuSVR] - - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-2-NuSVC] - - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-2-NuSVR] - - svm/tests/test_svm.py::test_auto_weight - - svm/tests/test_svm.py::test_bad_input - - svm/tests/test_svm.py::test_sparse_precomputed - - svm/tests/test_svm.py::test_sparse_fit_support_vectors_empty - - svm/tests/test_svm.py::test_immutable_coef_property - - svm/tests/test_svm.py::test_svc_bad_kernel - - svm/tests/test_svm.py::test_libsvm_convergence_warnings - - svm/tests/test_svm.py::test_svr_coef_sign - - svm/tests/test_svm.py::test_hasattr_predict_proba - - svm/tests/test_svm.py::test_decision_function_shape_two_class - - svm/tests/test_svm.py::test_ovr_decision_function - - svm/tests/test_svm.py::test_svc_invalid_break_ties_param[SVC] - - svm/tests/test_svm.py::test_svc_invalid_break_ties_param[NuSVC] - - svm/tests/test_svm.py::test_n_support[SVR] - - svm/tests/test_svm.py::test_n_support[NuSVR] - - svm/tests/test_svm.py::test_custom_kernel_not_array_input[SVC] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset0-NuSVC-ndarray] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset0-SVR-int] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset0-NuSVR-int] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-SVC-ndarray] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-NuSVC-ndarray] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-SVR-int] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-NuSVR-int] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-SVC-ndarray] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-NuSVC-ndarray] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-SVR-int] - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-NuSVR-int] - - svm/tests/test_svm.py::test_svm_class_weights_deprecation[SVR] - - svm/tests/test_svm.py::test_svm_class_weights_deprecation[NuSVR] - # Sporadic failures on Max series with 2024.0 toolchain update that require deeper investigation - - tests/test_multiclass.py::test_ovo_consistent_binary_classification - # Python 3.8 failures on Max series with 2024.0 toolchain update - - neighbors/tests/test_neighbors.py::test_query_equidistant_kth_nn - - neighbors/tests/test_neighbors.py::test_radius_neighbors_sort_results - - neighbors/tests/test_neighbors.py::test_neighbors_digits - - neighbors/tests/test_neighbors.py::test_nearest_neighbors_validate_params - - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend - - neighbors/tests/test_neighbors.py::test_metric_params_interface - - neighbors/tests/test_neighbors.py::test_non_euclidean_kneighbors - - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_X_None - - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_duplicates - - neighbors/tests/test_neighbors.py::test_same_knn_parallel - - neighbors/tests/test_neighbors.py::test_knn_forcing_backend - - neighbors/tests/test_neighbors.py::test_auto_algorithm - - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend - - svm/tests/test_sparse.py::test_consistent_proba - - svm/tests/test_svm.py::test_consistent_proba - - svm/tests/test_svm.py::test_libsvm_parameters - - svm/tests/test_svm.py::test_negative_weight_equal_coeffs - - svm/tests/test_svm.py::test_unicode_kernel - - svm/tests/test_svm.py::test_gamma_scale - - svm/tests/test_svm.py::test_svc_raises_error_internal_representation - - svm/tests/test_svm.py::test_n_iter_libsvm[dataset0-SVC-ndarray] - - tests/test_common.py::test_estimators[DBSCAN()-check_estimators_dtypes] - - tests/test_common.py::test_estimators[DBSCAN()-check_fit_score_takes_y] - - tests/test_common.py::test_estimators[DBSCAN()-check_sample_weights_pandas_series] - - tests/test_common.py::test_estimators[DBSCAN()-check_sample_weights_not_an_array] - - tests/test_common.py::test_estimators[DBSCAN()-check_sample_weights_list] - - tests/test_common.py::test_estimators[DBSCAN()-check_sample_weights_shape] - - tests/test_common.py::test_estimators[DBSCAN()-check_sample_weights_not_overwritten] - - tests/test_common.py::test_estimators[DBSCAN()-check_sample_weights_invariance(kind=ones)] - - tests/test_common.py::test_estimators[DBSCAN()-check_sample_weights_invariance(kind=zeros)] - - tests/test_common.py::test_estimators[DBSCAN()-check_estimators_fit_returns_self] - - tests/test_common.py::test_estimators[DBSCAN()-check_complex_data] - - tests/test_common.py::test_estimators[DBSCAN()-check_dtype_object] - - tests/test_common.py::test_estimators[DBSCAN()-check_estimators_empty_data_messages] - - tests/test_common.py::test_estimators[DBSCAN()-check_pipeline_consistency] - - tests/test_common.py::test_estimators[DBSCAN()-check_estimators_nan_inf] - - tests/test_common.py::test_estimators[DBSCAN()-check_estimators_overwrite_params] - - tests/test_common.py::test_estimators[DBSCAN()-check_estimators_pickle] - - tests/test_common.py::test_estimators[DBSCAN()-check_estimators_fit_returns_self(readonly_memmap=True)] - - tests/test_common.py::test_estimators[DBSCAN()-check_clustering] - - tests/test_common.py::test_estimators[DBSCAN()-check_clustering(readonly_memmap=True)] - - tests/test_common.py::test_estimators[DBSCAN()-check_methods_sample_order_invariance] - - tests/test_common.py::test_estimators[DBSCAN()-check_methods_subset_invariance] - - tests/test_common.py::test_estimators[DBSCAN()-check_fit2d_1sample] - - tests/test_common.py::test_estimators[DBSCAN()-check_fit2d_1feature] - - tests/test_common.py::test_estimators[DBSCAN()-check_dict_unchanged] - - tests/test_common.py::test_estimators[DBSCAN()-check_dont_overwrite_parameters] - - tests/test_common.py::test_estimators[DBSCAN()-check_fit_idempotent] - - tests/test_common.py::test_estimators[DBSCAN()-check_fit_check_is_fitted] - - tests/test_common.py::test_estimators[DBSCAN()-check_n_features_in] - - tests/test_common.py::test_estimators[DBSCAN()-check_fit1d] - - tests/test_common.py::test_estimators[DBSCAN()-check_fit2d_predict1d] - - tests/test_common.py::test_check_n_features_in_after_fitting[DBSCAN()] - - tests/test_common.py::test_check_n_features_in_after_fitting[SVC()] + - tests/test_common.py::test_estimators[SVC()-check_estimators_unfitted] diff --git a/doc/daal4py/sklearn.rst b/doc/daal4py/sklearn.rst index a6815b305c..2693889b84 100755 --- a/doc/daal4py/sklearn.rst +++ b/doc/daal4py/sklearn.rst @@ -89,7 +89,7 @@ algorithms: - No limitations. * - Classification - RandomForestClassifier - - All parameters except ``warm_start`` = True, ``cpp_alpha`` != 0, ``criterion`` != 'gini', ``oob_score`` = True. + - All parameters except ``warm_start`` = True, ``ccp_alpha`` != 0, ``criterion`` != 'gini', ``oob_score`` = True. - Multi-output, sparse data and out-of-bag score are not supported. * - Classification - KNeighborsClassifier @@ -101,7 +101,7 @@ algorithms: - Only dense data is supported. * - Regression - RandomForestRegressor - - All parameters except ``warm_start`` = True, ``cpp_alpha`` != 0, ``criterion`` != 'mse', ``oob_score`` = True. + - All parameters except ``warm_start`` = True, ``ccp_alpha`` != 0, ``criterion`` != 'mse', ``oob_score`` = True. - Multi-output, sparse data and out-of-bag score are not supported. * - Regression - KNeighborsRegressor diff --git a/doc/sources/algorithms.rst b/doc/sources/algorithms.rst index 49c93f378f..6a73ee2b96 100755 --- a/doc/sources/algorithms.rst +++ b/doc/sources/algorithms.rst @@ -46,7 +46,7 @@ Classification - All parameters are supported except: - ``warm_start`` = `True` - - ``cpp_alpha`` != `0` + - ``ccp_alpha`` != `0` - ``criterion`` != `'gini'` - Multi-output and sparse data are not supported * - `KNeighborsClassifier` @@ -87,7 +87,7 @@ Regression - All parameters are supported except: - ``warm_start`` = `True` - - ``cpp_alpha`` != `0` + - ``ccp_alpha`` != `0` - ``criterion`` != `'mse'` - Multi-output and sparse data are not supported * - `KNeighborsRegressor` @@ -143,7 +143,7 @@ Clustering - ``algorithm`` not in [`'brute'`, `'auto'`] - Only dense data is supported -Dimensionality reduction +Dimensionality Reduction ************************ .. list-table:: @@ -188,7 +188,7 @@ Nearest Neighbors all parameters except ``metric`` not in [`'euclidean'`, `'manhattan'`, `'minkowski'`, `'chebyshev'`, `'cosine'`] - Sparse data is not supported -Other tasks +Other Tasks *********** .. list-table:: @@ -199,6 +199,9 @@ Other tasks * - Algorithm - Parameters - Data formats + * - `EmpiricalCovariance` + - All parameters are supported + - Only dense data is supported * - `train_test_split` - All parameters are supported - Only dense data is supported @@ -245,7 +248,7 @@ Classification - All parameters are supported except: - ``warm_start`` = `True` - - ``cpp_alpha`` != `0` + - ``ccp_alpha`` != `0` - ``criterion`` != `'gini'` - ``oob_score`` = `True` - ``sample_weight`` != `None` @@ -281,7 +284,7 @@ Regression - All parameters are supported except: - ``warm_start`` = `True` - - ``cpp_alpha`` != `0` + - ``ccp_alpha`` != `0` - ``criterion`` != `'mse'` - ``oob_score`` = `True` - ``sample_weight`` != `None` @@ -316,8 +319,7 @@ Clustering - ``precompute_distances`` - ``sample_weight`` != `None` - - ``Init`` = `'k-means++'` fallbacks to CPU. + - ``Init`` = `'k-means++'` fallbacks to CPU. - Sparse data is not supported * - `DBSCAN` - All parameters are supported except: @@ -326,7 +328,7 @@ Clustering - ``algorithm`` not in [`'brute'`, `'auto'`] - Only dense data is supported -Dimensionality reduction +Dimensionality Reduction ************************ .. list-table:: @@ -362,7 +364,175 @@ Nearest Neighbors - ``metric`` not in [`'euclidean'`, `'manhattan'`, `'minkowski'`, `'chebyshev'`, `'cosine'`] - Only dense data is supported -Scikit-learn tests +Other Tasks +*********** + +.. list-table:: + :widths: 10 30 20 + :header-rows: 1 + :align: left + + * - Algorithm + - Parameters + - Data formats + * - `EmpiricalCovariance` + - All parameters are supported + - Only dense data is supported + +SPMD Support +------------ + +.. seealso:: :ref:`distributed` + +Classification +************** + +.. list-table:: + :widths: 10 30 20 + :header-rows: 1 + :align: left + + * - Algorithm + - Parameters & Methods + - Data formats + * - `RandomForestClassifier` + - All parameters are supported except: + + - ``warm_start`` = `True` + - ``ccp_alpha`` != `0` + - ``criterion`` != `'gini'` + - ``oob_score`` = `True` + - ``sample_weight`` != `None` + - Multi-output and sparse data are not supported + * - `KNeighborsClassifier` + - All parameters are supported except: + + - ``algorithm`` != `'brute'` + - ``weights`` = `'callable'` + - ``metric`` not in [`'euclidean'`, `'manhattan'`, `'minkowski'`, `'chebyshev'`, `'cosine'`] + - ``predict_proba`` method not supported + - Only dense data is supported + * - `LogisticRegression` + - All parameters are supported except: + + - ``solver`` != `'newton-cg'` + - ``class_weight`` != `None` + - ``sample_weight`` != `None` + - ``penalty`` != `'l2'` + - Only dense data is supported + +Regression +********** + +.. list-table:: + :widths: 10 30 20 + :header-rows: 1 + :align: left + + * - Algorithm + - Parameters & Methods + - Data formats + * - `RandomForestRegressor` + - All parameters are supported except: + + - ``warm_start`` = `True` + - ``ccp_alpha`` != `0` + - ``criterion`` != `'mse'` + - ``oob_score`` = `True` + - ``sample_weight`` != `None` + - Multi-output and sparse data are not supported + * - `KNeighborsRegressor` + - All parameters are supported except: + + - ``algorithm`` != `'brute'` + - ``weights`` = `'callable'` + - ``metric`` != `'euclidean'` or `'minkowski'` with ``p`` != `2` + - Only dense data is supported + * - `LinearRegression` + - All parameters are supported except: + + - ``normalize`` != `False` + - ``sample_weight`` != `None` + - Only dense data is supported, `#observations` should be >= `#features`. + +Clustering +********** + +.. list-table:: + :widths: 10 30 20 + :header-rows: 1 + :align: left + + * - Algorithm + - Parameters & Methods + - Data formats + * - `KMeans` + - All parameters are supported except: + + - ``precompute_distances`` + - ``sample_weight`` != `None` + - ``Init`` = `'k-means++'` fallbacks to CPU. + - Sparse data is not supported + * - `DBSCAN` + - All parameters are supported except: + + - ``metric`` != `'euclidean'` + - ``algorithm`` not in [`'brute'`, `'auto'`] + - Only dense data is supported + +Dimensionality Reduction +************************ + +.. list-table:: + :widths: 10 30 20 + :header-rows: 1 + :align: left + + * - Algorithm + - Parameters & Methods + - Data formats + * - `PCA` + - All parameters are supported except: + + - ``svd_solver`` not in [`'full'`, `'covariance_eigh'`] + - ``fit`` is the only method supported + - Sparse data is not supported + +Nearest Neighbors +***************** + +.. list-table:: + :widths: 10 30 20 + :header-rows: 1 + :align: left + + * - Algorithm + - Parameters + - Data formats + * - `NearestNeighbors` + - All parameters are supported except: + + - ``algorithm`` != `'brute'` + - ``weights`` = `'callable'` + - ``metric`` not in [`'euclidean'`, `'manhattan'`, `'minkowski'`, `'chebyshev'`, `'cosine'`] + - Only dense data is supported + +Other Tasks +*********** + +.. list-table:: + :widths: 10 30 20 + :header-rows: 1 + :align: left + + * - Algorithm + - Parameters + - Data formats + * - `EmpiricalCovariance` + - All parameters are supported + - Only dense data is supported + +Scikit-learn Tests ------------------ Monkey-patched scikit-learn classes and functions passes scikit-learn's own test diff --git a/doc/sources/distributed-mode.rst b/doc/sources/distributed-mode.rst index 3b152b68a0..d3a6d9d13e 100644 --- a/doc/sources/distributed-mode.rst +++ b/doc/sources/distributed-mode.rst @@ -19,12 +19,26 @@ Distributed Mode ================ -.. note:: +|intelex| offers Single Program, Multiple Data (SPMD) supported interfaces for distributed computing. +Several `GPU-supported algorithms `_ +also provide distributed, multi-GPU computing capabilities via integration with ``mpi4py``. The prerequisites +match those of GPU computing, along with an MPI backend of your choice (`Intel MPI recommended +`_, available +via ``impi-devel`` python package) and the ``mpi4py`` python package. If using |intelex| +`installed from sources `_, +ensure that the spmd_backend is built. - |intelex| contains scikit-learn patching functionality that was originally available in - `daal4py `_ package. - We recommend you to use scikit-learn-intelex package instead of daal4py. - You can learn more about daal4py in `daal4py documentation `_. +Estimators can be imported from the ``sklearnex.spmd`` module. Data should be distributed across multiple nodes as +desired, and should be transfered to a dpctl or dpnp array before being passed to the estimator. View a full +example of this process in the |intelex| repository, where many examples of our SPMD-supported estimators are +available: https://github.com/intel/scikit-learn-intelex/blob/main/examples/sklearnex/. To run: -While daal4py is available in `distribued mode `_, -|intelex| does not currently offer this functionality. +:: + + mpirun -n 4 python linear_regression_spmd.py + +Note that additional mpirun arguments can be added as desired. SPMD-supported estimators are listed in the +`algorithms support documentation `_. + +Additionally, daal4py offers some distributed functionality, see +`documentation `_ for further details. diff --git a/doc/sources/quick-start.rst b/doc/sources/quick-start.rst index 07fa8b4fe6..bc6ac8798a 100644 --- a/doc/sources/quick-start.rst +++ b/doc/sources/quick-start.rst @@ -206,13 +206,11 @@ To install |intelex|, run: - [CPU, GPU] - [CPU, GPU] - [CPU, GPU] - - [CPU, GPU] * - Windows* OS - [CPU, GPU] - [CPU, GPU] - [CPU, GPU] - [CPU, GPU] - - [CPU, GPU] @@ -245,13 +243,11 @@ To prevent version conflicts, we recommend installing `scikit-learn-intelex` int - [CPU] - [CPU] - [CPU] - - [CPU] * - Windows* OS - [CPU] - [CPU] - [CPU] - [CPU] - - [CPU] .. tab:: Intel channel @@ -276,13 +272,11 @@ To prevent version conflicts, we recommend installing `scikit-learn-intelex` int - [CPU, GPU] - [CPU, GPU] - [CPU, GPU] - - [CPU, GPU] * - Windows* OS - [CPU, GPU] - [CPU, GPU] - [CPU, GPU] - [CPU, GPU] - - [CPU, GPU] @@ -306,13 +300,11 @@ To prevent version conflicts, we recommend installing `scikit-learn-intelex` int - [CPU] - [CPU] - [CPU] - - [CPU] * - Windows* OS - [CPU] - [CPU] - [CPU] - [CPU] - - [CPU] diff --git a/generator/wrapper_gen.py b/generator/wrapper_gen.py index eeef76e7d2..638b120c9b 100755 --- a/generator/wrapper_gen.py +++ b/generator/wrapper_gen.py @@ -274,36 +274,6 @@ def daal_tsne_gradient_descent(init, p, size_iter, params, results, dtype=0): data_or_file(size_iter), data_or_file(params), data_or_file(results), dtype) - - -def _execute_with_context(func): - def exec_func(*args, **keyArgs): - if 'daal4py.oneapi' in sys.modules: - import daal4py.oneapi as d4p_oneapi - devname = d4p_oneapi._get_device_name_sycl_ctxt() - ctxparams = d4p_oneapi._get_sycl_ctxt_params() - - if devname == 'gpu' and ctxparams.get('host_offload_on_fail', False): - import logging - classname = func.__qualname__.split('.')[0] - try: - res = func(*args, **keyArgs) - logging.info(f"{classname} successfully run on gpu") - return res - except RuntimeError as e: - logging.info(f"{classname} failed to run on gpu. Fallback to host") - gpu_ctx = d4p_oneapi._get_sycl_ctxt() - host_ctx = d4p_oneapi.sycl_execution_context('host') - try: - host_ctx.apply() - res = func(*args, **keyArgs) - finally: - del host_ctx - gpu_ctx.apply() - return res - - return func(*args, **keyArgs) - return exec_func """ ############################################################################### @@ -1057,7 +1027,6 @@ def __cinit__(self, {% set cytype = result_map.class_type.replace('Ptr', '')|d2cy(False)|lower %} # compute simply forwards to the C++ de-templatized manager__iface__::compute - @_execute_with_context def _compute(self, {{input_args|fmt('{}', 'decl_dflt_cy', sep=',\n')|indent(17)}}, setup=False): diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp index 80a35dc17e..3f10fd0893 100644 --- a/onedal/basic_statistics/basic_statistics.cpp +++ b/onedal/basic_statistics/basic_statistics.cpp @@ -101,6 +101,7 @@ auto get_onedal_result_options(const py::dict& params) { } } catch (std::regex_error& e) { + (void)e; ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(result_option); } diff --git a/onedal/cluster/dbscan.cpp b/onedal/cluster/dbscan.cpp index 8c6e1213f7..92a6f0aecc 100644 --- a/onedal/cluster/dbscan.cpp +++ b/onedal/cluster/dbscan.cpp @@ -76,6 +76,7 @@ auto get_onedal_result_options(const py::dict& params) { } } catch (std::regex_error& e) { + (void)e; ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(result_options); } diff --git a/onedal/cluster/kmeans_common.cpp b/onedal/cluster/kmeans_common.cpp index 569106e1af..3d3e52b29f 100644 --- a/onedal/cluster/kmeans_common.cpp +++ b/onedal/cluster/kmeans_common.cpp @@ -49,8 +49,6 @@ bool is_same_clustering(const dal::table& left, auto map = dal::array::full( // n_clusters, minus_one); - const auto* const l_ptr = l_arr.get_data(); - const auto* const r_ptr = r_arr.get_data(); auto* const m_ptr = map.get_mutable_data(); const auto l_count = l_arr.get_count(); diff --git a/onedal/datatypes/data_conversion.cpp b/onedal/datatypes/data_conversion.cpp index ad866d832b..5e46810248 100644 --- a/onedal/datatypes/data_conversion.cpp +++ b/onedal/datatypes/data_conversion.cpp @@ -237,7 +237,7 @@ template static PyObject *convert_to_numpy_impl(const dal::array &array, std::int64_t row_count, std::int64_t column_count = 0) { - const std::int64_t size_dims = column_count == 0 ? 1 : 2; + const int size_dims = column_count == 0 ? 1 : 2; npy_intp dims[2] = { static_cast(row_count), static_cast(column_count) }; auto host_array = transfer_to_host(array); diff --git a/onedal/ensemble/forest.cpp b/onedal/ensemble/forest.cpp index 77399ff7b0..6855ded19b 100644 --- a/onedal/ensemble/forest.cpp +++ b/onedal/ensemble/forest.cpp @@ -164,7 +164,6 @@ struct params2desc { using namespace decision_forest; constexpr bool is_cls = std::is_same_v; - constexpr bool is_reg = std::is_same_v; auto desc = descriptor{} .set_observations_per_tree_fraction( diff --git a/onedal/linear_model/__init__.py b/onedal/linear_model/__init__.py index 998e4a62d7..bdb0d0d6b3 100755 --- a/onedal/linear_model/__init__.py +++ b/onedal/linear_model/__init__.py @@ -14,12 +14,13 @@ # limitations under the License. # =============================================================================== -from .incremental_linear_model import IncrementalLinearRegression +from .incremental_linear_model import IncrementalLinearRegression, IncrementalRidge from .linear_model import LinearRegression, Ridge from .logistic_regression import LogisticRegression __all__ = [ "IncrementalLinearRegression", + "IncrementalRidge", "LinearRegression", "LogisticRegression", "Ridge", diff --git a/onedal/linear_model/incremental_linear_model.py b/onedal/linear_model/incremental_linear_model.py index b8b754e18f..43f9db4159 100644 --- a/onedal/linear_model/incremental_linear_model.py +++ b/onedal/linear_model/incremental_linear_model.py @@ -144,3 +144,113 @@ def finalize_fit(self, queue=None): self.intercept_ = self.intercept_[0] return self + + +class IncrementalRidge(BaseLinearRegression): + """ + Incremental Ridge Regression oneDAL implementation. + + Parameters + ---------- + alpha : float, default=1.0 + Regularization strength; must be a positive float. Regularization + improves the conditioning of the problem and reduces the variance of + the estimates. Larger values specify stronger regularization. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + algorithm : string, default="norm_eq" + Algorithm used for computation on oneDAL side + """ + + def __init__(self, alpha=1.0, fit_intercept=True, copy_X=False, algorithm="norm_eq"): + module = self._get_backend("linear_model", "regression") + super().__init__( + fit_intercept=fit_intercept, alpha=alpha, copy_X=copy_X, algorithm=algorithm + ) + self._partial_result = module.partial_train_result() + + def _reset(self): + module = self._get_backend("linear_model", "regression") + self._partial_result = module.partial_train_result() + + def partial_fit(self, X, y, queue=None): + """ + Computes partial data for ridge regression + from data batch X and saves it to `_partial_result`. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data batch, where `n_samples` is the number of samples + in the batch, and `n_features` is the number of features. + + y: array-like of shape (n_samples,) or (n_samples, n_targets) in + case of multiple targets + Responses for training data. + + queue : dpctl.SyclQueue + If not None, use this queue for computations. + Returns + ------- + self : object + Returns the instance itself. + """ + module = self._get_backend("linear_model", "regression") + + if not hasattr(self, "_queue"): + self._queue = queue + policy = self._get_policy(queue, X) + + X, y = _convert_to_supported(policy, X, y) + + if not hasattr(self, "_dtype"): + self._dtype = get_dtype(X) + self._params = self._get_onedal_params(self._dtype) + + y = np.asarray(y).astype(dtype=self._dtype) + + X, y = _check_X_y(X, y, dtype=[np.float64, np.float32], accept_2d_y=True) + + self.n_features_in_ = _num_features(X, fallback_1d=True) + X_table, y_table = to_table(X, y) + self._partial_result = module.partial_train( + policy, self._params, self._partial_result, X_table, y_table + ) + + def finalize_fit(self, queue=None): + """ + Finalizes ridge regression computation and obtains coefficients + from the current `_partial_result`. + + Parameters + ---------- + queue : dpctl.SyclQueue + If available, uses provided queue for computations. + + Returns + ------- + self : object + Returns the instance itself. + """ + module = self._get_backend("linear_model", "regression") + if queue is not None: + policy = self._get_policy(queue) + else: + policy = self._get_policy(self._queue) + result = module.finalize_train(policy, self._params, self._partial_result) + + self._onedal_model = result.model + + packed_coefficients = from_table(result.model.packed_coefficients) + self.coef_, self.intercept_ = ( + packed_coefficients[:, 1:].squeeze(), + packed_coefficients[:, 0].squeeze(), + ) + + return self diff --git a/onedal/linear_model/linear_model.cpp b/onedal/linear_model/linear_model.cpp index b51dd69a8c..ca310030e2 100644 --- a/onedal/linear_model/linear_model.cpp +++ b/onedal/linear_model/linear_model.cpp @@ -72,6 +72,7 @@ auto get_onedal_result_options(const py::dict& params) { } } catch (std::regex_error& e) { + (void)e; ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(result_option); } diff --git a/onedal/linear_model/logistic_regression.cpp b/onedal/linear_model/logistic_regression.cpp index 3847ed7a7e..e426d3fec9 100644 --- a/onedal/linear_model/logistic_regression.cpp +++ b/onedal/linear_model/logistic_regression.cpp @@ -41,7 +41,10 @@ struct method2t { const auto method = params["method"].cast(); ONEDAL_PARAM_DISPATCH_VALUE(method, "dense_batch", ops, Float, method::dense_batch); - ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::dense_batch); +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 + ONEDAL_PARAM_DISPATCH_VALUE(method, "sparse", ops, Float, method::sparse); +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >=20240700 + ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -115,14 +118,14 @@ auto get_onedal_result_options(const py::dict& params) { template struct descriptor_creator; -template +template struct descriptor_creator { static auto get(bool intercept, double C) { return dal::logistic_regression::descriptor(intercept, C); } }; diff --git a/onedal/linear_model/logistic_regression.py b/onedal/linear_model/logistic_regression.py index 40d59bf9f9..518ba03d15 100644 --- a/onedal/linear_model/logistic_regression.py +++ b/onedal/linear_model/logistic_regression.py @@ -29,6 +29,7 @@ _check_array, _check_n_features, _check_X_y, + _is_csr, _num_features, _type_of_target, ) @@ -44,11 +45,11 @@ def __init__(self, tol, C, fit_intercept, solver, max_iter, algorithm): self.max_iter = max_iter self.algorithm = algorithm - def _get_onedal_params(self, dtype=np.float32): + def _get_onedal_params(self, is_csr, dtype=np.float32): intercept = "intercept|" if self.fit_intercept else "" return { "fptype": "float" if dtype == np.float32 else "double", - "method": self.algorithm, + "method": "sparse" if is_csr else self.algorithm, "intercept": self.fit_intercept, "tol": self.tol, "max_iter": self.max_iter, @@ -62,14 +63,16 @@ def _get_onedal_params(self, dtype=np.float32): } def _fit(self, X, y, module, queue): + sparsity_enabled = daal_check_version((2024, "P", 700)) X, y = _check_X_y( X, y, - accept_sparse=False, + accept_sparse=sparsity_enabled, force_all_finite=True, accept_2d_y=False, dtype=[np.float64, np.float32], ) + is_csr = _is_csr(X) self.n_features_in_ = _num_features(X, fallback_1d=True) @@ -81,7 +84,7 @@ def _fit(self, X, y, module, queue): policy = self._get_policy(queue, X, y) X, y = _convert_to_supported(policy, X, y) - params = self._get_onedal_params(get_dtype(X)) + params = self._get_onedal_params(is_csr, get_dtype(X)) X_table, y_table = to_table(X, y) result = module.train(policy, params, X_table, y_table) @@ -151,10 +154,17 @@ def _create_model(self, module, policy): def _infer(self, X, module, queue): _check_is_fitted(self) + sparsity_enabled = daal_check_version((2024, "P", 700)) X = _check_array( - X, dtype=[np.float64, np.float32], force_all_finite=True, ensure_2d=False + X, + dtype=[np.float64, np.float32], + accept_sparse=sparsity_enabled, + force_all_finite=True, + ensure_2d=False, + accept_large_sparse=sparsity_enabled, ) + is_csr = _is_csr(X) _check_n_features(self, X, False) X = make2d(X) @@ -166,7 +176,7 @@ def _infer(self, X, module, queue): model = self._create_model(module, policy) X = _convert_to_supported(policy, X) - params = self._get_onedal_params(get_dtype(X)) + params = self._get_onedal_params(is_csr, get_dtype(X)) X_table = to_table(X) result = module.infer(policy, params, model, X_table) diff --git a/onedal/linear_model/tests/test_incremental_ridge_regression.py b/onedal/linear_model/tests/test_incremental_ridge_regression.py new file mode 100644 index 0000000000..471f46e4f6 --- /dev/null +++ b/onedal/linear_model/tests/test_incremental_ridge_regression.py @@ -0,0 +1,107 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2024, "P", 600)): + import numpy as np + import pytest + from numpy.testing import assert_allclose, assert_array_equal + from sklearn.datasets import load_diabetes + from sklearn.metrics import mean_squared_error + from sklearn.model_selection import train_test_split + + from onedal.linear_model import IncrementalRidge + from onedal.tests.utils._device_selection import get_queues + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_diabetes(queue, dtype): + X, y = load_diabetes(return_X_y=True) + X, y = X.astype(dtype), y.astype(dtype) + X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=0.8, random_state=777 + ) + X_train_split = np.array_split(X_train, 2) + y_train_split = np.array_split(y_train, 2) + model = IncrementalRidge(fit_intercept=True, alpha=0.1) + for i in range(2): + model.partial_fit(X_train_split[i], y_train_split[i], queue=queue) + model.finalize_fit() + y_pred = model.predict(X_test, queue=queue) + assert_allclose(mean_squared_error(y_test, y_pred), 2388.775, rtol=1e-5) + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.skip(reason="pickling not implemented for oneDAL entities") + def test_pickle(queue, dtype): + # TODO Implement pickling for oneDAL entities + X, y = load_diabetes(return_X_y=True) + X, y = X.astype(dtype), y.astype(dtype) + model = IncrementalRidge(fit_intercept=True, alpha=0.5) + model.partial_fit(X, y, queue=queue) + model.finalize_fit() + expected = model.predict(X, queue=queue) + + import pickle + + dump = pickle.dumps(model) + model2 = pickle.loads(dump) + + assert isinstance(model2, model.__class__) + result = model2.predict(X, queue=queue) + + assert_array_equal(expected, result) + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("num_blocks", [1, 2, 10]) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_no_intercept_results(queue, num_blocks, dtype): + seed = 42 + n_features, n_targets = 19, 7 + n_train_samples, n_test_samples = 3500, 1999 + + gen = np.random.default_rng(seed) + + X = gen.random(size=(n_train_samples, n_features), dtype=dtype) + y = gen.random(size=(n_train_samples, n_targets), dtype=dtype) + X_split = np.array_split(X, num_blocks) + y_split = np.array_split(y, num_blocks) + alpha = 0.5 + + lambda_identity = alpha * np.eye(X.shape[1]) + inverse_term = np.linalg.inv(np.dot(X.T, X) + lambda_identity) + xt_y = np.dot(X.T, y) + coef = np.dot(inverse_term, xt_y) + + model = IncrementalRidge(fit_intercept=False, alpha=alpha) + for i in range(num_blocks): + model.partial_fit(X_split[i], y_split[i], queue=queue) + model.finalize_fit() + + if queue and queue.sycl_device.is_gpu: + tol = 5e-3 if model.coef_.dtype == np.float32 else 1e-5 + else: + tol = 2e-3 if model.coef_.dtype == np.float32 else 1e-5 + assert_allclose(coef, model.coef_.T, rtol=tol) + + Xt = gen.random(size=(n_test_samples, n_features), dtype=dtype) + gtr = Xt @ coef + + res = model.predict(Xt, queue=queue) + + tol = 2e-4 if res.dtype == np.float32 else 1e-7 + assert_allclose(gtr, res, rtol=tol) diff --git a/onedal/linear_model/tests/test_logistic_regression.py b/onedal/linear_model/tests/test_logistic_regression.py index ad1f4a4f27..7633950dd5 100644 --- a/onedal/linear_model/tests/test_logistic_regression.py +++ b/onedal/linear_model/tests/test_logistic_regression.py @@ -20,6 +20,7 @@ import numpy as np import pytest from numpy.testing import assert_allclose, assert_array_equal + from scipy.sparse import csr_matrix from sklearn.datasets import load_breast_cancer, make_classification from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split @@ -64,3 +65,30 @@ def test_pickle(queue, dtype): result = model2.predict(X, queue=queue) assert_array_equal(expected, result) + + +if daal_check_version((2024, "P", 700)): + + @pytest.mark.parametrize("queue", get_queues("gpu")) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize( + "dims", [(3007, 17, 0.05), (50000, 100, 0.01), (512, 10, 0.5)] + ) + def test_csr(queue, dtype, dims): + n, p, density = dims + X, y = make_classification(n, p, random_state=42) + np.random.seed(2007 + n + p) + mask = np.random.binomial(1, density, (n, p)) + X = X * mask + X_sp = csr_matrix(X) + model = LogisticRegression(fit_intercept=True, solver="newton-cg") + model.fit(X, y, queue=queue) + pred = model.predict(X, queue=queue) + + model_sp = LogisticRegression(fit_intercept=True, solver="newton-cg") + model_sp.fit(X_sp, y, queue=queue) + pred_sp = model_sp.predict(X_sp, queue=queue) + + assert_allclose(pred, pred_sp) + assert_allclose(model.coef_, model_sp.coef_) + assert_allclose(model.intercept_, model_sp.intercept_) diff --git a/onedal/neighbors/neighbors.cpp b/onedal/neighbors/neighbors.cpp index 876cd9db22..fe458fc0b5 100644 --- a/onedal/neighbors/neighbors.cpp +++ b/onedal/neighbors/neighbors.cpp @@ -130,6 +130,7 @@ auto get_onedal_result_options(const py::dict& params) { } } catch (std::regex_error& e) { + (void)e; ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(result_option); } diff --git a/onedal/primitives/optimizers.hpp b/onedal/primitives/optimizers.hpp index 35b8a4cbc9..18428b9ccb 100644 --- a/onedal/primitives/optimizers.hpp +++ b/onedal/primitives/optimizers.hpp @@ -26,10 +26,6 @@ namespace oneapi::dal::python { template auto get_optimizer_descriptor(const pybind11::dict& params) { - using float_t = typename Optimizer::float_t; - using method_t = typename Optimizer::method_t; - using task_t = typename Optimizer::task_t; - using newton_cg_desc_t = newton_cg::descriptor; auto optimizer = Optimizer{}; optimizer.set_tolerance(params["tol"].cast()); optimizer.set_max_iteration(params["max_iter"].cast()); diff --git a/onedal/primitives/pairwise_distances.hpp b/onedal/primitives/pairwise_distances.hpp index 00de691431..c94786f63a 100755 --- a/onedal/primitives/pairwise_distances.hpp +++ b/onedal/primitives/pairwise_distances.hpp @@ -32,8 +32,6 @@ auto get_distance_descriptor(const pybind11::dict& params) { using method_t = typename Distance::method_t; using task_t = typename Distance::task_t; using minkowski_desc_t = minkowski_distance::descriptor; - using chebyshev_desc_t = chebyshev_distance::descriptor; - using cosine_desc_t = cosine_distance::descriptor; auto distance = Distance{}; if constexpr (std::is_same_v) { diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index ca1d1a5d61..3f77ae086f 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -206,7 +206,7 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& this->node_ar_ptr[node_id].threshold = info.get_feature_value(); this->node_ar_ptr[node_id].impurity = info.get_impurity(); this->node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); - this->node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar_ptr[node_id].weighted_n_node_samples = static_cast(info.get_sample_count()); this->node_ar_ptr[node_id].missing_go_to_left = false; // wrap-up @@ -230,7 +230,7 @@ void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info< this->node_ar_ptr[node_id].impurity = info.get_impurity(); this->node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); - this->node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar_ptr[node_id].weighted_n_node_samples = static_cast(info.get_sample_count()); this->node_ar_ptr[node_id].missing_go_to_left = false; } @@ -253,7 +253,7 @@ bool to_sklearn_tree_object_visitor::call( std::size_t depth = static_cast(info.get_level()); const std::size_t label = info.get_response(); // these may be a slow accesses due to oneDAL abstraction - const double nNodeSampleCount = info.get_sample_count(); // do them only once + const double nNodeSampleCount = static_cast(info.get_sample_count()); // do them only once while(depth--) { diff --git a/scripts/build_backend.py b/scripts/build_backend.py index fe8ff31794..a914adb807 100755 --- a/scripts/build_backend.py +++ b/scripts/build_backend.py @@ -42,86 +42,6 @@ IS_WIN = True -def build_cpp( - cc, - cxx, - sources, - targetprefix, - targetname, - targetsuffix, - libs, - libdirs, - includes, - eca, - ela, - defines, - installpath="", -): - import shutil - import subprocess - from os.path import basename - - logger.info(f"building cpp target {targetname}...") - - include_dir_plat = ["-I" + incdir for incdir in includes] - if IS_WIN: - eca += ["/EHsc"] - lib_prefix = "" - lib_suffix = ".lib" - obj_ext = ".obj" - libdirs += [jp(get_paths()["data"], "libs")] - library_dir_plat = ["/link"] + [f"/LIBPATH:{libdir}" for libdir in libdirs] - additional_linker_opts = [ - "/DLL", - f"/OUT:{targetprefix}{targetname}{targetsuffix}", - ] - else: - eca += ["-fPIC"] - ela += ["-shared"] - lib_prefix = "-l" - lib_suffix = "" - obj_ext = ".o" - library_dir_plat = ["-L" + libdir for libdir in libdirs] - additional_linker_opts = ["-o", f"{targetprefix}{targetname}{targetsuffix}"] - eca += ["-c"] - libs = [f"{lib_prefix}{str(item)}{lib_suffix}" for item in libs] - - d4p_dir = os.getcwd() - build_dir = os.path.join(d4p_dir, f"build_{targetname}") - - if os.path.exists(build_dir): - shutil.rmtree(build_dir) - os.mkdir(build_dir) - os.chdir(build_dir) - - objfiles = [basename(f).replace(".cpp", obj_ext) for f in sources] - for i, cppfile in enumerate(sources): - if IS_WIN: - out = [f"/Fo{objfiles[i]}"] - else: - out = ["-o", objfiles[i]] - cmd = [cc] + include_dir_plat + eca + [f"{d4p_dir}/{cppfile}"] + out + defines - logger.info(subprocess.list2cmdline(cmd)) - subprocess.check_call(cmd) - - if IS_WIN: - cmd = [cxx] + ela + objfiles + library_dir_plat + libs + additional_linker_opts - else: - cmd = [cxx] + objfiles + library_dir_plat + ela + libs + additional_linker_opts - logger.info(subprocess.list2cmdline(cmd)) - subprocess.check_call(cmd) - shutil.copy( - f"{targetprefix}{targetname}{targetsuffix}", os.path.join(d4p_dir, installpath) - ) - if IS_WIN: - target_lib_suffix = targetsuffix.replace(".dll", ".lib") - shutil.copy( - f"{targetprefix}{targetname}{target_lib_suffix}", - os.path.join(d4p_dir, installpath), - ) - os.chdir(d4p_dir) - - def custom_build_cmake_clib( iface, cxx=None, onedal_major_binary_version=1, no_dist=True, use_parameters_lib=True ): diff --git a/scripts/version.py b/scripts/version.py index df4030fef8..834d48fd68 100755 --- a/scripts/version.py +++ b/scripts/version.py @@ -85,7 +85,7 @@ def get_onedal_shared_libs(dal_root): possible_aliases = [ lib_name, f"lib{lib_name}.so.{major_bin_version}", - f"lib{lib_name}.{major_bin_version}.dylib" + f"lib{lib_name}.{major_bin_version}.dylib", f"{lib_name}.{major_bin_version}.dll", ] if any(find_library(alias) for alias in possible_aliases): diff --git a/setup.py b/setup.py index af6b87657f..fc1a9a400e 100644 --- a/setup.py +++ b/setup.py @@ -327,29 +327,6 @@ def getpyexts(): ) exts.extend(cythonize(ext, nthreads=n_threads)) - if dpcpp: - if IS_LIN or IS_MAC: - runtime_oneapi_dirs = ["$ORIGIN/oneapi"] - elif IS_WIN: - runtime_oneapi_dirs = [] - - ext = Extension( - "daal4py._oneapi", - [ - os.path.abspath("src/oneapi/oneapi.pyx"), - ], - depends=["src/oneapi/oneapi.h", "src/oneapi/oneapi_backend.h"], - include_dirs=include_dir_plat + [np.get_include()], - extra_compile_args=eca, - extra_link_args=ela, - define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], - libraries=["oneapi_backend"] + libraries_plat, - library_dirs=["daal4py/oneapi"] + ONEDAL_LIBDIRS, - runtime_library_dirs=runtime_oneapi_dirs, - language="c++", - ) - exts.extend(cythonize(ext, nthreads=n_threads)) - if not no_dist: mpi_include_dir = include_dir_plat + [np.get_include()] + MPI_INCDIRS mpi_depens = glob.glob(jp(os.path.abspath("src"), "*.h")) @@ -405,33 +382,6 @@ def gen_pyx(odir): gen_pyx(os.path.abspath("./build")) -def build_oneapi_backend(): - eca, ela, includes = get_build_options() - cc = "icx" - if IS_WIN: - cxx = "icx" - else: - cxx = "icpx" - eca = ["-fsycl"] + ["-fsycl-device-code-split=per_kernel"] + eca - ela = ["-fsycl"] + ["-fsycl-device-code-split=per_kernel"] + ela - - return build_backend.build_cpp( - cc=cc, - cxx=cxx, - sources=["src/oneapi/oneapi_backend.cpp"], - targetname="oneapi_backend", - targetprefix="" if IS_WIN else "lib", - targetsuffix=".dll" if IS_WIN else ".so", - libs=get_libs("daal") + ["OpenCL", "onedal_sycl"], - libdirs=ONEDAL_LIBDIRS, - includes=includes, - eca=eca, - ela=ela, - defines=[], - installpath="daal4py/oneapi/", - ) - - def get_onedal_py_libs(): ext_suffix = get_config_vars("EXT_SUFFIX")[0] libs = [f"_onedal_py_host{ext_suffix}", f"_onedal_py_dpc{ext_suffix}"] @@ -468,7 +418,6 @@ def run(self): use_parameters_lib=use_parameters_lib, ) if dpcpp: - build_oneapi_backend() if is_onedal_iface: build_backend.custom_build_cmake_clib( iface="dpc", @@ -532,7 +481,6 @@ def run(self): packages_with_tests = [ "daal4py", - "daal4py.oneapi", "daal4py.mb", "daal4py.sklearn", "daal4py.sklearn.cluster", @@ -647,11 +595,6 @@ def run(self): keywords=["machine learning", "scikit-learn", "data science", "data analytics"], packages=get_packages_with_tests(packages_with_tests), package_data={ - "daal4py.oneapi": [ - "liboneapi_backend.so", - "oneapi_backend.lib", - "oneapi_backend.dll", - ], "onedal": get_onedal_py_libs(), }, ext_modules=getpyexts(), diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index a155ac12fc..a4a62556f6 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -147,6 +147,7 @@ def get_patch_map_core(preview=False): from .linear_model import ( IncrementalLinearRegression as IncrementalLinearRegression_sklearnex, ) + from .linear_model import IncrementalRidge as IncrementalRidge_sklearnex from .linear_model import Lasso as Lasso_sklearnex from .linear_model import LinearRegression as LinearRegression_sklearnex from .linear_model import LogisticRegression as LogisticRegression_sklearnex @@ -408,6 +409,19 @@ def get_patch_map_core(preview=False): ] ] + if daal_check_version((2024, "P", 600)): + # IncrementalRidge + mapping["incrementalridge"] = [ + [ + ( + linear_model_module, + "IncrementalRidge", + IncrementalRidge_sklearnex, + ), + None, + ] + ] + # Configs mapping["set_config"] = [ [(base_module, "set_config", set_config_sklearnex), None] diff --git a/sklearnex/linear_model/__init__.py b/sklearnex/linear_model/__init__.py index 7c6ef5201b..2c9defc9e9 100755 --- a/sklearnex/linear_model/__init__.py +++ b/sklearnex/linear_model/__init__.py @@ -16,6 +16,7 @@ from .coordinate_descent import ElasticNet, Lasso from .incremental_linear import IncrementalLinearRegression +from .incremental_ridge import IncrementalRidge from .linear import LinearRegression from .logistic_regression import LogisticRegression from .ridge import Ridge @@ -23,6 +24,7 @@ __all__ = [ "ElasticNet", "IncrementalLinearRegression", + "IncrementalRidge", "Lasso", "LinearRegression", "LogisticRegression", diff --git a/sklearnex/linear_model/incremental_ridge.py b/sklearnex/linear_model/incremental_ridge.py new file mode 100644 index 0000000000..99dc473456 --- /dev/null +++ b/sklearnex/linear_model/incremental_ridge.py @@ -0,0 +1,418 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import numbers +import warnings + +import numpy as np +from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin +from sklearn.metrics import r2_score +from sklearn.utils import gen_batches +from sklearn.utils.validation import check_is_fitted, check_X_y + +from daal4py.sklearn._n_jobs_support import control_n_jobs +from daal4py.sklearn.utils.validation import sklearn_check_version + +if sklearn_check_version("1.2"): + from sklearn.utils._param_validation import Interval + +from onedal.linear_model import IncrementalRidge as onedal_IncrementalRidge + +from .._device_offload import dispatch, wrap_output_data +from .._utils import PatchingConditionsChain + + +@control_n_jobs( + decorated_methods=["fit", "partial_fit", "predict", "_onedal_finalize_fit"] +) +class IncrementalRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): + """ + Incremental estimator for Ridge Regression. + Allows to train Ridge Regression if data is splitted into batches. + + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + alpha : float, default=1.0 + Regularization strength; must be a positive float. Regularization + improves the conditioning of the problem and reduces the variance of + the estimates. Larger values specify stronger regularization. + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + n_jobs : int, default=None + The number of jobs to use for the computation. + + batch_size : int, default=None + The number of samples to use for each batch. Only used when calling + ``fit``. If ``batch_size`` is ``None``, then ``batch_size`` + is inferred from the data and set to ``5 * n_features``, to provide a + balance between approximation accuracy and memory consumption. + + Attributes + ---------- + coef_ : array of shape (n_features, ) or (n_targets, n_features) + Estimated coefficients for the ridge regression problem. + If multiple targets are passed during the fit (y 2D), this + is a 2D array of shape (n_targets, n_features), while if only + one target is passed, this is a 1D array of length n_features. + + intercept_ : float or array of shape (n_targets,) + Independent term in the linear model. Set to 0.0 if + `fit_intercept = False`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + n_samples_seen_ : int + The number of samples processed by the estimator. Will be reset on + new calls to fit, but increments across ``partial_fit`` calls. + It should be not less than `n_features_in_` if `fit_intercept` + is False and not less than `n_features_in_` + 1 if `fit_intercept` + is True to obtain regression coefficients. + + batch_size_ : int + Inferred batch size from ``batch_size``. + """ + + _onedal_incremental_ridge = staticmethod(onedal_IncrementalRidge) + + if sklearn_check_version("1.2"): + _parameter_constraints: dict = { + "fit_intercept": ["boolean"], + "alpha": [Interval(numbers.Real, 0, None, closed="left")], + "copy_X": ["boolean"], + "n_jobs": [Interval(numbers.Integral, -1, None, closed="left"), None], + "batch_size": [Interval(numbers.Integral, 1, None, closed="left"), None], + } + + def __init__( + self, fit_intercept=True, alpha=1.0, copy_X=True, n_jobs=None, batch_size=None + ): + self.fit_intercept = fit_intercept + self.alpha = alpha + self.copy_X = copy_X + self.n_jobs = n_jobs + self.batch_size = batch_size + + def _onedal_supported(self, method_name, *data): + patching_status = PatchingConditionsChain( + f"sklearn.linear_model.{self.__class__.__name__}.{method_name}" + ) + return patching_status + + _onedal_cpu_supported = _onedal_supported + _onedal_gpu_supported = _onedal_supported + + def _onedal_predict(self, X, queue=None): + if sklearn_check_version("1.2"): + self._validate_params() + + if sklearn_check_version("1.0"): + X = self._validate_data(X, accept_sparse=False, reset=False) + + assert hasattr(self, "_onedal_estimator") + if self._need_to_finalize: + self._onedal_finalize_fit() + return self._onedal_estimator.predict(X, queue) + + def _onedal_score(self, X, y, sample_weight=None, queue=None): + return r2_score( + y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight + ) + + def _onedal_partial_fit(self, X, y, check_input=True, queue=None): + first_pass = not hasattr(self, "n_samples_seen_") or self.n_samples_seen_ == 0 + + if sklearn_check_version("1.2"): + self._validate_params() + + if check_input: + if sklearn_check_version("1.0"): + X, y = self._validate_data( + X, + y, + dtype=[np.float64, np.float32], + reset=first_pass, + copy=self.copy_X, + multi_output=True, + force_all_finite=False, + ) + else: + check_X_y(X, y, multi_output=True, y_numeric=True) + + if first_pass: + self.n_samples_seen_ = X.shape[0] + self.n_features_in_ = X.shape[1] + else: + self.n_samples_seen_ += X.shape[0] + onedal_params = { + "fit_intercept": self.fit_intercept, + "alpha": self.alpha, + "copy_X": self.copy_X, + } + if not hasattr(self, "_onedal_estimator"): + self._onedal_estimator = self._onedal_incremental_ridge(**onedal_params) + self._onedal_estimator.partial_fit(X, y, queue) + self._need_to_finalize = True + + def _onedal_finalize_fit(self): + assert hasattr(self, "_onedal_estimator") + is_underdetermined = self.n_samples_seen_ < self.n_features_in_ + int( + self.fit_intercept + ) + if is_underdetermined: + raise ValueError("Not enough samples to finalize") + self._onedal_estimator.finalize_fit() + self._save_attributes() + self._need_to_finalize = False + + def _onedal_fit(self, X, y, queue=None): + if sklearn_check_version("1.2"): + self._validate_params() + + if sklearn_check_version("1.0"): + X, y = self._validate_data( + X, + y, + dtype=[np.float64, np.float32], + copy=self.copy_X, + multi_output=True, + ensure_2d=True, + ) + else: + check_X_y(X, y, multi_output=True, y_numeric=True) + + n_samples, n_features = X.shape + + is_underdetermined = n_samples < n_features + int(self.fit_intercept) + if is_underdetermined: + raise ValueError("Not enough samples to run oneDAL backend") + + if self.batch_size is None: + self.batch_size_ = 5 * n_features + else: + self.batch_size_ = self.batch_size + + self.n_samples_seen_ = 0 + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator._reset() + + for batch in gen_batches(n_samples, self.batch_size_): + X_batch, y_batch = X[batch], y[batch] + self._onedal_partial_fit(X_batch, y_batch, check_input=False, queue=queue) + + if sklearn_check_version("1.2"): + self._validate_params() + + # finite check occurs on onedal side + self.n_features_in_ = n_features + + if n_samples == 1: + warnings.warn( + "Only one sample available. You may want to reshape your data array" + ) + + self._onedal_finalize_fit() + + return self + + def partial_fit(self, X, y, check_input=True): + """ + Incrementally fits the linear model with X and y. All of X and y is + processed as a single batch. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values, where `n_samples` is the number of samples and + `n_targets` is the number of targets. + + Returns + ------- + self : object + Returns the instance itself. + """ + + dispatch( + self, + "partial_fit", + { + "onedal": self.__class__._onedal_partial_fit, + "sklearn": None, + }, + X, + y, + check_input=check_input, + ) + return self + + def fit(self, X, y): + """ + Fit the model with X and y, using minibatches of size batch_size. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. It is necessary for + `n_samples` to be not less than `n_features` if `fit_intercept` + is False and not less than `n_features` + 1 if `fit_intercept` + is True + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values, where `n_samples` is the number of samples and + `n_targets` is the number of targets. + + Returns + ------- + self : object + Returns the instance itself. + """ + + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": None, + }, + X, + y, + ) + return self + + @wrap_output_data + def predict(self, X, y=None): + """ + Predict using the linear model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + Returns + ------- + array, shape (n_samples,) or (n_samples, n_targets) + Returns predicted values. + """ + check_is_fitted( + self, + msg=f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.", + ) + + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": None, + }, + X, + ) + + @wrap_output_data + def score(self, X, y, sample_weight=None): + """ + Return the coefficient of determination R^2 of the prediction. + + The coefficient R^2 is defined as (1 - u/v), where u is the residual + sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum + of squares ((y_true - y_true.mean()) ** 2).sum(). + The best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). A constant model that always + predicts the expected value of y, disregarding the input features, + would get a R^2 score of 0.0. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Test samples. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + True values for X. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + R^2 of self.predict(X) wrt. y. + """ + check_is_fitted( + self, + msg=f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.", + ) + + return dispatch( + self, + "score", + { + "onedal": self.__class__._onedal_score, + "sklearn": None, + }, + X, + y, + sample_weight=sample_weight, + ) + + @property + def coef_(self): + if hasattr(self, "_onedal_estimator") and self._need_to_finalize: + self._onedal_finalize_fit() + + return self._coef + + @coef_.setter + def coef_(self, value): + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.coef_ = value + # checking if the model is already fitted and if so, deleting the model + if hasattr(self._onedal_estimator, "_onedal_model"): + del self._onedal_estimator._onedal_model + self._coef = value + + @property + def intercept_(self): + if hasattr(self, "_onedal_estimator") and self._need_to_finalize: + self._onedal_finalize_fit() + + return self._intercept + + @intercept_.setter + def intercept_(self, value): + if hasattr(self, "_onedal_estimator"): + self._onedal_estimator.intercept_ = value + # checking if the model is already fitted and if so, deleting the model + if hasattr(self._onedal_estimator, "_onedal_model"): + del self._onedal_estimator._onedal_model + self._intercept = value + + def _save_attributes(self): + self.n_features_in_ = self._onedal_estimator.n_features_in_ + self._coef = self._onedal_estimator.coef_ + self._intercept = self._onedal_estimator.intercept_ diff --git a/sklearnex/linear_model/logistic_regression.py b/sklearnex/linear_model/logistic_regression.py index 107a442213..6658d8945c 100644 --- a/sklearnex/linear_model/logistic_regression.py +++ b/sklearnex/linear_model/logistic_regression.py @@ -39,6 +39,8 @@ from .._device_offload import dispatch, wrap_output_data from .._utils import PatchingConditionsChain, get_patch_message + _sparsity_enabled = daal_check_version((2024, "P", 700)) + class BaseLogisticRegression(ABC): def _save_attributes(self): assert hasattr(self, "_onedal_estimator") @@ -238,7 +240,7 @@ def _onedal_gpu_predict_supported(self, method_name, *data): [ (n_samples > 0, "Number of samples is less than 1."), ( - not any([issparse(i) for i in data]), + (not any([issparse(i) for i in data])) or _sparsity_enabled, "Sparse input is not supported.", ), (not model_is_sparse, "Sparse coefficients are not supported."), @@ -285,9 +287,21 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): assert sample_weight is None if sklearn_check_version("1.0"): - X, y = self._validate_data(X, y, dtype=[np.float64, np.float32]) + X, y = self._validate_data( + X, + y, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) else: - X, y = check_X_y(X, y, dtype=[np.float64, np.float32]) + X, y = check_X_y( + X, + y, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) self._initialize_onedal_estimator() try: @@ -308,9 +322,20 @@ def _onedal_predict(self, X, queue=None): check_is_fitted(self) if sklearn_check_version("1.0"): - X = self._validate_data(X, reset=False, dtype=[np.float64, np.float32]) + X = self._validate_data( + X, + reset=False, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) else: - X = check_array(X, dtype=[np.float64, np.float32]) + X = check_array( + X, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) assert hasattr(self, "_onedal_estimator") return self._onedal_estimator.predict(X, queue=queue) @@ -321,9 +346,20 @@ def _onedal_predict_proba(self, X, queue=None): check_is_fitted(self) if sklearn_check_version("1.0"): - X = self._validate_data(X, reset=False, dtype=[np.float64, np.float32]) + X = self._validate_data( + X, + reset=False, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) else: - X = check_array(X, dtype=[np.float64, np.float32]) + X = check_array( + X, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) assert hasattr(self, "_onedal_estimator") return self._onedal_estimator.predict_proba(X, queue=queue) @@ -334,9 +370,20 @@ def _onedal_predict_log_proba(self, X, queue=None): check_is_fitted(self) if sklearn_check_version("1.0"): - X = self._validate_data(X, reset=False, dtype=[np.float64, np.float32]) + X = self._validate_data( + X, + reset=False, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) else: - X = check_array(X, dtype=[np.float64, np.float32]) + X = check_array( + X, + accept_sparse=_sparsity_enabled, + accept_large_sparse=_sparsity_enabled, + dtype=[np.float64, np.float32], + ) assert hasattr(self, "_onedal_estimator") return self._onedal_estimator.predict_log_proba(X, queue=queue) diff --git a/sklearnex/linear_model/tests/test_incremental_ridge.py b/sklearnex/linear_model/tests/test_incremental_ridge.py new file mode 100644 index 0000000000..adcd5349ed --- /dev/null +++ b/sklearnex/linear_model/tests/test_incremental_ridge.py @@ -0,0 +1,153 @@ +# =============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2024, "P", 600)): + import numpy as np + import pytest + from numpy.testing import assert_allclose + from sklearn.exceptions import NotFittedError + + from onedal.tests.utils._dataframes_support import ( + _as_numpy, + _convert_to_dataframe, + get_dataframes_and_queues, + ) + from sklearnex.linear_model import IncrementalRidge + + def _compute_ridge_coefficients(X, y, alpha, fit_intercept): + coefficients_manual, intercept_manual = None, None + if fit_intercept: + X_mean = np.mean(X, axis=0) + y_mean = np.mean(y) + X_centered = X - X_mean + y_centered = y - y_mean + + X_with_intercept = np.hstack([np.ones((X.shape[0], 1)), X_centered]) + lambda_identity = alpha * np.eye(X_with_intercept.shape[1]) + inverse_term = np.linalg.inv( + np.dot(X_with_intercept.T, X_with_intercept) + lambda_identity + ) + xt_y = np.dot(X_with_intercept.T, y_centered) + coefficients_manual = np.dot(inverse_term, xt_y) + + intercept_manual = y_mean - np.dot(X_mean, coefficients_manual[1:]) + coefficients_manual = coefficients_manual[1:] + else: + lambda_identity = alpha * np.eye(X.shape[1]) + inverse_term = np.linalg.inv(np.dot(X.T, X) + lambda_identity) + xt_y = np.dot(X.T, y) + coefficients_manual = np.dot(inverse_term, xt_y) + + return coefficients_manual, intercept_manual + + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) + @pytest.mark.parametrize("batch_size", [10, 100, 1000]) + @pytest.mark.parametrize("alpha", [0.1, 0.5, 1.0]) + @pytest.mark.parametrize("fit_intercept", [True, False]) + def test_inc_ridge_fit_coefficients( + dataframe, queue, alpha, batch_size, fit_intercept + ): + sample_size, feature_size = 1000, 50 + X = np.random.rand(sample_size, feature_size) + y = np.random.rand(sample_size) + X_c = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y_c = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) + + inc_ridge = IncrementalRidge( + fit_intercept=fit_intercept, alpha=alpha, batch_size=batch_size + ) + inc_ridge.fit(X_c, y_c) + + coefficients_manual, intercept_manual = _compute_ridge_coefficients( + X, y, alpha, fit_intercept + ) + if fit_intercept: + assert_allclose(inc_ridge.intercept_, intercept_manual, rtol=1e-6, atol=1e-6) + + assert_allclose(inc_ridge.coef_, coefficients_manual, rtol=1e-6, atol=1e-6) + + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) + @pytest.mark.parametrize("batch_size", [2, 5]) + @pytest.mark.parametrize("alpha", [0.1, 0.5, 1.0]) + def test_inc_ridge_partial_fit_coefficients(dataframe, queue, alpha, batch_size): + sample_size, feature_size = 1000, 50 + X = np.random.rand(sample_size, feature_size) + y = np.random.rand(sample_size) + X_split = np.array_split(X, batch_size) + y_split = np.array_split(y, batch_size) + + inc_ridge = IncrementalRidge(fit_intercept=False, alpha=alpha) + + for batch_index in range(len(X_split)): + X_c = _convert_to_dataframe( + X_split[batch_index], sycl_queue=queue, target_df=dataframe + ) + y_c = _convert_to_dataframe( + y_split[batch_index], sycl_queue=queue, target_df=dataframe + ) + inc_ridge.partial_fit(X_c, y_c) + + lambda_identity = alpha * np.eye(X.shape[1]) + inverse_term = np.linalg.inv(np.dot(X.T, X) + lambda_identity) + xt_y = np.dot(X.T, y) + coefficients_manual = np.dot(inverse_term, xt_y) + + assert_allclose(inc_ridge.coef_, coefficients_manual, rtol=1e-6, atol=1e-6) + + def test_inc_ridge_score_before_fit(): + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + y = np.dot(X, np.array([1, 2])) + 3 + inc_ridge = IncrementalRidge(alpha=0.5) + with pytest.raises(NotFittedError): + inc_ridge.score(X, y) + + def test_inc_ridge_predict_before_fit(): + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + inc_ridge = IncrementalRidge(alpha=0.5) + with pytest.raises(NotFittedError): + inc_ridge.predict(X) + + def test_inc_ridge_score_after_fit(): + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + y = np.dot(X, np.array([1, 2])) + 3 + inc_ridge = IncrementalRidge(alpha=0.5) + inc_ridge.fit(X, y) + assert inc_ridge.score(X, y) >= 0.97 + + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) + @pytest.mark.parametrize("fit_intercept", [True, False]) + def test_inc_ridge_predict_after_fit(dataframe, queue, fit_intercept): + sample_size, feature_size = 1000, 50 + X = np.random.rand(sample_size, feature_size) + y = np.random.rand(sample_size) + X_c = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + y_c = _convert_to_dataframe(y, sycl_queue=queue, target_df=dataframe) + + inc_ridge = IncrementalRidge(fit_intercept=fit_intercept, alpha=0.5) + inc_ridge.fit(X_c, y_c) + + y_pred = inc_ridge.predict(X_c) + + coefficients_manual, intercept_manual = _compute_ridge_coefficients( + X, y, 0.5, fit_intercept + ) + y_pred_manual = np.dot(X, coefficients_manual) + if fit_intercept: + y_pred_manual += intercept_manual + + assert_allclose(_as_numpy(y_pred), y_pred_manual, rtol=1e-6, atol=1e-6) diff --git a/sklearnex/linear_model/tests/test_logreg.py b/sklearnex/linear_model/tests/test_logreg.py index d75913f645..8ee05cd8b5 100755 --- a/sklearnex/linear_model/tests/test_logreg.py +++ b/sklearnex/linear_model/tests/test_logreg.py @@ -14,8 +14,11 @@ # limitations under the License. # =============================================================================== +import numpy as np import pytest -from sklearn.datasets import load_breast_cancer, load_iris +from numpy.testing import assert_allclose, assert_array_equal +from scipy.sparse import csr_matrix +from sklearn.datasets import load_breast_cancer, load_iris, make_classification from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split @@ -24,7 +27,9 @@ _as_numpy, _convert_to_dataframe, get_dataframes_and_queues, + get_queues, ) +from sklearnex import config_context def prepare_input(X, y, dataframe, queue): @@ -88,3 +93,41 @@ def test_sklearnex_binary_classification(dataframe, queue): y_pred = _as_numpy(logreg.predict(X_test)) assert accuracy_score(y_test, y_pred) > 0.95 + + +if daal_check_version((2024, "P", 700)): + + @pytest.mark.parametrize("queue", get_queues("gpu")) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize( + "dims", [(3007, 17, 0.05), (50000, 100, 0.01), (512, 10, 0.5)] + ) + def test_csr(queue, dtype, dims): + from sklearnex.linear_model import LogisticRegression + + n, p, density = dims + + # Create sparse dataset for classification + X, y = make_classification(n, p, random_state=42) + X = X.astype(dtype) + y = y.astype(dtype) + np.random.seed(2007 + n + p) + mask = np.random.binomial(1, density, (n, p)) + X = X * mask + X_sp = csr_matrix(X) + + model = LogisticRegression(fit_intercept=True, solver="newton-cg") + model_sp = LogisticRegression(fit_intercept=True, solver="newton-cg") + + with config_context(target_offload="gpu:0"): + model.fit(X, y) + pred = model.predict(X) + prob = model.predict_proba(X) + model_sp.fit(X_sp, y) + pred_sp = model_sp.predict(X_sp) + prob_sp = model_sp.predict_proba(X_sp) + + assert_allclose(pred, pred_sp) + assert_allclose(prob, prob_sp) + assert_allclose(model.coef_, model_sp.coef_, rtol=1e-4) + assert_allclose(model.intercept_, model_sp.intercept_, rtol=1e-4) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index b072fd7814..778f99d268 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -54,6 +54,7 @@ "IncrementalEmpiricalCovariance", # dataframe_f issues "IncrementalLinearRegression", # TODO fix memory leak issue in private CI for data_shape = (1000, 100), data_transform_function = dataframe_f "IncrementalPCA", # TODO fix memory leak issue in private CI for data_shape = (1000, 100), data_transform_function = dataframe_f + "IncrementalRidge", # TODO fix memory leak issue in private CI for data_shape = (1000, 100), data_transform_function = dataframe_f "LogisticRegression(solver='newton-cg')", # memory leak fortran (1000, 100) ) diff --git a/src/daal4py.cpp b/src/daal4py.cpp index b92d5dece1..0be14efc08 100755 --- a/src/daal4py.cpp +++ b/src/daal4py.cpp @@ -206,11 +206,6 @@ static PyObject * _make_nda_from_csr(daal::data_management::NumericTablePtr * pt return NULL; } -#ifdef _DPCPP_ - #include "oneapi/oneapi_api.h" -// Disable returning of sycl buffer from algorithms -// static int __oneAPI_imp = import__oneapi(); -#endif // Convert a oneDAL NT to a numpy nd-array // tries to avoid copying the data, instead we try to share the memory with DAAL PyObject * make_nda(daal::data_management::NumericTablePtr * ptr) diff --git a/src/oneapi/oneapi.h b/src/oneapi/oneapi.h deleted file mode 100755 index a9c053ecbd..0000000000 --- a/src/oneapi/oneapi.h +++ /dev/null @@ -1,90 +0,0 @@ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __ONEAPI_H_INCLUDED__ -#define __ONEAPI_H_INCLUDED__ - -#include "oneapi_backend.h" -#include "numpy/ndarraytypes.h" -#include "oneapi_api.h" - -static void * to_device(void * ptr, int typ, int * shape) -{ - switch (typ) - { - case NPY_DOUBLE: return to_device(reinterpret_cast(ptr), shape); break; - case NPY_FLOAT: return to_device(reinterpret_cast(ptr), shape); break; - case NPY_INT: return to_device(reinterpret_cast(ptr), shape); break; - default: throw std::invalid_argument("invalid input array type (must be double, float or int)"); - } -} - -template -inline void * to_daal_nt(void * ptr, int typ, int * shape) -{ - switch (typ) - { - case NPY_DOUBLE: return to_daal_nt(ptr, shape); break; - case NPY_FLOAT: return to_daal_nt(ptr, shape); break; - case NPY_INT: return to_daal_nt(ptr, shape); break; - default: throw std::invalid_argument("invalid input array type (must be double, float or int)"); - } -} - -static void * to_daal_sycl_nt(void * ptr, int typ, int * shape) -{ - return to_daal_nt(ptr, typ, shape); -} - -static void * to_daal_host_nt(void * ptr, int typ, int * shape) -{ - return to_daal_nt(ptr, typ, shape); -} - -static void delete_device_data(void * ptr, int typ) -{ - if (ptr == nullptr) - return; - - switch (typ) - { - case NPY_DOUBLE: delete_device_data(ptr); break; - case NPY_FLOAT: delete_device_data(ptr); break; - case NPY_INT: delete_device_data(ptr); break; - default: throw std::invalid_argument("invalid array type (must be double, float or int)"); - } -} - -static std::string to_std_string(PyObject * o) -{ - return PyUnicode_AsUTF8(o); -} - -void * c_make_py_from_sycltable(void * _ptr, int typ) -{ - auto ptr = reinterpret_cast(_ptr); - - switch (typ) - { - case NPY_DOUBLE: return fromdaalnt(ptr); break; - case NPY_FLOAT: return fromdaalnt(ptr); break; - case NPY_INT: return fromdaalnt(ptr); break; - default: throw std::invalid_argument("invalid output array type (must be double, float or int)"); - } - return NULL; -} - -#endif // __ONEAPI_H_INCLUDED__ diff --git a/src/oneapi/oneapi.pyx b/src/oneapi/oneapi.pyx deleted file mode 100644 index 06cc758ccb..0000000000 --- a/src/oneapi/oneapi.pyx +++ /dev/null @@ -1,176 +0,0 @@ -#=============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# distutils: language = c++ -# cython: language_level=2 - -from cpython.ref cimport Py_INCREF, PyObject -from libcpp cimport bool -from libcpp.string cimport string as std_string - - -cdef extern from "oneapi/oneapi.h": - cdef cppclass PySyclExecutionContext: - PySyclExecutionContext(const std_string & dev, const bool from_python) except + - void apply() except + - void * to_device(void *, int, int*) - void * to_daal_sycl_nt(void*, int, int*) - void * to_daal_host_nt(void*, int, int*) - void delete_device_data(void *, int) - - std_string to_std_string(PyObject * o) except + - - void * c_make_py_from_sycltable(void * ptr, int typ) except + - - - -cdef class sycl_execution_context: - cdef PySyclExecutionContext * c_ptr - cdef object dev - - def __cinit__(self, dev, from_python=True): - self.dev = dev - self.c_ptr = new PySyclExecutionContext(to_std_string(dev), from_python) - - def __dealloc__(self): - del self.c_ptr - - def apply(self): - self.c_ptr.apply() - - def get_device_name(self): - return self.dev - - -# thread-local storage - -from threading import local as threading_local - -_tls = threading_local() - -def _is_tls_initialized(): - return (getattr(_tls, 'initialized', None) is not None) and (_tls.initialized == True) - -def _initialize_tls(): - _tls._in_sycl_ctxt = False - _tls.initialized = True - _tls.ctxt = None - _tls.params = dict() - -def _set_in_sycl_ctxt(ctxt, **kwargs): - if not _is_tls_initialized(): - _initialize_tls() - _tls._in_sycl_ctxt = ctxt is not None - _tls.ctxt = ctxt - _tls.params = kwargs - - if ctxt is not None: - ctxt.apply() - -def _get_in_sycl_ctxt(): - if not _is_tls_initialized(): - _initialize_tls() - return _tls._in_sycl_ctxt - -def _get_sycl_ctxt(): - if not _is_tls_initialized(): - _initialize_tls() - return _tls.ctxt - -def _get_device_name_sycl_ctxt(): - if not _is_tls_initialized(): - _initialize_tls() - if _tls.ctxt is None: - return None - else: - return _tls.ctxt.get_device_name() - -def _get_sycl_ctxt_params(): - if not _is_tls_initialized(): - _initialize_tls() - return _tls.params - -def is_in_sycl_ctxt(): - return _get_in_sycl_ctxt() - - -from contextlib import contextmanager - - -@contextmanager -def sycl_context(dev='host', host_offload_on_fail=False, from_python=True): - # Code to acquire resource - prev_ctxt = _get_sycl_ctxt() - prev_params = _get_sycl_ctxt_params() - ctxt = sycl_execution_context(dev, from_python=from_python) - _set_in_sycl_ctxt(ctxt, host_offload_on_fail=host_offload_on_fail) - try: - yield ctxt - finally: - # Code to release resource - _set_in_sycl_ctxt(prev_ctxt, **prev_params) - del ctxt - - -cimport numpy as np - -import numpy as np - -from cpython.pycapsule cimport PyCapsule_New - - -cdef class sycl_buffer: - 'Sycl buffer for DAAL. A generic implementation needs to do much more.' - - cdef readonly long long device_data - cdef int typ - cdef int shape[2] - cdef object _ary - - - def __cinit__(self, ary=None): - self._ary = ary - if ary is not None: - assert ary.flags['C_CONTIGUOUS'] and ary.ndim == 2 - self.__inilz__(0, np.PyArray_TYPE(ary), ary.shape[0], ary.shape[1]) - - cpdef __inilz__(self, long long device_data, int t, int d1, int d2): - self.typ = t - self.shape[0] = d1 - self.shape[1] = d2 - self.device_data = device_data - - def __dealloc__(self): - delete_device_data(self.device_data, self.typ) - - def __2daalnt__(self): - if _get_device_name_sycl_ctxt() == 'gpu': - if self.device_data == 0: - assert self._ary is not None - self.device_data = to_device(np.PyArray_DATA(self._ary), self.typ, self.shape) - return PyCapsule_New(to_daal_sycl_nt(self.device_data, self.typ, self.shape), NULL, NULL) - else: - return PyCapsule_New(to_daal_host_nt(np.PyArray_DATA(self._ary), self.typ, self.shape), NULL, NULL) - -cdef api object make_py_from_sycltable(void * ptr, int typ, int d1, int d2): - if not _get_in_sycl_ctxt(): - return None - cdef void * device_data = c_make_py_from_sycltable(ptr, typ) - if device_data: - res = sycl_buffer.__new__(sycl_buffer) - res.__inilz__(device_data, typ, d1, d2) - return res - return None diff --git a/src/oneapi/oneapi_backend.cpp b/src/oneapi/oneapi_backend.cpp deleted file mode 100644 index e797099573..0000000000 --- a/src/oneapi/oneapi_backend.cpp +++ /dev/null @@ -1,224 +0,0 @@ -/******************************************************************************* -* Copyright 2021 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "daal_sycl.h" -#ifndef DAAL_SYCL_INTERFACE - #include - #include -static_assert(false, "DAAL_SYCL_INTERFACE not defined") -#endif - -#include "oneapi_backend.h" - - PySyclExecutionContext::PySyclExecutionContext(const std::string & dev, const bool from_python) - : m_ctxt(NULL) -{ - if (dev == "gpu") -#if INTEL_DAAL_VERSION >= 20240000 - m_ctxt = new daal::services::SyclExecutionContext(cl::sycl::queue(cl::sycl::gpu_selector()), from_python); -#else // INTEL_DAAL_VERSION >= 20240000 - m_ctxt = new daal::services::SyclExecutionContext(cl::sycl::queue(cl::sycl::gpu_selector())); -#endif // INTEL_DAAL_VERSION >= 20240000 - else if (dev == "cpu") - m_ctxt = new daal::services::SyclExecutionContext(cl::sycl::queue(cl::sycl::cpu_selector())); - else if (dev == "host") - m_ctxt = new daal::services::SyclExecutionContext(cl::sycl::queue(cl::sycl::host_selector())); - else - { - throw std::runtime_error(std::string("Device is not supported: ") + dev); - } -} - -PySyclExecutionContext::~PySyclExecutionContext() -{ - daal::services::Environment::getInstance()->setDefaultExecutionContext(daal::services::CpuExecutionContext()); - delete m_ctxt; - m_ctxt = NULL; -} - -void PySyclExecutionContext::apply() -{ - daal::services::Environment::getInstance()->setDefaultExecutionContext(*m_ctxt); -} - -#if INTEL_DAAL_VERSION >= 20210200 -inline const sycl::queue & get_current_queue() -{ - auto & ctx = daal::services::Environment::getInstance()->getDefaultExecutionContext(); - auto * syclCtx = dynamic_cast(&ctx); - if (!syclCtx) - { - throw std::domain_error("Cannot get current queue outside sycl_context"); - } - return syclCtx->getQueue(); -} - -// take a raw array and convert to usm pointer -template -inline daal::services::SharedPtr * to_usm(T * ptr, int * shape) -{ - auto queue = get_current_queue(); - const std::int64_t count = shape[0] * shape[1]; - T * usm_host_ptr = sycl::malloc_host(count, queue); - T * usm_device_ptr = sycl::malloc_device(count, queue); - if (!usm_host_ptr || !usm_device_ptr) - { - sycl::free(usm_host_ptr, queue); - sycl::free(usm_device_ptr, queue); - throw std::runtime_error("internal error during allocating USM memory"); - } - - // TODO: avoid using usm_host_ptr and copy directly to usm_device_ptr - // It's a temporary solution till queue.memcpy() from non-usm memory does not work - int res = daal::services::internal::daal_memcpy_s(usm_host_ptr, sizeof(T) * count, ptr, sizeof(T) * count); - if (res) - { - sycl::free(usm_host_ptr, queue); - sycl::free(usm_device_ptr, queue); - throw std::runtime_error("internal error during data copying from host to USM memory"); - } - - try - { - auto event = queue.memcpy(usm_device_ptr, usm_host_ptr, sizeof(T) * count); - event.wait_and_throw(); - } - catch (std::exception & ex) - { - sycl::free(usm_host_ptr, queue); - sycl::free(usm_device_ptr, queue); - throw std::runtime_error("internal error during data copying from host to USM memory"); - } - - sycl::free(usm_host_ptr, queue); - return new daal::services::SharedPtr(usm_device_ptr, [q = queue](const void * data) { sycl::free(const_cast(data), q); }); -} - -template -inline void del_usm(void * ptr) -{ - auto * sh_ptr = reinterpret_cast *>(ptr); - sh_ptr->reset(); - delete sh_ptr; -} -#endif - -// take a raw array and convert to sycl buffer -template -inline sycl::buffer * to_sycl_buffer(T * ptr, int * shape) -{ - return new sycl::buffer(ptr, sycl::range<1>(shape[0] * shape[1]), { sycl::property::buffer::use_host_ptr() }); -} - -template -inline void del_sycl_buffer(void * ptr) -{ - auto * bf = reinterpret_cast *>(ptr); - delete bf; -} - -template -void * to_device(T * ptr, int * shape) -{ -#if INTEL_DAAL_VERSION >= 20210200 - return to_usm(ptr, shape); -#else - return to_sycl_buffer(ptr, shape); -#endif -} - -template -void delete_device_data(void * ptr) -{ -#if INTEL_DAAL_VERSION >= 20210200 - del_usm(ptr); -#else - del_sycl_buffer(ptr); -#endif -} - -// take a sycl buffer and convert ti oneDAL NT -template -daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape) -{ - // ptr is SharedPtr* in case of USM pointer - // or just T* in case of host data - // or sycl::buffer* for previous oneDAL versions - - if constexpr (is_device_data) - { - typedef daal::data_management::SyclHomogenNumericTable TBL_T; -#if INTEL_DAAL_VERSION >= 20210200 - auto * usm_ptr = reinterpret_cast *>(ptr); - // we need to return a pointer to safely cross language boundaries - return new daal::data_management::NumericTablePtr(TBL_T::create(usm_ptr->get(), shape[1], shape[0], get_current_queue())); -#else - auto * buffer = reinterpret_cast *>(ptr); - return new daal::data_management::NumericTablePtr(TBL_T::create(*buffer, shape[1], shape[0])); -#endif - } - else - { - typedef daal::data_management::HomogenNumericTable TBL_T; - auto * host_ptr = reinterpret_cast(ptr); - // we need to return a pointer to safely cross language boundaries - return new daal::data_management::NumericTablePtr(TBL_T::create(host_ptr, shape[1], shape[0])); - } -} - -// return a device data from a SyclHomogenNumericTable -template -void * fromdaalnt(daal::data_management::NumericTablePtr * ptr) -{ - auto data = dynamic_cast *>((*ptr).get()); - if (data) - { - daal::data_management::BlockDescriptor block; - data->getBlockOfRows(0, data->getNumberOfRows(), daal::data_management::readOnly, block); - auto daalBuffer = block.getBuffer(); - -#if INTEL_DAAL_VERSION >= 20210200 - auto queue = get_current_queue(); - auto * usmPointer = new daal::services::SharedPtr(daalBuffer.toUSM(queue, daal::data_management::readOnly)); - data->releaseBlockOfRows(block); - return usmPointer; -#else - auto * syclBuffer = new sycl::buffer(daalBuffer.toSycl()); - data->releaseBlockOfRows(block); - return syclBuffer; -#endif - } - return NULL; -} - -template void * to_device(double * ptr, int * shape); -template void * to_device(float * ptr, int * shape); -template void * to_device(int * ptr, int * shape); - -template void delete_device_data(void * ptr); -template void delete_device_data(void * ptr); -template void delete_device_data(void * ptr); - -template daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape); -template daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape); -template daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape); -template daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape); -template daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape); -template daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape); - -template void * fromdaalnt(daal::data_management::NumericTablePtr * ptr); -template void * fromdaalnt(daal::data_management::NumericTablePtr * ptr); -template void * fromdaalnt(daal::data_management::NumericTablePtr * ptr); diff --git a/src/oneapi/oneapi_backend.h b/src/oneapi/oneapi_backend.h deleted file mode 100644 index d971b4f976..0000000000 --- a/src/oneapi/oneapi_backend.h +++ /dev/null @@ -1,55 +0,0 @@ -/******************************************************************************* -* Copyright 2021 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __ONEAPI_BACKEND_H_INCLUDED__ -#define __ONEAPI_BACKEND_H_INCLUDED__ - -#include "daal.h" - -#ifdef _WIN32 - #define _ONEAPI_BACKEND_EXPORT __declspec(dllexport) -#else - #define _ONEAPI_BACKEND_EXPORT -#endif - -class _ONEAPI_BACKEND_EXPORT PySyclExecutionContext -{ -public: - // Construct from given device provided as string - PySyclExecutionContext(const std::string & dev, const bool from_python); - ~PySyclExecutionContext(); - - void apply(); - -private: - daal::services::internal::ExecutionContext * m_ctxt; -}; - -template -_ONEAPI_BACKEND_EXPORT void * to_device(T * ptr, int * shape); - -template -_ONEAPI_BACKEND_EXPORT void delete_device_data(void * ptr); - -// take a sycl buffer and convert ti oneDAL NT -template -_ONEAPI_BACKEND_EXPORT daal::data_management::NumericTablePtr * to_daal_nt(void * ptr, int * shape); - -// return a device data from a SyclHomogenNumericTable -template -_ONEAPI_BACKEND_EXPORT void * fromdaalnt(daal::data_management::NumericTablePtr * ptr); - -#endif // __ONEAPI_BACKEND_H_INCLUDED__ diff --git a/tests/daal4py/sycl/bf_knn_classification.py b/tests/daal4py/sycl/bf_knn_classification.py deleted file mode 100644 index bf5bca7929..0000000000 --- a/tests/daal4py/sycl/bf_knn_classification.py +++ /dev/null @@ -1,141 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py BF KNN example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -# Common code for both CPU and GPU computations -def compute(train_data, train_labels, predict_data, nClasses): - # Create an algorithm object and call compute - train_algo = d4p.bf_knn_classification_training(nClasses=nClasses, fptype="float") - train_result = train_algo.compute(train_data, train_labels) - - # Create an algorithm object and call compute - predict_algo = d4p.bf_knn_classification_prediction(nClasses=nClasses, fptype="float") - predict_result = predict_algo.compute(predict_data, train_result.model) - return predict_result - - -def main(readcsv=read_csv, method="defaultDense"): - # Input data set parameters - train_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "k_nearest_neighbors_train.csv", - ) - predict_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "k_nearest_neighbors_test.csv", - ) - - # Read data. Let's use 5 features per observation - nFeatures = 5 - nClasses = 5 - train_data = readcsv(train_file, range(nFeatures), t=np.float32) - train_labels = readcsv(train_file, range(nFeatures, nFeatures + 1), t=np.float32) - predict_data = readcsv(predict_file, range(nFeatures), t=np.float32) - predict_labels = readcsv(predict_file, range(nFeatures, nFeatures + 1), t=np.float32) - - predict_result_classic = compute(train_data, train_labels, predict_data, nClasses) - - # We expect less than 170 mispredicted values - assert np.count_nonzero(predict_labels != predict_result_classic.prediction) < 170 - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - - predict_result_gpu = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data, nClasses - ) - assert np.allclose( - predict_result_gpu.prediction, predict_result_classic.prediction - ) - - return (predict_result_classic, predict_labels) - - -if __name__ == "__main__": - (predict_result, predict_labels) = main() - print("BF based KNN classification results:") - print("Ground truth(observations #30-34):\n", predict_labels[30:35]) - print( - "Classification results(observations #30-34):\n", predict_result.prediction[30:35] - ) diff --git a/tests/daal4py/sycl/covariance.py b/tests/daal4py/sycl/covariance.py deleted file mode 100644 index b3ea6be5e4..0000000000 --- a/tests/daal4py/sycl/covariance.py +++ /dev/null @@ -1,111 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py covariance example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Common code for both CPU and GPU computations -def compute(data, method): - # configure a covariance object - algo = d4p.covariance(method=method, fptype="float") - return algo.compute(data) - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="defaultDense"): - infile = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "covcormoments_dense.csv", - ) - - # Load the data - data = readcsv(infile, range(10), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic = compute(data, method) - - data = to_numpy(data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_data = sycl_buffer(data) - result_gpu = compute(sycl_data, "defaultDense") - - assert np.allclose(result_classic.covariance, result_gpu.covariance) - assert np.allclose(result_classic.mean, result_gpu.mean) - assert np.allclose(result_classic.correlation, result_gpu.correlation) - - return result_classic - - -if __name__ == "__main__": - res = main() - print("Covariance matrix:\n", res.covariance) - print("Mean vector:\n", res.mean) - print("All looks good!") diff --git a/tests/daal4py/sycl/covariance_streaming.py b/tests/daal4py/sycl/covariance_streaming.py deleted file mode 100644 index ccc8487613..0000000000 --- a/tests/daal4py/sycl/covariance_streaming.py +++ /dev/null @@ -1,142 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py covariance example for streaming on shared memory systems - -import os - -# let's use a generator for getting stream from file (defined in stream.py) -import sys - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -sys.path.insert(0, "..") - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - -try: - import pandas - - def read_csv(f, c=None, s=0, n=None, t=np.float64): - return pandas.read_csv( - f, usecols=c, delimiter=",", header=None, skiprows=s, nrows=n, dtype=t - ) - -except Exception: - # fall back to numpy genfromtxt - def read_csv(f, c=None, s=0, n=np.iinfo(np.int64).max): - a = np.genfromtxt(f, usecols=c, delimiter=",", skip_header=s, max_rows=n) - if a.shape[0] == 0: - raise Exception("done") - if a.ndim == 1: - return a[:, np.newaxis] - return a - - -# a generator which reads a file in chunks -def read_next(file, chunksize, readcsv=read_csv): - assert os.path.isfile(file) - s = 0 - while True: - # if found a smaller chunk we set s to < 0 to indicate eof - if s < 0: - return - a = read_csv(file, s=s, n=chunksize) - # last chunk is usually smaller, if not, - # numpy will print warning in next iteration - if chunksize > a.shape[0]: - s = -1 - else: - s += a.shape[0] - yield a - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=None, method="defaultDense"): - infile = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "covcormoments_dense.csv", - ) - - # Using of the classic way (computations on CPU) - # configure a covariance object - algo = d4p.covariance(streaming=True, fptype="float") - # get the generator (defined in stream.py)... - rn = read_next(infile, 112, readcsv) - # ... and iterate through chunks/stream - for chunk in rn: - algo.compute(chunk) - # finalize computation - result_classic = algo.finalize() - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - # configure a covariance object - algo = d4p.covariance(streaming=True, fptype="float") - # get the generator (defined in stream.py)... - rn = read_next(infile, 112, readcsv) - # ... and iterate through chunks/stream - for chunk in rn: - sycl_chunk = sycl_buffer(to_numpy(chunk)) - algo.compute(sycl_chunk) - # finalize computation - result_gpu = algo.finalize() - assert np.allclose(result_classic.covariance, result_gpu.covariance) - assert np.allclose(result_classic.mean, result_gpu.mean) - assert np.allclose(result_classic.correlation, result_gpu.correlation) - - return result_classic - - -if __name__ == "__main__": - res = main() - print("Covariance matrix:\n", res.covariance) - print("Mean vector:\n", res.mean) - print("All looks good!") diff --git a/tests/daal4py/sycl/dbscan.py b/tests/daal4py/sycl/dbscan.py deleted file mode 100644 index 0937305590..0000000000 --- a/tests/daal4py/sycl/dbscan.py +++ /dev/null @@ -1,117 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py DBSCAN example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -# Common code for both CPU and GPU computations -def compute(data, minObservations, epsilon): - # configure dbscan main object: - # we also request the indices and observations of cluster cores - algo = d4p.dbscan( - minObservations=minObservations, - fptype="float", - epsilon=epsilon, - resultsToCompute="computeCoreIndices|computeCoreObservations", - memorySavingMode=True, - ) - # and compute - return algo.compute(data) - - -def main(readcsv=read_csv, method="defaultDense"): - infile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "dbscan_dense.csv" - ) - epsilon = 0.04 - minObservations = 45 - - # Load the data - data = readcsv(infile, range(2), t=np.float32) - - result_classic = compute(data, minObservations, epsilon) - - data = to_numpy(data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_data = sycl_buffer(data) - result_gpu = compute(sycl_data, minObservations, epsilon) - assert np.allclose(result_classic.nClusters, result_gpu.nClusters) - assert np.allclose(result_classic.assignments, result_gpu.assignments) - assert np.allclose(result_classic.coreIndices, result_gpu.coreIndices) - assert np.allclose( - result_classic.coreObservations, result_gpu.coreObservations - ) - - return result_classic - - -if __name__ == "__main__": - result = main() - print("\nFirst 10 cluster assignments:\n", result.assignments[0:10]) - print("\nFirst 10 cluster core indices:\n", result.coreIndices[0:10]) - print("\nFirst 10 cluster core observations:\n", result.coreObservations[0:10]) - print("\nNumber of clusters:\n", result.nClusters) - print("All looks good!") diff --git a/tests/daal4py/sycl/decision_forest_classification.py b/tests/daal4py/sycl/decision_forest_classification.py deleted file mode 100644 index 8bbbf14c99..0000000000 --- a/tests/daal4py/sycl/decision_forest_classification.py +++ /dev/null @@ -1,169 +0,0 @@ -# ============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py Decision Forest Classification example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except Exception: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2, dtype=t) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_data, train_labels, predict_data, method="defaultDense"): - # Configure a training object (5 classes) - train_algo = d4p.decision_forest_classification_training( - 5, - fptype="float", - nTrees=10, - minObservationsInLeafNode=8, - featuresPerNode=3, - engine=d4p.engines_mt19937(seed=777), - varImportance="MDI", - bootstrap=True, - resultsToCompute="computeOutOfBagError", - method=method, - ) - # Training result provides (depending on parameters) model, - # outOfBagError, outOfBagErrorPerObservation and/or variableImportance - train_result = train_algo.compute(train_data, train_labels) - - # now predict using the model from the training above - predict_algo = d4p.decision_forest_classification_prediction( - nClasses=5, - fptype="float", - resultsToEvaluate="computeClassLabels|computeClassProbabilities", - votingMethod="unweighted", - ) - - predict_result = predict_algo.compute(predict_data, train_result.model) - - return train_result, predict_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except Exception: - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except Exception: - return data - - return data - - -def main(readcsv=read_csv, method="defaultDense"): - nFeatures = 3 - # input data file - train_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "df_classification_train.csv", - ) - predict_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "df_classification_test.csv", - ) - - # Read train data. Let's use 3 features per observation - train_data = readcsv(train_file, range(nFeatures), t=np.float32) - train_labels = readcsv(train_file, range(nFeatures, nFeatures + 1), t=np.float32) - # Read test data (with same #features) - predict_data = readcsv(predict_file, range(nFeatures), t=np.float32) - predict_labels = readcsv(predict_file, range(nFeatures, nFeatures + 1), t=np.float32) - - # Using of the classic way (computations on CPU) - train_result, predict_result = compute( - train_data, train_labels, predict_data, "defaultDense" - ) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert (np.mean(predict_result.prediction != predict_labels) < 0.03).any() - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - train_result, predict_result = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data, "hist" - ) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert (np.mean(predict_result.prediction != predict_labels) < 0.03).any() - - return (train_result, predict_result, predict_labels) - - -if __name__ == "__main__": - (train_result, predict_result, plabels) = main() - print("\nVariable importance results:\n", train_result.variableImportance) - print("\nOOB error:\n", train_result.outOfBagError) - print( - "\nDecision forest prediction results (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print( - "\nDecision forest probabilities results (first 10 rows):\n", - predict_result.probabilities[0:10], - ) - print("\nGround truth (first 10 rows):\n", plabels[0:10]) - print("All looks good!") diff --git a/tests/daal4py/sycl/decision_forest_classification_hist.py b/tests/daal4py/sycl/decision_forest_classification_hist.py deleted file mode 100755 index 360ee86f6d..0000000000 --- a/tests/daal4py/sycl/decision_forest_classification_hist.py +++ /dev/null @@ -1,170 +0,0 @@ -# ============================================================================== -# Copyright 2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py Decision Forest Classification example of Hist method for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except Exception: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2, dtype=t) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_data, train_labels, predict_data): - # Configure a training object (5 classes) - train_algo = d4p.decision_forest_classification_training( - 5, - fptype="float", - method="hist", - maxBins=256, - minBinSize=1, - nTrees=10, - minObservationsInLeafNode=8, - featuresPerNode=3, - engine=d4p.engines_mt19937(seed=777), - varImportance="MDI", - bootstrap=True, - resultsToCompute="computeOutOfBagError", - ) - - # Training result provides (depending on parameters) model, - # outOfBagError, outOfBagErrorPerObservation and/or variableImportance - train_result = train_algo.compute(train_data, train_labels) - - # now predict using the model from the training above - predict_algo = d4p.decision_forest_classification_prediction( - nClasses=5, - fptype="float", - resultsToEvaluate="computeClassLabels|computeClassProbabilities", - votingMethod="unweighted", - ) - - predict_result = predict_algo.compute(predict_data, train_result.model) - - return train_result, predict_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except Exception: - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except Exception: - return data - - return data - - -def main(readcsv=read_csv): - nFeatures = 3 - # input data file - train_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "df_classification_train.csv", - ) - predict_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "df_classification_test.csv", - ) - - # Read train data. Let's use 3 features per observation - train_data = readcsv(train_file, range(nFeatures), t=np.float32) - train_labels = readcsv(train_file, range(nFeatures, nFeatures + 1), t=np.float32) - # Read test data (with same #features) - predict_data = readcsv(predict_file, range(nFeatures), t=np.float32) - predict_labels = readcsv(predict_file, range(nFeatures, nFeatures + 1), t=np.float32) - - # Using of the classic way (computations on CPU) - train_result, predict_result = compute(train_data, train_labels, predict_data) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert (np.mean(predict_result.prediction != predict_labels) < 0.04).any() - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - train_result, predict_result = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data - ) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert (np.mean(predict_result.prediction != predict_labels) < 0.03).any() - - return (train_result, predict_result, predict_labels) - - -if __name__ == "__main__": - (train_result, predict_result, plabels) = main() - print("\nVariable importance results:\n", train_result.variableImportance) - print("\nOOB error:\n", train_result.outOfBagError) - print( - "\nDecision forest prediction results (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print( - "\nDecision forest probabilities results (first 10 rows):\n", - predict_result.probabilities[0:10], - ) - print("\nGround truth (first 10 rows):\n", plabels[0:10]) - print("All looks good!") diff --git a/tests/daal4py/sycl/decision_forest_regression.py b/tests/daal4py/sycl/decision_forest_regression.py deleted file mode 100644 index 3ec552cf3a..0000000000 --- a/tests/daal4py/sycl/decision_forest_regression.py +++ /dev/null @@ -1,152 +0,0 @@ -# ============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py Decision Forest Regression example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except Exception: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2, dtype=t) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_data, train_labels, predict_data, method="defaultDense"): - # Configure a training object - train_algo = d4p.decision_forest_regression_training( - nTrees=100, - fptype="float", - engine=d4p.engines_mt2203(seed=777), - varImportance="MDA_Raw", - bootstrap=True, - resultsToCompute="computeOutOfBagError|computeOutOfBagErrorPerObservation", - method=method, - ) - # Training result provides (depending on parameters) model, - # outOfBagError, outOfBagErrorPerObservation and/or variableImportance - train_result = train_algo.compute(train_data, train_labels) - - # now predict using the model from the training above - predict_algo = d4p.decision_forest_regression_prediction(fptype="float") - - predict_result = predict_algo.compute(predict_data, train_result.model) - - return train_result, predict_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except Exception: - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except Exception: - return data - - return data - - -def main(readcsv=read_csv, method="defaultDense"): - nFeatures = 13 - # input data file - train_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "df_regression_train.csv", - ) - predict_file = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "df_regression_test.csv" - ) - - # Read train data. Let's use 3 features per observation - train_data = readcsv(train_file, range(nFeatures), t=np.float32) - train_labels = readcsv(train_file, range(nFeatures, nFeatures + 1), t=np.float32) - # Read test data (with same #features) - predict_data = readcsv(predict_file, range(nFeatures), t=np.float32) - predict_labels = readcsv(predict_file, range(nFeatures, nFeatures + 1), t=np.float32) - - # Using of the classic way (computations on CPU) - train_result, predict_result = compute( - train_data, train_labels, predict_data, "defaultDense" - ) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert (np.square(predict_result.prediction - predict_labels).mean() < 18).any() - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - train_result, predict_result = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data, "hist" - ) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert ( - np.square(predict_result.prediction - predict_labels).mean() < 18 - ).any() - - return (train_result, predict_result, predict_labels) - - -if __name__ == "__main__": - (train_result, predict_result, plabels) = main() - print("\nVariable importance results:\n", train_result.variableImportance) - print("\nOOB error:\n", train_result.outOfBagError) - print( - "\nDecision forest prediction results (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print("\nGround truth (first 10 rows):\n", plabels[0:10]) - print("All looks good!") diff --git a/tests/daal4py/sycl/decision_forest_regression_hist.py b/tests/daal4py/sycl/decision_forest_regression_hist.py deleted file mode 100755 index 93dd04a0be..0000000000 --- a/tests/daal4py/sycl/decision_forest_regression_hist.py +++ /dev/null @@ -1,153 +0,0 @@ -# ============================================================================== -# Copyright 2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py Decision Forest Regression example of Hist method for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except Exception: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2, dtype=t) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_data, train_labels, predict_data): - # Configure a training object - train_algo = d4p.decision_forest_regression_training( - method="hist", - maxBins=256, - minBinSize=1, - nTrees=100, - fptype="float", - varImportance="MDA_Raw", - bootstrap=True, - engine=d4p.engines_mt2203(seed=777), - resultsToCompute="computeOutOfBagError|computeOutOfBagErrorPerObservation", - ) - - # Training result provides (depending on parameters) model, - # outOfBagError, outOfBagErrorPerObservation and/or variableImportance - train_result = train_algo.compute(train_data, train_labels) - - # now predict using the model from the training above - predict_algo = d4p.decision_forest_regression_prediction(fptype="float") - - predict_result = predict_algo.compute(predict_data, train_result.model) - - return train_result, predict_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except Exception: - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except Exception: - return data - - return data - - -def main(readcsv=read_csv): - nFeatures = 13 - # input data file - train_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "df_regression_train.csv", - ) - predict_file = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "df_regression_test.csv" - ) - - # Read train data. Let's use 3 features per observation - train_data = readcsv(train_file, range(nFeatures), t=np.float32) - train_labels = readcsv(train_file, range(nFeatures, nFeatures + 1), t=np.float32) - # Read test data (with same #features) - predict_data = readcsv(predict_file, range(nFeatures), t=np.float32) - predict_labels = readcsv(predict_file, range(nFeatures, nFeatures + 1), t=np.float32) - - # Using of the classic way (computations on CPU) - train_result, predict_result = compute(train_data, train_labels, predict_data) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert (np.square(predict_result.prediction - predict_labels).mean() < 19).any() - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - train_result, predict_result = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data - ) - assert predict_result.prediction.shape == (predict_labels.shape[0], 1) - assert ( - np.square(predict_result.prediction - predict_labels).mean() < 18 - ).any() - - return (train_result, predict_result, predict_labels) - - -if __name__ == "__main__": - (train_result, predict_result, plabels) = main() - print("\nVariable importance results:\n", train_result.variableImportance) - print("\nOOB error:\n", train_result.outOfBagError) - print( - "\nDecision forest prediction results (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print("\nGround truth (first 10 rows):\n", plabels[0:10]) - print("All looks good!") diff --git a/tests/daal4py/sycl/gradient_boosted_regression.py b/tests/daal4py/sycl/gradient_boosted_regression.py deleted file mode 100644 index 87d5c6026b..0000000000 --- a/tests/daal4py/sycl/gradient_boosted_regression.py +++ /dev/null @@ -1,138 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py Gradient Bossting Regression example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=np.float32) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2, dtype=np.float32) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_indep_data, train_dep_data, test_indep_data, maxIterations): - # Configure a training object - train_algo = d4p.gbt_regression_training(maxIterations=maxIterations, fptype="float") - train_result = train_algo.compute(train_indep_data, train_dep_data) - # Now let's do some prediction - predict_algo = d4p.gbt_regression_prediction(fptype="float") - # now predict using the model from the training above - return predict_algo.compute(test_indep_data, train_result.model) - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="defaultDense"): - maxIterations = 200 - - # input data file - infile = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "df_regression_train.csv", - ) - testfile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "df_regression_test.csv" - ) - - # Read data. Let's use 13 features per observation - train_indep_data = readcsv(infile, range(13), t=np.float32) - train_dep_data = readcsv(infile, range(13, 14), t=np.float32) - # read test data (with same #features) - test_indep_data = readcsv(testfile, range(13), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic = compute( - train_indep_data, train_dep_data, test_indep_data, maxIterations - ) - - train_indep_data = to_numpy(train_indep_data) - train_dep_data = to_numpy(train_dep_data) - test_indep_data = to_numpy(test_indep_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_indep_data = sycl_buffer(train_indep_data) - sycl_train_dep_data = sycl_buffer(train_dep_data) - sycl_test_indep_data = sycl_buffer(test_indep_data) - _ = compute( - sycl_train_indep_data, - sycl_train_dep_data, - sycl_test_indep_data, - maxIterations, - ) - - test_dep_data = np.loadtxt( - testfile, usecols=range(13, 14), delimiter=",", ndmin=2, dtype=np.float32 - ) - - return (result_classic, test_dep_data) - - -if __name__ == "__main__": - (predict_result, test_dep_data) = main() - print( - "\nGradient boosted trees prediction results (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print("\nGround truth (first 10 rows):\n", test_dep_data[0:10]) - print("All looks good!") diff --git a/tests/daal4py/sycl/kmeans.py b/tests/daal4py/sycl/kmeans.py deleted file mode 100644 index 2bac97ef87..0000000000 --- a/tests/daal4py/sycl/kmeans.py +++ /dev/null @@ -1,123 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py K-Means example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(data, nClusters, maxIter, method): - # configure kmeans init object - initrain_algo = d4p.kmeans_init(nClusters, method=method, fptype="float") - # compute initial centroids - initrain_result = initrain_algo.compute(data) - - # configure kmeans main object: we also request the cluster assignments - algo = d4p.kmeans(nClusters, maxIter, assignFlag=True, fptype="float") - # compute the clusters/centroids - return algo.compute(data, initrain_result.centroids) - - # Note: we could have done this in just one line: - # return d4p.kmeans(nClusters, maxIter, assignFlag=True).compute( - # data, d4p.kmeans_init(nClusters, method=method).compute(data).centroids - # ) - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="randomDense"): - infile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "kmeans_dense.csv" - ) - nClusters = 20 - maxIter = 5 - - # Load the data - data = readcsv(infile, range(20), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic = compute(data, nClusters, maxIter, method) - - data = to_numpy(data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_data = sycl_buffer(data) - result_gpu = compute(sycl_data, nClusters, maxIter, method) - assert np.allclose(result_classic.centroids, result_gpu.centroids) - assert np.allclose(result_classic.assignments, result_gpu.assignments) - assert np.isclose(result_classic.objectiveFunction, result_gpu.objectiveFunction) - - # Kmeans result objects provide assignments (if requested), - # centroids, goalFunction, nIterations and objectiveFunction - assert result_classic.centroids.shape[0] == nClusters - assert result_classic.assignments.shape == (data.shape[0], 1) - assert result_classic.nIterations <= maxIter - - return result_classic - - -if __name__ == "__main__": - result = main() - print("\nFirst 10 cluster assignments:\n", result.assignments[0:10]) - print("\nFirst 10 dimensions of centroids:\n", result.centroids[:, 0:10]) - print("\nObjective function value:\n", result.objectiveFunction) - print("All looks good!") diff --git a/tests/daal4py/sycl/linear_regression.py b/tests/daal4py/sycl/linear_regression.py deleted file mode 100644 index cb353822fe..0000000000 --- a/tests/daal4py/sycl/linear_regression.py +++ /dev/null @@ -1,146 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py Linear Regression example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_indep_data, train_dep_data, test_indep_data): - # Configure a Linear regression training object - train_algo = d4p.linear_regression_training(interceptFlag=True, fptype="float") - # Now train/compute, the result provides the model for prediction - train_result = train_algo.compute(train_indep_data, train_dep_data) - # Now let's do some prediction - predict_algo = d4p.linear_regression_prediction(fptype="float") - # now predict using the model from the training above - return predict_algo.compute(test_indep_data, train_result.model), train_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="defaultDense"): - # read training data. Let's have 10 independent, - # and 2 dependent variables (for each observation) - trainfile = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "linear_regression_train.csv", - ) - train_indep_data = readcsv(trainfile, range(10), t=np.float32) - train_dep_data = readcsv(trainfile, range(10, 12), t=np.float32) - - # read testing data - testfile = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "linear_regression_test.csv", - ) - test_indep_data = readcsv(testfile, range(10), t=np.float32) - test_dep_data = readcsv(testfile, range(10, 12), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic, train_result = compute( - train_indep_data, train_dep_data, test_indep_data - ) - - train_indep_data = to_numpy(train_indep_data) - train_dep_data = to_numpy(train_dep_data) - test_indep_data = to_numpy(test_indep_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_indep_data = sycl_buffer(train_indep_data) - sycl_train_dep_data = sycl_buffer(train_dep_data) - sycl_test_indep_data = sycl_buffer(test_indep_data) - result_gpu, _ = compute( - sycl_train_indep_data, sycl_train_dep_data, sycl_test_indep_data - ) - assert np.allclose(result_classic.prediction, result_gpu.prediction, atol=1e-1) - - # The prediction result provides prediction - assert result_classic.prediction.shape == ( - test_dep_data.shape[0], - test_dep_data.shape[1], - ) - - return (train_result, result_classic, test_dep_data) - - -if __name__ == "__main__": - (train_result, predict_result, test_dep_data) = main() - print("\nLinear Regression coefficients:\n", train_result.model.Beta) - print( - "\nLinear Regression prediction results: (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print("\nGround truth (first 10 rows):\n", test_dep_data[0:10]) - print("All looks good!") diff --git a/tests/daal4py/sycl/log_reg_binary_dense.py b/tests/daal4py/sycl/log_reg_binary_dense.py deleted file mode 100644 index d1676a2271..0000000000 --- a/tests/daal4py/sycl/log_reg_binary_dense.py +++ /dev/null @@ -1,135 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py logistic regression example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_data, train_labels, predict_data, nClasses): - # set parameters and train - train_alg = d4p.logistic_regression_training( - nClasses=nClasses, interceptFlag=True, fptype="float" - ) - train_result = train_alg.compute(train_data, train_labels) - # set parameters and compute predictions - predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses, fptype="float") - return predict_alg.compute(predict_data, train_result.model), train_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="defaultDense"): - nClasses = 2 - nFeatures = 20 - - # read training data from file with 20 features per observation and 1 class label - trainfile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "binary_cls_train.csv" - ) - train_data = readcsv(trainfile, range(nFeatures), t=np.float32) - train_labels = readcsv(trainfile, range(nFeatures, nFeatures + 1), t=np.float32) - - # read testing data from file with 20 features per observation - testfile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "binary_cls_test.csv" - ) - predict_data = readcsv(testfile, range(nFeatures), t=np.float32) - predict_labels = readcsv(testfile, range(nFeatures, nFeatures + 1), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic, train_result = compute( - train_data, train_labels, predict_data, nClasses - ) - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - result_gpu, _ = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data, nClasses - ) - - assert np.mean(result_classic.prediction != result_gpu.prediction) < 0.2 - - # the prediction result provides prediction - assert result_classic.prediction.shape == ( - predict_data.shape[0], - train_labels.shape[1], - ) - - return (train_result, result_classic, predict_labels) - - -if __name__ == "__main__": - (train_result, predict_result, predict_labels) = main() - print("\nLogistic Regression coefficients:\n", train_result.model.Beta) - print( - "\nLogistic regression prediction results (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print("\nGround truth (first 10 rows):\n", predict_labels[0:10]) - print("All looks good!") diff --git a/tests/daal4py/sycl/log_reg_dense.py b/tests/daal4py/sycl/log_reg_dense.py deleted file mode 100644 index 19884afa55..0000000000 --- a/tests/daal4py/sycl/log_reg_dense.py +++ /dev/null @@ -1,162 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py logistic regression example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(train_data, train_labels, predict_data, nClasses): - # set parameters and train - train_alg = d4p.logistic_regression_training( - nClasses=nClasses, - fptype="float", - penaltyL1=0.1, - penaltyL2=0.1, - interceptFlag=True, - ) - train_result = train_alg.compute(train_data, train_labels) - # set parameters and compute predictions - predict_alg = d4p.logistic_regression_prediction( - nClasses=nClasses, - fptype="float", - resultsToEvaluate="computeClassLabels|computeClassProbabilities|" - "computeClassLogProbabilities", - ) - return predict_alg.compute(predict_data, train_result.model), train_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="defaultDense"): - nClasses = 5 - nFeatures = 6 - - # read training data from file with 6 features per observation and 1 class label - trainfile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "logreg_train.csv" - ) - train_data = readcsv(trainfile, range(nFeatures), t=np.float32) - train_labels = readcsv(trainfile, range(nFeatures, nFeatures + 1), t=np.float32) - - # read testing data from file with 6 features per observation - testfile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "logreg_test.csv" - ) - predict_data = readcsv(testfile, range(nFeatures), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic, train_result = compute( - train_data, train_labels, predict_data, nClasses - ) - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - result_gpu, _ = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data, nClasses - ) - assert np.allclose(result_classic.prediction, result_gpu.prediction) - assert np.allclose( - result_classic.probabilities, result_gpu.probabilities, atol=1e-3 - ) - assert np.allclose( - result_classic.logProbabilities, result_gpu.logProbabilities, atol=1e-2 - ) - - # the prediction result provides prediction, probabilities and logProbabilities - assert result_classic.probabilities.shape == (predict_data.shape[0], nClasses) - assert result_classic.logProbabilities.shape == (predict_data.shape[0], nClasses) - predict_labels = np.loadtxt( - testfile, usecols=range(nFeatures, nFeatures + 1), delimiter=",", ndmin=2 - ) - assert ( - np.count_nonzero(result_classic.prediction - predict_labels) - / predict_labels.shape[0] - < 0.025 - ) - - return (train_result, result_classic, predict_labels) - - -if __name__ == "__main__": - (train_result, predict_result, predict_labels) = main() - print("\nLogistic Regression coefficients:\n", train_result.model.Beta) - print( - "\nLogistic regression prediction results (first 10 rows):\n", - predict_result.prediction[0:10], - ) - print("\nGround truth (first 10 rows):\n", predict_labels[0:10]) - print( - "\nLogistic regression prediction probabilities (first 10 rows):\n", - predict_result.probabilities[0:10], - ) - print( - "\nLogistic regression prediction log probabilities (first 10 rows):\n", - predict_result.logProbabilities[0:10], - ) - print("All looks good!") diff --git a/tests/daal4py/sycl/low_order_moms_dense.py b/tests/daal4py/sycl/low_order_moms_dense.py deleted file mode 100644 index 699e5b4d83..0000000000 --- a/tests/daal4py/sycl/low_order_moms_dense.py +++ /dev/null @@ -1,145 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py low order moments example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(data, method): - alg = d4p.low_order_moments(method=method, fptype="float") - return alg.compute(data) - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="defaultDense"): - # read data from file - file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "covcormoments_dense.csv", - ) - data = readcsv(file, range(10), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic = compute(data, method) - - data = to_numpy(data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_data = sycl_buffer(data) - result_gpu = compute(sycl_data, "defaultDense") - for name in [ - "minimum", - "maximum", - "sum", - "sumSquares", - "sumSquaresCentered", - "mean", - "secondOrderRawMoment", - "variance", - "standardDeviation", - "variation", - ]: - assert np.allclose(getattr(result_classic, name), getattr(result_gpu, name)) - - # result provides minimum, maximum, sum, sumSquares, sumSquaresCentered, - # mean, secondOrderRawMoment, variance, standardDeviation, variation - assert all( - getattr(result_classic, name).shape == (1, data.shape[1]) - for name in [ - "minimum", - "maximum", - "sum", - "sumSquares", - "sumSquaresCentered", - "mean", - "secondOrderRawMoment", - "variance", - "standardDeviation", - "variation", - ] - ) - - return result_classic - - -if __name__ == "__main__": - res = main() - # print results - print("\nMinimum:\n", res.minimum) - print("\nMaximum:\n", res.maximum) - print("\nSum:\n", res.sum) - print("\nSum of squares:\n", res.sumSquares) - print("\nSum of squared difference from the means:\n", res.sumSquaresCentered) - print("\nMean:\n", res.mean) - print("\nSecond order raw moment:\n", res.secondOrderRawMoment) - print("\nVariance:\n", res.variance) - print("\nStandard deviation:\n", res.standardDeviation) - print("\nVariation:\n", res.variation) - print("All looks good!") diff --git a/tests/daal4py/sycl/low_order_moms_streaming.py b/tests/daal4py/sycl/low_order_moms_streaming.py deleted file mode 100644 index 475e81bfdc..0000000000 --- a/tests/daal4py/sycl/low_order_moms_streaming.py +++ /dev/null @@ -1,162 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py low order moments example for streaming on shared memory systems - -import os - -# let's use a generator for getting stream from file (defined in stream.py) -import sys - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -sys.path.insert(0, "..") - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - -try: - import pandas - - def read_csv(f, c=None, s=0, n=None, t=np.float64): - return pandas.read_csv( - f, usecols=c, delimiter=",", header=None, skiprows=s, nrows=n, dtype=t - ) - -except Exception: - # fall back to numpy genfromtxt - def read_csv(f, c=None, s=0, n=np.iinfo(np.int64).max): - a = np.genfromtxt(f, usecols=c, delimiter=",", skip_header=s, max_rows=n) - if a.shape[0] == 0: - raise Exception("done") - if a.ndim == 1: - return a[:, np.newaxis] - return a - - -# a generator which reads a file in chunks -def read_next(file, chunksize, readcsv=read_csv): - assert os.path.isfile(file) - s = 0 - while True: - # if found a smaller chunk we set s to < 0 to indicate eof - if s < 0: - return - a = read_csv(file, s=s, n=chunksize) - # last chunk is usually smaller, if not, - # numpy will print warning in next iteration - if chunksize > a.shape[0]: - s = -1 - else: - s += a.shape[0] - yield a - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=None, method="defaultDense"): - # read data from file - infile = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "covcormoments_dense.csv", - ) - - # Using of the classic way (computations on CPU) - # Configure a low order moments object for streaming - algo = d4p.low_order_moments(streaming=True, fptype="float") - # get the generator (defined in stream.py)... - rn = read_next(infile, 55, readcsv) - # ... and iterate through chunks/stream - for chunk in rn: - algo.compute(chunk) - # finalize computation - result_classic = algo.finalize() - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - # Configure a low order moments object for streaming - algo = d4p.low_order_moments(streaming=True, fptype="float") - # get the generator (defined in stream.py)... - rn = read_next(infile, 55, readcsv) - # ... and iterate through chunks/stream - for chunk in rn: - sycl_chunk = sycl_buffer(to_numpy(chunk)) - algo.compute(sycl_chunk) - # finalize computation - result_gpu = algo.finalize() - for name in [ - "minimum", - "maximum", - "sum", - "sumSquares", - "sumSquaresCentered", - "mean", - "secondOrderRawMoment", - "variance", - "standardDeviation", - "variation", - ]: - assert np.allclose(getattr(result_classic, name), getattr(result_gpu, name)) - - return result_classic - - -if __name__ == "__main__": - res = main() - # print results - print("\nMinimum:\n", res.minimum) - print("\nMaximum:\n", res.maximum) - print("\nSum:\n", res.sum) - print("\nSum of squares:\n", res.sumSquares) - print("\nSum of squared difference from the means:\n", res.sumSquaresCentered) - print("\nMean:\n", res.mean) - print("\nSecond order raw moment:\n", res.secondOrderRawMoment) - print("\nVariance:\n", res.variance) - print("\nStandard deviation:\n", res.standardDeviation) - print("\nVariation:\n", res.variation) - print("All looks good!") diff --git a/tests/daal4py/sycl/pca.py b/tests/daal4py/sycl/pca.py deleted file mode 100644 index feb4dc5db1..0000000000 --- a/tests/daal4py/sycl/pca.py +++ /dev/null @@ -1,122 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py PCA example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c=None, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c=None, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(data): - # 'normalization' is an optional parameter to PCA; - # we use z-score which could be configured differently - zscore = d4p.normalization_zscore(fptype="float") - # configure a PCA object - algo = d4p.pca( - fptype="float", - resultsToCompute="mean|variance|eigenvalue", - isDeterministic=True, - normalization=zscore, - ) - return algo.compute(data) - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="svdDense"): - infile = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "pca_normalized.csv" - ) - - # Load the data - data = readcsv(infile, t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic = compute(data) - - data = to_numpy(data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_data = sycl_buffer(data) - result_gpu = compute(sycl_data) - assert np.allclose(result_classic.eigenvalues, result_gpu.eigenvalues, atol=1e-5) - assert np.allclose( - result_classic.eigenvectors, result_gpu.eigenvectors, atol=1e-5 - ) - assert np.allclose(result_classic.means, result_gpu.means, atol=1e-5) - assert np.allclose(result_classic.variances, result_gpu.variances, atol=1e-5) - - # PCA result objects provide eigenvalues, eigenvectors, means and variances - assert result_classic.eigenvalues.shape == (1, data.shape[1]) - assert result_classic.eigenvectors.shape == (data.shape[1], data.shape[1]) - assert result_classic.means.shape == (1, data.shape[1]) - assert result_classic.variances.shape == (1, data.shape[1]) - - return result_classic - - -if __name__ == "__main__": - result = main() - print("\nEigenvalues:\n", result.eigenvalues) - print("\nEigenvectors:\n", result.eigenvectors) - print("\nMeans:\n", result.means) - print("\nVariances:\n", result.variances) - print("All looks good!") diff --git a/tests/daal4py/sycl/pca_transform.py b/tests/daal4py/sycl/pca_transform.py deleted file mode 100644 index 787966bcad..0000000000 --- a/tests/daal4py/sycl/pca_transform.py +++ /dev/null @@ -1,107 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py PCA example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Commone code for both CPU and GPU computations -def compute(data, nComponents): - # configure a PCA object and perform PCA - pca_algo = d4p.pca( - isDeterministic=True, fptype="float", resultsToCompute="mean|variance|eigenvalue" - ) - pca_res = pca_algo.compute(data) - # Apply transform with whitening because means and eigenvalues are provided - pcatrans_algo = d4p.pca_transform(fptype="float", nComponents=nComponents) - return pcatrans_algo.compute(data, pca_res.eigenvectors, pca_res.dataForTransform) - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv, method="svdDense"): - dataFileName = os.path.join( - "..", "..", "..", "examples", "daal4py", "data", "batch", "pca_transform.csv" - ) - nComponents = 2 - - # read data - data = readcsv(dataFileName, range(3), t=np.float32) - - # Using of the classic way (computations on CPU) - result_classic = compute(data, nComponents) - - data = to_numpy(data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_data = sycl_buffer(data) - result_gpu = compute(sycl_data, nComponents) - assert np.allclose(result_classic.transformedData, result_gpu.transformedData) - - return result_classic - - -if __name__ == "__main__": - pcatrans_res = main() - # print results of tranform - print(pcatrans_res) - print("All looks good!") diff --git a/tests/daal4py/sycl/sklearn_sycl.py b/tests/daal4py/sycl/sklearn_sycl.py deleted file mode 100644 index 5d29f243a6..0000000000 --- a/tests/daal4py/sycl/sklearn_sycl.py +++ /dev/null @@ -1,191 +0,0 @@ -# ============================================================================== -# Copyright 2014 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py Scikit-Learn examples for GPU -# run like this: -# python -m sklearnex ./sklearn_sycl.py - -import numpy as np -from sklearn.cluster import DBSCAN, KMeans -from sklearn.datasets import load_iris -from sklearn.linear_model import LinearRegression, LogisticRegression - -dpctl_available = False -try: - import dpctl - - from sklearnex._config import config_context - - dpctl_available = True -except ImportError: - try: - from daal4py.oneapi import sycl_context - - print("*" * 80) - print("\ndpctl package not found, switched to daal4py package\n") - print("*" * 80) - except ImportError: - print("\nRequired packages not found, aborting...\n") - exit() - - -gpu_available = False -if not dpctl_available: - try: - with sycl_context("gpu"): - gpu_available = True - except Exception: - gpu_available = False - - -def k_means_init_x(): - print("KMeans init=X[:2]") - X = np.array( - [[1.0, 2.0], [1.0, 4.0], [1.0, 0.0], [10.0, 2.0], [10.0, 4.0], [10.0, 0.0]], - dtype=np.float32, - ) - kmeans = KMeans(n_clusters=2, random_state=0, init=X[:2]).fit(X) - print("kmeans.labels_") - print(kmeans.labels_) - print("kmeans.predict([[0, 0], [12, 3]])") - print(kmeans.predict(np.array([[0, 0], [12, 3]], dtype=np.float32))) - print("kmeans.cluster_centers_") - print(kmeans.cluster_centers_) - - -def k_means_random(): - print("KMeans init='random'") - X = np.array( - [[1.0, 2.0], [1.0, 4.0], [1.0, 0.0], [10.0, 2.0], [10.0, 4.0], [10.0, 0.0]], - dtype=np.float32, - ) - kmeans = KMeans(n_clusters=2, random_state=0, init="random").fit(X) - print("kmeans.labels_") - print(kmeans.labels_) - print("kmeans.predict([[0, 0], [12, 3]])") - print(kmeans.predict(np.array([[0, 0], [12, 3]], dtype=np.float32))) - print("kmeans.cluster_centers_") - print(kmeans.cluster_centers_) - - -def linear_regression(): - print("LinearRegression") - X = np.array([[1.0, 1.0], [1.0, 2.0], [2.0, 2.0], [2.0, 3.0]], dtype=np.float32) - # y = 1 * x_0 + 2 * x_1 + 3 - y = np.dot(X, np.array([1, 2], dtype=np.float32)) + 3 - reg = LinearRegression().fit(X, y) - print("reg.score(X, y)") - print(reg.score(X, y)) - print("reg.coef_") - print(reg.coef_) - print("reg.intercept_") - print(reg.intercept_) - print("reg.predict(np.array([[3, 5]], dtype=np.float32))") - print(reg.predict(np.array([[3, 5]], dtype=np.float32))) - - -def logistic_regression_lbfgs(): - print("LogisticRegression solver='lbfgs'") - X, y = load_iris(return_X_y=True) - clf = LogisticRegression(random_state=0, solver="lbfgs").fit( - X.astype("float32"), y.astype("float32") - ) - print("clf.predict(X[:2, :])") - print(clf.predict(X[:2, :])) - print("clf.predict_proba(X[:2, :])") - print(clf.predict_proba(X[:2, :])) - print("clf.score(X, y)") - print(clf.score(X, y)) - - -def logistic_regression_newton(): - print("LogisticRegression solver='newton-cg'") - X, y = load_iris(return_X_y=True) - clf = LogisticRegression(random_state=0, solver="newton-cg").fit( - X.astype("float32"), y.astype("float32") - ) - print("clf.predict(X[:2, :])") - print(clf.predict(X[:2, :])) - print("clf.predict_proba(X[:2, :])") - print(clf.predict_proba(X[:2, :])) - print("clf.score(X, y)") - print(clf.score(X, y)) - - -def dbscan(): - print("DBSCAN") - X = np.array( - [[1.0, 2.0], [2.0, 2.0], [2.0, 3.0], [8.0, 7.0], [8.0, 8.0], [25.0, 80.0]], - dtype=np.float32, - ) - clustering = DBSCAN(eps=3, min_samples=2).fit(X) - print("clustering.labels_") - print(clustering.labels_) - print("clustering") - print(clustering) - - -def get_context(device): - if dpctl_available: - return config_context(target_offload=device) - return sycl_context(device) - - -def device_type_to_str(queue): - if queue is None: - return "cpu" - - from dpctl import device_type - - if queue.sycl_device.device_type == device_type.cpu: - return "cpu" - if queue.sycl_device.device_type == device_type.gpu: - return "gpu" - return "unknown" - - -if __name__ == "__main__": - examples = [ - k_means_init_x, - k_means_random, - linear_regression, - logistic_regression_lbfgs, - logistic_regression_newton, - dbscan, - ] - devices = [] - - if dpctl_available: - devices.append(None) - if dpctl.has_gpu_devices(): - devices.append(dpctl.SyclQueue("gpu")) - - else: - if gpu_available: - devices.append("gpu") - - for device in devices: - for e in examples: - print("*" * 80) - if dpctl_available: - print("device context:", device_type_to_str(device)) - else: - print("device context:", device) - with get_context(device): - e() - print("*" * 80) - - print("All looks good!") diff --git a/tests/daal4py/sycl/svm.py b/tests/daal4py/sycl/svm.py deleted file mode 100755 index 2b76529928..0000000000 --- a/tests/daal4py/sycl/svm.py +++ /dev/null @@ -1,157 +0,0 @@ -# ============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# daal4py SVM example for shared memory systems - -import os - -import numpy as np - -import daal4py as d4p -from daal4py.oneapi import sycl_buffer - -# let's try to use pandas' fast csv reader -try: - import pandas - - def read_csv(f, c, t=np.float64): - return pandas.read_csv(f, usecols=c, delimiter=",", header=None, dtype=t) - -except ImportError: - # fall back to numpy loadtxt - def read_csv(f, c, t=np.float64): - return np.loadtxt(f, usecols=c, delimiter=",", ndmin=2) - - -try: - from daal4py.oneapi import sycl_context - - with sycl_context("gpu"): - gpu_available = True -except Exception: - gpu_available = False - - -# Common code for both CPU and GPU computations -def compute(train_indep_data, train_dep_data, test_indep_data, method="defaultDense"): - # Configure a SVM object to use linear kernel - kernel_function = d4p.kernel_function_linear( - fptype="float", method="defaultDense", k=1.0, b=0.0 - ) - train_algo = d4p.svm_training( - fptype="float", - method=method, - kernel=kernel_function, - C=1.0, - accuracyThreshold=1e-3, - tau=1e-8, - cacheSize=600000000, - ) - - train_result = train_algo.compute(train_indep_data, train_dep_data) - - # Create an algorithm object and call compute - predict_algo = d4p.svm_prediction(fptype="float", kernel=kernel_function) - predict_result = predict_algo.compute(test_indep_data, train_result.model) - decision_result = predict_result.prediction - predict_labels = np.where(decision_result >= 0, 1, -1) - return predict_labels, decision_result - - -# At this moment with sycl we are working only with numpy arrays -def to_numpy(data): - try: - from pandas import DataFrame - - if isinstance(data, DataFrame): - return np.ascontiguousarray(data.values) - except ImportError: - pass - try: - from scipy.sparse import csr_matrix - - if isinstance(data, csr_matrix): - return data.toarray() - except ImportError: - pass - return data - - -def main(readcsv=read_csv): - # input data file - train_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "svm_two_class_train_dense.csv", - ) - predict_file = os.path.join( - "..", - "..", - "..", - "examples", - "daal4py", - "data", - "batch", - "svm_two_class_test_dense.csv", - ) - - nFeatures = 20 - train_data = readcsv(train_file, range(nFeatures), t=np.float32) - train_labels = readcsv(train_file, range(nFeatures, nFeatures + 1), t=np.float32) - predict_data = readcsv(predict_file, range(nFeatures), t=np.float32) - predict_labels = readcsv(predict_file, range(nFeatures, nFeatures + 1), t=np.float32) - - predict_result_classic, decision_function_classic = compute( - train_data, train_labels, predict_data, "boser" - ) - - train_data = to_numpy(train_data) - train_labels = to_numpy(train_labels) - predict_data = to_numpy(predict_data) - - # It is possible to specify to make the computations on GPU - if gpu_available: - with sycl_context("gpu"): - sycl_train_data = sycl_buffer(train_data) - sycl_train_labels = sycl_buffer(train_labels) - sycl_predict_data = sycl_buffer(predict_data) - - predict_result_gpu, decision_function_gpu = compute( - sycl_train_data, sycl_train_labels, sycl_predict_data, "thunder" - ) - # assert np.allclose(predict_result_gpu, predict_result_classic) - - return predict_labels, predict_result_classic, decision_function_classic - - -if __name__ == "__main__": - predict_labels, predict_result, decision_function = main() - np.set_printoptions(precision=0) - print( - "\nSVM classification decision function (first 10 observations):\n", - decision_function[0:10], - ) - print( - "\nSVM classification predict result (first 10 observations):\n", - predict_result[0:10], - ) - print("\nGround truth (first 10 observations):\n", predict_labels[0:10]) - print("All looks good!") diff --git a/tests/run_examples.py b/tests/run_examples.py index 71f3fede0a..d44a1bceb4 100755 --- a/tests/run_examples.py +++ b/tests/run_examples.py @@ -27,6 +27,7 @@ from daal4py import __has_dist__ from daal4py.sklearn._utils import get_daal_version +from onedal._device_offload import dpctl_available print("Starting examples validation") # First item is major version - 2021, @@ -75,27 +76,17 @@ (jp(tests_rootdir, "daal4py"), jp(logdir, "daal4py")), ] -available_devices = [] +available_devices = ["cpu"] -try: - from daal4py.oneapi import sycl_context +gpu_available = False +if dpctl_available: + import dpctl - sycl_extention_available = True -except ModuleNotFoundError: - sycl_extention_available = False -print("Sycl extensions available: {}".format(sycl_extention_available)) + if dpctl.has_gpu_devices(): + gpu_available = True + available_devices.append("gpu") -if sycl_extention_available: - try: - with sycl_context("gpu"): - gpu_available = True - available_devices.append("gpu") - except RuntimeError: - gpu_available = False - available_devices.append("cpu") - # validate that host and cpu devices avaialbe for logging reasons. Examples and - # vaidaton logic assumes that host and cpu devices are always available - print("Sycl gpu device: {}".format(gpu_available)) +print("GPU device available: {}".format(gpu_available)) def check_version(rule, target): @@ -149,8 +140,6 @@ def check_library(rule): req_version["knn_bf_regression_spmd.py"] = (2023, "P", 100) req_version["linear_regression_spmd.py"] = (2023, "P", 100) req_version["logistic_regression_spmd.py"] = (2024, "P", 400) -# Timeout on PVC, bumped the req version to deselect -req_version["sycl/gradient_boosted_regression.py"] = (2024, "P", 600) req_device = defaultdict(lambda: []) req_device["basic_statistics_spmd.py"] = ["gpu"] @@ -170,7 +159,6 @@ def check_library(rule): req_device["random_forest_classifier_spmd.py"] = ["gpu"] req_device["random_forest_regressor_dpnp.py"] = ["gpu"] req_device["random_forest_regressor_spmd.py"] = ["gpu"] -req_device["sycl/gradient_boosted_regression.py"] = ["gpu"] req_library = defaultdict(lambda: []) req_library["basic_statistics_spmd.py"] = ["dpctl", "mpi4py"] @@ -211,20 +199,6 @@ def check_library(rule): def get_exe_cmd(ex, args): - if os.path.dirname(ex).endswith("sycl"): - if not sycl_extention_available: - return None - if not check_version( - req_version["sycl/" + os.path.basename(ex)], get_daal_version() - ): - return None - if not check_device( - req_device["sycl/" + os.path.basename(ex)], available_devices - ): - return None - if not check_os(req_os["sycl/" + os.path.basename(ex)], system_os): - return None - if os.path.dirname(ex).endswith("daal4py") or os.path.dirname(ex).endswith("mb"): if args.nodaal4py: return None From 1f6411400200f87436dcc8b8d886bf1512d86284 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 29 Aug 2024 08:12:44 -0700 Subject: [PATCH 102/130] update daal version --- onedal/cluster/kmeans.cpp | 4 ++-- onedal/cluster/kmeans_init.cpp | 12 ++++++------ sklearnex/cluster/k_means.py | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp index 6fdefebd4b..a78c76238b 100644 --- a/onedal/cluster/kmeans.cpp +++ b/onedal/cluster/kmeans.cpp @@ -38,9 +38,9 @@ struct method2t { const auto method = params["method"].cast(); ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_dense", ops, Float, method::lloyd_dense); -#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_csr", ops, Float, method::lloyd_csr); -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } diff --git a/onedal/cluster/kmeans_init.cpp b/onedal/cluster/kmeans_init.cpp index d973f177ad..464b656b7d 100644 --- a/onedal/cluster/kmeans_init.cpp +++ b/onedal/cluster/kmeans_init.cpp @@ -43,10 +43,10 @@ struct method2t { ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); ONEDAL_PARAM_DISPATCH_VALUE(method, "random_dense", ops, Float, method::random_dense); ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_dense", ops, Float, method::plus_plus_dense); -#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 ONEDAL_PARAM_DISPATCH_VALUE(method, "random_csr", ops, Float, method::random_csr); ONEDAL_PARAM_DISPATCH_VALUE(method, "plus_plus_csr", ops, Float, method::plus_plus_csr); -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240600 +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240700 ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); } @@ -86,7 +86,7 @@ struct descriptor_creator= 20240600 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 template struct descriptor_creator{}; } }; -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240600 +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240700 struct params2desc { template @@ -128,12 +128,12 @@ struct params2desc { const auto local_trials_count = params["local_trials_count"].cast(); desc.set_local_trials_count(local_trials_count); } -#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240600 +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 if constexpr (std::is_same_v) { const auto local_trials_count = params["local_trials_count"].cast(); desc.set_local_trials_count(local_trials_count); } -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240600 +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION>=20240700 return desc; } }; diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 9b40da1e58..146ec9e8fb 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -104,7 +104,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): correct_count = self.n_clusters < sample_count is_data_supported = ( - _is_csr(X) and daal_check_version((2024, "P", 600)) + _is_csr(X) and daal_check_version((2024, "P", 700)) ) or not issparse(X) _acceptable_sample_weights = True @@ -129,7 +129,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): ), ( is_data_supported, - "Supported data formats: Dense, CSR (oneDAL version >= 2024.6.0).", + "Supported data formats: Dense, CSR (oneDAL version >= 2024.7.0).", ), ] ) @@ -177,7 +177,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): def _onedal_predict_supported(self, method_name, X, sample_weight=None): class_name = self.__class__.__name__ is_data_supported = ( - _is_csr(X) and daal_check_version((2024, "P", 600)) + _is_csr(X) and daal_check_version((2024, "P", 700)) ) or not issparse(X) patching_status = PatchingConditionsChain( f"sklearn.cluster.{class_name}.predict" @@ -202,7 +202,7 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): ), ( is_data_supported, - "Supported data formats: Dense, CSR (oneDAL version >= 2024.6.0).", + "Supported data formats: Dense, CSR (oneDAL version >= 2024.7.0).", ), ( _acceptable_sample_weights, From 596909ac0f00afc168801f4ba972f01b8cf6338e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 29 Aug 2024 08:35:40 -0700 Subject: [PATCH 103/130] refactor deselected tests --- deselected_tests.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 25c7fe72e1..ce069bd128 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -383,6 +383,8 @@ deselected_tests: - model_selection/tests/test_classification_threshold.py::test_fit_and_score_over_thresholds_sample_weight >=1.5 - model_selection/tests/test_classification_threshold.py::test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence >=1.5 + # -------------------------------------------------------- + # No need to test daal4py patching reduced_tests: - cluster/tests/test_affinity_propagation.py - cluster/tests/test_bicluster.py @@ -729,8 +731,6 @@ gpu: - svm/tests/test_svm.py::test_unfitted - tests/test_common.py::test_estimators[SVC()-check_estimators_unfitted] -preview: - - cluster/tests/test_k_means.py::test_kmeans_elkan_results - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2 - - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2 - - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3 + # -------------------------------------------------------- + # The following tests currently fail for preview namespace +#preview: From 39d1888fffe5d994fd31e7eb3b98beac1e80b77d Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 29 Aug 2024 09:54:32 -0700 Subject: [PATCH 104/130] update daal check --- sklearnex/tests/test_run_to_run_stability.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py index 4930eede68..79ea6ccef8 100755 --- a/sklearnex/tests/test_run_to_run_stability.py +++ b/sklearnex/tests/test_run_to_run_stability.py @@ -122,7 +122,7 @@ def _run_test(estimator, method, datasets): SVC(), *( [] - if not daal_check_version((2024, "P", 600)) + if not daal_check_version((2024, "P", 700)) else [ KMeans(), KMeans(init="random"), From c3f783bc0f3459325076e1235ae72742b4d45795 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 30 Aug 2024 07:57:09 -0700 Subject: [PATCH 105/130] address comments --- deselected_tests.yaml | 10 -------- sklearnex/tests/test_run_to_run_stability.py | 25 +++++++------------- 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index 7cdb0aa373..7ff0c5fbe0 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -370,12 +370,9 @@ deselected_tests: - tests/test_common.py::test_estimators[IncrementalPCA()-check_estimators_pickle(readonly_memmap=True)] - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle] - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle(readonly_memmap=True)] - - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle] - - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle(readonly_memmap=True)] # There are not enough data to run onedal backend - tests/test_common.py::test_estimators[IncrementalLinearRegression()-check_fit2d_1sample] - tests/test_common.py::test_estimators[IncrementalRidge()-check_fit2d_1sample] - - tests/test_common.py::test_estimators[IncrementalRidge()-check_fit2d_1sample] # Deselection of LogisticRegression tests over accuracy comparisons with sample_weights # and without. Because scikit-learn-intelex does not support sample_weights, it's doing @@ -466,9 +463,6 @@ gpu: - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] - model_selection/tests/test_search.py::test_unsupervised_grid_search - - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal] - - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs] - - model_selection/tests/test_search.py::test_unsupervised_grid_search - ensemble/tests/test_bagging.py::test_gridsearch - ensemble/tests/test_bagging.py::test_estimators_samples @@ -730,7 +724,3 @@ gpu: # RuntimeError: Device support is not implemented, failing as result of fallback to cpu false - svm/tests/test_svm.py::test_unfitted - tests/test_common.py::test_estimators[SVC()-check_estimators_unfitted] - - # -------------------------------------------------------- - # The following tests currently fail for preview namespace -#preview: diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py index 79ea6ccef8..375be06918 100755 --- a/sklearnex/tests/test_run_to_run_stability.py +++ b/sklearnex/tests/test_run_to_run_stability.py @@ -115,23 +115,16 @@ def _run_test(estimator, method, datasets): ) -SPARSE_INSTANCES = _sklearn_clone_dict( - { - str(i): i - for i in [ - SVC(), - *( - [] - if not daal_check_version((2024, "P", 700)) - else [ - KMeans(), - KMeans(init="random"), - KMeans(init="k-means++"), - ] - ), +_sparse_instances = [SVC()] +if not daal_check_version((2024, "P", 700)): # Not testing for < 2024.7.0 + _sparse_instances.extend( + [ + KMeans(), + KMeans(init="random"), + KMeans(init="k-means++"), ] - } -) + ) +SPARSE_INSTANCES = _sklearn_clone_dict({str(i): i for i in _sparse_instances}) STABILITY_INSTANCES = _sklearn_clone_dict( { From d80d042b1782e2ef96287d50d6dcce5a8c640386 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 30 Aug 2024 10:25:56 -0700 Subject: [PATCH 106/130] address comments --- onedal/cluster/kmeans.py | 33 +++++++++++++++++---------------- sklearnex/cluster/k_means.py | 10 ++++++++-- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index acbefcb2cb..9c02cf3f27 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +import logging import warnings from abc import ABC @@ -165,20 +166,6 @@ def _get_onedal_params(self, is_csr=False, dtype=np.float32, result_options=None "result_options": "" if result_options is None else result_options, } - def _get_params_and_input(self, X, is_csr, policy): - X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False - ) - X = _convert_to_supported(policy, X) - dtype = get_dtype(X) - X_table = to_table(X) - - self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype) - - params = self._get_onedal_params(is_csr, dtype) - - return (params, X_table, dtype) - def _init_centroids_onedal( self, X_table, @@ -192,7 +179,11 @@ def _init_centroids_onedal( n_clusters = self.n_clusters if n_centroids is None else n_centroids # Use host policy for KMeans init, only for csr data # as oneDAL KMeansInit for CSR data is not implemented on GPU - init_policy = self._get_policy(None, None) if is_csr else policy + if is_csr: + init_policy = self._get_policy(None, None) + logging.getLogger("sklearnex").info("Running Sparse KMeansInit on CPU") + else: + init_policy = policy if isinstance(init, str) and init == "k-means++": if not is_csr: @@ -236,6 +227,7 @@ def _init_centroids_onedal( def _init_centroids_sklearn(self, X, init, random_state, policy, dtype=np.float32): # For oneDAL versions < 2023.2 or callable init, # using the scikit-learn implementation + logging.getLogger("sklearnex").info("Computing KMeansInit with Stock sklearn") n_samples = X.shape[0] if isinstance(init, str) and init == "k-means++": @@ -283,7 +275,16 @@ def _fit_backend( def _fit(self, X, module, queue=None): policy = self._get_policy(queue, X) is_csr = _is_csr(X) - _, X_table, dtype = self._get_params_and_input(X, is_csr, policy) + X = _check_array( + X, dtype=[np.float64, np.float32], accept_sparse="csr", force_all_finite=False + ) + X = _convert_to_supported(policy, X) + dtype = get_dtype(X) + X_table = to_table(X) + + self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype) + + params = self._get_onedal_params(is_csr, dtype) self.n_features_in_ = X_table.column_count diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 146ec9e8fb..0f4e27815a 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -44,8 +44,6 @@ @control_n_jobs(decorated_methods=["fit", "predict", "transform", "fit_transform"]) class KMeans(sklearn_KMeans): __doc__ = sklearn_KMeans.__doc__ - n_iter_, inertia_ = None, None - labels_, cluster_centers_ = None, None if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_KMeans._parameter_constraints} @@ -101,6 +99,10 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): sample_count = _num_samples(X) self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd", "elkan"] + if self.algorithm == "elkan": + logging.getLogger("sklearnex").info( + "oneDAL does not elkan, using lloyd algorithm instead." + ) correct_count = self.n_clusters < sample_count is_data_supported = ( @@ -184,6 +186,10 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): ) supported_algs = ["auto", "full", "lloyd", "elkan"] + if self.algorithm == "elkan": + logging.getLogger("sklearnex").info( + "oneDAL does not elkan, using lloyd algorithm instead." + ) _acceptable_sample_weights = True if sample_weight is not None: From 1c5d4dbc3cb61908bfa018e3ef27b8b12056db57 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 30 Aug 2024 12:26:09 -0700 Subject: [PATCH 107/130] test fix --- sklearnex/tests/test_run_to_run_stability.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/tests/test_run_to_run_stability.py b/sklearnex/tests/test_run_to_run_stability.py index 375be06918..0d81652f7b 100755 --- a/sklearnex/tests/test_run_to_run_stability.py +++ b/sklearnex/tests/test_run_to_run_stability.py @@ -116,7 +116,7 @@ def _run_test(estimator, method, datasets): _sparse_instances = [SVC()] -if not daal_check_version((2024, "P", 700)): # Not testing for < 2024.7.0 +if daal_check_version((2024, "P", 700)): # Test for > 2024.7.0 _sparse_instances.extend( [ KMeans(), From 499521a36bf42bde52ef7ec815318943befbc364 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 2 Sep 2024 14:35:33 -0700 Subject: [PATCH 108/130] address comments --- onedal/cluster/kmeans.py | 23 ++++++++--------------- sklearnex/cluster/k_means.py | 18 +++++++----------- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 9c02cf3f27..d0d30d0ec8 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -21,18 +21,13 @@ import numpy as np from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype -from onedal import _backend +from onedal import _backend, _is_spmd_backend from onedal.basic_statistics import BasicStatistics -try: +if _is_spmd_backend: from onedal.spmd.basic_statistics import BasicStatistics as BasicStatistics_SPMD -except ImportError: - BasicStatistics_SPMD = None -try: - from ..common._policy import _DataParallelInteropPolicy as dp_policy -except ImportError: - dp_policy = None + from ..common._spmd_policy import _SPMDDataParallelInteropPolicy as spmd_policy if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit @@ -93,14 +88,10 @@ def _tolerance(self, X_table, rtol, is_csr, policy, dtype): return rtol dummy = to_table(None) - _is_host_policy = isinstance(policy, host_policy) - _is_dp_policy = dp_policy is not None and isinstance(policy, dp_policy) - if _is_host_policy or _is_dp_policy: - bs = BasicStatistics("variance") - elif BasicStatistics_SPMD is not None: + if _is_spmd_backend and isinstance(policy, spmd_policy): bs = BasicStatistics_SPMD("variance") else: - raise ImportError("Failed to import BasicStatistics from onedal.spmd") + bs = BasicStatistics("variance") res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr) mean_var = from_table(res["variance"]).mean() @@ -395,7 +386,9 @@ def _predict(self, X, module, queue=None, result_options=None): result = module.infer(policy, params, self.model_, X_table) - if result_options: # This is only set for score function + if ( + result_options == "compute_exact_objective_function" + ): # This is only set for score function return result.objective_function_value * (-1) else: return from_table(result.responses).ravel() diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 0f4e27815a..3962a12acd 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -20,6 +20,7 @@ if daal_check_version((2023, "P", 200)): + import numbers import warnings import numpy as np @@ -101,7 +102,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): supported_algs = ["auto", "full", "lloyd", "elkan"] if self.algorithm == "elkan": logging.getLogger("sklearnex").info( - "oneDAL does not elkan, using lloyd algorithm instead." + "oneDAL does not support 'elkan', using 'lloyd' algorithm instead." ) correct_count = self.n_clusters < sample_count @@ -110,7 +111,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): ) or not issparse(X) _acceptable_sample_weights = True - if sample_weight is not None: + if sample_weight is not None or not isinstance(sample_weight, numbers.Number): sample_weight = _check_sample_weight( sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None ) @@ -122,7 +123,7 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): [ ( self.algorithm in supported_algs, - "Only lloyd algorithm is supported, elkan is computed using lloyd", + "Only 'lloyd' algorithm is supported, 'elkan' is computed using lloyd", ), (correct_count, "n_clusters is smaller than number of samples"), ( @@ -163,11 +164,6 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): dtype=[np.float64, np.float32], ) - if sklearn_check_version("1.2"): - self._check_params_vs_input(X) - else: - self._check_params(X) - self._n_features_out = self.n_clusters self._initialize_onedal_estimator() @@ -188,11 +184,11 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): supported_algs = ["auto", "full", "lloyd", "elkan"] if self.algorithm == "elkan": logging.getLogger("sklearnex").info( - "oneDAL does not elkan, using lloyd algorithm instead." + "oneDAL does not support 'elkan', using 'lloyd' algorithm instead." ) _acceptable_sample_weights = True - if sample_weight is not None: + if sample_weight is not None or not isinstance(sample_weight, numbers.Number): sample_weight = _check_sample_weight( sample_weight, X, dtype=X.dtype if hasattr(X, "dtype") else None ) @@ -204,7 +200,7 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): [ ( self.algorithm in supported_algs, - "Only lloyd algorithm is supported, elkan is computed using lloyd.", + "Only 'lloyd' algorithm is supported, 'elkan' is computed using lloyd.", ), ( is_data_supported, From 3d36e8eea38fa87ec52209ba4a5d1f54ff5a7c10 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 2 Sep 2024 15:03:58 -0700 Subject: [PATCH 109/130] minor --- onedal/cluster/kmeans.py | 3 +-- sklearnex/cluster/k_means.py | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index d0d30d0ec8..a1b8feac18 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -20,7 +20,7 @@ import numpy as np -from daal4py.sklearn._utils import daal_check_version, get_dtype, parse_dtype +from daal4py.sklearn._utils import daal_check_version, get_dtype from onedal import _backend, _is_spmd_backend from onedal.basic_statistics import BasicStatistics @@ -39,7 +39,6 @@ from ..common._base import BaseEstimator as onedal_BaseEstimator from ..common._mixin import ClusterMixin, TransformerMixin -from ..common._policy import _HostInteropPolicy as host_policy from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import _check_array, _is_arraylike_not_scalar, _is_csr diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 3962a12acd..d0ce1ceba5 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -164,6 +164,11 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): dtype=[np.float64, np.float32], ) + if sklearn_check_version("1.2"): + self._check_params_vs_input(X) + else: + self._check_params(X) + self._n_features_out = self.n_clusters self._initialize_onedal_estimator() From bca7518abd2bc59240a31484b62301e21a907155 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 2 Sep 2024 21:27:41 -0700 Subject: [PATCH 110/130] refactor --- onedal/cluster/kmeans.py | 16 ++++++---------- sklearnex/cluster/k_means.py | 5 ----- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a1b8feac18..a43e9d17ff 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -21,14 +21,9 @@ import numpy as np from daal4py.sklearn._utils import daal_check_version, get_dtype -from onedal import _backend, _is_spmd_backend +from onedal import _backend from onedal.basic_statistics import BasicStatistics -if _is_spmd_backend: - from onedal.spmd.basic_statistics import BasicStatistics as BasicStatistics_SPMD - - from ..common._spmd_policy import _SPMDDataParallelInteropPolicy as spmd_policy - if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit @@ -81,16 +76,17 @@ def _validate_center_shape(self, X, centers): def _get_kmeans_init(self, cluster_count, seed, algorithm): return KMeansInit(cluster_count=cluster_count, seed=seed, algorithm=algorithm) + # Get appropriate backend (required for SPMD) + def _get_basic_statistics_backend(self, result_options): + return BasicStatistics(result_options) + def _tolerance(self, X_table, rtol, is_csr, policy, dtype): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol dummy = to_table(None) - if _is_spmd_backend and isinstance(policy, spmd_policy): - bs = BasicStatistics_SPMD("variance") - else: - bs = BasicStatistics("variance") + bs = self._get_basic_statistics_backend("variance") res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr) mean_var = from_table(res["variance"]).mean() diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index d0ce1ceba5..3962a12acd 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -164,11 +164,6 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): dtype=[np.float64, np.float32], ) - if sklearn_check_version("1.2"): - self._check_params_vs_input(X) - else: - self._check_params(X) - self._n_features_out = self.n_clusters self._initialize_onedal_estimator() From f649cb275e8f817166b658df566df7a8c734087f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Mon, 2 Sep 2024 21:30:47 -0700 Subject: [PATCH 111/130] refactor --- onedal/cluster/kmeans.py | 1 - 1 file changed, 1 deletion(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a43e9d17ff..0a57f4ddba 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -138,7 +138,6 @@ def _check_params_vs_input( stacklevel=2, ) self._n_init = 1 - assert self.algorithm == "lloyd" def _get_onedal_params(self, is_csr=False, dtype=np.float32, result_options=None): thr = self._tol if hasattr(self, "_tol") else self.tol From 2c4fc1b00e85cb8bdda817bf61a0d8be4cc0e7d7 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 08:27:52 -0700 Subject: [PATCH 112/130] refactor --- onedal/cluster/kmeans.py | 9 +++++++-- sklearnex/cluster/k_means.py | 7 +++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 0a57f4ddba..0187a9daa6 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -21,9 +21,12 @@ import numpy as np from daal4py.sklearn._utils import daal_check_version, get_dtype -from onedal import _backend +from onedal import _backend, _is_spmd_backend from onedal.basic_statistics import BasicStatistics +if _is_spmd_backend: + from ..common._spmd_policy import _SPMDDataParallelInteropPolicy as spmd_policy + if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit @@ -138,6 +141,7 @@ def _check_params_vs_input( stacklevel=2, ) self._n_init = 1 + assert self.algorithm == "lloyd" def _get_onedal_params(self, is_csr=False, dtype=np.float32, result_options=None): thr = self._tol if hasattr(self, "_tol") else self.tol @@ -267,7 +271,8 @@ def _fit(self, X, module, queue=None): dtype = get_dtype(X) X_table = to_table(X) - self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype) + if _is_spmd_backend and isinstance(policy, spmd_policy): + self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype) params = self._get_onedal_params(is_csr, dtype) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 3962a12acd..2beac914bd 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -164,6 +164,11 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): dtype=[np.float64, np.float32], ) + if sklearn_check_version("1.2"): + self._check_params_vs_input(X) + else: + self._check_params(X) + self._n_features_out = self.n_clusters self._initialize_onedal_estimator() @@ -181,6 +186,8 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): f"sklearn.cluster.{class_name}.predict" ) + # algorithm "auto" has been deprecated since 1.1, + # algorithm "full" has been replaced by "lloyd" supported_algs = ["auto", "full", "lloyd", "elkan"] if self.algorithm == "elkan": logging.getLogger("sklearnex").info( From 20df2c2ce41c7cc35edd324b333667bce04c3831 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 09:39:43 -0700 Subject: [PATCH 113/130] ci fix --- onedal/cluster/kmeans.py | 8 ++------ sklearnex/cluster/k_means.py | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 0187a9daa6..a43e9d17ff 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -21,12 +21,9 @@ import numpy as np from daal4py.sklearn._utils import daal_check_version, get_dtype -from onedal import _backend, _is_spmd_backend +from onedal import _backend from onedal.basic_statistics import BasicStatistics -if _is_spmd_backend: - from ..common._spmd_policy import _SPMDDataParallelInteropPolicy as spmd_policy - if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit @@ -271,8 +268,7 @@ def _fit(self, X, module, queue=None): dtype = get_dtype(X) X_table = to_table(X) - if _is_spmd_backend and isinstance(policy, spmd_policy): - self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype) + self._check_params_vs_input(X_table, is_csr, policy, dtype=dtype) params = self._get_onedal_params(is_csr, dtype) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 2beac914bd..cb3c1b3a70 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -164,9 +164,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): dtype=[np.float64, np.float32], ) - if sklearn_check_version("1.2"): - self._check_params_vs_input(X) - else: + if not sklearn_check_version("1.2"): self._check_params(X) self._n_features_out = self.n_clusters From a6cb0ee4faae6141126f90efbe47f3d0c5f37025 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 10:16:00 -0700 Subject: [PATCH 114/130] ci fix --- sklearnex/cluster/k_means.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index cb3c1b3a70..91feafc06b 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -157,6 +157,29 @@ def fit(self, X, y=None, sample_weight=None): return self + def _validate_algorithm(self, X): + if self.algorithm not in ("lloyd", "elkan", "auto", "full"): + raise ValueError( + "Algorithm must be either 'lloyd' or 'elkan', " + f"got {self.algorithm} instead." + ) + + self._algorithm = self.algorithm + if self._algorithm == "elkan" and self.n_clusters == 1: + warnings.warn( + "algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'lloyd' instead.", + RuntimeWarning, + ) + self._algorithm = "lloyd" + elif self._algorithm in ["auto", "full"] and sklearn_check_version("1.1"): + warnings.warn( + f"algorithm='{self._algorithm}' is deprecated, it will be " + "removed in 1.3. Using 'lloyd' instead.", + FutureWarning, + ) + self._algorithm = "lloyd" + def _onedal_fit(self, X, _, sample_weight, queue=None): X = self._validate_data( X, @@ -164,13 +187,11 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): dtype=[np.float64, np.float32], ) - if not sklearn_check_version("1.2"): - self._check_params(X) + self._validate_algorithm(X) self._n_features_out = self.n_clusters self._initialize_onedal_estimator() - self._n_threads = _openmp_effective_n_threads() self._onedal_estimator.fit(X, queue=queue) self._save_attributes() From 2cd54f26258f78cd3d2bf9de3b1cbcb6d042510d Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 10:36:27 -0700 Subject: [PATCH 115/130] minor --- sklearnex/cluster/k_means.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 91feafc06b..a082824fed 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -192,6 +192,7 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._n_features_out = self.n_clusters self._initialize_onedal_estimator() + self._n_threads = _openmp_effective_n_threads() self._onedal_estimator.fit(X, queue=queue) self._save_attributes() From 28ccee9cd9c1bffe2af4b901552d15b95436349b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 11:25:32 -0700 Subject: [PATCH 116/130] update checks --- sklearnex/cluster/k_means.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index a082824fed..77d42a643c 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -139,9 +139,32 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): return patching_status - def fit(self, X, y=None, sample_weight=None): + def _validate_params(self): if sklearn_check_version("1.2"): - self._validate_params() + super()._validate_params() + else: + if self.n_init <= 0: + raise ValueError(f"n_init should be > 0, got {self.n_init} instead.") + self._n_init = self.n_init + if self.max_iter <= 0: + raise ValueError( + f"max_iter should be > 0, got {self.max_iter} instead." + ) + if not ( + _is_arraylike_not_scalar(self.init) + or callable(self.init) + or ( + isinstance(self.init, str) + and self.init in ["k-means++", "random"] + ) + ): + raise ValueError( + "init should be either 'k-means++', 'random', an array-like or a " + f"callable, got '{self.init}' instead." + ) + + def fit(self, X, y=None, sample_weight=None): + self._validate_params() dispatch( self, @@ -246,8 +269,7 @@ def _onedal_predict_supported(self, method_name, X, sample_weight=None): @wrap_output_data def predict(self, X): - if sklearn_check_version("1.2"): - self._validate_params() + self._validate_params() return dispatch( self, @@ -267,8 +289,7 @@ def predict( X, sample_weight="deprecated" if sklearn_check_version("1.3") else None, ): - if sklearn_check_version("1.2"): - self._validate_params() + self._validate_params() return dispatch( self, From 6f336cae0bc683a80745f3506c5e6d3196d44ee7 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 15:28:53 -0700 Subject: [PATCH 117/130] import --- sklearnex/cluster/k_means.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 77d42a643c..ee35089668 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -30,6 +30,7 @@ from sklearn.utils.validation import ( _check_sample_weight, _deprecate_positional_args, + _is_arraylike_not_scalar, _num_samples, check_is_fitted, ) From ebec4c907fd74262c389c35feea157bc32d07a6e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 17:37:43 -0700 Subject: [PATCH 118/130] fix import --- sklearnex/cluster/k_means.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index ee35089668..b1a48debfa 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -30,11 +30,13 @@ from sklearn.utils.validation import ( _check_sample_weight, _deprecate_positional_args, - _is_arraylike_not_scalar, _num_samples, check_is_fitted, ) + if sklearn_check_version("1.1"): + from sklearn.utils.validation import _is_arraylike_not_scalar + from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from onedal.cluster import KMeans as onedal_KMeans @@ -151,18 +153,19 @@ def _validate_params(self): raise ValueError( f"max_iter should be > 0, got {self.max_iter} instead." ) - if not ( - _is_arraylike_not_scalar(self.init) - or callable(self.init) - or ( - isinstance(self.init, str) - and self.init in ["k-means++", "random"] - ) - ): - raise ValueError( - "init should be either 'k-means++', 'random', an array-like or a " - f"callable, got '{self.init}' instead." - ) + if sklearn_check_version("1.1"): + if not ( + _is_arraylike_not_scalar(self.init) + or callable(self.init) + or ( + isinstance(self.init, str) + and self.init in ["k-means++", "random"] + ) + ): + raise ValueError( + "init should be either 'k-means++', 'random', an array-like or a " + f"callable, got '{self.init}' instead." + ) def fit(self, X, y=None, sample_weight=None): self._validate_params() From e5508604953dace0b0bcbc39a5f48b586ebfe7e9 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 3 Sep 2024 18:59:51 -0700 Subject: [PATCH 119/130] refactor --- sklearnex/cluster/k_means.py | 62 +++++------------------------------- 1 file changed, 8 insertions(+), 54 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index b1a48debfa..c36b73dbfb 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -34,9 +34,6 @@ check_is_fitted, ) - if sklearn_check_version("1.1"): - from sklearn.utils.validation import _is_arraylike_not_scalar - from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from onedal.cluster import KMeans as onedal_KMeans @@ -142,33 +139,9 @@ def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): return patching_status - def _validate_params(self): - if sklearn_check_version("1.2"): - super()._validate_params() - else: - if self.n_init <= 0: - raise ValueError(f"n_init should be > 0, got {self.n_init} instead.") - self._n_init = self.n_init - if self.max_iter <= 0: - raise ValueError( - f"max_iter should be > 0, got {self.max_iter} instead." - ) - if sklearn_check_version("1.1"): - if not ( - _is_arraylike_not_scalar(self.init) - or callable(self.init) - or ( - isinstance(self.init, str) - and self.init in ["k-means++", "random"] - ) - ): - raise ValueError( - "init should be either 'k-means++', 'random', an array-like or a " - f"callable, got '{self.init}' instead." - ) - def fit(self, X, y=None, sample_weight=None): - self._validate_params() + if sklearn_check_version("1.2"): + self._validate_params() dispatch( self, @@ -184,29 +157,6 @@ def fit(self, X, y=None, sample_weight=None): return self - def _validate_algorithm(self, X): - if self.algorithm not in ("lloyd", "elkan", "auto", "full"): - raise ValueError( - "Algorithm must be either 'lloyd' or 'elkan', " - f"got {self.algorithm} instead." - ) - - self._algorithm = self.algorithm - if self._algorithm == "elkan" and self.n_clusters == 1: - warnings.warn( - "algorithm='elkan' doesn't make sense for a single " - "cluster. Using 'lloyd' instead.", - RuntimeWarning, - ) - self._algorithm = "lloyd" - elif self._algorithm in ["auto", "full"] and sklearn_check_version("1.1"): - warnings.warn( - f"algorithm='{self._algorithm}' is deprecated, it will be " - "removed in 1.3. Using 'lloyd' instead.", - FutureWarning, - ) - self._algorithm = "lloyd" - def _onedal_fit(self, X, _, sample_weight, queue=None): X = self._validate_data( X, @@ -214,7 +164,10 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): dtype=[np.float64, np.float32], ) - self._validate_algorithm(X) + if sklearn_check_version("1.2"): + self._check_params_vs_input(X) + else: + self._check_params(X) self._n_features_out = self.n_clusters @@ -293,7 +246,8 @@ def predict( X, sample_weight="deprecated" if sklearn_check_version("1.3") else None, ): - self._validate_params() + if sklearn_check_version("1.2"): + self._validate_params() return dispatch( self, From 247548cc76e072dac0bbc83af3e8cc74aa0a3cb0 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 01:49:23 -0700 Subject: [PATCH 120/130] update test --- sklearnex/cluster/tests/test_kmeans.py | 190 +++++++++++++++++++++++-- 1 file changed, 180 insertions(+), 10 deletions(-) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index f92361f1b9..cb4f72396b 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -17,34 +17,204 @@ import numpy as np import pytest from numpy.testing import assert_allclose +from scipy.sparse import csr_matrix +from sklearn.datasets import make_blobs from daal4py.sklearn._utils import daal_check_version from onedal.tests.utils._dataframes_support import ( _as_numpy, _convert_to_dataframe, get_dataframes_and_queues, + get_queues, ) +def generate_dense_dataset(): + np.random.seed(0) + X, _ = make_blobs( + n_samples=100, n_features=3, centers=3, cluster_std=1.0, random_state=42 + ) + X[X < 0] = 0 # Replace negative elements with 0 + return X + + +def convert_to_sparse(X): + return csr_matrix(X) + + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) -def test_sklearnex_import(dataframe, queue): +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("init", ["k-means++", "random"]) +def test_sklearnex_import_for_dense_data(dataframe, queue, algorithm, init): from sklearnex.cluster import KMeans - X_train = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) - X_test = np.array([[0, 0], [12, 3]]) - X_train = _convert_to_dataframe(X_train, sycl_queue=queue, target_df=dataframe) - X_test = _convert_to_dataframe(X_test, sycl_queue=queue, target_df=dataframe) + X_dense = generate_dense_dataset() + X_dense_df = _convert_to_dataframe(X_dense, sycl_queue=queue, target_df=dataframe) + + kmeans_dense = KMeans( + n_clusters=3, random_state=0, algorithm=algorithm, init=init + ).fit(X_dense_df) - kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train) if daal_check_version((2023, "P", 200)): - assert "sklearnex" in kmeans.__module__ + assert "sklearnex" in kmeans_dense.__module__ + else: + assert "daal4py" in kmeans_dense.__module__ + + +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("init", ["k-means++", "random"]) +def test_sklearnex_import_for_sparse_data(queue, algorithm, init): + from sklearnex.cluster import KMeans + + X_dense = generate_dense_dataset() + X_sparse = convert_to_sparse(X_dense) + + kmeans_sparse = KMeans( + n_clusters=3, random_state=0, algorithm=algorithm, init=init + ).fit(X_sparse) + + if daal_check_version((2024, "P", 700)): + assert "sklearnex" in kmeans_sparse.__module__ else: - assert "daal4py" in kmeans.__module__ + assert "sklearn." in kmeans_sparse.__module__ + + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +def test_results_on_dense_gold_data(dataframe, queue, algorithm): + from sklearnex.cluster import KMeans + + X_train = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) + X_test = np.array([[0, 0], [12, 3]]) + X_train_df = _convert_to_dataframe(X_train, sycl_queue=queue, target_df=dataframe) + X_test_df = _convert_to_dataframe(X_test, sycl_queue=queue, target_df=dataframe) + + kmeans = KMeans(n_clusters=2, random_state=0, algorithm=algorithm).fit(X_train_df) - result_cluster_labels = kmeans.predict(X_test) if queue and queue.sycl_device.is_gpu: # KMeans Init Dense GPU implementation is different from CPU expected_cluster_labels = np.array([0, 1], dtype=np.int32) + expected_cluster_centers = np.array([[1.0, 2.0], [10.0, 2.0]], dtype=np.float32) + expected_inertia = 15.0 + expected_n_iter = 1 else: expected_cluster_labels = np.array([1, 0], dtype=np.int32) - assert_allclose(expected_cluster_labels, _as_numpy(result_cluster_labels)) + expected_cluster_centers = np.array([[10.0, 2.0], [1.0, 2.0]], dtype=np.float32) + expected_inertia = 16.0 + expected_n_iter = 2 + + assert_allclose(expected_cluster_labels, _as_numpy(kmeans.predict(X_test_df))) + assert_allclose(expected_cluster_centers, _as_numpy(kmeans.cluster_centers_)) + assert expected_inertia == kmeans.inertia_ + assert expected_n_iter == kmeans.n_iter_ + + +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("init", ["k-means++", "random"]) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("n_init", ["auto", 1, 10]) +def test_dense_vs_sparse_cpu(queue, init, algorithm, n_init): + from sklearnex.cluster import KMeans + + X_dense = generate_dense_dataset() + X_sparse = convert_to_sparse(X_dense) + + kmeans_dense = KMeans( + n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init + ).fit(X_dense) + kmeans_sparse = KMeans( + n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init + ).fit(X_sparse) + + assert_allclose( + kmeans_dense.cluster_centers_, + kmeans_sparse.cluster_centers_, + ) + + +@pytest.mark.parametrize("queue", get_queues("gpu")) +@pytest.mark.parametrize("init", ["k-means++", "random"]) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("n_init", ["auto", 1, 10]) +def test_dense_vs_sparse_gpu(queue, init, algorithm, n_init): + from sklearnex.cluster import KMeans + + X_dense = generate_dense_dataset() + X_sparse = convert_to_sparse(X_dense) + + with config_context(target_offload="gpu:0"): + kmeans_dense = KMeans( + n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init + ).fit(X_dense) + kmeans_sparse = KMeans( + n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init + ).fit(X_sparse) + + assert_allclose( + kmeans_dense.cluster_centers_, + kmeans_sparse.cluster_centers_, + ) + + +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("n_init", ["auto", 1, 10]) +def test_dense_vs_sparse_for_arraylike_init_cpu(queue, algorithm, n_init): + from sklearnex.cluster import KMeans + + X_dense = generate_dense_dataset() + init_centers = X_dense[:3] + X_sparse = convert_to_sparse(X_dense) + + kmeans_dense = KMeans( + n_clusters=3, + random_state=0, + init=init_centers, + algorithm=algorithm, + n_init=n_init, + ).fit(X_dense) + kmeans_sparse = KMeans( + n_clusters=3, + random_state=0, + init=init_centers, + algorithm=algorithm, + n_init=n_init, + ).fit(X_sparse) + + assert_allclose( + kmeans_dense.cluster_centers_, + kmeans_sparse.cluster_centers_, + ) + + +@pytest.mark.parametrize("queue", get_queues("gpu")) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("n_init", ["auto", 1, 10]) +def test_dense_vs_sparse_for_arraylike_init_gpu(queue, algorithm, n_init): + from sklearnex.cluster import KMeans + + X_dense = generate_dense_dataset() + init_centers = X_dense[:3] + X_sparse = convert_to_sparse(X_dense) + + with config_context(target_offload="gpu:0"): + kmeans_dense = KMeans( + n_clusters=3, + random_state=0, + init=init_centers, + algorithm=algorithm, + n_init=n_init, + ).fit(X_dense) + kmeans_sparse = KMeans( + n_clusters=3, + random_state=0, + init=init_centers, + algorithm=algorithm, + n_init=n_init, + ).fit(X_sparse) + + assert_allclose( + kmeans_dense.cluster_centers_, + kmeans_sparse.cluster_centers_, + ) From b19c019b3d449995b0a97b8a120bddadee6cae56 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 10:20:32 -0700 Subject: [PATCH 121/130] update test --- sklearnex/cluster/tests/test_kmeans.py | 138 ++++++------------------- 1 file changed, 31 insertions(+), 107 deletions(-) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index cb4f72396b..aec52eca16 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -27,28 +27,30 @@ get_dataframes_and_queues, get_queues, ) +from sklearnex import config_context -def generate_dense_dataset(): - np.random.seed(0) +def generate_dense_dataset(n_samples, n_features, density, n_clusters): + np.random.seed(2024 + n_samples + n_features + n_clusters) X, _ = make_blobs( - n_samples=100, n_features=3, centers=3, cluster_std=1.0, random_state=42 + n_samples=n_samples, + n_features=n_features, + centers=n_clusters, + cluster_std=1.0, + random_state=42, ) - X[X < 0] = 0 # Replace negative elements with 0 + mask = np.random.binomial(1, density, (n_samples, n_features)) + X = X * mask return X -def convert_to_sparse(X): - return csr_matrix(X) - - @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) @pytest.mark.parametrize("init", ["k-means++", "random"]) def test_sklearnex_import_for_dense_data(dataframe, queue, algorithm, init): from sklearnex.cluster import KMeans - X_dense = generate_dense_dataset() + X_dense = generate_dense_dataset(1000, 10, 0.5, 3) X_dense_df = _convert_to_dataframe(X_dense, sycl_queue=queue, target_df=dataframe) kmeans_dense = KMeans( @@ -67,8 +69,8 @@ def test_sklearnex_import_for_dense_data(dataframe, queue, algorithm, init): def test_sklearnex_import_for_sparse_data(queue, algorithm, init): from sklearnex.cluster import KMeans - X_dense = generate_dense_dataset() - X_sparse = convert_to_sparse(X_dense) + X_dense = generate_dense_dataset(1000, 10, 0.5, 3) + X_sparse = csr_matrix(X_dense) kmeans_sparse = KMeans( n_clusters=3, random_state=0, algorithm=algorithm, init=init @@ -96,8 +98,8 @@ def test_results_on_dense_gold_data(dataframe, queue, algorithm): # KMeans Init Dense GPU implementation is different from CPU expected_cluster_labels = np.array([0, 1], dtype=np.int32) expected_cluster_centers = np.array([[1.0, 2.0], [10.0, 2.0]], dtype=np.float32) - expected_inertia = 15.0 - expected_n_iter = 1 + expected_inertia = 16.0 + expected_n_iter = 2 else: expected_cluster_labels = np.array([1, 0], dtype=np.int32) expected_cluster_centers = np.array([[10.0, 2.0], [1.0, 2.0]], dtype=np.float32) @@ -110,111 +112,33 @@ def test_results_on_dense_gold_data(dataframe, queue, algorithm): assert expected_n_iter == kmeans.n_iter_ -@pytest.mark.parametrize("queue", get_queues("cpu")) -@pytest.mark.parametrize("init", ["k-means++", "random"]) -@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) -@pytest.mark.parametrize("n_init", ["auto", 1, 10]) -def test_dense_vs_sparse_cpu(queue, init, algorithm, n_init): - from sklearnex.cluster import KMeans - - X_dense = generate_dense_dataset() - X_sparse = convert_to_sparse(X_dense) - - kmeans_dense = KMeans( - n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init - ).fit(X_dense) - kmeans_sparse = KMeans( - n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init - ).fit(X_sparse) - - assert_allclose( - kmeans_dense.cluster_centers_, - kmeans_sparse.cluster_centers_, - ) - - -@pytest.mark.parametrize("queue", get_queues("gpu")) -@pytest.mark.parametrize("init", ["k-means++", "random"]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("init", ["k-means++", "random", "arraylike"]) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) -@pytest.mark.parametrize("n_init", ["auto", 1, 10]) -def test_dense_vs_sparse_gpu(queue, init, algorithm, n_init): +@pytest.mark.parametrize( + "dims", [(1000, 10, 0.95, 3), (50000, 100, 0.75, 10), (10000, 10, 0.8, 5)] +) +def test_dense_vs_sparse(queue, init, algorithm, dims): from sklearnex.cluster import KMeans - X_dense = generate_dense_dataset() - X_sparse = convert_to_sparse(X_dense) - - with config_context(target_offload="gpu:0"): - kmeans_dense = KMeans( - n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init - ).fit(X_dense) - kmeans_sparse = KMeans( - n_clusters=3, random_state=0, init=init, algorithm=algorithm, n_init=n_init - ).fit(X_sparse) - - assert_allclose( - kmeans_dense.cluster_centers_, - kmeans_sparse.cluster_centers_, - ) - - -@pytest.mark.parametrize("queue", get_queues("cpu")) -@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) -@pytest.mark.parametrize("n_init", ["auto", 1, 10]) -def test_dense_vs_sparse_for_arraylike_init_cpu(queue, algorithm, n_init): - from sklearnex.cluster import KMeans + # For higher level of sparsity (smaller density) the test will fail + # This is because random initialization of centroids may choose isolated initial centroids + n_samples, n_features, density, n_clusters = dims + X_dense = generate_dense_dataset(n_samples, n_features, density, n_clusters) + X_sparse = csr_matrix(X_dense) - X_dense = generate_dense_dataset() - init_centers = X_dense[:3] - X_sparse = convert_to_sparse(X_dense) + if init == "arraylike": + np.random.seed(2024 + n_samples + n_features + n_clusters) + init = X_dense[np.random.choice(n_samples, size=n_clusters, replace=False)] kmeans_dense = KMeans( - n_clusters=3, - random_state=0, - init=init_centers, - algorithm=algorithm, - n_init=n_init, + n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm ).fit(X_dense) kmeans_sparse = KMeans( - n_clusters=3, - random_state=0, - init=init_centers, - algorithm=algorithm, - n_init=n_init, + n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm ).fit(X_sparse) assert_allclose( kmeans_dense.cluster_centers_, kmeans_sparse.cluster_centers_, ) - - -@pytest.mark.parametrize("queue", get_queues("gpu")) -@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) -@pytest.mark.parametrize("n_init", ["auto", 1, 10]) -def test_dense_vs_sparse_for_arraylike_init_gpu(queue, algorithm, n_init): - from sklearnex.cluster import KMeans - - X_dense = generate_dense_dataset() - init_centers = X_dense[:3] - X_sparse = convert_to_sparse(X_dense) - - with config_context(target_offload="gpu:0"): - kmeans_dense = KMeans( - n_clusters=3, - random_state=0, - init=init_centers, - algorithm=algorithm, - n_init=n_init, - ).fit(X_dense) - kmeans_sparse = KMeans( - n_clusters=3, - random_state=0, - init=init_centers, - algorithm=algorithm, - n_init=n_init, - ).fit(X_sparse) - - assert_allclose( - kmeans_dense.cluster_centers_, - kmeans_sparse.cluster_centers_, - ) From a295198fe2c9b0204e08da42ee703b82526bb589 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 11:39:21 -0700 Subject: [PATCH 122/130] ci fixes --- onedal/cluster/kmeans.py | 11 ---- sklearnex/cluster/tests/test_kmeans.py | 86 ++++++++++++-------------- 2 files changed, 41 insertions(+), 56 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a43e9d17ff..3a310c3f70 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -106,17 +106,6 @@ def _check_params_vs_input( self._tol = self._tolerance(X_table, self.tol, is_csr, policy, dtype) self._n_init = self.n_init - if self._n_init == "warn": - warnings.warn( - ( - "The default value of `n_init` will change from " - f"{default_n_init} to 'auto' in 1.4. Set the value of `n_init`" - " explicitly to suppress the warning" - ), - FutureWarning, - stacklevel=2, - ) - self._n_init = default_n_init if self._n_init == "auto": if isinstance(self.init, str) and self.init == "k-means++": self._n_init = 1 diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index aec52eca16..b74ebc736e 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -62,25 +62,21 @@ def test_sklearnex_import_for_dense_data(dataframe, queue, algorithm, init): else: assert "daal4py" in kmeans_dense.__module__ +if daal_check_version((2024, "P", 700)): + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) + @pytest.mark.parametrize("init", ["k-means++", "random"]) + def test_sklearnex_import_for_sparse_data(queue, algorithm, init): + from sklearnex.cluster import KMeans -@pytest.mark.parametrize("queue", get_queues()) -@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) -@pytest.mark.parametrize("init", ["k-means++", "random"]) -def test_sklearnex_import_for_sparse_data(queue, algorithm, init): - from sklearnex.cluster import KMeans - - X_dense = generate_dense_dataset(1000, 10, 0.5, 3) - X_sparse = csr_matrix(X_dense) + X_dense = generate_dense_dataset(1000, 10, 0.5, 3) + X_sparse = csr_matrix(X_dense) - kmeans_sparse = KMeans( - n_clusters=3, random_state=0, algorithm=algorithm, init=init - ).fit(X_sparse) + kmeans_sparse = KMeans( + n_clusters=3, random_state=0, algorithm=algorithm, init=init + ).fit(X_sparse) - if daal_check_version((2024, "P", 700)): assert "sklearnex" in kmeans_sparse.__module__ - else: - assert "sklearn." in kmeans_sparse.__module__ - @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) @@ -111,34 +107,34 @@ def test_results_on_dense_gold_data(dataframe, queue, algorithm): assert expected_inertia == kmeans.inertia_ assert expected_n_iter == kmeans.n_iter_ - -@pytest.mark.parametrize("queue", get_queues()) -@pytest.mark.parametrize("init", ["k-means++", "random", "arraylike"]) -@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) -@pytest.mark.parametrize( - "dims", [(1000, 10, 0.95, 3), (50000, 100, 0.75, 10), (10000, 10, 0.8, 5)] -) -def test_dense_vs_sparse(queue, init, algorithm, dims): - from sklearnex.cluster import KMeans - - # For higher level of sparsity (smaller density) the test will fail - # This is because random initialization of centroids may choose isolated initial centroids - n_samples, n_features, density, n_clusters = dims - X_dense = generate_dense_dataset(n_samples, n_features, density, n_clusters) - X_sparse = csr_matrix(X_dense) - - if init == "arraylike": - np.random.seed(2024 + n_samples + n_features + n_clusters) - init = X_dense[np.random.choice(n_samples, size=n_clusters, replace=False)] - - kmeans_dense = KMeans( - n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm - ).fit(X_dense) - kmeans_sparse = KMeans( - n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm - ).fit(X_sparse) - - assert_allclose( - kmeans_dense.cluster_centers_, - kmeans_sparse.cluster_centers_, +if daal_check_version((2024, "P", 700)): + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("init", ["k-means++", "random", "arraylike"]) + @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) + @pytest.mark.parametrize( + "dims", [(1000, 10, 0.95, 3), (50000, 100, 0.75, 10), (10000, 10, 0.8, 5)] ) + def test_dense_vs_sparse(queue, init, algorithm, dims): + from sklearnex.cluster import KMeans + + # For higher level of sparsity (smaller density) the test will fail + # This is because random initialization of centroids may choose isolated ones + n_samples, n_features, density, n_clusters = dims + X_dense = generate_dense_dataset(n_samples, n_features, density, n_clusters) + X_sparse = csr_matrix(X_dense) + + if init == "arraylike": + np.random.seed(2024 + n_samples + n_features + n_clusters) + init = X_dense[np.random.choice(n_samples, size=n_clusters, replace=False)] + + kmeans_dense = KMeans( + n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm + ).fit(X_dense) + kmeans_sparse = KMeans( + n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm + ).fit(X_sparse) + + assert_allclose( + kmeans_dense.cluster_centers_, + kmeans_sparse.cluster_centers_, + ) From 740349312367ae5d8475b721064e4b1b413971ad Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 11:40:09 -0700 Subject: [PATCH 123/130] lint --- sklearnex/cluster/tests/test_kmeans.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index b74ebc736e..30f59fb4b5 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -62,7 +62,9 @@ def test_sklearnex_import_for_dense_data(dataframe, queue, algorithm, init): else: assert "daal4py" in kmeans_dense.__module__ + if daal_check_version((2024, "P", 700)): + @pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) @pytest.mark.parametrize("init", ["k-means++", "random"]) @@ -78,6 +80,7 @@ def test_sklearnex_import_for_sparse_data(queue, algorithm, init): assert "sklearnex" in kmeans_sparse.__module__ + @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) def test_results_on_dense_gold_data(dataframe, queue, algorithm): @@ -107,7 +110,9 @@ def test_results_on_dense_gold_data(dataframe, queue, algorithm): assert expected_inertia == kmeans.inertia_ assert expected_n_iter == kmeans.n_iter_ + if daal_check_version((2024, "P", 700)): + @pytest.mark.parametrize("queue", get_queues()) @pytest.mark.parametrize("init", ["k-means++", "random", "arraylike"]) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) From 848b9dbd2477447d2c9dd2f7ef12588a76dc8232 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 12:13:47 -0700 Subject: [PATCH 124/130] minor --- onedal/cluster/kmeans.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 3a310c3f70..a43e9d17ff 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -106,6 +106,17 @@ def _check_params_vs_input( self._tol = self._tolerance(X_table, self.tol, is_csr, policy, dtype) self._n_init = self.n_init + if self._n_init == "warn": + warnings.warn( + ( + "The default value of `n_init` will change from " + f"{default_n_init} to 'auto' in 1.4. Set the value of `n_init`" + " explicitly to suppress the warning" + ), + FutureWarning, + stacklevel=2, + ) + self._n_init = default_n_init if self._n_init == "auto": if isinstance(self.init, str) and self.init == "k-means++": self._n_init = 1 From c77f5977620c561588fe15f9e95638ecff042793 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 12:21:52 -0700 Subject: [PATCH 125/130] minor --- onedal/cluster/kmeans.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index a43e9d17ff..c747389055 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -106,17 +106,6 @@ def _check_params_vs_input( self._tol = self._tolerance(X_table, self.tol, is_csr, policy, dtype) self._n_init = self.n_init - if self._n_init == "warn": - warnings.warn( - ( - "The default value of `n_init` will change from " - f"{default_n_init} to 'auto' in 1.4. Set the value of `n_init`" - " explicitly to suppress the warning" - ), - FutureWarning, - stacklevel=2, - ) - self._n_init = default_n_init if self._n_init == "auto": if isinstance(self.init, str) and self.init == "k-means++": self._n_init = 1 @@ -531,7 +520,7 @@ def k_means( n_clusters, *, init="k-means++", - n_init="warn", + n_init="auto", max_iter=300, verbose=False, tol=1e-4, From 5fddeda88b8ad87fce1bcefe784e35192e6e046e Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 17:28:18 -0700 Subject: [PATCH 126/130] ci fix --- onedal/cluster/kmeans.py | 13 +++++++++++++ sklearnex/cluster/tests/test_kmeans.py | 3 +-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index c747389055..d7a9e88f82 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -105,7 +105,20 @@ def _check_params_vs_input( # tol self._tol = self._tolerance(X_table, self.tol, is_csr, policy, dtype) + # n-init + # TODO(1.4): Remove self._n_init = self.n_init + if self._n_init == "warn": + warnings.warn( + ( + "The default value of `n_init` will change from " + f"{default_n_init} to 'auto' in 1.4. Set the value of `n_init`" + " explicitly to suppress the warning" + ), + FutureWarning, + stacklevel=2, + ) + self._n_init = default_n_init if self._n_init == "auto": if isinstance(self.init, str) and self.init == "k-means++": self._n_init = 1 diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 30f59fb4b5..838e98c5ea 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -122,8 +122,7 @@ def test_results_on_dense_gold_data(dataframe, queue, algorithm): def test_dense_vs_sparse(queue, init, algorithm, dims): from sklearnex.cluster import KMeans - # For higher level of sparsity (smaller density) the test will fail - # This is because random initialization of centroids may choose isolated ones + # For higher level of sparsity (smaller density) the test may fail n_samples, n_features, density, n_clusters = dims X_dense = generate_dense_dataset(n_samples, n_features, density, n_clusters) X_sparse = csr_matrix(X_dense) From 365a766ab54b9a9253a9963140f10f123df855a3 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 19:39:08 -0700 Subject: [PATCH 127/130] fix ci --- sklearnex/cluster/k_means.py | 2 +- sklearnex/cluster/tests/test_kmeans.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index c36b73dbfb..49a09454b9 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -64,7 +64,7 @@ def __init__( verbose=0, random_state=None, copy_x=True, - algorithm="lloyd" if sklearn_check_version("1.1") else "auto", + algorithm="lloyd" if sklearn_check_version("1.2") else "auto", ): super().__init__( n_clusters=n_clusters, diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 838e98c5ea..84e596435b 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -98,17 +98,14 @@ def test_results_on_dense_gold_data(dataframe, queue, algorithm): expected_cluster_labels = np.array([0, 1], dtype=np.int32) expected_cluster_centers = np.array([[1.0, 2.0], [10.0, 2.0]], dtype=np.float32) expected_inertia = 16.0 - expected_n_iter = 2 else: expected_cluster_labels = np.array([1, 0], dtype=np.int32) expected_cluster_centers = np.array([[10.0, 2.0], [1.0, 2.0]], dtype=np.float32) expected_inertia = 16.0 - expected_n_iter = 2 assert_allclose(expected_cluster_labels, _as_numpy(kmeans.predict(X_test_df))) assert_allclose(expected_cluster_centers, _as_numpy(kmeans.cluster_centers_)) assert expected_inertia == kmeans.inertia_ - assert expected_n_iter == kmeans.n_iter_ if daal_check_version((2024, "P", 700)): From 6542ec089de25e1035d7412144cc3ddf05f1e8d1 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 20:16:24 -0700 Subject: [PATCH 128/130] fix ci --- sklearnex/cluster/k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 49a09454b9..c36b73dbfb 100644 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -64,7 +64,7 @@ def __init__( verbose=0, random_state=None, copy_x=True, - algorithm="lloyd" if sklearn_check_version("1.2") else "auto", + algorithm="lloyd" if sklearn_check_version("1.1") else "auto", ): super().__init__( n_clusters=n_clusters, From b598059252a1b09eff625d6cb7c20385c1509646 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 21:13:16 -0700 Subject: [PATCH 129/130] fix ci --- sklearnex/cluster/tests/test_kmeans.py | 93 ++++++++++++++------------ 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 84e596435b..f8d1566926 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -20,7 +20,7 @@ from scipy.sparse import csr_matrix from sklearn.datasets import make_blobs -from daal4py.sklearn._utils import daal_check_version +from daal4py.sklearn._utils import daal_check_version, sklearn_check_version from onedal.tests.utils._dataframes_support import ( _as_numpy, _convert_to_dataframe, @@ -63,27 +63,32 @@ def test_sklearnex_import_for_dense_data(dataframe, queue, algorithm, init): assert "daal4py" in kmeans_dense.__module__ -if daal_check_version((2024, "P", 700)): - - @pytest.mark.parametrize("queue", get_queues()) - @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) - @pytest.mark.parametrize("init", ["k-means++", "random"]) - def test_sklearnex_import_for_sparse_data(queue, algorithm, init): - from sklearnex.cluster import KMeans +@pytest.mark.skipif( + not daal_check_version((2024, "P", 700)), + reason="Sparse data requires oneDAL>=2024.7.0", +) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("init", ["k-means++", "random"]) +def test_sklearnex_import_for_sparse_data(queue, algorithm, init): + from sklearnex.cluster import KMeans - X_dense = generate_dense_dataset(1000, 10, 0.5, 3) - X_sparse = csr_matrix(X_dense) + X_dense = generate_dense_dataset(1000, 10, 0.5, 3) + X_sparse = csr_matrix(X_dense) - kmeans_sparse = KMeans( - n_clusters=3, random_state=0, algorithm=algorithm, init=init - ).fit(X_sparse) + kmeans_sparse = KMeans( + n_clusters=3, random_state=0, algorithm=algorithm, init=init + ).fit(X_sparse) - assert "sklearnex" in kmeans_sparse.__module__ + assert "sklearnex" in kmeans_sparse.__module__ @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) def test_results_on_dense_gold_data(dataframe, queue, algorithm): + if not sklearn_check_version("1.1") and algorithm == "lloyd": + pytest.skip("lloyd requires sklearn>=1.1.") + from sklearnex.cluster import KMeans X_train = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) @@ -108,34 +113,36 @@ def test_results_on_dense_gold_data(dataframe, queue, algorithm): assert expected_inertia == kmeans.inertia_ -if daal_check_version((2024, "P", 700)): +@pytest.mark.skipif( + not daal_check_version((2024, "P", 700)), + reason="Sparse data requires oneDAL>=2024.7.0", +) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("init", ["k-means++", "random", "arraylike"]) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize( + "dims", [(1000, 10, 0.95, 3), (50000, 100, 0.75, 10), (10000, 10, 0.8, 5)] +) +def test_dense_vs_sparse(queue, init, algorithm, dims): + from sklearnex.cluster import KMeans - @pytest.mark.parametrize("queue", get_queues()) - @pytest.mark.parametrize("init", ["k-means++", "random", "arraylike"]) - @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) - @pytest.mark.parametrize( - "dims", [(1000, 10, 0.95, 3), (50000, 100, 0.75, 10), (10000, 10, 0.8, 5)] + # For higher level of sparsity (smaller density) the test may fail + n_samples, n_features, density, n_clusters = dims + X_dense = generate_dense_dataset(n_samples, n_features, density, n_clusters) + X_sparse = csr_matrix(X_dense) + + if init == "arraylike": + np.random.seed(2024 + n_samples + n_features + n_clusters) + init = X_dense[np.random.choice(n_samples, size=n_clusters, replace=False)] + + kmeans_dense = KMeans( + n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm + ).fit(X_dense) + kmeans_sparse = KMeans( + n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm + ).fit(X_sparse) + + assert_allclose( + kmeans_dense.cluster_centers_, + kmeans_sparse.cluster_centers_, ) - def test_dense_vs_sparse(queue, init, algorithm, dims): - from sklearnex.cluster import KMeans - - # For higher level of sparsity (smaller density) the test may fail - n_samples, n_features, density, n_clusters = dims - X_dense = generate_dense_dataset(n_samples, n_features, density, n_clusters) - X_sparse = csr_matrix(X_dense) - - if init == "arraylike": - np.random.seed(2024 + n_samples + n_features + n_clusters) - init = X_dense[np.random.choice(n_samples, size=n_clusters, replace=False)] - - kmeans_dense = KMeans( - n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm - ).fit(X_dense) - kmeans_sparse = KMeans( - n_clusters=n_clusters, random_state=0, init=init, algorithm=algorithm - ).fit(X_sparse) - - assert_allclose( - kmeans_dense.cluster_centers_, - kmeans_sparse.cluster_centers_, - ) From 25850401a930905f2d384a55339ba622b3dbcc20 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 5 Sep 2024 21:50:23 -0700 Subject: [PATCH 130/130] fix ci --- sklearnex/cluster/tests/test_kmeans.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index f8d1566926..e12211eb70 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -48,6 +48,8 @@ def generate_dense_dataset(n_samples, n_features, density, n_clusters): @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) @pytest.mark.parametrize("init", ["k-means++", "random"]) def test_sklearnex_import_for_dense_data(dataframe, queue, algorithm, init): + if not sklearn_check_version("1.1") and algorithm == "lloyd": + pytest.skip("lloyd requires sklearn>=1.1.") from sklearnex.cluster import KMeans X_dense = generate_dense_dataset(1000, 10, 0.5, 3)