From 2f52a21c1c70af9ac9696a4b9a0525711d137ee7 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 21 May 2024 06:03:15 -0700 Subject: [PATCH 1/7] bug fix in kmeans sparse init --- .../src/algorithms/kmeans/kmeans_plusplus_init_impl.i | 10 ++++++++-- .../kmeans_init/backend/cpu/compute_kernel_dense.cpp | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i index 158a906d572..3c9c3fe2ac0 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -294,13 +294,19 @@ public: { algorithmFPType sumOfDist2 = algorithmFPType(0); size_t csrCursor = 0u; + algorithmFPType pLastAddedCenterSumSq = algorithmFPType(0.); + for (size_t iCol = 0u; iCol < dim; iCol++){ + pLastAddedCenterSumSq += pLastAddedCenter[iCol] * pLastAddedCenter[iCol]; + } + for (size_t iRow = 0u; iRow < nRowsToProcess; iRow++) { - algorithmFPType dist2 = algorithmFPType(0); + algorithmFPType dist2 = pLastAddedCenterSumSq; const size_t nValues = rowIdx[iRow + 1] - rowIdx[iRow]; for (size_t i = 0u; i < nValues; i++, csrCursor++) { - dist2 += (pData[csrCursor] - pLastAddedCenter[colIdx[csrCursor] - 1]) * (pData[csrCursor] - pLastAddedCenter[colIdx[csrCursor] - 1]); + dist2 += pData[csrCursor] * pData[csrCursor]; + dist2 -= 2 * pData[csrCursor] * pLastAddedCenter[colIdx[csrCursor] - 1]; } if (aWeights) { diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp index b4347518dc1..d81a46a45e9 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp @@ -42,10 +42,12 @@ static compute_result call_daal_kernel(const context_cpu& ctx, const table& data) { const std::int64_t column_count = data.get_column_count(); const std::int64_t cluster_count = desc.get_cluster_count(); + const std::int64_t trial_count = desc.get_local_trials_count(); daal_kmeans_init::Parameter par(dal::detail::integral_cast(cluster_count), 0, dal::detail::integral_cast(desc.get_seed())); + par.nTrials = dal::detail::integral_cast(trial_count); const auto daal_data = interop::convert_to_daal_table(data); const std::size_t len_input = 1; From eee6863bcc0733607c345f3fb6aae00e031a5428 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 21 May 2024 09:58:33 -0700 Subject: [PATCH 2/7] format and error fix --- .../src/algorithms/kmeans/kmeans_plusplus_init_impl.i | 9 +++++---- cpp/oneapi/dal/algo/kmeans_init/common.cpp | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i index 3c9c3fe2ac0..89ba8f786df 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -292,10 +292,11 @@ public: const algorithmFPType * const pLastAddedCenter, const algorithmFPType * const aWeights, const algorithmFPType * const pDistSqBest) { - algorithmFPType sumOfDist2 = algorithmFPType(0); - size_t csrCursor = 0u; - algorithmFPType pLastAddedCenterSumSq = algorithmFPType(0.); - for (size_t iCol = 0u; iCol < dim; iCol++){ + algorithmFPType sumOfDist2 = algorithmFPType(0); + size_t csrCursor = 0u; + algorithmFPType pLastAddedCenterSumSq = algorithmFPType(0.); + for (size_t iCol = 0u; iCol < dim; iCol++) + { pLastAddedCenterSumSq += pLastAddedCenter[iCol] * pLastAddedCenter[iCol]; } diff --git a/cpp/oneapi/dal/algo/kmeans_init/common.cpp b/cpp/oneapi/dal/algo/kmeans_init/common.cpp index c84d4b6965f..9797ac47754 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/common.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/common.cpp @@ -23,7 +23,7 @@ namespace v1 { template class descriptor_impl : public base { public: - std::int64_t local_trials_count = -1; + std::int64_t local_trials_count = 2; std::int64_t cluster_count = 2; std::int64_t seed = 777; }; From 328407d5946e78bec899e1ce41b8288b23b4e2fa Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Tue, 21 May 2024 11:29:09 -0700 Subject: [PATCH 3/7] align trial_count computation --- .../kmeans_init/backend/cpu/compute_kernel_dense.cpp | 10 ++++++++-- cpp/oneapi/dal/algo/kmeans_init/common.cpp | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp index d81a46a45e9..deb2428b102 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp @@ -14,6 +14,8 @@ * limitations under the License. *******************************************************************************/ +#include + #include #include "oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel.hpp" @@ -42,12 +44,16 @@ static compute_result call_daal_kernel(const context_cpu& ctx, const table& data) { const std::int64_t column_count = data.get_column_count(); const std::int64_t cluster_count = desc.get_cluster_count(); - const std::int64_t trial_count = desc.get_local_trials_count(); + + const std::int64_t init_trial_count = desc.get_local_trials_count(); + const auto additional = std::log(cluster_count); + const auto proposed = 2 + std::int64_t(additional); + const std::int64_t trial_count = (init_trial_count == -1) ? proposed : init_trial_count; daal_kmeans_init::Parameter par(dal::detail::integral_cast(cluster_count), 0, dal::detail::integral_cast(desc.get_seed())); - par.nTrials = dal::detail::integral_cast(trial_count); + par.nTrials = trial_count; const auto daal_data = interop::convert_to_daal_table(data); const std::size_t len_input = 1; diff --git a/cpp/oneapi/dal/algo/kmeans_init/common.cpp b/cpp/oneapi/dal/algo/kmeans_init/common.cpp index 9797ac47754..c84d4b6965f 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/common.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/common.cpp @@ -23,7 +23,7 @@ namespace v1 { template class descriptor_impl : public base { public: - std::int64_t local_trials_count = 2; + std::int64_t local_trials_count = -1; std::int64_t cluster_count = 2; std::int64_t seed = 777; }; From 137dee45c6d445575628a1d2df371c601ea0c8d5 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Thu, 23 May 2024 11:37:56 -0700 Subject: [PATCH 4/7] address review --- .../src/algorithms/kmeans/kmeans_plusplus_init_impl.i | 3 +-- .../kmeans_init/backend/cpu/compute_kernel_dense.cpp | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i index 89ba8f786df..c1e1b3ccfaf 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -306,8 +306,7 @@ public: const size_t nValues = rowIdx[iRow + 1] - rowIdx[iRow]; for (size_t i = 0u; i < nValues; i++, csrCursor++) { - dist2 += pData[csrCursor] * pData[csrCursor]; - dist2 -= 2 * pData[csrCursor] * pLastAddedCenter[colIdx[csrCursor] - 1]; + dist2 += pData[csrCursor] * pData[csrCursor] - 2 * pData[csrCursor] * pLastAddedCenter[colIdx[csrCursor] - 1]; } if (aWeights) { diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp index deb2428b102..05dc0b71fa4 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp @@ -45,10 +45,12 @@ static compute_result call_daal_kernel(const context_cpu& ctx, const std::int64_t column_count = data.get_column_count(); const std::int64_t cluster_count = desc.get_cluster_count(); - const std::int64_t init_trial_count = desc.get_local_trials_count(); - const auto additional = std::log(cluster_count); - const auto proposed = 2 + std::int64_t(additional); - const std::int64_t trial_count = (init_trial_count == -1) ? proposed : init_trial_count; + std::int64_t trial_count = desc.get_local_trials_count(); + if (trial_count == -1) + { + const auto additional = std::log(cluster_count); + trial_count = 2 + std::int64_t(additional); + } daal_kmeans_init::Parameter par(dal::detail::integral_cast(cluster_count), 0, From bc5138b513305179a5cd770db87582561608fc7f Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 24 May 2024 06:59:53 -0700 Subject: [PATCH 5/7] add comment and refactor --- cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i | 4 ++++ .../dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i index c1e1b3ccfaf..7e3167e5ce1 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -300,10 +300,14 @@ public: pLastAddedCenterSumSq += pLastAddedCenter[iCol] * pLastAddedCenter[iCol]; } + //calculate (weighted if weight is provided) distance from last added center to the rows in the block for (size_t iRow = 0u; iRow < nRowsToProcess; iRow++) { algorithmFPType dist2 = pLastAddedCenterSumSq; const size_t nValues = rowIdx[iRow + 1] - rowIdx[iRow]; + + //distance from the lastAddedCenter to the current row + //using the formula: dist2 = x^2 + y^2 - 2xy for (size_t i = 0u; i < nValues; i++, csrCursor++) { dist2 += pData[csrCursor] * pData[csrCursor] - 2 * pData[csrCursor] * pLastAddedCenter[colIdx[csrCursor] - 1]; diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp index 05dc0b71fa4..638f225b584 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp @@ -46,8 +46,7 @@ static compute_result call_daal_kernel(const context_cpu& ctx, const std::int64_t cluster_count = desc.get_cluster_count(); std::int64_t trial_count = desc.get_local_trials_count(); - if (trial_count == -1) - { + if (trial_count == -1) { const auto additional = std::log(cluster_count); trial_count = 2 + std::int64_t(additional); } From a5891d0773e0a3431eab4fd4c7779616f461fd5b Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 24 May 2024 09:51:40 -0700 Subject: [PATCH 6/7] update comments --- cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i | 5 ++--- .../algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i index 7e3167e5ce1..a89d3f0bfea 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -287,6 +287,7 @@ public: return _csr->releaseSparseBlock(block); } + //calculate distance from current trial center to the rows in the block and update min distance algorithmFPType updateMinDistForITrials(algorithmFPType * const pDistSq, size_t iTrials, size_t nRowsToProcess, const algorithmFPType * const pData, const size_t * const colIdx, const size_t * const rowIdx, const algorithmFPType * const pLastAddedCenter, const algorithmFPType * const aWeights, @@ -300,14 +301,12 @@ public: pLastAddedCenterSumSq += pLastAddedCenter[iCol] * pLastAddedCenter[iCol]; } - //calculate (weighted if weight is provided) distance from last added center to the rows in the block for (size_t iRow = 0u; iRow < nRowsToProcess; iRow++) { algorithmFPType dist2 = pLastAddedCenterSumSq; const size_t nValues = rowIdx[iRow + 1] - rowIdx[iRow]; - //distance from the lastAddedCenter to the current row - //using the formula: dist2 = x^2 + y^2 - 2xy + //distance from the lastAddedCenter to the current row, dist2 = x^2 + y^2 - 2xy for (size_t i = 0u; i < nValues; i++, csrCursor++) { dist2 += pData[csrCursor] * pData[csrCursor] - 2 * pData[csrCursor] * pLastAddedCenter[colIdx[csrCursor] - 1]; diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp index 638f225b584..bde263e984a 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp @@ -45,6 +45,7 @@ static compute_result call_daal_kernel(const context_cpu& ctx, const std::int64_t column_count = data.get_column_count(); const std::int64_t cluster_count = desc.get_cluster_count(); + //number of trials to pick each centroid from, 2 + int(ln(cluster_count)) if not set by user std::int64_t trial_count = desc.get_local_trials_count(); if (trial_count == -1) { const auto additional = std::log(cluster_count); From d7c233308cfc4aa5622fc708a7c58fbc02c0fea9 Mon Sep 17 00:00:00 2001 From: Md Shafiul Alam Date: Fri, 24 May 2024 10:03:53 -0700 Subject: [PATCH 7/7] update comments --- .../dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp index bde263e984a..3a58e13a566 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp @@ -45,7 +45,8 @@ static compute_result call_daal_kernel(const context_cpu& ctx, const std::int64_t column_count = data.get_column_count(); const std::int64_t cluster_count = desc.get_cluster_count(); - //number of trials to pick each centroid from, 2 + int(ln(cluster_count)) if not set by user + //number of trials to pick each centroid from, 2 + int(ln(cluster_count)) works better than vanilla kmeans++ + //https://github.com/scikit-learn/scikit-learn/blob/a63b021310ba13ea39ad3555f550d8aeec3002c5/sklearn/cluster/_kmeans.py#L108 std::int64_t trial_count = desc.get_local_trials_count(); if (trial_count == -1) { const auto additional = std::log(cluster_count);