diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i index 158a906d572..a89d3f0bfea 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -287,20 +287,29 @@ public: return _csr->releaseSparseBlock(block); } + //calculate distance from current trial center to the rows in the block and update min distance algorithmFPType updateMinDistForITrials(algorithmFPType * const pDistSq, size_t iTrials, size_t nRowsToProcess, const algorithmFPType * const pData, const size_t * const colIdx, const size_t * const rowIdx, const algorithmFPType * const pLastAddedCenter, const algorithmFPType * const aWeights, const algorithmFPType * const pDistSqBest) { - algorithmFPType sumOfDist2 = algorithmFPType(0); - size_t csrCursor = 0u; + algorithmFPType sumOfDist2 = algorithmFPType(0); + size_t csrCursor = 0u; + algorithmFPType pLastAddedCenterSumSq = algorithmFPType(0.); + for (size_t iCol = 0u; iCol < dim; iCol++) + { + pLastAddedCenterSumSq += pLastAddedCenter[iCol] * pLastAddedCenter[iCol]; + } + for (size_t iRow = 0u; iRow < nRowsToProcess; iRow++) { - algorithmFPType dist2 = algorithmFPType(0); + algorithmFPType dist2 = pLastAddedCenterSumSq; const size_t nValues = rowIdx[iRow + 1] - rowIdx[iRow]; + + //distance from the lastAddedCenter to the current row, dist2 = x^2 + y^2 - 2xy for (size_t i = 0u; i < nValues; i++, csrCursor++) { - dist2 += (pData[csrCursor] - pLastAddedCenter[colIdx[csrCursor] - 1]) * (pData[csrCursor] - pLastAddedCenter[colIdx[csrCursor] - 1]); + dist2 += pData[csrCursor] * pData[csrCursor] - 2 * pData[csrCursor] * pLastAddedCenter[colIdx[csrCursor] - 1]; } if (aWeights) { diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp index b4347518dc1..3a58e13a566 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel_dense.cpp @@ -14,6 +14,8 @@ * limitations under the License. *******************************************************************************/ +#include + #include #include "oneapi/dal/algo/kmeans_init/backend/cpu/compute_kernel.hpp" @@ -43,9 +45,18 @@ static compute_result call_daal_kernel(const context_cpu& ctx, const std::int64_t column_count = data.get_column_count(); const std::int64_t cluster_count = desc.get_cluster_count(); + //number of trials to pick each centroid from, 2 + int(ln(cluster_count)) works better than vanilla kmeans++ + //https://github.com/scikit-learn/scikit-learn/blob/a63b021310ba13ea39ad3555f550d8aeec3002c5/sklearn/cluster/_kmeans.py#L108 + std::int64_t trial_count = desc.get_local_trials_count(); + if (trial_count == -1) { + const auto additional = std::log(cluster_count); + trial_count = 2 + std::int64_t(additional); + } + daal_kmeans_init::Parameter par(dal::detail::integral_cast(cluster_count), 0, dal::detail::integral_cast(desc.get_seed())); + par.nTrials = trial_count; const auto daal_data = interop::convert_to_daal_table(data); const std::size_t len_input = 1;