diff --git a/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp b/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp index aa64aacf5bb..429b666279e 100644 --- a/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp @@ -153,16 +153,21 @@ result_t finalize_compute_kernel_dense_impl::operator()(const descriptor_ const auto nobs_nd = pr::table2ndarray_1d(q, input.get_partial_n_rows()); auto rows_count_global = nobs_nd.get_data()[0]; + auto is_distributed = (comm_.get_rank_count() > 1); { ONEDAL_PROFILER_TASK(allreduce_rows_count_global); - comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait(); + if (is_distributed) { + comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait(); + } } if (res_op.test(result_options::min)) { ONEDAL_ASSERT(input.get_partial_min().get_column_count() == column_count); const auto min = pr::table2ndarray_1d(q, input.get_partial_min(), sycl::usm::alloc::device); - { comm_.allreduce(min.flatten(q, {}), spmd::reduce_op::min).wait(); } + if (is_distributed) { + comm_.allreduce(min.flatten(q, {}), spmd::reduce_op::min).wait(); + } res.set_min(homogen_table::wrap(min.flatten(q, {}), 1, column_count)); } @@ -178,46 +183,46 @@ result_t finalize_compute_kernel_dense_impl::operator()(const descriptor_ if (res_op_partial.test(result_options::sum)) { auto sums_nd = pr::table2ndarray_1d(q, input.get_partial_sum(), sycl::usm::alloc::device); - if (comm_.get_rank_count() > 1) { + auto sums2_nd = pr::table2ndarray_1d(q, + input.get_partial_sum_squares(), + sycl::usm::alloc::device); + + auto sums2cent_nd = pr::table2ndarray_1d(q, + input.get_partial_sum_squares_centered(), + sycl::usm::alloc::device); + if (is_distributed) { auto sums_nd_copy = pr::ndarray::empty(q, { column_count }, sycl::usm::alloc::device); auto copy_event = copy(q, sums_nd_copy, sums_nd, {}); copy_event.wait_and_throw(); sums_nd = sums_nd_copy; - } - { - ONEDAL_PROFILER_TASK(allreduce_sums, q); - comm_.allreduce(sums_nd.flatten(q, {}), spmd::reduce_op::sum).wait(); - } - auto sums2_nd = pr::table2ndarray_1d(q, - input.get_partial_sum_squares(), - sycl::usm::alloc::device); - if (comm_.get_rank_count() > 1) { + { + ONEDAL_PROFILER_TASK(allreduce_sums, q); + comm_.allreduce(sums_nd.flatten(q, {}), spmd::reduce_op::sum).wait(); + } + auto sums2_nd_copy = pr::ndarray::empty(q, { column_count }, sycl::usm::alloc::device); - auto copy_event = copy(q, sums2_nd_copy, sums2_nd, {}); + copy_event = copy(q, sums2_nd_copy, sums2_nd, {}); copy_event.wait_and_throw(); sums2_nd = sums2_nd_copy; - } - { - ONEDAL_PROFILER_TASK(allreduce_sums, q); - comm_.allreduce(sums2_nd.flatten(q, {}), spmd::reduce_op::sum).wait(); - } - auto sums2cent_nd = pr::table2ndarray_1d(q, - input.get_partial_sum_squares_centered(), - sycl::usm::alloc::device); - if (comm_.get_rank_count() > 1) { + + { + ONEDAL_PROFILER_TASK(allreduce_sums, q); + comm_.allreduce(sums2_nd.flatten(q, {}), spmd::reduce_op::sum).wait(); + } auto sums2cent_nd_copy = pr::ndarray::empty(q, { column_count }, sycl::usm::alloc::device); - auto copy_event = copy(q, sums2cent_nd_copy, sums2cent_nd, {}); + copy_event = copy(q, sums2cent_nd_copy, sums2cent_nd, {}); copy_event.wait_and_throw(); sums2cent_nd = sums2cent_nd_copy; + { + ONEDAL_PROFILER_TASK(allreduce_sums, q); + comm_.allreduce(sums2cent_nd.flatten(q, {}), spmd::reduce_op::sum).wait(); + } } - { - ONEDAL_PROFILER_TASK(allreduce_sums, q); - comm_.allreduce(sums2cent_nd.flatten(q, {}), spmd::reduce_op::sum).wait(); - } + auto [result_means, result_variance, result_raw_moment, diff --git a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp index 37e264c4b71..3df7f3c2d8d 100644 --- a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp @@ -66,41 +66,38 @@ result_t finalize_compute_kernel_dense_impl::operator()(const descriptor_ const auto nobs_host = pr::table2ndarray(q, input.get_partial_n_rows()); auto rows_count_global = nobs_host.get_data()[0]; - { - ONEDAL_PROFILER_TASK(allreduce_rows_count_global); - comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait(); - } - - ONEDAL_ASSERT(rows_count_global > 0); - auto sums = pr::table2ndarray_1d(q, input.get_partial_sum(), sycl::usm::alloc::device); + auto xtx = + pr::table2ndarray(q, input.get_partial_crossproduct(), sycl::usm::alloc::device); + if (comm_.get_rank_count() > 1) { + { + ONEDAL_PROFILER_TASK(allreduce_rows_count_global); + comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait(); + } auto sums_copy = pr::ndarray::empty(q, { column_count }, sycl::usm::alloc::device); auto copy_event = copy(q, sums_copy, sums, {}); copy_event.wait_and_throw(); sums = sums_copy; - } - { - ONEDAL_PROFILER_TASK(allreduce_sums, q); - comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait(); - } + { + ONEDAL_PROFILER_TASK(allreduce_sums, q); + comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait(); + } - auto xtx = - pr::table2ndarray(q, input.get_partial_crossproduct(), sycl::usm::alloc::device); - if (comm_.get_rank_count() > 1) { auto xtx_copy = pr::ndarray::empty(q, { column_count, column_count }, sycl::usm::alloc::device); - auto copy_event = copy(q, xtx_copy, xtx, {}); + copy_event = copy(q, xtx_copy, xtx, {}); copy_event.wait_and_throw(); xtx = xtx_copy; + { + ONEDAL_PROFILER_TASK(allreduce_xtx, q); + comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait(); + } } - { - ONEDAL_PROFILER_TASK(allreduce_xtx, q); - comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait(); - } + ONEDAL_ASSERT(rows_count_global > 0); if (desc.get_result_options().test(result_options::cov_matrix)) { auto [cov, cov_event] = diff --git a/cpp/oneapi/dal/algo/linear_regression/backend/gpu/finalize_train_kernel_norm_eq_impl_dpc.cpp b/cpp/oneapi/dal/algo/linear_regression/backend/gpu/finalize_train_kernel_norm_eq_impl_dpc.cpp index 0d6fe6af069..f6c6dd54091 100644 --- a/cpp/oneapi/dal/algo/linear_regression/backend/gpu/finalize_train_kernel_norm_eq_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/backend/gpu/finalize_train_kernel_norm_eq_impl_dpc.cpp @@ -52,33 +52,29 @@ train_result finalize_train_kernel_norm_eq_impl::operator()( const pr::ndshape<2> betas_shape{ response_count, feature_count + 1 }; auto xtx_nd = pr::table2ndarray(q, input.get_partial_xtx(), sycl::usm::alloc::device); - if (comm_.get_rank_count() > 1) { - auto xtx_nd_copy = pr::ndarray::empty(q, xtx_shape, sycl::usm::alloc::device); - auto copy_event = copy(q, xtx_nd_copy, xtx_nd, {}); - copy_event.wait_and_throw(); - xtx_nd = xtx_nd_copy; - } auto xty_nd = pr::table2ndarray(q, input.get_partial_xty(), sycl::usm::alloc::device); - if (comm_.get_rank_count() > 1) { - auto xty_nd_copy = - pr::ndarray::empty(q, betas_shape, sycl::usm::alloc::device); - auto copy_event = copy(q, xty_nd_copy, xty_nd, {}); - copy_event.wait_and_throw(); - xty_nd = xty_nd_copy; - } const auto betas_size = check_mul_overflow(response_count, feature_count + 1); auto betas_arr = array::zeros(q, betas_size, alloc); if (comm_.get_rank_count() > 1) { + auto xtx_nd_copy = pr::ndarray::empty(q, xtx_shape, sycl::usm::alloc::device); + auto copy_event = copy(q, xtx_nd_copy, xtx_nd, {}); + copy_event.wait_and_throw(); + xtx_nd = xtx_nd_copy; { ONEDAL_PROFILER_TASK(xtx_allreduce); auto xtx_arr = dal::array::wrap(q, xtx_nd.get_mutable_data(), xtx_nd.get_count()); comm_.allreduce(xtx_arr).wait(); } + auto xty_nd_copy = + pr::ndarray::empty(q, betas_shape, sycl::usm::alloc::device); + copy_event = copy(q, xty_nd_copy, xty_nd, {}); + copy_event.wait_and_throw(); + xty_nd = xty_nd_copy; { ONEDAL_PROFILER_TASK(xty_allreduce); auto xty_arr = diff --git a/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp b/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp index 5ff9a1b00d6..12862ab04ba 100644 --- a/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp @@ -59,43 +59,41 @@ result_t finalize_train_kernel_cov_impl::operator()(const descriptor_t& d const auto nobs_host = pr::table2ndarray(q, input.get_partial_n_rows()); auto rows_count_global = nobs_host.get_data()[0]; - { - ONEDAL_PROFILER_TASK(allreduce_rows_count_global); - comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait(); - } - auto sums = pr::table2ndarray_1d(q, input.get_partial_sum(), sycl::usm::alloc::device); + auto xtx = + pr::table2ndarray(q, input.get_partial_crossproduct(), sycl::usm::alloc::device); if (comm_.get_rank_count() > 1) { + { + ONEDAL_PROFILER_TASK(allreduce_rows_count_global); + comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait(); + } auto sums_copy = pr::ndarray::empty(q, { column_count }, sycl::usm::alloc::device); auto copy_event = copy(q, sums_copy, sums, {}); copy_event.wait_and_throw(); sums = sums_copy; - } - { - ONEDAL_PROFILER_TASK(allreduce_sums, q); - comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait(); - } - - if (desc.get_result_options().test(result_options::means)) { - auto [means, means_event] = compute_means(q, sums, rows_count_global, {}); - result.set_means(homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count)); - } - - auto xtx = - pr::table2ndarray(q, input.get_partial_crossproduct(), sycl::usm::alloc::device); - if (comm_.get_rank_count() > 1) { auto xtx_copy = pr::ndarray::empty(q, { column_count, column_count }, sycl::usm::alloc::device); - auto copy_event = copy(q, xtx_copy, xtx, {}); + copy_event = copy(q, xtx_copy, xtx, {}); copy_event.wait_and_throw(); xtx = xtx_copy; + + { + ONEDAL_PROFILER_TASK(allreduce_sums, q); + comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait(); + } + + { + ONEDAL_PROFILER_TASK(allreduce_xtx, q); + comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait(); + } } - { - ONEDAL_PROFILER_TASK(allreduce_xtx, q); - comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait(); + + if (desc.get_result_options().test(result_options::means)) { + auto [means, means_event] = compute_means(q, sums, rows_count_global, {}); + result.set_means(homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count)); } auto [cov, cov_event] = compute_covariance(q, rows_count_global, xtx, sums, {});