From b3c4d41ce862a03ea0ee6a102437d12a2d22cedf Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Tue, 12 Nov 2024 19:29:04 +0100
Subject: [PATCH 01/17] add spectral decomposition fallback on GPU

---
 .../dal/algo/linear_regression/test/batch.cpp |   3 +-
 .../algo/linear_regression/test/fixture.hpp   |  78 +++----
 .../backend/primitives/lapack/solve_dpc.cpp   | 191 +++++++++++++++++-
 3 files changed, 229 insertions(+), 43 deletions(-)

diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp
index a5585202c31..33f690d2110 100644
--- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp
+++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp
@@ -51,9 +51,10 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) {
 }
 
 TEMPLATE_LIST_TEST_M(lr_batch_test, "LR with non-PSD matrix", "[lr][batch-nonpsd]", lr_types) {
-    SKIP_IF(this->non_psd_system_not_supported_on_device());
+    SKIP_IF(this->not_float64_friendly());
 
     this->generate(777);
+
     this->run_and_check_linear_indefinite();
     this->run_and_check_linear_indefinite_multioutput();
 }
diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp
index e1e54092552..bfba2abf9e5 100644
--- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp
+++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp
@@ -65,10 +65,6 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         return static_cast<Derived*>(this);
     }
 
-    bool non_psd_system_not_supported_on_device() {
-        return this->get_policy().is_gpu();
-    }
-
     table compute_responses(const table& beta, const table& bias, const table& data) const {
         const auto s_count = data.get_row_count();
 
@@ -299,18 +295,20 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
     here is not positive-definite, thus it has an infinite number of possible solutions. The
     solution here is the one with minimum norm, which is typically more desirable. */
     void run_and_check_linear_indefinite(double tol = 1e-3) {
-        const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
-                             0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
-                             0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
-        const double y[] = { 0.13632112, 1.53203308, -0.65996941 };
+        const auto dtype =
+            std::is_same<float_t, double>::value ? data_type::float64 : data_type::float32;
+        const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
+                              0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
+                              0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
+        const float_t y[] = { 0.13632112, 1.53203308, -0.65996941 };
         auto X_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 5)
                          .copy_data(X, 3, 5)
                          .build();
         auto y_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 1)
                          .copy_data(y, 3, 1)
@@ -321,20 +319,20 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         const auto coefs = train_res.get_coefficients();
 
         if (desc.get_result_options().test(result_options::intercept)) {
-            const double expected_beta[] = { 0.27785494,
-                                             0.53011669,
-                                             0.34352259,
-                                             0.40506216,
-                                             -1.26026447 };
-            const double expected_intercept[] = { 1.24485441 };
+            const float_t expected_beta[] = { 0.27785494,
+                                              0.53011669,
+                                              0.34352259,
+                                              0.40506216,
+                                              -1.26026447 };
+            const float_t expected_intercept[] = { 1.24485441 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(1, 5)
                                                .copy_data(expected_beta, 1, 5)
                                                .build();
             const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder()
-                                                    .set_data_type(data_type::float64)
+                                                    .set_data_type(dtype)
                                                     .set_layout(data_layout::row_major)
                                                     .allocate(1, 1)
                                                     .copy_data(expected_intercept, 1, 1)
@@ -351,13 +349,13 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         }
 
         else {
-            const double expected_beta[] = { 0.38217445,
-                                             0.2732197,
-                                             1.87135517,
-                                             0.63458468,
-                                             -2.08473134 };
+            const float_t expected_beta[] = { 0.38217445,
+                                              0.2732197,
+                                              1.87135517,
+                                              0.63458468,
+                                              -2.08473134 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(1, 5)
                                                .copy_data(expected_beta, 1, 5)
@@ -369,19 +367,21 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
     }
 
     void run_and_check_linear_indefinite_multioutput(double tol = 1e-3) {
-        const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
-                             0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
-                             0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
-        const double y[] = { 0.13632112,  1.53203308, -0.65996941,
-                             -0.31179486, 0.33776913, -2.2074711 };
+        const auto dtype =
+            std::is_same<float_t, double>::value ? data_type::float64 : data_type::float32;
+        const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442,  0.9202309,
+                              0.57710379,  -0.63646365, 0.54195222, -0.31659545, -0.32238912,
+                              0.09716732,  -1.52593041, 1.1921661,  -0.67108968, 1.00026942 };
+        const float_t y[] = { 0.13632112,  1.53203308, -0.65996941,
+                              -0.31179486, 0.33776913, -2.2074711 };
         auto X_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 5)
                          .copy_data(X, 3, 5)
                          .build();
         auto y_tbl = oneapi::dal::detail::homogen_table_builder()
-                         .set_data_type(data_type::float64)
+                         .set_data_type(dtype)
                          .set_layout(data_layout::row_major)
                          .allocate(3, 2)
                          .copy_data(y, 3, 2)
@@ -392,19 +392,19 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         const auto coefs = train_res.get_coefficients();
 
         if (desc.get_result_options().test(result_options::intercept)) {
-            const double expected_beta[] = {
+            const float_t expected_beta[] = {
                 -0.18692112, -0.20034801, -0.09590892, -0.13672683, 0.56229012,
                 -0.97006008, 1.39413595,  0.49238012,  1.11041239,  -0.79213452,
             };
-            const double expected_intercept[] = { -0.48964358, 0.96467681 };
+            const float_t expected_intercept[] = { -0.48964358, 0.96467681 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(2, 5)
                                                .copy_data(expected_beta, 2, 5)
                                                .build();
             const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder()
-                                                    .set_data_type(data_type::float64)
+                                                    .set_data_type(dtype)
                                                     .set_layout(data_layout::row_major)
                                                     .allocate(1, 2)
                                                     .copy_data(expected_intercept, 1, 2)
@@ -421,11 +421,11 @@ class lr_test : public te::crtp_algo_fixture<TestType, Derived> {
         }
 
         else {
-            const double expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585,
-                                             0.88658098,  -0.88921961, 1.19505839,  1.67634561,
-                                             1.2882766,   -1.43103981 };
+            const float_t expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585,
+                                              0.88658098,  -0.88921961, 1.19505839,  1.67634561,
+                                              1.2882766,   -1.43103981 };
             const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder()
-                                               .set_data_type(data_type::float64)
+                                               .set_data_type(dtype)
                                                .set_layout(data_layout::row_major)
                                                .allocate(2, 5)
                                                .copy_data(expected_beta, 2, 5)
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 223f6753f69..77ffd3e2540 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -14,6 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <limits>
+
 #include "oneapi/dal/backend/primitives/lapack.hpp"
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/backend/primitives/ndindexer.hpp"
@@ -59,6 +61,167 @@ inline sycl::event beta_copy_transform(sycl::queue& queue,
     });
 }
 
+/*
+This is an adaptation of the CPU version, which can be found in this file:
+cpp/daal/src/algorithms/service_kernel_math.h
+
+It solves a linear system A*x = b
+where 'b' might be a matrix (multiple RHS)
+
+It is intended as a fallback for solving linear regression, where these
+matrices are obtained as follows:
+    A = t(X)*X
+    b = t(X)*y
+
+It can handle systems that are not positive semi-definite, so it's used
+as a fallback when Cholesky fails or when it involves too small numbers
+(which tends to happen when floating point error results in a slightly
+positive matrix when it should have zero determinant in theory).
+*/
+template <mkl::uplo uplo, typename Float>
+sycl::event solve_spectral_decomposition(
+    sycl::queue& queue,
+    ndview<Float, 2>& A, // t(X)*X, LHS, gets overwritten
+    sycl::event& event_A,
+    ndview<Float, 2>& b, // t(X)*y, RHS, solution is outputted here
+    sycl::event& event_b,
+    const std::int64_t dim_A,
+    const std::int64_t nrhs) {
+    constexpr auto alloc = sycl::usm::alloc::device;
+
+    /* Decompose: A = Q * diag(l) * t(Q) */
+    /* Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on */
+    auto eigenvalues = array<Float>::empty(queue, dim_A * nrhs, alloc);
+    auto eigenvalues_view = ndview<Float, 1>::wrap(eigenvalues);
+    sycl::event syevd_event =
+        syevd<mkl::job::vec, uplo, Float>(queue, dim_A, A, dim_A, eigenvalues_view, { event_A });
+    const Float eps = std::numeric_limits<Float>::epsilon();
+
+    /* Discard too small singular values */
+    std::int64_t num_discarded;
+    {
+        /* This is placed inside a block because the array created here is not used any further */
+        auto eigenvalues_cpu = eigenvalues_view.to_host(queue, { syevd_event });
+        const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data();
+        const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1];
+        if (largest_ev <= eps) {
+            throw std::runtime_error(
+                "Could not solve linear system. Problem has too small singular values.");
+        }
+        const Float component_threshold = eps * largest_ev;
+        for (num_discarded = 0; num_discarded < dim_A - 1; num_discarded++) {
+            if (eigenvalues_cpu_ptr[num_discarded] > component_threshold)
+                break;
+        }
+    }
+
+    /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */
+    std::int64_t num_taken = dim_A - num_discarded;
+    auto ev_mutable = eigenvalues.get_mutable_data();
+    sycl::event inv_sqrt_eigenvalues_event = queue.submit([&](sycl::handler& h) {
+        h.depends_on(syevd_event);
+        h.parallel_for(num_taken, [=](const auto& i) {
+            const std::size_t ix = i + num_discarded;
+            ev_mutable[ix] = sycl::sqrt(Float(1) / ev_mutable[ix]);
+        });
+    });
+
+    auto Q_mutable = A.get_mutable_data();
+    sycl::event inv_sqrt_eigenvectors_event = queue.submit([&](sycl::handler& h) {
+        const auto range = oneapi::dal::backend::make_range_2d(num_taken, dim_A);
+        h.depends_on(inv_sqrt_eigenvalues_event);
+        h.parallel_for(range, [=](sycl::id<2> id) {
+            const std::size_t col = id[0] + num_discarded;
+            const std::size_t row = id[1];
+            Q_mutable[row + col * dim_A] *= ev_mutable[col];
+        });
+    });
+
+    /* Now calculate the actual solution: Qis * Qis' * B */
+    const std::int64_t eigenvectors_offset = num_discarded * dim_A;
+    if (nrhs == 1) {
+        sycl::event gemv_right_event =
+            mkl::blas::column_major::gemv(queue,
+                                          mkl::transpose::trans,
+                                          dim_A,
+                                          num_taken,
+                                          Float(1),
+                                          Q_mutable + eigenvectors_offset,
+                                          dim_A,
+                                          b.get_data(),
+                                          1,
+                                          Float(0),
+                                          ev_mutable,
+                                          1,
+                                          { inv_sqrt_eigenvectors_event, event_b });
+        return mkl::blas::column_major::gemv(queue,
+                                             mkl::transpose::nontrans,
+                                             dim_A,
+                                             num_taken,
+                                             Float(1),
+                                             Q_mutable + eigenvectors_offset,
+                                             dim_A,
+                                             ev_mutable,
+                                             1,
+                                             Float(0),
+                                             b.get_mutable_data(),
+                                             1,
+                                             { gemv_right_event });
+    }
+
+    else {
+        sycl::event gemm_right_event =
+            mkl::blas::column_major::gemm(queue,
+                                          mkl::transpose::trans,
+                                          mkl::transpose::nontrans,
+                                          num_taken,
+                                          nrhs,
+                                          dim_A,
+                                          Float(1),
+                                          Q_mutable + eigenvectors_offset,
+                                          dim_A,
+                                          b.get_data(),
+                                          dim_A,
+                                          Float(0),
+                                          ev_mutable,
+                                          num_taken,
+                                          { inv_sqrt_eigenvectors_event, event_b });
+        return mkl::blas::column_major::gemm(queue,
+                                             mkl::transpose::nontrans,
+                                             mkl::transpose::nontrans,
+                                             dim_A,
+                                             nrhs,
+                                             num_taken,
+                                             Float(1),
+                                             Q_mutable + eigenvectors_offset,
+                                             dim_A,
+                                             ev_mutable,
+                                             num_taken,
+                                             Float(0),
+                                             b.get_mutable_data(),
+                                             dim_A,
+                                             { gemm_right_event });
+    }
+}
+
+/* Returns the minimum value among entries in the diagonal of a square matrix */
+template <typename Float>
+Float diagonal_minimum(sycl::queue& queue,
+                       const Float* Matrix,
+                       const std::int64_t dim_matrix,
+                       sycl::event& event_Matrix) {
+    constexpr auto alloc = sycl::usm::alloc::device;
+    auto diag_min_holder = array<Float>::empty(queue, 1, alloc);
+    sycl::event diag_min_event = queue.submit([&](sycl::handler& h) {
+        auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>());
+        h.depends_on(event_Matrix);
+        h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) {
+            min_obj.combine(Matrix[i * (dim_matrix + 1)]);
+        });
+    });
+    return ndview<Float, 1>::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event });
+}
+
 template <mkl::uplo uplo, bool beta, typename Float, ndorder xlayout, ndorder ylayout>
 sycl::event solve_system(sycl::queue& queue,
                          const ndview<Float, 2, xlayout>& xtx,
@@ -70,12 +233,34 @@ sycl::event solve_system(sycl::queue& queue,
 
     auto [nxty, xty_event] = copy<ndorder::c, Float, ylayout, alloc>(queue, xty, dependencies);
     auto [nxtx, xtx_event] = copy<ndorder::c, Float, xlayout, alloc>(queue, xtx, dependencies);
+    const std::int64_t dim_xtx = xtx.get_dimension(0);
+
+    sycl::event solution_event;
 
     opt_array<Float> dummy{};
-    auto potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
-    auto potrs_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
+    try {
+        sycl::event potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
+        const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event);
+        if (diag_min <= 1e-6)
+            throw mkl::exception("");
+        solution_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
+    }
+    catch (mkl::exception& ex) {
+        const std::int64_t nrhs = nxty.get_dimension(0);
+        /* Note: this templated version of 'copy' reuses the layout that was specified in the previous copy */
+        sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies);
+        sycl::event xty_event_new = copy(queue, nxty, xty, dependencies);
+
+        solution_event = solve_spectral_decomposition<uplo, Float>(queue,
+                                                                   nxtx,
+                                                                   xtx_event_new,
+                                                                   nxty,
+                                                                   xty_event_new,
+                                                                   dim_xtx,
+                                                                   nrhs);
+    }
 
-    return beta_copy_transform<beta>(queue, nxty, final_xty, { potrs_event });
+    return beta_copy_transform<beta>(queue, nxty, final_xty, { solution_event });
 }
 
 #define INSTANTIATE(U, B, F, XL, YL)                                 \

From d528fab78c415ba29aa81aaa49428835f021de4f Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Thu, 14 Nov 2024 17:51:06 +0100
Subject: [PATCH 02/17] fix failing min diagonal

---
 .../backend/primitives/lapack/solve_dpc.cpp   | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 77ffd3e2540..a0f42ecadbe 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -211,15 +211,18 @@ Float diagonal_minimum(sycl::queue& queue,
                        const std::int64_t dim_matrix,
                        sycl::event& event_Matrix) {
     constexpr auto alloc = sycl::usm::alloc::device;
-    auto diag_min_holder = array<Float>::empty(queue, 1, alloc);
-    sycl::event diag_min_event = queue.submit([&](sycl::handler& h) {
-        auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>());
-        h.depends_on(event_Matrix);
-        h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) {
-            min_obj.combine(Matrix[i * (dim_matrix + 1)]);
-        });
-    });
-    return ndview<Float, 1>::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event });
+    auto idx_min_holder = array<std::int64_t>::empty(queue, 1, alloc);
+    sycl::event diag_min_event = mkl::blas::column_major::iamin(
+        queue,
+        dim_matrix,
+        Matrix,
+        dim_matrix + 1,
+        idx_min_holder.get_mutable_data(),
+        mkl::index_base::zero,
+        { event_Matrix }
+    );
+    const std::int64_t idx_min = ndview<std::int64_t, 1>::wrap(idx_min_holder).at_device(queue, 0, { diag_min_event });
+    return ndview<Float, 1>::wrap(Matrix, dim_matrix * dim_matrix).at_device(queue, idx_min * (dim_matrix + 1), { event_Matrix });
 }
 
 template <mkl::uplo uplo, bool beta, typename Float, ndorder xlayout, ndorder ylayout>

From 5255e648bdd0d9e605da55eee9270cb9da5ceeae Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Fri, 15 Nov 2024 08:54:58 +0100
Subject: [PATCH 03/17] switch back to sycl reduction for diag min

---
 .../backend/primitives/lapack/solve_dpc.cpp   | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index a0f42ecadbe..305f74a4a67 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -211,18 +211,21 @@ Float diagonal_minimum(sycl::queue& queue,
                        const std::int64_t dim_matrix,
                        sycl::event& event_Matrix) {
     constexpr auto alloc = sycl::usm::alloc::device;
-    auto idx_min_holder = array<std::int64_t>::empty(queue, 1, alloc);
-    sycl::event diag_min_event = mkl::blas::column_major::iamin(
-        queue,
-        dim_matrix,
-        Matrix,
-        dim_matrix + 1,
-        idx_min_holder.get_mutable_data(),
-        mkl::index_base::zero,
-        { event_Matrix }
-    );
-    const std::int64_t idx_min = ndview<std::int64_t, 1>::wrap(idx_min_holder).at_device(queue, 0, { diag_min_event });
-    return ndview<Float, 1>::wrap(Matrix, dim_matrix * dim_matrix).at_device(queue, idx_min * (dim_matrix + 1), { event_Matrix });
+    auto diag_min_holder = array<Float>::empty(queue, 1, alloc);
+    sycl::event diag_min_holder_init = queue.submit([&](sycl::handler& h) {
+        Float* diag_min_ptr = diag_min_holder.get_mutable_data();
+        h.parallel_for(1, [=](const auto& i) {
+            diag_min_ptr[i] = std::numeric_limits<Float>::infinity();
+        });
+    });
+    sycl::event diag_min_event = queue.submit([&](sycl::handler& h) {
+        auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>());
+        h.depends_on({ diag_min_holder_init, event_Matrix });
+        h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) {
+            min_obj.combine(Matrix[i * (dim_matrix + 1)]);
+        });
+    });
+    return ndview<Float, 1>::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event });
 }
 
 template <mkl::uplo uplo, bool beta, typename Float, ndorder xlayout, ndorder ylayout>
@@ -245,10 +248,10 @@ sycl::event solve_system(sycl::queue& queue,
         sycl::event potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
         const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event);
         if (diag_min <= 1e-6)
-            throw mkl::exception("");
+            throw mkl::lapack::computation_error("", "", 0);
         solution_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
     }
-    catch (mkl::exception& ex) {
+    catch (mkl::lapack::computation_error& ex) {
         const std::int64_t nrhs = nxty.get_dimension(0);
         /* Note: this templated version of 'copy' reuses the layout that was specified in the previous copy */
         sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies);

From 141b65b4c2e42aac0d0d27ac4117b3eb9e555b6c Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Fri, 15 Nov 2024 09:03:49 +0100
Subject: [PATCH 04/17] try another way

---
 .../dal/backend/primitives/lapack/solve_dpc.cpp      | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 305f74a4a67..5dc20bfd768 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -212,15 +212,11 @@ Float diagonal_minimum(sycl::queue& queue,
                        sycl::event& event_Matrix) {
     constexpr auto alloc = sycl::usm::alloc::device;
     auto diag_min_holder = array<Float>::empty(queue, 1, alloc);
-    sycl::event diag_min_holder_init = queue.submit([&](sycl::handler& h) {
-        Float* diag_min_ptr = diag_min_holder.get_mutable_data();
-        h.parallel_for(1, [=](const auto& i) {
-            diag_min_ptr[i] = std::numeric_limits<Float>::infinity();
-        });
-    });
     sycl::event diag_min_event = queue.submit([&](sycl::handler& h) {
-        auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>());
-        h.depends_on({ diag_min_holder_init, event_Matrix });
+        auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(),
+                                             sycl::minimum<>(),
+                                             sycl::property::reduction::initialize_to_identity());
+        h.depends_on({ event_Matrix });
         h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) {
             min_obj.combine(Matrix[i * (dim_matrix + 1)]);
         });

From e88ed4717a84281210202343b67952fbdf93cc44 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Fri, 15 Nov 2024 13:09:25 +0100
Subject: [PATCH 05/17] use ndarray instead of array for allocation

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 5dc20bfd768..401a304cfbc 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -91,17 +91,16 @@ sycl::event solve_spectral_decomposition(
 
     /* Decompose: A = Q * diag(l) * t(Q) */
     /* Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on */
-    auto eigenvalues = array<Float>::empty(queue, dim_A * nrhs, alloc);
-    auto eigenvalues_view = ndview<Float, 1>::wrap(eigenvalues);
+    auto eigenvalues = ndarray<Float, 1>::empty(queue, dim_A * nrhs, alloc);
     sycl::event syevd_event =
-        syevd<mkl::job::vec, uplo, Float>(queue, dim_A, A, dim_A, eigenvalues_view, { event_A });
+        syevd<mkl::job::vec, uplo, Float>(queue, dim_A, A, dim_A, eigenvalues, { event_A });
     const Float eps = std::numeric_limits<Float>::epsilon();
 
     /* Discard too small singular values */
     std::int64_t num_discarded;
     {
         /* This is placed inside a block because the array created here is not used any further */
-        auto eigenvalues_cpu = eigenvalues_view.to_host(queue, { syevd_event });
+        auto eigenvalues_cpu = eigenvalues.to_host(queue, { syevd_event });
         const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data();
         const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1];
         if (largest_ev <= eps) {

From a3254527c2d4c5e242823f8f96ae6537cbc0fa27 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Fri, 15 Nov 2024 16:52:21 +0100
Subject: [PATCH 06/17] avoid copying unused entries in array

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 401a304cfbc..efa497e73fe 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -100,7 +100,10 @@ sycl::event solve_spectral_decomposition(
     std::int64_t num_discarded;
     {
         /* This is placed inside a block because the array created here is not used any further */
-        auto eigenvalues_cpu = eigenvalues.to_host(queue, { syevd_event });
+        /* Note: the array for 'eigenvalues' is allocated with size [dimA, nrhs],
+        but by this point, only 'dimA' elements will have been written to it. */
+        auto eigenvalues_cpu =
+            ndview<Float, 1>::wrap(eigenvalues.get_data(), dim_A).to_host(queue, { syevd_event });
         const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data();
         const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1];
         if (largest_ev <= eps) {

From 6addc0355e0d7f61cbd591f282b414eafb7fdd7b Mon Sep 17 00:00:00 2001
From: david-cortes-intel <david.cortes@intel.com>
Date: Mon, 18 Nov 2024 15:07:26 +0100
Subject: [PATCH 07/17] Update
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp

Co-authored-by: Victoriya Fedotova <viktoria.nn@gmail.com>
---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index efa497e73fe..1942c2cb1df 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -124,7 +124,7 @@ sycl::event solve_spectral_decomposition(
         h.depends_on(syevd_event);
         h.parallel_for(num_taken, [=](const auto& i) {
             const std::size_t ix = i + num_discarded;
-            ev_mutable[ix] = sycl::sqrt(Float(1) / ev_mutable[ix]);
+            ev_mutable[ix] = sycl::rsqrt(ev_mutable[ix]);
         });
     });
 

From 8cc9ca187851a5cc4add852117a1ef30fcc67c75 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Mon, 18 Nov 2024 15:06:43 +0100
Subject: [PATCH 08/17] follow internal logic for error messages

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 3 +--
 cpp/oneapi/dal/detail/error_messages.cpp               | 2 ++
 cpp/oneapi/dal/detail/error_messages.hpp               | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 1942c2cb1df..6cf3d51c7e1 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -107,8 +107,7 @@ sycl::event solve_spectral_decomposition(
         const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data();
         const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1];
         if (largest_ev <= eps) {
-            throw std::runtime_error(
-                "Could not solve linear system. Problem has too small singular values.");
+            throw internal_error{ dal::detail::error_messages::too_small_singular_values() };
         }
         const Float component_threshold = eps * largest_ev;
         for (num_discarded = 0; num_discarded < dim_A - 1; num_discarded++) {
diff --git a/cpp/oneapi/dal/detail/error_messages.cpp b/cpp/oneapi/dal/detail/error_messages.cpp
index 20ce68e0ef2..85cbb969099 100644
--- a/cpp/oneapi/dal/detail/error_messages.cpp
+++ b/cpp/oneapi/dal/detail/error_messages.cpp
@@ -303,6 +303,8 @@ MSG(input_y_is_empty, "Input y is empty")
 /* Linear Regression */
 MSG(intercept_result_option_requires_intercept_flag,
     "Intercept result option requires intercept flag")
+MSG(too_small_singular_values,
+    "Linear system has too small singular values, cannot calculate a solution")
 
 /* Logistic Regression */
 MSG(class_count_neq_two,
diff --git a/cpp/oneapi/dal/detail/error_messages.hpp b/cpp/oneapi/dal/detail/error_messages.hpp
index dbb9c8ba6ec..926e89321b0 100644
--- a/cpp/oneapi/dal/detail/error_messages.hpp
+++ b/cpp/oneapi/dal/detail/error_messages.hpp
@@ -242,6 +242,7 @@ class ONEDAL_EXPORT error_messages {
 
     /* Linear Regression */
     MSG(intercept_result_option_requires_intercept_flag);
+    MSG(too_small_singular_values);
 
     /* Logistic Regression */
     MSG(class_count_neq_two);

From e39f2ac33744d32c9328cee3b4e94188fd7d0365 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Mon, 18 Nov 2024 15:11:51 +0100
Subject: [PATCH 09/17] add comment about the try-catch block

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 6cf3d51c7e1..78bd7d21960 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -241,6 +241,10 @@ sycl::event solve_system(sycl::queue& queue,
     sycl::event solution_event;
 
     opt_array<Float> dummy{};
+    /// Note: this is wrapped in a try-catch block in order to catch a potential exception
+    /// thrown by oneMKL when the Cholesky factorization ('potrs') fails due to the matrix
+    /// not being positive-definite. In such case, it will then fall back to spectral
+    /// decomposition-based inversion, which is able to handle such problems.
     try {
         sycl::event potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
         const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event);

From 9a4d13eef18ceb9c6be0d69e12f059621363468c Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Mon, 18 Nov 2024 15:13:34 +0100
Subject: [PATCH 10/17] add manual wait-and-throw call where appropriate

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 78bd7d21960..f5b372001a2 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -247,6 +247,7 @@ sycl::event solve_system(sycl::queue& queue,
     /// decomposition-based inversion, which is able to handle such problems.
     try {
         sycl::event potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
+        queue.wait_and_throw();
         const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event);
         if (diag_min <= 1e-6)
             throw mkl::lapack::computation_error("", "", 0);

From 9a1f48d2fc78116fb63ebd3226263f8c2b5e1258 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Mon, 18 Nov 2024 16:17:50 +0100
Subject: [PATCH 11/17] use internal wrappers for blas calls

---
 .../backend/primitives/lapack/solve_dpc.cpp   | 97 ++++++++-----------
 1 file changed, 41 insertions(+), 56 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index f5b372001a2..e7576b9cf32 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -16,9 +16,12 @@
 
 #include <limits>
 
+#include "oneapi/dal/backend/primitives/blas/gemm.hpp"
+#include "oneapi/dal/backend/primitives/blas/gemv.hpp"
 #include "oneapi/dal/backend/primitives/lapack.hpp"
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/backend/primitives/ndindexer.hpp"
+#include "oneapi/dal/detail/error_messages.hpp"
 
 namespace oneapi::dal::backend::primitives {
 
@@ -141,67 +144,49 @@ sycl::event solve_spectral_decomposition(
     /* Now calculate the actual solution: Qis * Qis' * B */
     const std::int64_t eigenvectors_offset = num_discarded * dim_A;
     if (nrhs == 1) {
+        auto ev_mutable_vec_view = ndview<Float, 1>::wrap(ev_mutable, num_taken);
+        auto b_mutable_vec_view = ndview<Float, 1>::wrap(b.get_mutable_data(), dim_A);
         sycl::event gemv_right_event =
-            mkl::blas::column_major::gemv(queue,
-                                          mkl::transpose::trans,
-                                          dim_A,
-                                          num_taken,
-                                          Float(1),
-                                          Q_mutable + eigenvectors_offset,
-                                          dim_A,
-                                          b.get_data(),
-                                          1,
-                                          Float(0),
-                                          ev_mutable,
-                                          1,
-                                          { inv_sqrt_eigenvectors_event, event_b });
-        return mkl::blas::column_major::gemv(queue,
-                                             mkl::transpose::nontrans,
-                                             dim_A,
-                                             num_taken,
-                                             Float(1),
-                                             Q_mutable + eigenvectors_offset,
-                                             dim_A,
-                                             ev_mutable,
-                                             1,
-                                             Float(0),
-                                             b.get_mutable_data(),
-                                             1,
-                                             { gemv_right_event });
+            gemv(queue,
+                 ndview<Float, 2, ndorder::c>::wrap(Q_mutable + eigenvectors_offset,
+                                                    ndshape<2>(num_taken, dim_A)),
+                 b_mutable_vec_view,
+                 ev_mutable_vec_view,
+                 Float(1),
+                 Float(0),
+                 { inv_sqrt_eigenvectors_event, event_b });
+        return gemv(queue,
+                    ndview<Float, 2, ndorder::f>::wrap(Q_mutable + eigenvectors_offset,
+                                                       ndshape<2>(dim_A, num_taken)),
+                    ev_mutable_vec_view,
+                    b_mutable_vec_view,
+                    Float(1),
+                    Float(0),
+                    { gemv_right_event });
     }
 
     else {
+        auto ev_mutable_mat_view =
+            ndview<Float, 2, ndorder::c>::wrap(ev_mutable, ndshape<2>(nrhs, num_taken));
+        auto b_mutable_mat_view =
+            ndview<Float, 2, ndorder::c>::wrap(b.get_mutable_data(), ndshape<2>(nrhs, dim_A));
         sycl::event gemm_right_event =
-            mkl::blas::column_major::gemm(queue,
-                                          mkl::transpose::trans,
-                                          mkl::transpose::nontrans,
-                                          num_taken,
-                                          nrhs,
-                                          dim_A,
-                                          Float(1),
-                                          Q_mutable + eigenvectors_offset,
-                                          dim_A,
-                                          b.get_data(),
-                                          dim_A,
-                                          Float(0),
-                                          ev_mutable,
-                                          num_taken,
-                                          { inv_sqrt_eigenvectors_event, event_b });
-        return mkl::blas::column_major::gemm(queue,
-                                             mkl::transpose::nontrans,
-                                             mkl::transpose::nontrans,
-                                             dim_A,
-                                             nrhs,
-                                             num_taken,
-                                             Float(1),
-                                             Q_mutable + eigenvectors_offset,
-                                             dim_A,
-                                             ev_mutable,
-                                             num_taken,
-                                             Float(0),
-                                             b.get_mutable_data(),
-                                             dim_A,
-                                             { gemm_right_event });
+            gemm(queue,
+                 b_mutable_mat_view,
+                 ndview<Float, 2, ndorder::f>::wrap(Q_mutable + eigenvectors_offset,
+                                                    ndshape<2>(dim_A, num_taken)),
+                 ev_mutable_mat_view,
+                 Float(1),
+                 Float(0),
+                 { inv_sqrt_eigenvectors_event, event_b });
+        return gemm(queue,
+                    ev_mutable_mat_view,
+                    ndview<Float, 2, ndorder::c>::wrap(Q_mutable + eigenvectors_offset,
+                                                       ndshape<2>(num_taken, dim_A)),
+                    b_mutable_mat_view,
+                    Float(1),
+                    Float(0),
+                    { gemm_right_event });
     }
 }
 

From 8b3d1c1392a09ce58e2e42c66b7f96ff52c6bc75 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Mon, 18 Nov 2024 16:19:27 +0100
Subject: [PATCH 12/17] add comment about dimensions of inputs

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index e7576b9cf32..9d335198a53 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -84,9 +84,10 @@ positive matrix when it should have zero determinant in theory).
 template <mkl::uplo uplo, typename Float>
 sycl::event solve_spectral_decomposition(
     sycl::queue& queue,
-    ndview<Float, 2>& A, // t(X)*X, LHS, gets overwritten
+    ndview<Float, 2>& A, // t(X)*X, LHS, gets overwritten, dim=[dim_A, dim_A] (symmetric)
     sycl::event& event_A,
-    ndview<Float, 2>& b, // t(X)*y, RHS, solution is outputted here
+    ndview<Float, 2>&
+        b, // t(X)*y, RHS, solution is outputted here, dim=[dim_A, nrhs] in colmajor order
     sycl::event& event_b,
     const std::int64_t dim_A,
     const std::int64_t nrhs) {

From e86b5bde7929e0af69d2a4933048a242e21e880c Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Mon, 18 Nov 2024 16:21:27 +0100
Subject: [PATCH 13/17] change format of comments

---
 .../backend/primitives/lapack/solve_dpc.cpp   | 56 +++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 9d335198a53..173c92f8ffe 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -64,48 +64,46 @@ inline sycl::event beta_copy_transform(sycl::queue& queue,
     });
 }
 
-/*
-This is an adaptation of the CPU version, which can be found in this file:
-cpp/daal/src/algorithms/service_kernel_math.h
-
-It solves a linear system A*x = b
-where 'b' might be a matrix (multiple RHS)
-
-It is intended as a fallback for solving linear regression, where these
-matrices are obtained as follows:
-    A = t(X)*X
-    b = t(X)*y
-
-It can handle systems that are not positive semi-definite, so it's used
-as a fallback when Cholesky fails or when it involves too small numbers
-(which tends to happen when floating point error results in a slightly
-positive matrix when it should have zero determinant in theory).
-*/
+/// This is an adaptation of the CPU version, which can be found in this file:
+/// cpp/daal/src/algorithms/service_kernel_math.h
+///
+/// It solves a linear system A*x = b
+/// where 'b' might be a matrix (multiple RHS)
+///
+/// It is intended as a fallback for solving linear regression, where these
+/// matrices are obtained as follows:
+///     A = t(X)*X
+///     b = t(X)*y
+///
+/// It can handle systems that are not positive semi-definite, so it's used
+/// as a fallback when Cholesky fails or when it involves too small numbers
+/// (which tends to happen when floating point error results in a slightly
+/// positive matrix when it should have zero determinant in theory).
 template <mkl::uplo uplo, typename Float>
 sycl::event solve_spectral_decomposition(
     sycl::queue& queue,
-    ndview<Float, 2>& A, // t(X)*X, LHS, gets overwritten, dim=[dim_A, dim_A] (symmetric)
+    ndview<Float, 2>& A, /// t(X)*X, LHS, gets overwritten, dim=[dim_A, dim_A] (symmetric)
     sycl::event& event_A,
     ndview<Float, 2>&
-        b, // t(X)*y, RHS, solution is outputted here, dim=[dim_A, nrhs] in colmajor order
+        b, /// t(X)*y, RHS, solution is outputted here, dim=[dim_A, nrhs] in colmajor order
     sycl::event& event_b,
     const std::int64_t dim_A,
     const std::int64_t nrhs) {
     constexpr auto alloc = sycl::usm::alloc::device;
 
-    /* Decompose: A = Q * diag(l) * t(Q) */
-    /* Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on */
+    /// Decompose: A = Q * diag(l) * t(Q)
+    /// Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on
     auto eigenvalues = ndarray<Float, 1>::empty(queue, dim_A * nrhs, alloc);
     sycl::event syevd_event =
         syevd<mkl::job::vec, uplo, Float>(queue, dim_A, A, dim_A, eigenvalues, { event_A });
     const Float eps = std::numeric_limits<Float>::epsilon();
 
-    /* Discard too small singular values */
+    /// Discard too small singular values
     std::int64_t num_discarded;
     {
-        /* This is placed inside a block because the array created here is not used any further */
-        /* Note: the array for 'eigenvalues' is allocated with size [dimA, nrhs],
-        but by this point, only 'dimA' elements will have been written to it. */
+        /// This is placed inside a block because the array created here is not used any further
+        /// Note: the array for 'eigenvalues' is allocated with size [dimA, nrhs],
+        /// but by this point, only 'dimA' elements will have been written to it.
         auto eigenvalues_cpu =
             ndview<Float, 1>::wrap(eigenvalues.get_data(), dim_A).to_host(queue, { syevd_event });
         const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data();
@@ -120,7 +118,7 @@ sycl::event solve_spectral_decomposition(
         }
     }
 
-    /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */
+    /// Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l))
     std::int64_t num_taken = dim_A - num_discarded;
     auto ev_mutable = eigenvalues.get_mutable_data();
     sycl::event inv_sqrt_eigenvalues_event = queue.submit([&](sycl::handler& h) {
@@ -142,7 +140,7 @@ sycl::event solve_spectral_decomposition(
         });
     });
 
-    /* Now calculate the actual solution: Qis * Qis' * B */
+    /// Now calculate the actual solution: Qis * Qis' * B
     const std::int64_t eigenvectors_offset = num_discarded * dim_A;
     if (nrhs == 1) {
         auto ev_mutable_vec_view = ndview<Float, 1>::wrap(ev_mutable, num_taken);
@@ -191,7 +189,7 @@ sycl::event solve_spectral_decomposition(
     }
 }
 
-/* Returns the minimum value among entries in the diagonal of a square matrix */
+/// Returns the minimum value among entries in the diagonal of a square matrix
 template <typename Float>
 Float diagonal_minimum(sycl::queue& queue,
                        const Float* Matrix,
@@ -241,7 +239,7 @@ sycl::event solve_system(sycl::queue& queue,
     }
     catch (mkl::lapack::computation_error& ex) {
         const std::int64_t nrhs = nxty.get_dimension(0);
-        /* Note: this templated version of 'copy' reuses the layout that was specified in the previous copy */
+        /// Note: this templated version of 'copy' reuses the layout that was specified in the previous copy
         sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies);
         sycl::event xty_event_new = copy(queue, nxty, xty, dependencies);
 

From 15b686d38224129c79bfd3be9cd6f877f5b54eb0 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Tue, 19 Nov 2024 13:41:20 +0100
Subject: [PATCH 14/17] add comment about threshold for diagonal minimum

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 8 +++++++-
 oneDAL.code-workspace                                  | 7 +++++++
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 oneDAL.code-workspace

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 173c92f8ffe..1fd7883c5d1 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -233,7 +233,13 @@ sycl::event solve_system(sycl::queue& queue,
         sycl::event potrf_event = potrf_factorization<uplo>(queue, nxtx, dummy, { xtx_event });
         queue.wait_and_throw();
         const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event);
-        if (diag_min <= 1e-6)
+        /// Note: this threshold was chosen as a guess for when Cholesky factorization might
+        /// succeed despite the matrix in theory not being positive-definite, or succeed but
+        /// having too large numerical inaccuracies. In such cases, there should be too small
+        /// singular values that the fallback will discard, but there's no guaranteed match
+        /// between singular values and entries in the Cholesky diagonal. This is just a guess.
+        const Float threshold_diagonal_min = 1e-6f;
+        if (diag_min <= threshold_diagonal_min)
             throw mkl::lapack::computation_error("", "", 0);
         solution_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
     }
diff --git a/oneDAL.code-workspace b/oneDAL.code-workspace
new file mode 100644
index 00000000000..362d7c25bb4
--- /dev/null
+++ b/oneDAL.code-workspace
@@ -0,0 +1,7 @@
+{
+	"folders": [
+		{
+			"path": "."
+		}
+	]
+}
\ No newline at end of file

From 4cff7b9328c7aa5bd2ab36ccd8676d34d8d92e9f Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Tue, 19 Nov 2024 13:45:56 +0100
Subject: [PATCH 15/17] avoid manually throwing MKL error

---
 cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index 1fd7883c5d1..ef49ea497e1 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -238,12 +238,17 @@ sycl::event solve_system(sycl::queue& queue,
         /// having too large numerical inaccuracies. In such cases, there should be too small
         /// singular values that the fallback will discard, but there's no guaranteed match
         /// between singular values and entries in the Cholesky diagonal. This is just a guess.
-        const Float threshold_diagonal_min = 1e-6f;
+        const Float threshold_diagonal_min = 1e-6;
         if (diag_min <= threshold_diagonal_min)
-            throw mkl::lapack::computation_error("", "", 0);
+            goto fallback_solver;
         solution_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
     }
     catch (mkl::lapack::computation_error& ex) {
+        goto fallback_solver;
+    }
+    /// Note: this block is structured so that it will only be entered into through a 'goto'
+    if (false) {
+    fallback_solver:
         const std::int64_t nrhs = nxty.get_dimension(0);
         /// Note: this templated version of 'copy' reuses the layout that was specified in the previous copy
         sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies);

From 58ebb83893b3ea5d0095d4cb37a3e948c55617b9 Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Tue, 19 Nov 2024 13:49:18 +0100
Subject: [PATCH 16/17] remove accidentally commited file

---
 oneDAL.code-workspace | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 oneDAL.code-workspace

diff --git a/oneDAL.code-workspace b/oneDAL.code-workspace
deleted file mode 100644
index 362d7c25bb4..00000000000
--- a/oneDAL.code-workspace
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-	"folders": [
-		{
-			"path": "."
-		}
-	]
-}
\ No newline at end of file

From 4e241dcfa8c92b063afc944e3510bdc61fab3a6f Mon Sep 17 00:00:00 2001
From: David Cortes <david.cortes@intel.com>
Date: Tue, 19 Nov 2024 15:08:59 +0100
Subject: [PATCH 17/17] wrap procedure in a function

---
 .../backend/primitives/lapack/solve_dpc.cpp   | 52 ++++++++++++-------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
index ef49ea497e1..12eb654bbe4 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp
@@ -209,6 +209,30 @@ Float diagonal_minimum(sycl::queue& queue,
     return ndview<Float, 1>::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event });
 }
 
+template <mkl::uplo uplo, typename Float, ndorder xlayout, ndorder ylayout>
+sycl::event solve_with_fallback(
+    sycl::queue& queue,
+    const ndview<Float, 2, xlayout>& xtx,
+    const ndview<Float, 2, ylayout>& xty,
+    ndview<Float, 2, ndorder::c>& nxtx,
+    ndview<Float, 2, ndorder::c>& nxty, /// solution will be written here
+    const event_vector& dependencies) {
+    const std::int64_t dim_xtx = xtx.get_dimension(0);
+    const std::int64_t nrhs = nxty.get_dimension(0);
+    /// Note: this templated version of 'copy' reuses a layout that should have been
+    /// specified in a previous copy before the fallback.
+    sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies);
+    sycl::event xty_event_new = copy(queue, nxty, xty, dependencies);
+
+    return solve_spectral_decomposition<uplo, Float>(queue,
+                                                     nxtx,
+                                                     xtx_event_new,
+                                                     nxty,
+                                                     xty_event_new,
+                                                     dim_xtx,
+                                                     nrhs);
+}
+
 template <mkl::uplo uplo, bool beta, typename Float, ndorder xlayout, ndorder ylayout>
 sycl::event solve_system(sycl::queue& queue,
                          const ndview<Float, 2, xlayout>& xtx,
@@ -239,28 +263,16 @@ sycl::event solve_system(sycl::queue& queue,
         /// singular values that the fallback will discard, but there's no guaranteed match
         /// between singular values and entries in the Cholesky diagonal. This is just a guess.
         const Float threshold_diagonal_min = 1e-6;
-        if (diag_min <= threshold_diagonal_min)
-            goto fallback_solver;
-        solution_event = potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
+        if (diag_min > threshold_diagonal_min) {
+            solution_event =
+                potrs_solution<uplo>(queue, nxtx, nxty, dummy, { potrf_event, xty_event });
+        }
+        else {
+            solution_event = solve_with_fallback<uplo>(queue, xtx, xty, nxtx, nxty, dependencies);
+        }
     }
     catch (mkl::lapack::computation_error& ex) {
-        goto fallback_solver;
-    }
-    /// Note: this block is structured so that it will only be entered into through a 'goto'
-    if (false) {
-    fallback_solver:
-        const std::int64_t nrhs = nxty.get_dimension(0);
-        /// Note: this templated version of 'copy' reuses the layout that was specified in the previous copy
-        sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies);
-        sycl::event xty_event_new = copy(queue, nxty, xty, dependencies);
-
-        solution_event = solve_spectral_decomposition<uplo, Float>(queue,
-                                                                   nxtx,
-                                                                   xtx_event_new,
-                                                                   nxty,
-                                                                   xty_event_new,
-                                                                   dim_xtx,
-                                                                   nrhs);
+        solution_event = solve_with_fallback<uplo>(queue, xtx, xty, nxtx, nxty, dependencies);
     }
 
     return beta_copy_transform<beta>(queue, nxty, final_xty, { solution_event });