diff --git a/batched/dense/impl/KokkosBatched_Gbtrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Gbtrf_Serial_Impl.hpp new file mode 100644 index 0000000000..7d8b754907 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Gbtrf_Serial_Impl.hpp @@ -0,0 +1,93 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GBTRF_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_GBTRF_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Gbtrf_Serial_Internal.hpp" + +namespace KokkosBatched { +namespace Impl { +template +KOKKOS_INLINE_FUNCTION static int checkGbtrfInput([[maybe_unused]] const ABViewType &AB, [[maybe_unused]] const int kl, + [[maybe_unused]] const int ku, [[maybe_unused]] const int m) { + static_assert(Kokkos::is_view_v, "KokkosBatched::gbtrf: ABViewType is not a Kokkos::View."); + static_assert(ABViewType::rank == 2, "KokkosBatched::gbtrs: ABViewType must have rank 2."); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (m < 0) { + Kokkos::printf( + "KokkosBatched::gbtrf: input parameter m must not be less than 0: m " + "= " + "%d\n", + m); + return 1; + } + + if (kl < 0) { + Kokkos::printf( + "KokkosBatched::gbtrf: input parameter kl must not be less than 0: kl " + "= " + "%d\n", + kl); + return 1; + } + + if (ku < 0) { + Kokkos::printf( + "KokkosBatched::gbtrf: input parameter ku must not be less than 0: ku " + "= " + "%d\n", + ku); + return 1; + } + + const int lda = AB.extent(0); + if (lda < (2 * kl + ku + 1)) { + Kokkos::printf( + "KokkosBatched::gbtrs: leading dimension of A must be smaller than 2 * " + "kl + ku + 1: " + "lda = %d, kl = %d, ku = %d\n", + lda, kl, ku); + return 1; + } +#endif + return 0; +} +} // namespace Impl + +template <> +struct SerialGbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &AB, const PivViewType &piv, const int kl, const int ku, + const int m = -1) { + // default: m == n + int n = AB.extent(1); + int m_tmp = m > 0 ? m : n; + + auto info = Impl::checkGbtrfInput(AB, kl, ku, m_tmp); + if (info) return info; + + // Quick return if possible + if (m_tmp == 0 || n == 0) return 0; + + return Impl::SerialGbtrfInternal::invoke(AB, piv, kl, ku, m_tmp); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_GBTRF_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Gbtrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Gbtrf_Serial_Internal.hpp new file mode 100644 index 0000000000..1764f386b7 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Gbtrf_Serial_Internal.hpp @@ -0,0 +1,109 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GBTRF_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_GBTRF_SERIAL_INTERNAL_HPP_ + +#include +#include +#include +#include +#include "KokkosBatched_Ger_Serial_Internal.hpp" + +namespace KokkosBatched { +namespace Impl { +template +struct SerialGbtrfInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &AB, const PivViewType &piv, const int kl, const int ku, + const int m); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialGbtrfInternal::invoke(const ABViewType &AB, + const PivViewType &piv, const int kl, + const int ku, const int m) { + const int n = AB.extent(1); + // Upper bandwidth of U factor + const int kv = ku + kl; + + // Gaussian elimination with partial pivoting + // Set fill-in elements in columns KU+2 to KV to zero + for (int j = ku + 1; j < Kokkos::min(kv, n); ++j) { + for (int i = kv - j + 1; i < kl; ++i) { + AB(i, j) = 0; + } + } + + // JU is the index of the last column affected by the current stage of + // the factorization + int ju = 0; + int info = 0; + for (int j = 0; j < Kokkos::min(m, n); ++j) { + // Set fill-in elements in column J+KV to zero + if (j + kv < n) { + for (int i = 0; i < kl; ++i) { + AB(i, j + kv) = 0; + } + } + + // Find pivot and test for singularity. KM is the number of subdiagonals + // elements in the current column. + int km = Kokkos::min(kl, m - j - 1); + auto cur_col_AB = Kokkos::subview(AB, Kokkos::pair(kv, kv + km + 1), j); + int jp = SerialIamax::invoke(cur_col_AB); + piv(j) = jp + j; + + if (AB(kv + jp, j) == 0) { + // If pivot is zero, set INFO to the index of the pivot unless a + // zero pivot has already been found. + if (info == 0) info = j + 1; + } else { + ju = Kokkos::max(ju, Kokkos::min(j + ku + jp, n - 1)); + + // Apply the interchange to columns J to JU + if (jp != 0) { + for (int k = 0; k < ju - j + 1; ++k) { + Kokkos::kokkos_swap(AB(kv + jp - k, j + k), AB(kv - k, j + k)); + } + } + + if (km > 0) { + // Compute multipliers + const auto alpha = 1.0 / AB(kv, j); + auto sub_col_AB = Kokkos::subview(AB, Kokkos::pair(kv + 1, kv + km + 1), j); + KokkosBlas::SerialScale::invoke(alpha, sub_col_AB); + + // Update trailing submatrix within the band + if (ju > j) { + auto x = Kokkos::subview(AB, Kokkos::pair(kv + 1, kv + km + 1), j); + + // dger or zgeru with alpha = -1.0 + const int abs0 = AB.stride(0), abs1 = AB.stride(1); + Impl::SerialGerInternal::invoke(KokkosBlas::Impl::OpID(), km, ju - j, -1.0, &AB(kv + 1, j), abs0, + &AB(kv - 1, j + 1), (abs1 - abs0), &AB(kv, j + 1), abs0, (abs1 - abs0)); + } + } + } + } + return 0; +} + +} // namespace Impl +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_GBTRF_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/src/KokkosBatched_Gbtrf.hpp b/batched/dense/src/KokkosBatched_Gbtrf.hpp new file mode 100644 index 0000000000..893dba3bac --- /dev/null +++ b/batched/dense/src/KokkosBatched_Gbtrf.hpp @@ -0,0 +1,54 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_GBTRF_HPP_ +#define KOKKOSBATCHED_GBTRF_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Gbtrf: +/// Compute a LU factorization of a m-by-n band matrix A using partial +/// pivoting with row interchanges. +/// \tparam ArgAlgo: Type indicating the blocked (KokkosBatched::Algo::Gbtrf::Blocked) or unblocked +/// (KokkosBatched::Algo::Gbtrf::Unblocked) algorithm to be used +/// +/// \tparam ABViewType: Input type for the matrix, needs to be a 2D view +/// \tparam PivViewType: Input type for the pivot indices, needs to be a 1D view +/// +/// \param Ab [inout]: A is a ldab by n band matrix, a rank 2 view +/// \param piv [out]: On exit, the pivot indices, a rank 1 view +/// \param kl [in]: The number of subdiagonals within the band of A. kl >= 0 +/// \param ku [in]: The number of superdiagonals within the band of A. ku >= 0 +/// \param m [in]: The number of rows of the matrix A. (optional, default is -1, corresponding to m == n) +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialGbtrf { + static_assert(std::is_same_v, "KokkosBatched::gbtrf: Use Algo::Gbtrf::Unblocked"); + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab, const PivViewType &piv, const int kl, const int ku, + const int m = -1); +}; +} // namespace KokkosBatched + +#include "KokkosBatched_Gbtrf_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_GBTRF_HPP_ diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 731fd03cc4..8a4f03e88b 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -66,6 +66,7 @@ #include "Test_Batched_SerialGer.hpp" #include "Test_Batched_SerialSyr.hpp" #include "Test_Batched_SerialLacgv.hpp" +#include "Test_Batched_SerialGbtrf.hpp" // Team Kernels #include "Test_Batched_TeamAxpy.hpp" diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index 9e6a1e6c7c..8045a0605f 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -269,7 +269,8 @@ void create_banded_pds_matrix(InViewType& in, OutViewType& out, int k = 1, bool } /// \brief Converts a banded matrix to a full matrix. -/// Takes a banded matrix in banded storage and converts it to a full matrix. +/// Takes a upper/lower triangular banded matrix in banded storage +/// and converts it to a full matrix. /// /// \tparam InViewType: Input type for the matrix, needs to be a 3D view /// \tparam OutViewType: Output type for the matrix, needs to be a 3D view @@ -309,6 +310,73 @@ void banded_to_full(InViewType& in, OutViewType& out, int k = 1) { Kokkos::deep_copy(out, h_out); } +/// \brief Converts a banded matrix to a full matrix. +/// Takes a banded matrix in banded storage and converts it to a full matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// \param kl [in]: Number of subdiagonals for within the band of A (default is 1) +/// \param ku [in]: Number of superdiagonals for within the band of A (default is 1) +/// +template +void banded_to_full(InViewType& in, OutViewType& out, int kl = 1, int ku = 1) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int Nb = out.extent(0), m = out.extent(1), n = out.extent(2); + + Kokkos::deep_copy(h_in, in); + assert(in.extent(0) == out.extent(0)); + assert(in.extent(1) == static_cast(2 * kl + ku + 1)); + assert(in.extent(2) == out.extent(2)); + + for (int i0 = 0; i0 < Nb; i0++) { + for (int i1 = 0; i1 < m; i1++) { + for (int i2 = Kokkos::max(0, i1 - ku); i2 < Kokkos::min(n, i1 + kl + ku + 1); i2++) { + auto row_in_banded = kl + ku + i1 - i2; + h_out(i0, i1, i2) = h_in(i0, row_in_banded, i2); + } + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Converts a full matrix to a banded matrix. +/// Takes a full matrix and converts it to a banded matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched full matrix, a rank 3 view +/// \param out [out]: Output batched banded matrix, a rank 3 view +/// \param kl [in]: Number of subdiagonals for within the band of A (default is 1) +/// \param ku [in]: Number of superdiagonals for within the band of A (default is 1) +/// +template +void full_to_banded(InViewType& in, OutViewType& out, int kl = 1, int ku = 1) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int Nb = in.extent(0), m = in.extent(1), n = in.extent(2); + + Kokkos::deep_copy(h_in, in); + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == static_cast(2 * kl + ku + 1)); + assert(out.extent(2) == in.extent(2)); + + for (int i0 = 0; i0 < Nb; i0++) { + for (int i1 = 0; i1 < m; i1++) { + for (int i2 = Kokkos::max(0, i1 - ku); i2 < Kokkos::min(n, i1 + kl + 1); i2++) { + // Do not use the element from the full matrix if it is outside the band + auto row_in_banded = kl + ku + i1 - i2; + h_out(i0, row_in_banded, i2) = h_in(i0, i1, i2); + } + } + } + Kokkos::deep_copy(out, h_out); +} + /// \brief Create a triangular matrix from an input matrix: /// Copies the input matrix into the upper/lower triangular of the output matrix specified /// by the parameter k. Zero out elements below/above the k-th diagonal. @@ -325,15 +393,15 @@ void banded_to_full(InViewType& in, OutViewType& out, int k = 1) { /// template void create_triangular_matrix(InViewType& in, OutViewType& out, int k = 0) { - auto h_in = Kokkos::create_mirror_view(in); - auto h_out = Kokkos::create_mirror_view(out); - const int N = in.extent(0), BlkSize = in.extent(1); + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int Nb = in.extent(0), m = in.extent(1), n = in.extent(2); Kokkos::deep_copy(h_in, in); Kokkos::deep_copy(h_out, 0.0); - for (int i0 = 0; i0 < N; i0++) { - for (int i1 = 0; i1 < BlkSize; i1++) { - for (int i2 = 0; i2 < BlkSize; i2++) { + for (int i0 = 0; i0 < Nb; i0++) { + for (int i1 = 0; i1 < m; i1++) { + for (int i2 = 0; i2 < n; i2++) { if constexpr (std::is_same_v) { // Upper // Zero out elements below the k-th diagonal @@ -344,7 +412,8 @@ void create_triangular_matrix(InViewType& in, OutViewType& out, int k = 0) { h_out(i0, i1, i2) = i2 > i1 + k ? 0.0 : h_in(i0, i1, i2); } } - + } + for (int i1 = 0; i1 < Kokkos::min(m, n); i1++) { if constexpr (std::is_same_v) { // Unit diagonal h_out(i0, i1, i1) = 1.0; diff --git a/batched/dense/unit_test/Test_Batched_SerialGbtrf.hpp b/batched/dense/unit_test/Test_Batched_SerialGbtrf.hpp new file mode 100644 index 0000000000..8a23b35fba --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialGbtrf.hpp @@ -0,0 +1,519 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include +#include +#include +#include +#include "Test_Batched_DenseUtils.hpp" + +namespace Test { +namespace Gbtrf { + +template +struct Functor_BatchedSerialGbtrf { + using execution_space = typename DeviceType::execution_space; + ABViewType m_ab; + PivViewType m_ipiv; + int m_kl, m_ku, m_m; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGbtrf(const ABViewType &ab, const PivViewType &ipiv, int kl, int ku, int m) + : m_ab(ab), m_ipiv(ipiv), m_kl(kl), m_ku(ku), m_m(m) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k, int &info) const { + auto sub_ab = Kokkos::subview(m_ab, k, Kokkos::ALL(), Kokkos::ALL()); + auto sub_ipiv = Kokkos::subview(m_ipiv, k, Kokkos::ALL()); + + info += KokkosBatched::SerialGbtrf::invoke(sub_ab, sub_ipiv, m_kl, m_ku, m_m); + } + + inline int run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialGbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, m_ab.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGetrf { + using execution_space = typename DeviceType::execution_space; + AViewType m_a; + PivViewType m_ipiv; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGetrf(const AViewType &a, const PivViewType &ipiv) : m_a(a), m_ipiv(ipiv) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto sub_a = Kokkos::subview(m_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto sub_ipiv = Kokkos::subview(m_ipiv, k, Kokkos::ALL()); + + KokkosBatched::SerialGetrf::invoke(sub_a, sub_ipiv); + } + + void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialGbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, m_a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +/// \brief Implementation details of batched gbtrf analytical test +/// +/// \param Nb [in] Batch size of matrices +/// \param BlkSize [in] Block size of matrix A +/// 4x4 matrix +/// which satisfies PA = LU +/// P = [[0, 0, 1, 0], +/// [1, 0, 0, 0], +/// [0, 1, 0, 0], +/// [0, 0, 0, 1]] +/// A: [[1, -3, -2, 0], AB: [[0, 0, 0, 0], +/// [-1, 1, -3, -2], [0, 0, 0, 0], +/// [2, -1, 1, -3], [0, 0, -2, -2], +/// [0, 2, -1, 1]] [0, -3, -3, -3], +/// [1, 1, 1, 1], +/// [-1, -1, -1, 0], +/// [2, 2, 0, 0]] +/// +/// L = [[1, 0, 0, 0], LUB: [[0, 0, 0, 0], +/// [0.5, 1, 0, 0], [0, 0, 0, -3], +/// [-0.5, -0.2, 1, 0], [0, 0, 1, 1.5], +/// [0, -0.8, 1, 1]] [0, -1, -2.5, -3.2], +/// U = [[2, -1, 1, -3. ], [2, -2.5, -3, 5.4], +/// [0, -2.5, -2.5, 1.5 ], [-0.5, -0.2, 1, 0], +/// [0, 0, -3, -3.2], [0.5, -0.8, 0, 0]] +/// [0, 0, 0, 5.4]] +/// Note P is obtained by piv = [2 2 2 3] +/// +/// 3x4 matrix +/// which satisfies PA = LU +/// P = [[0, 1, 0], +/// [0, 0, 1], +/// [1, 0, 0]] +/// A: [[1, -3, -2, 0], AB: [[ 0, 0, 0, 0], +/// [-1, 1, -3, -2], [ 0, 0, 0, 0], +/// [2, -1, 1, -3]] [ 0, 0, -2, -2], +/// [ 0, -3, -3, -3], +/// [ 1, 1, 1, 0], +/// [-1, -1, 0, 0], +/// [ 2, 0, 0, 0]] +/// +/// L = [[1, 0, 0, 0], LUB: [[0, 0, 0, 0], +/// [0.5, 1, 0, 0], [0, 0, 0, -3], +/// [-0.5, -0.2, 1, 0]] [0, 0, 1, 1.5], +/// [0, -1, -2.5, -3.2], +/// U = [[2, -1, 1, -3. ], [2, -2.5, -3, 0], +/// [0, -2.5, -2.5, 1.5 ], [-0.5, -0.2, 0, 0], +/// [0, 0, -3, -3.2], [0.5, 0, 0, 0]] +/// [0, 0, 0, 0]] +/// Note P is obtained by piv = [2 2 2] +/// For simplicity, we represent the matrix U in 3x4 form instead of 4x4 +/// +/// 5x4 matrix +/// which satisfies PA = LU +/// P = [[0. 1. 0. 0. 0.], +/// [0. 0. 1. 0. 0.], +/// [1. 0. 0. 0. 0.], +/// [0. 0. 0. 1. 0.], +/// [0. 0. 0. 0. 1.]] +/// A: [[1, -3, -2, 0], AB: [[ 0, 0, 0, 0], +/// [-1, 1, -3, -2], [ 0, 0, 0, 0], +/// [2, -1, 1, -3], [ 0, 0, -2, -2], +/// [0, 2, -1, 1], [ 0, -3, -3, -3], +/// [0, 0, 0, 0]] [ 1, 1, 1, 1], +/// [-1, -1, -1, 0], +/// [ 2, 2, 0, 0]] +/// +/// L = [[1, 0, 0, 0], LUB: [[0, 0, 0, 0], +/// [0.5, 1, 0, 0], [0, 0, 0, -3], +/// [-0.5, -0.2, 1, 0], [0, 0, 1, 1.5], +/// [ 0, -0.8, 1, 1], [0, -1, -2.5, -3.2], +/// [ 0, 0, 0, 0]] [2, -2.5, -3, 5.4], +/// U = [[2, -1, 1, -3. ], [-0.5, -0.2, 1, 0], +/// [0, -2.5, -2.5, 1.5 ], [0.5, -0.8, 0, 0]] +/// [0, 0, -3, -3.2], +/// [0, 0, 0, 5.4]] +/// Note P is obtained by piv = [2 2 2 3] +/// For simplicity, we represent the matrix U in 5x4 form instead of 4x4 +template +void impl_test_batched_gbtrf_analytical(const int Nb) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View3DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + const int BlkSize = 4, kl = 2, ku = 2; + const int ldab = 2 * kl + ku + 1; + View3DType A0("A0", Nb, BlkSize, BlkSize), NL0("NL0", Nb, BlkSize, BlkSize), NL0_ref("NL0_ref", Nb, BlkSize, BlkSize), + U0("U0", Nb, BlkSize, BlkSize), U0_ref("U0_ref", Nb, BlkSize, BlkSize); + View3DType A1("A1", Nb, BlkSize - 1, BlkSize), NL1("NL1", Nb, BlkSize - 1, BlkSize), + U1("U1", Nb, BlkSize - 1, BlkSize), NL1_ref("NL1_ref", Nb, BlkSize - 1, BlkSize), + U1_ref("U1_ref", Nb, BlkSize - 1, BlkSize); + View3DType A2("A2", Nb, BlkSize + 1, BlkSize), NL2("NL2", Nb, BlkSize + 1, BlkSize), + U2("U2", Nb, BlkSize + 1, BlkSize), NL2_ref("NL2_ref", Nb, BlkSize + 1, BlkSize), + U2_ref("U2_ref", Nb, BlkSize + 1, BlkSize); + + View3DType AB0("AB0", Nb, ldab, BlkSize), AB1("AB1", Nb, ldab, BlkSize), AB2("AB2", Nb, ldab, BlkSize); + PivView2DType ipiv0("ipiv0", Nb, BlkSize), ipiv0_ref("ipiv0_ref", Nb, BlkSize), ipiv1("ipiv1", Nb, BlkSize - 1), + ipiv1_ref("ipiv1_ref", Nb, BlkSize - 1), ipiv2("ipiv2", Nb, BlkSize), ipiv2_ref("ipiv2_ref", Nb, BlkSize); + + // Only filling A2 and deep_copy from its subview + auto h_A2 = Kokkos::create_mirror_view(A2); + auto h_NL2_ref = Kokkos::create_mirror_view(NL2_ref); + auto h_U2_ref = Kokkos::create_mirror_view(U2_ref); + auto h_ipiv2_ref = Kokkos::create_mirror_view(ipiv2_ref); + + for (int ib = 0; ib < Nb; ib++) { + h_A2(ib, 0, 0) = 1; + h_A2(ib, 0, 1) = -3; + h_A2(ib, 0, 2) = -2; + h_A2(ib, 0, 3) = 0; + h_A2(ib, 1, 0) = -1; + h_A2(ib, 1, 1) = 1; + h_A2(ib, 1, 2) = -3; + h_A2(ib, 1, 3) = -2; + h_A2(ib, 2, 0) = 2; + h_A2(ib, 2, 1) = -1; + h_A2(ib, 2, 2) = 1; + h_A2(ib, 2, 3) = -3; + h_A2(ib, 3, 0) = 0; + h_A2(ib, 3, 1) = 2; + h_A2(ib, 3, 2) = -1; + h_A2(ib, 3, 3) = 1; + h_A2(ib, 4, 0) = 0; + h_A2(ib, 4, 1) = 0; + + h_U2_ref(ib, 0, 0) = 2; + h_U2_ref(ib, 0, 1) = -1; + h_U2_ref(ib, 0, 2) = 1; + h_U2_ref(ib, 0, 3) = -3; + h_U2_ref(ib, 1, 1) = -2.5; + h_U2_ref(ib, 1, 2) = -2.5; + h_U2_ref(ib, 1, 3) = 1.5; + h_U2_ref(ib, 2, 2) = -3; + h_U2_ref(ib, 2, 3) = -3.2; + h_U2_ref(ib, 3, 3) = 5.4; + + h_NL2_ref(ib, 1, 0) = 0.5; + h_NL2_ref(ib, 2, 0) = -0.5; + h_NL2_ref(ib, 2, 1) = -0.2; + h_NL2_ref(ib, 3, 1) = -0.8; + h_NL2_ref(ib, 3, 2) = 1.0; + + h_ipiv2_ref(ib, 0) = 2; + h_ipiv2_ref(ib, 1) = 2; + h_ipiv2_ref(ib, 2) = 2; + h_ipiv2_ref(ib, 3) = 3; + } + + // Copy matrix A2 to device + Kokkos::deep_copy(A2, h_A2); + + // Copy submatrix of A2 to device + auto A2_m3 = Kokkos::subview(A2, Kokkos::ALL, Kokkos::pair(0, BlkSize - 1), Kokkos::ALL); + auto A2_m4 = Kokkos::subview(A2, Kokkos::ALL, Kokkos::pair(0, BlkSize), Kokkos::ALL); + auto h_NL2_ref_m3 = Kokkos::subview(h_NL2_ref, Kokkos::ALL, Kokkos::pair(0, BlkSize - 1), Kokkos::ALL); + auto h_NL2_ref_m4 = Kokkos::subview(h_NL2_ref, Kokkos::ALL, Kokkos::pair(0, BlkSize), Kokkos::ALL); + auto h_U2_ref_m3 = Kokkos::subview(h_U2_ref, Kokkos::ALL, Kokkos::pair(0, BlkSize - 1), Kokkos::ALL); + auto h_U2_ref_m4 = Kokkos::subview(h_U2_ref, Kokkos::ALL, Kokkos::pair(0, BlkSize), Kokkos::ALL); + + Kokkos::deep_copy(A0, A2_m4); // Extract 4x4 matrix + Kokkos::deep_copy(A1, A2_m3); // Extract 3x4 matrix + + auto h_NL0_ref = Kokkos::create_mirror_view(NL0_ref); + auto h_NL1_ref = Kokkos::create_mirror_view(NL1_ref); + auto h_U0_ref = Kokkos::create_mirror_view(U0_ref); + auto h_U1_ref = Kokkos::create_mirror_view(U1_ref); + Kokkos::deep_copy(h_NL0_ref, h_NL2_ref_m4); // Extract 4x4 matrix + Kokkos::deep_copy(h_NL1_ref, h_NL2_ref_m3); // Extract 3x4 matrix + Kokkos::deep_copy(h_U0_ref, h_U2_ref_m4); // Extract 4x4 matrix + Kokkos::deep_copy(h_U1_ref, h_U2_ref_m3); // Extract 3x4 matrix + + // Copy submatrix of ipiv2 to device + auto h_ipiv0_ref = Kokkos::create_mirror_view(ipiv0_ref); + auto h_ipiv1_ref = Kokkos::create_mirror_view(ipiv1_ref); + auto h_ipiv2_m3 = Kokkos::subview(h_ipiv2_ref, Kokkos::ALL, Kokkos::pair(0, BlkSize - 1)); + Kokkos::deep_copy(h_ipiv0_ref, h_ipiv2_ref); + Kokkos::deep_copy(h_ipiv1_ref, h_ipiv2_m3); + + // Convert into banded storage + full_to_banded(A0, AB0, kl, ku); + full_to_banded(A1, AB1, kl, ku); + full_to_banded(A2, AB2, kl, ku); + + // gbtrf to factorize matrix A = P * L * U + auto info0 = + Functor_BatchedSerialGbtrf(AB0, ipiv0, kl, ku, BlkSize).run(); + auto info1 = + Functor_BatchedSerialGbtrf(AB1, ipiv1, kl, ku, BlkSize - 1) + .run(); + auto info2 = + Functor_BatchedSerialGbtrf(AB2, ipiv2, kl, ku, BlkSize + 1) + .run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + // Extract matrix U and L from AB + // first convert it to the full matrix (stored in A) + banded_to_full(AB0, A0, kl, ku); + banded_to_full(AB1, A1, kl, ku); + banded_to_full(AB2, A2, kl, ku); + + // Copy upper triangular components to U + create_triangular_matrix(A0, U0); + create_triangular_matrix(A1, U1); + create_triangular_matrix(A2, U2); + + // Extract L + // Apply pivot at host + auto h_ipiv0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), ipiv0); + auto h_ipiv1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), ipiv1); + auto h_ipiv2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), ipiv2); + auto h_A0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A0); + auto h_A1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A1); + Kokkos::deep_copy(h_A2, A2); + for (int ib = 0; ib < Nb; ib++) { + for (int j = 0; j < BlkSize - 1; j++) { + for (int i = j + 1; i < BlkSize - 1; i++) { + Kokkos::kokkos_swap(h_A0(ib, i, j), h_A0(ib, h_ipiv0(ib, i), j)); + } + } + + for (int j = 0; j < BlkSize - 2; j++) { + for (int i = j + 1; i < BlkSize - 1; i++) { + Kokkos::kokkos_swap(h_A1(ib, i, j), h_A1(ib, h_ipiv1(ib, i), j)); + } + } + + for (int j = 0; j < BlkSize; j++) { + for (int i = j + 1; i < BlkSize - 1; i++) { + Kokkos::kokkos_swap(h_A2(ib, i, j), h_A2(ib, h_ipiv2(ib, i), j)); + } + } + } + Kokkos::deep_copy(A0, h_A0); + Kokkos::deep_copy(A1, h_A1); + Kokkos::deep_copy(A2, h_A2); + + // Copy non-diagonal lower triangular components to NL + create_triangular_matrix(A0, NL0, + -1); + create_triangular_matrix(A1, NL1, + -1); + create_triangular_matrix(A2, NL2, + -1); + + // Check if U, NL and ipiv have expected values + auto h_U0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U0); + auto h_U1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U1); + auto h_U2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U2); + auto h_NL0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), NL0); + auto h_NL1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), NL1); + auto h_NL2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), NL2); + + RealType eps = 1.0e1 * ats::epsilon(); + for (int ib = 0; ib < Nb; ib++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_EQ(h_ipiv0(ib, j), h_ipiv0_ref(ib, j)); + EXPECT_EQ(h_ipiv2(ib, j), h_ipiv2_ref(ib, j)); + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_U0(ib, i, j), h_U0_ref(ib, i, j), eps, "h_U0"); + EXPECT_NEAR_KK(h_NL0(ib, i, j), h_NL0_ref(ib, i, j), eps, "h_NL0"); + } + for (int i = 0; i < BlkSize - 1; i++) { + EXPECT_NEAR_KK(h_U1(ib, i, j), h_U1_ref(ib, i, j), eps, "h_U1"); + EXPECT_NEAR_KK(h_NL1(ib, i, j), h_NL1_ref(ib, i, j), eps, "h_NL1"); + } + for (int i = 0; i < BlkSize + 1; i++) { + EXPECT_NEAR_KK(h_U2(ib, i, j), h_U2_ref(ib, i, j), eps, "h_U2"); + EXPECT_NEAR_KK(h_NL2(ib, i, j), h_NL2_ref(ib, i, j), eps, "h_NL2"); + } + } + for (int j = 0; j < BlkSize - 1; j++) { + EXPECT_EQ(h_ipiv1(ib, j), h_ipiv1_ref(ib, j)); + } + } +} + +/// \brief Implementation details of batched gbtrf test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +template +void impl_test_batched_gbtrf(const int Nb, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View3DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + const int kl = 2, ku = 2; + const int ldab = 2 * kl + ku + 1; + View3DType A("A", Nb, BlkSize, BlkSize), LU("LU", Nb, BlkSize, BlkSize), NL("NL", Nb, BlkSize, BlkSize), + NL_ref("NL_ref", Nb, BlkSize, BlkSize), U("U", Nb, BlkSize, BlkSize), U_ref("U_ref", Nb, BlkSize, BlkSize); + + View3DType AB("AB", Nb, ldab, BlkSize), AB_upper("AB_upper", Nb, kl + ku + 1, BlkSize); + PivView2DType ipiv("ipiv", Nb, BlkSize), ipiv_ref("ipiv_ref", Nb, BlkSize); + + // Create a random matrix A and make it Positive Definite Symmetric + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + // Initialize LU with random matrix + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(LU, rand_pool, randStart, randEnd); + + full_to_banded(LU, AB, kl, ku); // In banded storage + banded_to_full(AB, A, kl, ku); // In full storage + + Kokkos::deep_copy(LU, A); // for getrf + + // gbtrf to factorize matrix A = P * L * U + Functor_BatchedSerialGbtrf(AB, ipiv, kl, ku, BlkSize).run(); + + // Extract matrix U and L from AB + // first convert it to the full matrix (stored in A) + banded_to_full(AB, A, kl, ku); + + // Copy upper triangular components to U + create_triangular_matrix(A, U); + + // Apply pivot at host + auto h_ipiv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), ipiv); + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + for (int ib = 0; ib < Nb; ib++) { + for (int j = 0; j < BlkSize - 1; j++) { + for (int i = j + 1; i < BlkSize - 1; i++) { + Kokkos::kokkos_swap(h_A(ib, i, j), h_A(ib, h_ipiv(ib, i), j)); + } + } + } + Kokkos::deep_copy(A, h_A); + + // Copy non-diagonal lower triangular components to NL + create_triangular_matrix(A, NL, -1); + + // Reference is made by getrf + // getrf to factorize matrix A = P * L * U + Functor_BatchedSerialGetrf(LU, ipiv_ref).run(); + + // Copy non-diagonal lower triangular components to NL + create_triangular_matrix(LU, NL_ref, + -1); + + // Copy upper triangular components to U_ref + create_triangular_matrix(LU, U_ref); + + // Check if U, NL and ipiv have expected values + auto h_U = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U); + auto h_U_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U_ref); + auto h_NL = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), NL); + auto h_NL_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), NL_ref); + auto h_ipiv_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), ipiv_ref); + + RealType eps = 1.0e3 * ats::epsilon(); + for (int ib = 0; ib < Nb; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_EQ(h_ipiv(ib, i), h_ipiv_ref(ib, i)); + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_U(ib, i, j), h_U_ref(ib, i, j), eps); + EXPECT_NEAR_KK(h_NL(ib, i, j), h_NL_ref(ib, i, j), eps); + } + } + } +} + +} // namespace Gbtrf +} // namespace Test + +template +int test_batched_gbtrf() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Gbtrf::impl_test_batched_gbtrf_analytical(1); + Test::Gbtrf::impl_test_batched_gbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + Test::Gbtrf::impl_test_batched_gbtrf(1, i); + Test::Gbtrf::impl_test_batched_gbtrf(2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Gbtrf::impl_test_batched_gbtrf_analytical(1); + Test::Gbtrf::impl_test_batched_gbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + Test::Gbtrf::impl_test_batched_gbtrf(1, i); + Test::Gbtrf::impl_test_batched_gbtrf(2, i); + } + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_gbtrf_float) { + using algo_tag_type = typename KokkosBatched::Algo::Gbtrf::Unblocked; + + test_batched_gbtrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_gbtrf_double) { + using algo_tag_type = typename KokkosBatched::Algo::Gbtrf::Unblocked; + + test_batched_gbtrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_gbtrf_fcomplex) { + using algo_tag_type = typename KokkosBatched::Algo::Gbtrf::Unblocked; + + test_batched_gbtrf, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_gbtrf_dcomplex) { + using algo_tag_type = typename KokkosBatched::Algo::Gbtrf::Unblocked; + + test_batched_gbtrf, algo_tag_type>(); +} +#endif diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 30015b46ce..094fe232bf 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -147,6 +147,7 @@ struct Algo { using Tbsv = Level2; using Pbtrf = Level2; using Pbtrs = Level2; + using Gbtrf = Level2; }; namespace Impl {