diff --git a/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp new file mode 100644 index 0000000000..1687d82333 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp @@ -0,0 +1,81 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pbtrf_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPbtrfInput([[maybe_unused]] const ABViewType &Ab) { + static_assert(Kokkos::is_view_v, "KokkosBatched::pbtrf: ABViewType is not a Kokkos::View."); + static_assert(ABViewType::rank == 2, "KokkosBatched::pbtrf: ABViewType must have rank 2."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int kd = Ab.extent(0) - 1; + if (kd < 0) { + Kokkos::printf( + "KokkosBatched::pbtrf: input parameter kd must not be less than 0: kd " + "= " + "%d\n", + kd); + return 1; + } +#endif + return 0; +} + +//// Lower //// +template <> +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab) { + // Quick return if possible + const int n = Ab.extent(1); + if (n == 0) return 0; + + auto info = checkPbtrfInput(Ab); + if (info) return info; + + const int kd = Ab.extent(0) - 1; + return SerialPbtrfInternalLower::invoke(n, Ab.data(), Ab.stride_0(), Ab.stride_1(), kd); + } +}; + +//// Upper //// +template <> +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab) { + // Quick return if possible + const int n = Ab.extent(1); + if (n == 0) return 0; + + auto info = checkPbtrfInput(Ab); + if (info) return info; + + const int kd = Ab.extent(0) - 1; + return SerialPbtrfInternalUpper::invoke(n, Ab.data(), Ab.stride_0(), Ab.stride_1(), kd); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp new file mode 100644 index 0000000000..0a4ed7d697 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp @@ -0,0 +1,272 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ + +#include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" + +namespace KokkosBatched { + +/// +/// Serial Internal Impl +/// ==================== + +/// +/// Lower +/// + +template +struct SerialPbtrfInternalLower { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, const int as0, const int as1, + const int kd); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, + const int as1, const int kd); +}; + +/// +/// Real matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalLower::invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, + const int as0, const int as1, + const int kd) { + // Compute the Cholesky factorization A = L*L'. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[0 * as0 + j * as1]; + + // Check if L (j, j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[0 * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of column J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(an - j - 1, kd); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[1 * as0 + j * as1]), 1); + + // syr (lower) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + 1) * as0 + j * as1]; + if (x_k != 0) { + auto temp = -1.0 * x_k; + for (int i = k; i < kn; ++i) { + auto x_i = AB[(i + 1) * as0 + j * as1]; + AB[i * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + } + } + + return 0; +} + +/// +/// Complex matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalLower::invoke( + const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, const int as1, const int kd) { + // Compute the Cholesky factorization A = L*L**H + for (int j = 0; j < an; ++j) { + auto a_jj = AB[0 * as0 + j * as1].real(); + + // Check if L (j, j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + AB[0 * as0 + j * as1] = a_jj; + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[0 * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of column J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[1 * as0 + j * as1]), 1); + + // zher (lower) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + 1) * as0 + j * as1]; + if (x_k != 0) { + auto temp = -1.0 * Kokkos::conj(x_k); + AB[k * as0 + (j + 1) * as1] = AB[k * as0 + (j + 1) * as1].real() + (temp * x_k).real(); + for (int i = k + 1; i < kn; ++i) { + auto x_i = AB[(i + 1) * as0 + j * as1]; + AB[i * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } else { + AB[k * as0 + (j + 1) * as1] = AB[k * as0 + (j + 1) * as1].real(); + } + } + } + } + + return 0; +} + +/// +/// Upper +/// + +template +struct SerialPbtrfInternalUpper { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, const int as0, const int as1, + const int kd); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, + const int as1, const int kd); +}; + +/// +/// Real matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalUpper::invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, + const int as0, const int as1, + const int kd) { + // Compute the Cholesky factorization A = U'*U. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[kd * as0 + j * as1]; + + // Check if U (j,j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + return j + 1; + } +#endif + a_jj = Kokkos::sqrt(a_jj); + AB[kd * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of row J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + int kld = Kokkos::max(1, as0 - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[(kd - 1) * as0 + (j + 1) * as1]), kld); + + // syr (upper) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + kd - 1) * as0 + (j + 1 - k) * as1]; + if (x_k != 0) { + auto temp = -1.0 * x_k; + for (int i = 0; i < k + 1; ++i) { + auto x_i = AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]; + AB[(kd + i) * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + } + } + + return 0; +} + +/// +/// Complex matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalUpper::invoke( + const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, const int as1, const int kd) { + // Compute the Cholesky factorization A = U**H * U. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[kd * as0 + j * as1].real(); + + // Check if U (j,j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + AB[kd * as0 + j * as1] = a_jj; + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[kd * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of row J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + int kld = Kokkos::max(1, as0 - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[(kd - 1) * as0 + (j + 1) * as1]), kld); + + // zlacgv to diagonal elements + for (int i = 0; i < kn; ++i) { + AB[(i + kd - 1) * as0 + (j + 1 - i) * as1] = Kokkos::conj(AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]); + } + + // zher (upper) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + kd - 1) * as0 + (j + 1 - k) * as1]; + if (x_k != 0) { + auto temp = -1.0 * Kokkos::conj(x_k); + for (int i = 0; i < k + 1; ++i) { + auto x_i = AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]; + AB[(kd + i) * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + + // zlacgv to diagonal elements + for (int i = 0; i < kn; ++i) { + AB[(i + kd - 1) * as0 + (j + 1 - i) * as1] = Kokkos::conj(AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]); + } + } + } + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/src/KokkosBatched_Pbtrf.hpp b/batched/dense/src/KokkosBatched_Pbtrf.hpp new file mode 100644 index 0000000000..879dfc0db3 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Pbtrf.hpp @@ -0,0 +1,54 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRF_HPP_ +#define KOKKOSBATCHED_PBTRF_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pbtrf: +/// Compute the Cholesky factorization U**H * U (or L * L**H) of a real +/// symmetric (or complex Hermitian) positive definite banded matrix A_l +/// for all l = 0, ..., +/// The factorization has the form +/// A = U**T * U , if ArgUplo = KokkosBatched::Uplo::Upper, or +/// A = L * L**T, if ArgUplo = KokkosBatched::Uplo::Lower, +/// where U is an upper triangular matrix, U**T is the transpose of U, and +/// L is lower triangular. +/// This is the unblocked version of the algorithm, calling Level 2 BLAS. +/// +/// \tparam ABViewType: Input type for a banded matrix, needs to be a 2D +/// view +/// +/// \param ab [inout]: ab is a ldab by n banded matrix, with ( kd + 1 ) diagonals +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &ab); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pbtrf_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PBTRF_HPP_ diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 56ed46f592..a6a3a25c45 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -55,6 +55,9 @@ #include "Test_Batched_SerialPttrs.hpp" #include "Test_Batched_SerialPttrs_Real.hpp" #include "Test_Batched_SerialPttrs_Complex.hpp" +#include "Test_Batched_SerialPbtrf.hpp" +#include "Test_Batched_SerialPbtrf_Real.hpp" +#include "Test_Batched_SerialPbtrf_Complex.hpp" // Team Kernels #include "Test_Batched_TeamAxpy.hpp" diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index f536f220d3..eccdba50d1 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -145,6 +145,170 @@ void create_diagonal_matrix(InViewType& in, OutViewType& out, int k = 0) { Kokkos::deep_copy(out, h_out); } +/// \brief Creates a positive definite symmetric (PDS) matrix. +/// Takes a full random matrix and converts it to a full pds matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// +template +void random_to_pds(InViewType& in, OutViewType& out) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + using value_type = typename InViewType::non_const_value_type; + + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + Kokkos::deep_copy(h_in, in); + Kokkos::deep_copy(h_out, 0.0); + + for (int i0 = 0; i0 < N; i0++) { + // Make a hermitian matrix + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + if (i1 == i2) { + // Diagonal elements must be real + h_out(i0, i1, i2) = Kokkos::ArithTraits::real(h_in(i0, i1, i2)); + } else { + // Off-diagonal elements are complex and Hermitian + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + + // Make matrix diagonal dominant + for (int i1 = 0; i1 < BlkSize; i1++) { + value_type sum = 0; + for (int i2 = 0; i2 < BlkSize; i2++) { + if (i1 != i2) { + sum += Kokkos::abs(h_out(i0, i1, i2)); + } + } + h_out(i0, i1, i1) = sum + 1.0; + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Creates a banded positive definite symmetric (PDS) matrix. +/// Takes a full diagonal dominant matrix and converts it to a banded pds matrix either +/// in banded or full storage. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// \tparam UploType: Type indicating whether the matrix is upper or lower triangular +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// \param k [in]: Number of sub/super-diagonals for lower/upper triangular (default is 1) +/// \param band_storage [in]: Boolean flag indicating whether the output should be in banded storage format (default is +/// true) +template +void create_banded_pds_matrix(InViewType& in, OutViewType& out, int k = 1, bool band_storage = true) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + using value_type = typename InViewType::non_const_value_type; + const int N = in.extent(0), BlkSize = in.extent(1); + + Kokkos::deep_copy(h_in, in); + + if (band_storage) { + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == static_cast(k + 1)); + assert(out.extent(2) == in.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, k - i1, i2) = h_in(i0, i2 - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i2 + i1, i2); + } + } + } + } + } else { + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < Kokkos::min(i1 + k + 1, BlkSize); i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = Kokkos::max(0, i1 - k); i2 <= i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Converts a banded matrix to a full matrix. +/// Takes a banded matrix in banded storage and converts it to a full matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// \tparam UploType: Type indicating whether the matrix is upper or lower triangular +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// \param k [in]: Number of sub/super-diagonals for lower/upper triangular (default is 1) +/// +template +void banded_to_full(InViewType& in, OutViewType& out, int k = 1) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(2); + + Kokkos::deep_copy(h_in, in); + assert(in.extent(0) == out.extent(0)); + assert(in.extent(1) == static_cast(k + 1)); + assert(in.extent(2) == out.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, i2 - i1, i2) = h_in(i0, k - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i2 + i1, i2) = h_in(i0, i1, i2); + } + } + } + } + Kokkos::deep_copy(out, h_out); +} + } // namespace KokkosBatched #endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp new file mode 100644 index 0000000000..0b16fab242 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp @@ -0,0 +1,322 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pbtrf.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pbtrf { + +template +struct ParamTag { + using uplo = U; +}; + +template +struct Functor_BatchedSerialPbtrf { + using execution_space = typename DeviceType::execution_space; + ABViewType _ab; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPbtrf(const ABViewType &ab) : _ab(ab) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k, int &info) const { + auto sub_ab = Kokkos::subview(_ab, k, Kokkos::ALL(), Kokkos::ALL()); + + info += KokkosBatched::SerialPbtrf::invoke(sub_ab); + } + + inline int run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _ab.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pbtrf test +/// Confirm A = U**H * U or L * L**H, where +/// For full storage, +/// A: [[4, 1], +/// [1, 4]] +/// L: [[sqrt(4), 0], +/// [1/sqrt(4), sqrt(4 - (1/sqrt(4))**2)] +/// U: [[sqrt(4), 1/sqrt(4)], +/// [0, sqrt(4 - (1/sqrt(4))**2)] +/// +/// For lower banded storage, Ab = Lb * Lb**H +/// Ab: [[4, 4], +/// [1, 0]] +/// Lb: [[sqrt(4), sqrt(4 - (1/sqrt(4))**2)], +/// [1/sqrt(4), 0]] +/// +/// For upper banded storage, Ab = Ub**H * Ub +/// Ab: [[0, 1], +/// [4, 4]] +/// Ub: [[0, 1/sqrt(4)], +/// [sqrt(4), sqrt(4 - (1/sqrt(4))**2)]] +/// \param N [in] Batch size of AB +void impl_test_batched_pbtrf_analytical(const int N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View3DType = Kokkos::View; + + constexpr int BlkSize = 2, k = 1; + View3DType A("A", N, BlkSize, BlkSize), Ab("Ab", N, k + 1, BlkSize), + Ab_ref("Ab_ref", N, k + 1, BlkSize); // Banded matrix + + auto h_A = Kokkos::create_mirror_view(A); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_A(ib, i, j) = i == j ? 4.0 : 1.0; + } + } + } + + Kokkos::deep_copy(A, h_A); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_triangular_matrix(A, Ab, k, true); + + // Make a reference using the naive Cholesky decomposition + // Cholesky decomposition for full storage + // l_kk = np.sqrt( a_kk - sum_{i=1}^{k-1}( l_ik^2 ) ) + // l_ik = 1/l_kk * ( a_ik - sum_{j=1}^{k-1}( l_ij * l_kj ) ) + auto h_Ab_ref = Kokkos::create_mirror_view(Ab_ref); + auto h_Ab = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ab); + if (std::is_same_v) { + // A = U**H * U + for (int ib = 0; ib < N; ib++) { + h_Ab_ref(ib, 1, 0) = Kokkos::sqrt(h_Ab(ib, 1, 0)); + h_Ab_ref(ib, 0, 1) = 1.0 / h_Ab_ref(ib, 1, 0); + h_Ab_ref(ib, 1, 1) = Kokkos::sqrt(h_Ab(ib, 1, 1) - h_Ab_ref(ib, 0, 1) * h_Ab_ref(ib, 0, 1)); + } + } else { + // A = L * L**H + for (int ib = 0; ib < N; ib++) { + h_Ab_ref(ib, 0, 0) = Kokkos::sqrt(h_Ab(ib, 0, 0)); + h_Ab_ref(ib, 1, 0) = 1.0 / h_Ab_ref(ib, 0, 0); + h_Ab_ref(ib, 0, 1) = Kokkos::sqrt(h_Ab(ib, 0, 1) - h_Ab_ref(ib, 1, 0) * h_Ab_ref(ib, 1, 0)); + } + } + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + auto info = Functor_BatchedSerialPbtrf(Ab).run(); + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + // Check if Ab == Ub or Lb + Kokkos::deep_copy(h_Ab, Ab); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < k + 1; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_Ab(ib, i, j), h_Ab_ref(ib, i, j), eps); + } + } + } +} + +template +/// \brief Implementation details of batched pbtrs test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pbtrf(const int N, const int k, const int BlkSize) { + using View3DType = Kokkos::View; + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize), + Ab("Ab", N, k + 1, BlkSize); // Banded matrix + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + // Initialize A_reconst with random matrix + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + + // Make the matrix Positive Definite Symmetric and Diagonal dominant + random_to_pds(A, A_reconst); + Kokkos::deep_copy(A, 0.0); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_pds_matrix(A_reconst, A, k, false); + + create_banded_triangular_matrix(A_reconst, Ab, k, true); + + // Clear matrix + Kokkos::deep_copy(A_reconst, 0.0); + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + auto info = Functor_BatchedSerialPbtrf(Ab).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + if (std::is_same_v) { + // A = U**H * U + View3DType U("U", N, BlkSize, BlkSize), Uc("Uc", N, BlkSize, BlkSize); + banded_to_full(Ab, U, k); + + // Compute the complex conjugate of U + // U -> conj(U) + auto h_U = Kokkos::create_mirror_view(U); + auto h_Uc = Kokkos::create_mirror_view(Uc); + Kokkos::deep_copy(h_U, U); + Kokkos::deep_copy(h_Uc, Uc); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_Uc(ib, i, j) = Kokkos::ArithTraits::conj(h_U(ib, i, j)); + } + } + } + Kokkos::deep_copy(Uc, h_Uc); + + // Create conjugate of U + Functor_BatchedSerialGemm(1.0, Uc, U, 0.0, A_reconst) + .run(); + } else { + // A = L * L**H + View3DType L("L", N, BlkSize, BlkSize), Lc("Lc", N, BlkSize, BlkSize); + banded_to_full(Ab, L, k); + + // Compute the complex conjugate of L + // L -> conj(L) + auto h_L = Kokkos::create_mirror_view(L); + auto h_Lc = Kokkos::create_mirror_view(Lc); + Kokkos::deep_copy(h_L, L); + Kokkos::deep_copy(h_Lc, Lc); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_Lc(ib, i, j) = Kokkos::ArithTraits::conj(h_L(ib, i, j)); + } + } + } + Kokkos::deep_copy(Lc, h_Lc); + + // Create conjugate of L + Functor_BatchedSerialGemm(1.0, L, Lc, 0.0, A_reconst) + .run(); + } + + // this eps is about 10^-14 + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check if A = U**H * U or A = L * L**H + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +} // namespace Pbtrf +} // namespace Test + +template +int test_batched_pbtrf() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Pbtrf::impl_test_batched_pbtrf_analytical(1); + Test::Pbtrf::impl_test_batched_pbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrf::impl_test_batched_pbtrf(1, k, i); + Test::Pbtrf::impl_test_batched_pbtrf(2, k, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Pbtrf::impl_test_batched_pbtrf_analytical(1); + Test::Pbtrf::impl_test_batched_pbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrf::impl_test_batched_pbtrf(1, k, i); + Test::Pbtrf::impl_test_batched_pbtrf(2, k, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp new file mode 100644 index 0000000000..3aebb8ffa5 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pbtrf_l_fcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_fcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrf_l_dcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_dcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp new file mode 100644 index 0000000000..e1b77416f5 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pbtrf_l_float) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_float) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrf_l_double) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_double) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +#endif diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 964d82c7bb..67e833de7e 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -119,6 +119,7 @@ struct Algo { using Trsv = Level2; using ApplyQ = Level2; using Tbsv = Level2; + using Pbtrf = Level2; }; namespace Impl {