Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement batched serial gbtrf #2489

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions batched/dense/impl/KokkosBatched_Gbtrf_Serial_Impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#ifndef KOKKOSBATCHED_GBTRF_SERIAL_IMPL_HPP_
#define KOKKOSBATCHED_GBTRF_SERIAL_IMPL_HPP_

#include <KokkosBatched_Util.hpp>
#include "KokkosBatched_Gbtrf_Serial_Internal.hpp"

namespace KokkosBatched {
namespace Impl {
template <typename ABViewType>
KOKKOS_INLINE_FUNCTION static int checkGbtrfInput([[maybe_unused]] const ABViewType &AB, [[maybe_unused]] const int kl,
[[maybe_unused]] const int ku, [[maybe_unused]] const int m) {
static_assert(Kokkos::is_view_v<ABViewType>, "KokkosBatched::gbtrf: ABViewType is not a Kokkos::View.");
static_assert(ABViewType::rank == 2, "KokkosBatched::gbtrs: ABViewType must have rank 2.");
#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
if (m < 0) {
Kokkos::printf(
"KokkosBatched::gbtrf: input parameter m must not be less than 0: m "
"= "
"%d\n",
m);
return 1;
}

if (kl < 0) {
Kokkos::printf(
"KokkosBatched::gbtrf: input parameter kl must not be less than 0: kl "
"= "
"%d\n",
kl);
return 1;
}

if (ku < 0) {
Kokkos::printf(
"KokkosBatched::gbtrf: input parameter ku must not be less than 0: ku "
"= "
"%d\n",
ku);
return 1;
}

const int lda = AB.extent(0);
if (lda < (2 * kl + ku + 1)) {
Kokkos::printf(
"KokkosBatched::gbtrs: leading dimension of A must be smaller than 2 * "
"kl + ku + 1: "
"lda = %d, kl = %d, ku = %d\n",
lda, kl, ku);
return 1;
}
#endif
return 0;
}
} // namespace Impl

template <>
struct SerialGbtrf<Algo::Gbtrf::Unblocked> {
template <typename ABViewType, typename PivViewType>
KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &AB, const PivViewType &piv, const int kl, const int ku,
const int m = -1) {
// default: m == n
int n = AB.extent(1);
int m_tmp = m > 0 ? m : n;

auto info = Impl::checkGbtrfInput(AB, kl, ku, m_tmp);
if (info) return info;

// Quick return if possible
if (m_tmp == 0 || n == 0) return 0;

return Impl::SerialGbtrfInternal<Algo::Gbtrf::Unblocked>::invoke(AB, piv, kl, ku, m_tmp);
}
};

} // namespace KokkosBatched

#endif // KOKKOSBATCHED_GBTRF_SERIAL_IMPL_HPP_
109 changes: 109 additions & 0 deletions batched/dense/impl/KokkosBatched_Gbtrf_Serial_Internal.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#ifndef KOKKOSBATCHED_GBTRF_SERIAL_INTERNAL_HPP_
#define KOKKOSBATCHED_GBTRF_SERIAL_INTERNAL_HPP_

#include <Kokkos_Swap.hpp>
#include <KokkosBatched_Util.hpp>
#include <KokkosBlas1_scal.hpp>
#include <KokkosBatched_Iamax.hpp>
#include "KokkosBatched_Ger_Serial_Internal.hpp"

namespace KokkosBatched {
namespace Impl {
template <typename AlgoType>
struct SerialGbtrfInternal {
template <typename ABViewType, typename PivViewType>
KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &AB, const PivViewType &piv, const int kl, const int ku,
const int m);
};

template <>
template <typename ABViewType, typename PivViewType>
KOKKOS_INLINE_FUNCTION int SerialGbtrfInternal<Algo::Gbtrf::Unblocked>::invoke(const ABViewType &AB,
const PivViewType &piv, const int kl,
const int ku, const int m) {
const int n = AB.extent(1);
// Upper bandwidth of U factor
const int kv = ku + kl;

// Gaussian elimination with partial pivoting
// Set fill-in elements in columns KU+2 to KV to zero
for (int j = ku + 1; j < Kokkos::min(kv, n); ++j) {
for (int i = kv - j + 1; i < kl; ++i) {
AB(i, j) = 0;
}
}

// JU is the index of the last column affected by the current stage of
// the factorization
int ju = 0;
int info = 0;
for (int j = 0; j < Kokkos::min(m, n); ++j) {
// Set fill-in elements in column J+KV to zero
if (j + kv < n) {
for (int i = 0; i < kl; ++i) {
AB(i, j + kv) = 0;
}
}

// Find pivot and test for singularity. KM is the number of subdiagonals
// elements in the current column.
int km = Kokkos::min(kl, m - j - 1);
auto cur_col_AB = Kokkos::subview(AB, Kokkos::pair<int, int>(kv, kv + km + 1), j);
int jp = SerialIamax::invoke(cur_col_AB);
piv(j) = jp + j;

if (AB(kv + jp, j) == 0) {
// If pivot is zero, set INFO to the index of the pivot unless a
// zero pivot has already been found.
if (info == 0) info = j + 1;
} else {
ju = Kokkos::max(ju, Kokkos::min(j + ku + jp, n - 1));

// Apply the interchange to columns J to JU
if (jp != 0) {
for (int k = 0; k < ju - j + 1; ++k) {
Kokkos::kokkos_swap(AB(kv + jp - k, j + k), AB(kv - k, j + k));
}
}

if (km > 0) {
// Compute multipliers
const auto alpha = 1.0 / AB(kv, j);
auto sub_col_AB = Kokkos::subview(AB, Kokkos::pair<int, int>(kv + 1, kv + km + 1), j);
KokkosBlas::SerialScale::invoke(alpha, sub_col_AB);

// Update trailing submatrix within the band
if (ju > j) {
auto x = Kokkos::subview(AB, Kokkos::pair<int, int>(kv + 1, kv + km + 1), j);

// dger or zgeru with alpha = -1.0
const int abs0 = AB.stride(0), abs1 = AB.stride(1);
Impl::SerialGerInternal::invoke(KokkosBlas::Impl::OpID(), km, ju - j, -1.0, &AB(kv + 1, j), abs0,
&AB(kv - 1, j + 1), (abs1 - abs0), &AB(kv, j + 1), abs0, (abs1 - abs0));
}
}
}
}
return 0;
}

} // namespace Impl
} // namespace KokkosBatched

#endif // KOKKOSBATCHED_GBTRF_SERIAL_INTERNAL_HPP_
54 changes: 54 additions & 0 deletions batched/dense/src/KokkosBatched_Gbtrf.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#ifndef KOKKOSBATCHED_GBTRF_HPP_
#define KOKKOSBATCHED_GBTRF_HPP_

#include <KokkosBatched_Util.hpp>

/// \author Yuuichi Asahi ([email protected])

namespace KokkosBatched {

/// \brief Serial Batched Gbtrf:
/// Compute a LU factorization of a m-by-n band matrix A using partial
/// pivoting with row interchanges.
/// \tparam ArgAlgo: Type indicating the blocked (KokkosBatched::Algo::Gbtrf::Blocked) or unblocked
/// (KokkosBatched::Algo::Gbtrf::Unblocked) algorithm to be used
///
/// \tparam ABViewType: Input type for the matrix, needs to be a 2D view
/// \tparam PivViewType: Input type for the pivot indices, needs to be a 1D view
///
/// \param Ab [inout]: A is a ldab by n band matrix, a rank 2 view
/// \param piv [out]: On exit, the pivot indices, a rank 1 view
/// \param kl [in]: The number of subdiagonals within the band of A. kl >= 0
/// \param ku [in]: The number of superdiagonals within the band of A. ku >= 0
/// \param m [in]: The number of rows of the matrix A. (optional, default is -1, corresponding to m == n)
///
/// No nested parallel_for is used inside of the function.
///

template <typename ArgAlgo>
struct SerialGbtrf {
static_assert(std::is_same_v<ArgAlgo, Algo::Gbtrf::Unblocked>, "KokkosBatched::gbtrf: Use Algo::Gbtrf::Unblocked");
template <typename ABViewType, typename PivViewType>
KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab, const PivViewType &piv, const int kl, const int ku,
const int m = -1);
};
} // namespace KokkosBatched

#include "KokkosBatched_Gbtrf_Serial_Impl.hpp"

#endif // KOKKOSBATCHED_GBTRF_HPP_
1 change: 1 addition & 0 deletions batched/dense/unit_test/Test_Batched_Dense.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
#include "Test_Batched_SerialGer.hpp"
#include "Test_Batched_SerialSyr.hpp"
#include "Test_Batched_SerialLacgv.hpp"
#include "Test_Batched_SerialGbtrf.hpp"

// Team Kernels
#include "Test_Batched_TeamAxpy.hpp"
Expand Down
85 changes: 77 additions & 8 deletions batched/dense/unit_test/Test_Batched_DenseUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ void create_banded_pds_matrix(InViewType& in, OutViewType& out, int k = 1, bool
}

/// \brief Converts a banded matrix to a full matrix.
/// Takes a banded matrix in banded storage and converts it to a full matrix.
/// Takes a upper/lower triangular banded matrix in banded storage
/// and converts it to a full matrix.
///
/// \tparam InViewType: Input type for the matrix, needs to be a 3D view
/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view
Expand Down Expand Up @@ -309,6 +310,73 @@ void banded_to_full(InViewType& in, OutViewType& out, int k = 1) {
Kokkos::deep_copy(out, h_out);
}

/// \brief Converts a banded matrix to a full matrix.
/// Takes a banded matrix in banded storage and converts it to a full matrix.
///
/// \tparam InViewType: Input type for the matrix, needs to be a 3D view
/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view
///
/// \param in [in]: Input batched banded matrix, a rank 3 view
/// \param out [out]: Output batched full matrix, a rank 3 view
/// \param kl [in]: Number of subdiagonals for within the band of A (default is 1)
/// \param ku [in]: Number of superdiagonals for within the band of A (default is 1)
///
template <typename InViewType, typename OutViewType>
void banded_to_full(InViewType& in, OutViewType& out, int kl = 1, int ku = 1) {
auto h_in = Kokkos::create_mirror_view(in);
auto h_out = Kokkos::create_mirror_view(out);
const int Nb = out.extent(0), m = out.extent(1), n = out.extent(2);

Kokkos::deep_copy(h_in, in);
assert(in.extent(0) == out.extent(0));
assert(in.extent(1) == static_cast<std::size_t>(2 * kl + ku + 1));
assert(in.extent(2) == out.extent(2));

for (int i0 = 0; i0 < Nb; i0++) {
for (int i1 = 0; i1 < m; i1++) {
for (int i2 = Kokkos::max(0, i1 - ku); i2 < Kokkos::min(n, i1 + kl + ku + 1); i2++) {
auto row_in_banded = kl + ku + i1 - i2;
h_out(i0, i1, i2) = h_in(i0, row_in_banded, i2);
}
}
}
Kokkos::deep_copy(out, h_out);
}

/// \brief Converts a full matrix to a banded matrix.
/// Takes a full matrix and converts it to a banded matrix.
///
/// \tparam InViewType: Input type for the matrix, needs to be a 3D view
/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view
///
/// \param in [in]: Input batched full matrix, a rank 3 view
/// \param out [out]: Output batched banded matrix, a rank 3 view
/// \param kl [in]: Number of subdiagonals for within the band of A (default is 1)
/// \param ku [in]: Number of superdiagonals for within the band of A (default is 1)
///
template <typename InViewType, typename OutViewType>
void full_to_banded(InViewType& in, OutViewType& out, int kl = 1, int ku = 1) {
auto h_in = Kokkos::create_mirror_view(in);
auto h_out = Kokkos::create_mirror_view(out);
const int Nb = in.extent(0), m = in.extent(1), n = in.extent(2);

Kokkos::deep_copy(h_in, in);
assert(out.extent(0) == in.extent(0));
assert(out.extent(1) == static_cast<std::size_t>(2 * kl + ku + 1));
assert(out.extent(2) == in.extent(2));

for (int i0 = 0; i0 < Nb; i0++) {
for (int i1 = 0; i1 < m; i1++) {
for (int i2 = Kokkos::max(0, i1 - ku); i2 < Kokkos::min(n, i1 + kl + 1); i2++) {
// Do not use the element from the full matrix if it is outside the band
auto row_in_banded = kl + ku + i1 - i2;
h_out(i0, row_in_banded, i2) = h_in(i0, i1, i2);
}
}
}
Kokkos::deep_copy(out, h_out);
}

/// \brief Create a triangular matrix from an input matrix:
/// Copies the input matrix into the upper/lower triangular of the output matrix specified
/// by the parameter k. Zero out elements below/above the k-th diagonal.
Expand All @@ -325,15 +393,15 @@ void banded_to_full(InViewType& in, OutViewType& out, int k = 1) {
///
template <typename InViewType, typename OutViewType, typename UploType, typename DiagType>
void create_triangular_matrix(InViewType& in, OutViewType& out, int k = 0) {
auto h_in = Kokkos::create_mirror_view(in);
auto h_out = Kokkos::create_mirror_view(out);
const int N = in.extent(0), BlkSize = in.extent(1);
auto h_in = Kokkos::create_mirror_view(in);
auto h_out = Kokkos::create_mirror_view(out);
const int Nb = in.extent(0), m = in.extent(1), n = in.extent(2);

Kokkos::deep_copy(h_in, in);
Kokkos::deep_copy(h_out, 0.0);
for (int i0 = 0; i0 < N; i0++) {
for (int i1 = 0; i1 < BlkSize; i1++) {
for (int i2 = 0; i2 < BlkSize; i2++) {
for (int i0 = 0; i0 < Nb; i0++) {
for (int i1 = 0; i1 < m; i1++) {
for (int i2 = 0; i2 < n; i2++) {
if constexpr (std::is_same_v<UploType, KokkosBatched::Uplo::Upper>) {
// Upper
// Zero out elements below the k-th diagonal
Expand All @@ -344,7 +412,8 @@ void create_triangular_matrix(InViewType& in, OutViewType& out, int k = 0) {
h_out(i0, i1, i2) = i2 > i1 + k ? 0.0 : h_in(i0, i1, i2);
}
}

}
for (int i1 = 0; i1 < Kokkos::min(m, n); i1++) {
if constexpr (std::is_same_v<DiagType, KokkosBatched::Diag::Unit>) {
// Unit diagonal
h_out(i0, i1, i1) = 1.0;
Expand Down
Loading
Loading