From 1b2eb9b8c49d1a05e60e388dac65d4193417290a Mon Sep 17 00:00:00 2001
From: Christian Glusa <caglusa@sandia.gov>
Date: Tue, 5 Nov 2024 15:22:02 -0700
Subject: [PATCH 1/3] MueLu: Fix for phase 2b

Compute aggWeight locally instead of globally.

Signed-off-by: Christian Glusa <caglusa@sandia.gov>
---
 ...MueLu_AggregationPhase2bAlgorithm_decl.hpp |  18 +-
 .../MueLu_AggregationPhase2bAlgorithm_def.hpp | 357 ++++++++----------
 .../MueLu_UncoupledAggregationFactory_def.hpp |   3 +-
 .../unit_tests_kokkos/Aggregates_kokkos.cpp   |   1 +
 4 files changed, 173 insertions(+), 206 deletions(-)
diff --git a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_decl.hpp b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_decl.hpp
index b1dc9913bbb1..dd86190c590a 100644
--- a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_decl.hpp
+++ b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_decl.hpp
@@ -73,17 +73,13 @@ class AggregationPhase2bAlgorithm : public MueLu::AggregationAlgorithmBase<Local
                        typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
                        LO& numNonAggregatedNodes) const;
 
-  void BuildAggregatesRandom(const ParameterList& params,
-                             const LWGraph_kokkos& graph,
-                             Aggregates& aggregates,
-                             typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
-                             LO& numNonAggregatedNodes) const;
-
-  void BuildAggregatesDeterministic(const ParameterList& params,
-                                    const LWGraph_kokkos& graph,
-                                    Aggregates& aggregates,
-                                    typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType& aggStat,
-                                    LO& numNonAggregatedNodes) const;
+  template <bool deterministic>
+  void BuildAggregates(const ParameterList& params,
+                       const LWGraph_kokkos graph,
+                       Aggregates& aggregates,
+                       typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType aggStat,
+                       LO& numNonAggregatedNodes) const;
+
   //@}
 
   std::string description() const { return "Phase 2b (expansion)"; }
diff --git a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_def.hpp b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_def.hpp
index ca6802c31f5f..7e2292cdccba 100644
--- a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_def.hpp
+++ b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_AggregationPhase2bAlgorithm_def.hpp
@@ -39,12 +39,12 @@ void AggregationPhase2bAlgorithm<LocalOrdinal, GlobalOrdinal, Node>::BuildAggreg
 
   LO numLocalAggregates = aggregates.GetNumAggregates();
 
-  const int defaultConnectWeight = 100;
-  const int penaltyConnectWeight = 10;
+  const LO defaultConnectWeight = 100;
+  const LO penaltyConnectWeight = 10;
 
-  std::vector<int> aggWeight(numLocalAggregates, 0);
-  std::vector<int> connectWeight(numRows, defaultConnectWeight);
-  std::vector<int> aggPenalties(numRows, 0);
+  std::vector<LO> aggWeight(numLocalAggregates, 0);
+  std::vector<LO> connectWeight(numRows, defaultConnectWeight);
+  std::vector<LO> aggPenalties(numRows, 0);
 
   // We do this cycle twice.
   // I don't know why, but ML does it too
@@ -118,24 +118,126 @@ void AggregationPhase2bAlgorithm<LocalOrdinal, GlobalOrdinal, Node>::
                     LO& numNonAggregatedNodes) const {
   if (params.get<bool>("aggregation: deterministic")) {
     Monitor m(*this, "BuildAggregatesDeterministic");
-    BuildAggregatesDeterministic(params, graph, aggregates, aggStat, numNonAggregatedNodes);
+    BuildAggregates<true>(params, graph, aggregates, aggStat, numNonAggregatedNodes);
   } else {
     Monitor m(*this, "BuildAggregatesRandom");
-    BuildAggregatesRandom(params, graph, aggregates, aggStat, numNonAggregatedNodes);
+    BuildAggregates<false>(params, graph, aggregates, aggStat, numNonAggregatedNodes);
   }
 
 }  // BuildAggregates
 
-template <class LO, class GO, class Node>
-void AggregationPhase2bAlgorithm<LO, GO, Node>::
-    BuildAggregatesRandom(const ParameterList& params,
-                          const LWGraph_kokkos& graph,
-                          Aggregates& aggregates,
-                          typename AggregationAlgorithmBase<LO, GO, Node>::AggStatType& aggStat,
-                          LO& numNonAggregatedNodes) const {
+template <class AggStatType, class ProcWinnerType, class Vertex2AggType, class ColorsType, class LocalGraphType, class AggPenaltyType, class LO, bool deterministic, bool matchMLbehavior>
+class ExpansionFunctor {
+ private:
+  AggStatType aggStat;
+  ProcWinnerType procWinner;
+  Vertex2AggType vertex2AggId;
+  ColorsType colors;
+  LocalGraphType lclLWGraph;
+  AggPenaltyType aggPenalties;
+  AggPenaltyType aggPenaltyUpdates;
+  AggPenaltyType connectWeight;
+  LO penaltyConnectWeight;
+  LO color;
+  LO myRank;
+
+ public:
+  ExpansionFunctor(AggStatType& aggStat_, ProcWinnerType& procWinner_, Vertex2AggType& vertex2AggId_, ColorsType& colors_, LocalGraphType& lclLWGraph_, AggPenaltyType& aggPenalties_, AggPenaltyType& aggPenaltyUpdates_, AggPenaltyType& connectWeight_, LO penaltyConnectWeight_, LO color_, LO rank_)
+    : aggStat(aggStat_)
+    , procWinner(procWinner_)
+    , vertex2AggId(vertex2AggId_)
+    , colors(colors_)
+    , lclLWGraph(lclLWGraph_)
+    , aggPenalties(aggPenalties_)
+    , connectWeight(connectWeight_)
+    , aggPenaltyUpdates(aggPenaltyUpdates_)
+    , penaltyConnectWeight(penaltyConnectWeight_)
+    , color(color_)
+    , myRank(rank_) {}
+
+  ExpansionFunctor(AggStatType& aggStat_, ProcWinnerType& procWinner_, Vertex2AggType& vertex2AggId_, ColorsType& colors_, LocalGraphType& lclLWGraph_, AggPenaltyType& aggPenalties_, AggPenaltyType& connectWeight_, LO penaltyConnectWeight_, LO color_, LO rank_)
+    : aggStat(aggStat_)
+    , procWinner(procWinner_)
+    , vertex2AggId(vertex2AggId_)
+    , colors(colors_)
+    , lclLWGraph(lclLWGraph_)
+    , aggPenalties(aggPenalties_)
+    , connectWeight(connectWeight_)
+    , penaltyConnectWeight(penaltyConnectWeight_)
+    , color(color_)
+    , myRank(rank_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const LO& i, LO& tmpNumAggregated) const {
+    if (aggStat(i) != READY || colors(i) != color)
+      return;
+
+    int bestScore   = -100000;
+    int bestAggId   = -1;
+    int bestConnect = -1;
+
+    auto neighOfINode = lclLWGraph.getNeighborVertices(i);
+
+    for (int j = 0; j < neighOfINode.length; j++) {
+      LO neigh = neighOfINode(j);
+
+      if (lclLWGraph.isLocalNeighborVertex(neigh) &&
+          (aggStat(neigh) == AGGREGATED)) {
+        auto aggId   = vertex2AggId(neigh, 0);
+        LO aggWeight = 0;
+        for (int k = 0; k < neighOfINode.length; k++) {
+          LO neigh2 = neighOfINode(k);
+          if (lclLWGraph.isLocalNeighborVertex(neigh2) &&
+              (aggStat(neigh2) == AGGREGATED) &&
+              (vertex2AggId(neigh2, 0) == aggId))
+            aggWeight += connectWeight(neigh2);
+        }
+
+        if (matchMLbehavior && (aggWeight == 0))
+          return;
+
+        int score = aggWeight - aggPenalties(aggId);
+
+        if (score > bestScore) {
+          bestAggId   = aggId;
+          bestScore   = score;
+          bestConnect = connectWeight(neigh);
+
+        } else if (aggId == bestAggId &&
+                   connectWeight(neigh) > bestConnect) {
+          bestConnect = connectWeight(neigh);
+        }
+      }
+    }
+    if (bestScore >= 0) {
+      aggStat(i)         = AGGREGATED;
+      vertex2AggId(i, 0) = bestAggId;
+      procWinner(i, 0)   = myRank;
+
+      if constexpr (deterministic) {
+        Kokkos::atomic_add(&aggPenaltyUpdates(bestAggId), 1);
+      } else {
+        Kokkos::atomic_add(&aggPenalties(bestAggId), 1);
+      }
+      connectWeight(i) = bestConnect - penaltyConnectWeight;
+      tmpNumAggregated++;
+    }
+  }
+};
+
+template <class LocalOrdinal, class GlobalOrdinal, class Node>
+template <bool deterministic>
+void AggregationPhase2bAlgorithm<LocalOrdinal, GlobalOrdinal, Node>::
+    BuildAggregates(const ParameterList& params,
+                    const LWGraph_kokkos graph,
+                    Aggregates& aggregates,
+                    typename AggregationAlgorithmBase<LocalOrdinal, GlobalOrdinal, Node>::AggStatType aggStat,
+                    LO& numNonAggregatedNodes) const {
   using device_type     = typename LWGraph_kokkos::device_type;
   using execution_space = typename LWGraph_kokkos::execution_space;
 
+  bool matchMLbehavior = params.get<bool>("aggregation: match ML phase2b");
+
   const LO numRows = graph.GetNodeNumVertices();
   const int myRank = graph.GetComm()->getRank();
 
@@ -145,14 +247,14 @@ void AggregationPhase2bAlgorithm<LO, GO, Node>::
   const LO numColors          = aggregates.GetGraphNumColors();
   const LO numLocalAggregates = aggregates.GetNumAggregates();
 
-  auto lclLWGraph = graph;
-
   const LO defaultConnectWeight = 100;
   const LO penaltyConnectWeight = 10;
 
-  Kokkos::View<LO*, device_type> aggWeight(Kokkos::ViewAllocateWithoutInitializing("aggWeight"), numLocalAggregates);  // This gets re-initialized at the start of each "color" loop
   Kokkos::View<LO*, device_type> connectWeight(Kokkos::ViewAllocateWithoutInitializing("connectWeight"), numRows);
   Kokkos::View<LO*, device_type> aggPenalties("aggPenalties", numLocalAggregates);  // This gets initialized to zero here
+  Kokkos::View<LO*, device_type> aggPenaltyUpdates;
+  // if constexpr (deterministic)
+  aggPenaltyUpdates = Kokkos::View<LO*, device_type>("aggPenaltyUpdates", numLocalAggregates);
 
   Kokkos::deep_copy(connectWeight, defaultConnectWeight);
 
@@ -170,190 +272,59 @@ void AggregationPhase2bAlgorithm<LO, GO, Node>::
   }
   for (int iter = 0; iter < maxIters; ++iter) {
     for (LO color = 1; color <= numColors; ++color) {
-      Kokkos::deep_copy(aggWeight, 0);
-
       // the reduce counts how many nodes are aggregated by this phase,
       // which will then be subtracted from numNonAggregatedNodes
       LO numAggregated = 0;
-      Kokkos::parallel_reduce(
-          "Aggregation Phase 2b: aggregates expansion",
-          Kokkos::RangePolicy<execution_space>(0, numRows),
-          KOKKOS_LAMBDA(const LO i, LO& tmpNumAggregated) {
-            if (aggStat(i) != READY || colors(i) != color)
-              return;
-
-            auto neighOfINode = lclLWGraph.getNeighborVertices(i);
-            for (int j = 0; j < neighOfINode.length; j++) {
-              LO neigh = neighOfINode(j);
-
-              // We don't check (neigh != i), as it is covered by checking
-              // (aggStat[neigh] == AGGREGATED)
-              if (lclLWGraph.isLocalNeighborVertex(neigh) &&
-                  aggStat(neigh) == AGGREGATED)
-                Kokkos::atomic_add(&aggWeight(vertex2AggId(neigh, 0)),
-                                   connectWeight(neigh));
-            }
-
-            int bestScore   = -100000;
-            int bestAggId   = -1;
-            int bestConnect = -1;
-
-            for (int j = 0; j < neighOfINode.length; j++) {
-              LO neigh = neighOfINode(j);
-
-              if (lclLWGraph.isLocalNeighborVertex(neigh) &&
-                  aggStat(neigh) == AGGREGATED) {
-                auto aggId = vertex2AggId(neigh, 0);
-                int score  = aggWeight(aggId) - aggPenalties(aggId);
-
-                if (score > bestScore) {
-                  bestAggId   = aggId;
-                  bestScore   = score;
-                  bestConnect = connectWeight(neigh);
-
-                } else if (aggId == bestAggId &&
-                           connectWeight(neigh) > bestConnect) {
-                  bestConnect = connectWeight(neigh);
-                }
-              }
-            }
-            if (bestScore >= 0) {
-              aggStat(i)         = AGGREGATED;
-              vertex2AggId(i, 0) = bestAggId;
-              procWinner(i, 0)   = myRank;
-
-              Kokkos::atomic_add(&aggPenalties(bestAggId), 1);
-              connectWeight(i) = bestConnect - penaltyConnectWeight;
-              tmpNumAggregated++;
-            }
-          },
-          numAggregated);  // parallel_for
-      numNonAggregatedNodes -= numAggregated;
-    }
-  }  // loop over maxIters
 
-}  // BuildAggregatesRandom
-
-template <class LO, class GO, class Node>
-void AggregationPhase2bAlgorithm<LO, GO, Node>::
-    BuildAggregatesDeterministic(const ParameterList& params,
-                                 const LWGraph_kokkos& graph,
-                                 Aggregates& aggregates,
-                                 typename AggregationAlgorithmBase<LO, GO, Node>::AggStatType& aggStat,
-                                 LO& numNonAggregatedNodes) const {
-  using device_type     = typename LWGraph_kokkos::device_type;
-  using execution_space = typename LWGraph_kokkos::execution_space;
-
-  const LO numRows = graph.GetNodeNumVertices();
-  const int myRank = graph.GetComm()->getRank();
-
-  auto vertex2AggId     = aggregates.GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadWrite);
-  auto procWinner       = aggregates.GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadWrite);
-  auto colors           = aggregates.GetGraphColors();
-  const LO numColors    = aggregates.GetGraphNumColors();
-  LO numLocalAggregates = aggregates.GetNumAggregates();
-
-  auto lclLWGraph = graph;
-
-  const int defaultConnectWeight = 100;
-  const int penaltyConnectWeight = 10;
-
-  Kokkos::View<int*, device_type> connectWeight(Kokkos::ViewAllocateWithoutInitializing("connectWeight"), numRows);
-  Kokkos::View<int*, device_type> aggWeight(Kokkos::ViewAllocateWithoutInitializing("aggWeight"), numLocalAggregates);  // This gets re-initialized at the start of each "color" loop
-  Kokkos::View<int*, device_type> aggPenaltyUpdates("aggPenaltyUpdates", numLocalAggregates);
-  Kokkos::View<int*, device_type> aggPenalties("aggPenalties", numLocalAggregates);
-
-  Kokkos::deep_copy(connectWeight, defaultConnectWeight);
+      if constexpr (deterministic) {
+        if (matchMLbehavior) {
+          auto functor = ExpansionFunctor<decltype(aggStat), decltype(procWinner), decltype(vertex2AggId), decltype(colors), decltype(graph), decltype(aggPenalties), LO, true, true>(aggStat, procWinner, vertex2AggId, colors, graph, aggPenalties, aggPenaltyUpdates, connectWeight, penaltyConnectWeight, color, myRank);
+
+          Kokkos::parallel_reduce("Aggregation Phase 2b: aggregates expansion",
+                                  Kokkos::RangePolicy<execution_space>(0, numRows),
+                                  functor,
+                                  numAggregated);
+        } else {
+          auto functor = ExpansionFunctor<decltype(aggStat), decltype(procWinner), decltype(vertex2AggId), decltype(colors), decltype(graph), decltype(aggPenalties), LO, true, false>(aggStat, procWinner, vertex2AggId, colors, graph, aggPenalties, aggPenaltyUpdates, connectWeight, penaltyConnectWeight, color, myRank);
+
+          Kokkos::parallel_reduce("Aggregation Phase 2b: aggregates expansion",
+                                  Kokkos::RangePolicy<execution_space>(0, numRows),
+                                  functor,
+                                  numAggregated);
+        }
+      } else {
+        if (matchMLbehavior) {
+          auto functor = ExpansionFunctor<decltype(aggStat), decltype(procWinner), decltype(vertex2AggId), decltype(colors), decltype(graph), decltype(aggPenalties), LO, false, true>(aggStat, procWinner, vertex2AggId, colors, graph, aggPenalties, connectWeight, penaltyConnectWeight, color, myRank);
+
+          Kokkos::parallel_reduce("Aggregation Phase 2b: aggregates expansion",
+                                  Kokkos::RangePolicy<execution_space>(0, numRows),
+                                  functor,
+                                  numAggregated);
+        } else {
+          auto functor = ExpansionFunctor<decltype(aggStat), decltype(procWinner), decltype(vertex2AggId), decltype(colors), decltype(graph), decltype(aggPenalties), LO, false, false>(aggStat, procWinner, vertex2AggId, colors, graph, aggPenalties, connectWeight, penaltyConnectWeight, color, myRank);
+
+          Kokkos::parallel_reduce("Aggregation Phase 2b: aggregates expansion",
+                                  Kokkos::RangePolicy<execution_space>(0, numRows),
+                                  functor,
+                                  numAggregated);
+        }
+      }
 
-  // We do this cycle twice.
-  // I don't know why, but ML does it too
-  // taw: by running the aggregation routine more than once there is a chance that also
-  // non-aggregated nodes with a node distance of two are added to existing aggregates.
-  // Assuming that the aggregate size is 3 in each direction running the algorithm only twice
-  // should be sufficient.
-  int maxIters             = 2;
-  int maxNodesPerAggregate = params.get<int>("aggregation: max agg size");
-  if (maxNodesPerAggregate == std::numeric_limits<int>::max()) {
-    maxIters = 1;
-  }
-  for (int iter = 0; iter < maxIters; ++iter) {
-    for (LO color = 1; color <= numColors; color++) {
-      Kokkos::deep_copy(aggWeight, 0);
+      if constexpr (deterministic) {
+        Kokkos::parallel_for(
+            "Aggregation Phase 2b: updating agg penalties",
+            Kokkos::RangePolicy<execution_space>(0, numLocalAggregates),
+            KOKKOS_LAMBDA(const LO agg) {
+              aggPenalties(agg) += aggPenaltyUpdates(agg);
+              aggPenaltyUpdates(agg) = 0;
+            });
+      }
 
-      // the reduce counts how many nodes are aggregated by this phase,
-      // which will then be subtracted from numNonAggregatedNodes
-      LO numAggregated = 0;
-      Kokkos::parallel_for(
-          "Aggregation Phase 2b: updating agg weights",
-          Kokkos::RangePolicy<execution_space>(0, numRows),
-          KOKKOS_LAMBDA(const LO i) {
-            if (aggStat(i) != READY || colors(i) != color)
-              return;
-            auto neighOfINode = lclLWGraph.getNeighborVertices(i);
-            for (int j = 0; j < neighOfINode.length; j++) {
-              LO neigh = neighOfINode(j);
-              // We don't check (neigh != i), as it is covered by checking
-              // (aggStat[neigh] == AGGREGATED)
-              if (lclLWGraph.isLocalNeighborVertex(neigh) &&
-                  aggStat(neigh) == AGGREGATED)
-                Kokkos::atomic_add(&aggWeight(vertex2AggId(neigh, 0)),
-                                   connectWeight(neigh));
-            }
-          });
-
-      Kokkos::parallel_reduce(
-          "Aggregation Phase 2b: aggregates expansion",
-          Kokkos::RangePolicy<execution_space>(0, numRows),
-          KOKKOS_LAMBDA(const LO i, LO& tmpNumAggregated) {
-            if (aggStat(i) != READY || colors(i) != color)
-              return;
-            int bestScore   = -100000;
-            int bestAggId   = -1;
-            int bestConnect = -1;
-
-            auto neighOfINode = lclLWGraph.getNeighborVertices(i);
-            for (int j = 0; j < neighOfINode.length; j++) {
-              LO neigh = neighOfINode(j);
-
-              if (lclLWGraph.isLocalNeighborVertex(neigh) &&
-                  aggStat(neigh) == AGGREGATED) {
-                auto aggId = vertex2AggId(neigh, 0);
-                int score  = aggWeight(aggId) - aggPenalties(aggId);
-
-                if (score > bestScore) {
-                  bestAggId   = aggId;
-                  bestScore   = score;
-                  bestConnect = connectWeight(neigh);
-
-                } else if (aggId == bestAggId &&
-                           connectWeight(neigh) > bestConnect) {
-                  bestConnect = connectWeight(neigh);
-                }
-              }
-            }
-            if (bestScore >= 0) {
-              aggStat(i)         = AGGREGATED;
-              vertex2AggId(i, 0) = bestAggId;
-              procWinner(i, 0)   = myRank;
-
-              Kokkos::atomic_add(&aggPenaltyUpdates(bestAggId), 1);
-              connectWeight(i) = bestConnect - penaltyConnectWeight;
-              tmpNumAggregated++;
-            }
-          },
-          numAggregated);  // parallel_reduce
-
-      Kokkos::parallel_for(
-          "Aggregation Phase 2b: updating agg penalties",
-          Kokkos::RangePolicy<execution_space>(0, numLocalAggregates),
-          KOKKOS_LAMBDA(const LO agg) {
-            aggPenalties(agg) += aggPenaltyUpdates(agg);
-            aggPenaltyUpdates(agg) = 0;
-          });
       numNonAggregatedNodes -= numAggregated;
     }
-  }  // loop over k
-}  // BuildAggregatesDeterministic
+  }  // loop over maxIters
+
+}  // BuildAggregates
 
 }  // namespace MueLu
 
diff --git a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp
index 386451d1cfc3..170506f48d39 100644
--- a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp
+++ b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp
@@ -190,9 +190,8 @@ void UncoupledAggregationFactory<LocalOrdinal, GlobalOrdinal, Node>::Build(Level
     runOnHost    = false;
 
     TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: use interface aggregation"), std::invalid_argument, "Option: 'aggregation: use interface aggregation' is not supported in the Kokkos version of uncoupled aggregation");
-    // Sanity Checking: match ML behavior is not supported in UncoupledAggregation_Kokkos in Phase 1 or Phase 2b, but is in 2a
+    // Sanity Checking: match ML behavior is not supported in UncoupledAggregation_Kokkos in Phase 1 , but it is in 2a and 2b
     TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: match ML phase1"), std::invalid_argument, "Option: 'aggregation: match ML phase1' is not supported in the Kokkos version of uncoupled aggregation");
-    TEUCHOS_TEST_FOR_EXCEPTION(pL.get<bool>("aggregation: match ML phase2b"), std::invalid_argument, "Option: 'aggregation: match ML phase2b' is not supported in the Kokkos version of uncoupled aggregation");
   }
 
   // Build
diff --git a/packages/muelu/test/unit_tests_kokkos/Aggregates_kokkos.cpp b/packages/muelu/test/unit_tests_kokkos/Aggregates_kokkos.cpp
index cca61d42e331..5ea76cbaec3c 100644
--- a/packages/muelu/test/unit_tests_kokkos/Aggregates_kokkos.cpp
+++ b/packages/muelu/test/unit_tests_kokkos/Aggregates_kokkos.cpp
@@ -102,6 +102,7 @@ void gimmeUncoupledAggregates(const Teuchos::RCP<Xpetra::Matrix<Scalar, LocalOrd
   params.set<bool>("aggregation: deterministic", false);
 
   params.set<bool>("aggregation: match ML phase2a", true);
+  params.set<bool>("aggregation: match ML phase2b", false);
   params.set<bool>("aggregation: error on nodes with no on-rank neighbors", false);
   params.set<bool>("aggregation: phase3 avoid singletons", false);
 

From 7f3e984a4f14e36466fbb488c7bf9addbe42ff37 Mon Sep 17 00:00:00 2001
From: Christian Glusa <caglusa@sandia.gov>
Date: Tue, 19 Nov 2024 17:31:32 -0700
Subject: [PATCH 2/3] MueLu: Rebase gold files

Signed-off-by: Christian Glusa <caglusa@sandia.gov>
---
 .../kokkos/Output/operator_solve_1_np1_tpetra.gold          | 6 +++---
 .../kokkos/Output/operator_solve_1_np4_tpetra.gold          | 6 +++---
 .../kokkos/Output/operator_solve_5_np1_tpetra.gold          | 6 +++---
 .../kokkos/Output/operator_solve_5_np4_tpetra.gold          | 6 +++---
 .../kokkos/Output/operator_solve_6_np1_tpetra.gold          | 6 +++---
 .../kokkos/Output/operator_solve_6_np4_tpetra.gold          | 6 +++---
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np1_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np1_tpetra.gold
index 61ecae9e7780..186ca4496970 100644
--- a/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np1_tpetra.gold
+++ b/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np1_tpetra.gold
@@ -120,14 +120,14 @@ Cycle type          = V
 level  rows   nnz    nnz/row  c ratio  procs
   0  10000  49600  4.96                  1  
   1  1700   14928  8.78     5.88         1  
-  2  192    1682   8.76     8.85         1  
-  3  24     200    8.33     8.00         1  
+  2  192    1674   8.72     8.85         1  
+  3  24     190    7.92     8.00         1  
 
 Smoother (level 0) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [10000, 10000], Global nnz: 49600}
 
 Smoother (level 1) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [1700, 1700], Global nnz: 14928}
 
-Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [192, 192], Global nnz: 1682}
+Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [192, 192], Global nnz: 1674}
 
 Smoother (level 3) pre  : <Direct> solver interface
 Smoother (level 3) post : no smoother
diff --git a/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np4_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np4_tpetra.gold
index 0117cfa2fd4d..df9604a89b67 100644
--- a/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np4_tpetra.gold
+++ b/packages/muelu/test/interface/kokkos/Output/operator_solve_1_np4_tpetra.gold
@@ -120,14 +120,14 @@ Cycle type          = V
 level  rows   nnz    nnz/row  c ratio  procs
   0  10000  49600  4.96                  4  
   1  1700   15318  9.01     5.88         4  
-  2  216    2158   9.99     7.87         4  
-  3  32     446    13.94    6.75         4  
+  2  216    2150   9.95     7.87         4  
+  3  32     434    13.56    6.75         4  
 
 Smoother (level 0) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [10000, 10000], Global nnz: 49600}
 
 Smoother (level 1) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [1700, 1700], Global nnz: 15318}
 
-Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [216, 216], Global nnz: 2158}
+Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [216, 216], Global nnz: 2150}
 
 Smoother (level 3) pre  : <Direct> solver interface
 Smoother (level 3) post : no smoother
diff --git a/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np1_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np1_tpetra.gold
index 622442f2328f..0b51e98970e5 100644
--- a/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np1_tpetra.gold
+++ b/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np1_tpetra.gold
@@ -95,14 +95,14 @@ Cycle type          = V
 level  rows   nnz    nnz/row  c ratio  procs
   0  10000  49600  4.96                  1  
   1  1700   14928  8.78     5.88         1  
-  2  192    1682   8.76     8.85         1  
-  3  24     200    8.33     8.00         1  
+  2  192    1674   8.72     8.85         1  
+  3  24     190    7.92     8.00         1  
 
 Smoother (level 0) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [10000, 10000], Global nnz: 49600}
 
 Smoother (level 1) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [1700, 1700], Global nnz: 14928}
 
-Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [192, 192], Global nnz: 1682}
+Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [192, 192], Global nnz: 1674}
 
 Smoother (level 3) pre  : <Direct> solver interface
 Smoother (level 3) post : no smoother
diff --git a/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np4_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np4_tpetra.gold
index a747e7c913f7..baf6c048c339 100644
--- a/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np4_tpetra.gold
+++ b/packages/muelu/test/interface/kokkos/Output/operator_solve_5_np4_tpetra.gold
@@ -95,14 +95,14 @@ Cycle type          = V
 level  rows   nnz    nnz/row  c ratio  procs
   0  10000  49600  4.96                  4  
   1  1700   15318  9.01     5.88         4  
-  2  216    2158   9.99     7.87         4  
-  3  32     446    13.94    6.75         4  
+  2  216    2150   9.95     7.87         4  
+  3  32     434    13.56    6.75         4  
 
 Smoother (level 0) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [10000, 10000], Global nnz: 49600}
 
 Smoother (level 1) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [1700, 1700], Global nnz: 15318}
 
-Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [216, 216], Global nnz: 2158}
+Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [216, 216], Global nnz: 2150}
 
 Smoother (level 3) pre  : <Direct> solver interface
 Smoother (level 3) post : no smoother
diff --git a/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np1_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np1_tpetra.gold
index fe5914900fb0..49df428e7ac1 100644
--- a/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np1_tpetra.gold
+++ b/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np1_tpetra.gold
@@ -100,14 +100,14 @@ Cycle type          = V
 level  rows   nnz    nnz/row  c ratio  procs
   0  10000  49600  4.96                  1  
   1  1700   14928  8.78     5.88         1  
-  2  192    1682   8.76     8.85         1  
-  3  24     200    8.33     8.00         1  
+  2  192    1674   8.72     8.85         1  
+  3  24     190    7.92     8.00         1  
 
 Smoother (level 0) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [10000, 10000], Global nnz: 49600}
 
 Smoother (level 1) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [1700, 1700], Global nnz: 14928}
 
-Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [192, 192], Global nnz: 1682}
+Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [192, 192], Global nnz: 1674}
 
 Smoother (level 3) pre  : <Direct> solver interface
 Smoother (level 3) post : no smoother
diff --git a/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np4_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np4_tpetra.gold
index 0d4663cf3256..9fb6a3101a8b 100644
--- a/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np4_tpetra.gold
+++ b/packages/muelu/test/interface/kokkos/Output/operator_solve_6_np4_tpetra.gold
@@ -100,14 +100,14 @@ Cycle type          = V
 level  rows   nnz    nnz/row  c ratio  procs
   0  10000  49600  4.96                  4  
   1  1700   15318  9.01     5.88         4  
-  2  216    2158   9.99     7.87         4  
-  3  32     446    13.94    6.75         4  
+  2  216    2150   9.95     7.87         4  
+  3  32     434    13.56    6.75         4  
 
 Smoother (level 0) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [10000, 10000], Global nnz: 49600}
 
 Smoother (level 1) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [1700, 1700], Global nnz: 15318}
 
-Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [216, 216], Global nnz: 2158}
+Smoother (level 2) both : "Ifpack2::Relaxation": {Initialized: true, Computed: true, Type: Symmetric Gauss-Seidel, sweeps: 1, damping factor: 1, Global matrix dimensions: [216, 216], Global nnz: 2150}
 
 Smoother (level 3) pre  : <Direct> solver interface
 Smoother (level 3) post : no smoother

From ff0636bbe636fcd49fd7c8b84809783060c6a1fd Mon Sep 17 00:00:00 2001
From: Christian Glusa <caglusa@sandia.gov>
Date: Tue, 19 Nov 2024 17:42:32 -0700
Subject: [PATCH 3/3] MueLu Regression test: adjust deepcopy counts

Signed-off-by: Christian Glusa <caglusa@sandia.gov>
---
 packages/muelu/test/unit_tests_kokkos/Regression.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/muelu/test/unit_tests_kokkos/Regression.cpp b/packages/muelu/test/unit_tests_kokkos/Regression.cpp
index e9ab09ea37a2..46f4f88777ce 100644
--- a/packages/muelu/test/unit_tests_kokkos/Regression.cpp
+++ b/packages/muelu/test/unit_tests_kokkos/Regression.cpp
@@ -98,12 +98,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, H2D, Scalar, LocalOrdinal, GlobalO
   }
 #ifdef KOKKOS_HAS_SHARED_SPACE
   else {
-    size_t targetNumDeepCopies = kkNativeDeepCopies + (std::is_same_v<typename Node::memory_space, Kokkos::SharedSpace> ? 19 : 34);
+    size_t targetNumDeepCopies = kkNativeDeepCopies + (std::is_same_v<typename Node::memory_space, Kokkos::SharedSpace> ? 19 : 31);
     TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), targetNumDeepCopies);
   }
 #else
   else {
-    TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), kkNativeDeepCopies + 34);
+    TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), kkNativeDeepCopies + 31);
   }
 #endif  // KOKKOS_HAS_SHARED_SPACE
 
@@ -130,7 +130,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, H2D, Scalar, LocalOrdinal, GlobalO
 
 }  // H2D
 
-TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, Aggregration, Scalar, LocalOrdinal, GlobalOrdinal, Node) {
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, Aggregation, Scalar, LocalOrdinal, GlobalOrdinal, Node) {
 #include <MueLu_UseShortNames.hpp>
   MUELU_TESTING_SET_OSTREAM;
   MUELU_TESTING_LIMIT_SCOPE(Scalar, GlobalOrdinal, Node);
@@ -175,12 +175,12 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, Aggregration, Scalar, LocalOrdinal
   }
 #ifdef KOKKOS_HAS_SHARED_SPACE
   else {
-    size_t targetNumDeepCopies = std::is_same_v<typename Node::memory_space, Kokkos::SharedSpace> ? 17 : 23;
+    size_t targetNumDeepCopies = std::is_same_v<typename Node::memory_space, Kokkos::SharedSpace> ? 17 : 16;
     TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), targetNumDeepCopies);
   }
 #else
   else {
-    TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), 23);
+    TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), 16);
   }
 #endif
 
@@ -193,7 +193,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, Aggregration, Scalar, LocalOrdinal
 
 #define MUELU_ETI_GROUP(Scalar, LO, GO, Node)                                 \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Regression, H2D, Scalar, LO, GO, Node) \
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Regression, Aggregration, Scalar, LO, GO, Node)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Regression, Aggregation, Scalar, LO, GO, Node)
 
 #include <MueLu_ETI_4arg.hpp>