diff --git a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp index 53bd87de9a5d..78f5085479d0 100644 --- a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp @@ -1053,6 +1053,11 @@ description () const } else { out << "INVALID"; } + + // BlockCrs if we have that + if(hasBlockCrsMatrix_) + out<<", BlockCrs"; + // Print the approximate # rows per part int approx_rows_per_part = A_->getLocalNumRows()/Partitioner_->numLocalParts(); out <<", blocksize: "<(A, H); } + template + void ReuseTpetraPreconditioner(const Teuchos::RCP >& inA, + MueLu::TpetraOperator& Op) { + typedef Scalar SC; + typedef LocalOrdinal LO; + typedef GlobalOrdinal GO; + typedef Node NO; + + typedef Xpetra::Matrix Matrix; + typedef MueLu ::Hierarchy Hierarchy; + + RCP H = Op.GetHierarchy(); + RCP > temp = rcp(new Xpetra::TpetraBlockCrsMatrix(inA)); + TEUCHOS_TEST_FOR_EXCEPTION(temp==Teuchos::null, Exceptions::RuntimeError, "ReuseTpetraPreconditioner: cast from Tpetra::BlockCrsMatrix to Xpetra::TpetraBlockCrsMatrix failed."); + RCP A = rcp(new Xpetra::CrsMatrixWrap(temp)); + + MueLu::ReuseXpetraPreconditioner(A, H); + } + } //namespace diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp index 549b00075d81..9b4f08644349 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp @@ -75,11 +75,30 @@ namespace MueLu { RCP A = Get< RCP >(currentLevel, "A"); + /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. + fullblocksize is the number of storage blocks that must kept together during the amalgamation process. + + Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: + + numPDEs = fullblocksize * storageblocksize. + + If numPDEs==1 + Matrix is point storage (classical CRS storage). storageblocksize=1 and fullblocksize=1 + No other values makes sense. + + If numPDEs>1 + If matrix uses point storage, then storageblocksize=1 and fullblockssize=numPDEs. + If matrix uses block storage, with block size of n, then storageblocksize=n, and fullblocksize=numPDEs/n. + Thus far, only storageblocksize=numPDEs and fullblocksize=1 has been tested. + */ + + LO fullblocksize = 1; // block dim for fixed size blocks GO offset = 0; // global offset of dof gids LO blockid = -1; // block id in strided map LO nStridedOffset = 0; // DOF offset for strided block id "blockid" (default = 0) LO stridedblocksize = fullblocksize; // size of strided block id "blockid" (default = fullblocksize, only if blockid!=-1 stridedblocksize <= fullblocksize) + LO storageblocksize = A->GetStorageBlockSize(); // GO indexBase = A->getRowMap()->getIndexBase(); // index base for maps (unused) // 1) check for blocking/striding information @@ -101,6 +120,12 @@ namespace MueLu { } else { stridedblocksize = fullblocksize; } + // Correct for the storageblocksize + // NOTE: Before this point fullblocksize is actually numPDEs + TEUCHOS_TEST_FOR_EXCEPTION(fullblocksize % storageblocksize != 0,Exceptions::RuntimeError,"AmalgamationFactory: fullblocksize needs to be a multiple of A->GetStorageBlockSize()"); + fullblocksize /= storageblocksize; + stridedblocksize /= storageblocksize; + oldView = A->SwitchToView(oldView); GetOStream(Runtime1) << "AmalagamationFactory::Build():" << " found fullblocksize=" << fullblocksize << " and stridedblocksize=" << stridedblocksize << " from strided maps. offset=" << offset << std::endl; @@ -108,6 +133,7 @@ namespace MueLu { GetOStream(Warnings0) << "AmalagamationFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl; } + // build node row map (uniqueMap) and node column map (nonUniqueMap) // the arrays rowTranslation and colTranslation contain the local node id // given a local dof id. They are only necessary for the CoalesceDropFactory if @@ -166,7 +192,7 @@ namespace MueLu { container filter; GO offset = 0; - LO blkSize = A.GetFixedBlockSize(); + LO blkSize = A.GetFixedBlockSize() / A.GetStorageBlockSize(); if (A.IsView("stridedMaps") == true) { Teuchos::RCP myMap = A.getRowMap("stridedMaps"); Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp index e945469b5ba1..cc284df2e27b 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp @@ -80,11 +80,29 @@ namespace MueLu { RCP A = Get< RCP >(currentLevel, "A"); + /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. + fullblocksize is the number of storage blocks that must kept together during the amalgamation process. + + Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: + + numPDEs = fullblocksize * storageblocksize. + + If numPDEs==1 + Matrix is point storage (classical CRS storage). storageblocksize=1 and fullblocksize=1 + No other values makes sense. + + If numPDEs>1 + If matrix uses point storage, then storageblocksize=1 and fullblockssize=numPDEs. + If matrix uses block storage, with block size of n, then storageblocksize=n, and fullblocksize=numPDEs/n. + Thus far, only storageblocksize=numPDEs and fullblocksize=1 has been tested. + */ + LO fullblocksize = 1; // block dim for fixed size blocks GO offset = 0; // global offset of dof gids LO blockid = -1; // block id in strided map LO nStridedOffset = 0; // DOF offset for strided block id "blockid" (default = 0) LO stridedblocksize = fullblocksize; // size of strided block id "blockid" (default = fullblocksize, only if blockid!=-1 stridedblocksize <= fullblocksize) + LO storageblocksize = A->GetStorageBlockSize(); // GO indexBase = A->getRowMap()->getIndexBase(); // index base for maps (unused) // 1) check for blocking/striding information @@ -106,6 +124,12 @@ namespace MueLu { } else { stridedblocksize = fullblocksize; } + // Correct for the storageblocksize + // NOTE: Before this point fullblocksize is actually numPDEs + TEUCHOS_TEST_FOR_EXCEPTION(fullblocksize % storageblocksize != 0,Exceptions::RuntimeError,"AmalgamationFactory::Build(): fullblocksize needs to be a multiple of A->GetStorageBlockSize()"); + fullblocksize /= storageblocksize; + stridedblocksize /= storageblocksize; + oldView = A->SwitchToView(oldView); GetOStream(Runtime1) << "AmalagamationFactory::Build():" << " found fullblocksize=" << fullblocksize << " and stridedblocksize=" << stridedblocksize << " from strided maps. offset=" << offset << std::endl; @@ -172,7 +196,7 @@ namespace MueLu { container filter; // TODO: replace std::set with an object having faster lookup/insert, hashtable for instance GO offset = 0; - LO blkSize = A.GetFixedBlockSize(); + LO blkSize = A.GetFixedBlockSize() / A.GetStorageBlockSize(); if (A.IsView("stridedMaps") == true) { Teuchos::RCP myMap = A.getRowMap("stridedMaps"); Teuchos::RCP strMap = Teuchos::rcp_dynamic_cast(myMap); diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp index 9bd4b73d1169..edfad670c279 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp @@ -132,7 +132,7 @@ namespace MueLu { template void AmalgamationInfo::UnamalgamateAggregatesLO(const Aggregates& aggregates, - Teuchos::ArrayRCP& aggStart, Teuchos::ArrayRCP& aggToRowMap) const { + Teuchos::ArrayRCP& aggStart, Teuchos::ArrayRCP& aggToRowMap) const { int myPid = aggregates.GetMap()->getComm()->getRank(); Teuchos::ArrayView nodeGlobalElts = aggregates.GetMap()->getLocalElementList(); diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 6c2241c12b8d..4d5f035069e1 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -370,7 +370,7 @@ namespace MueLu { distanceLaplacianAlgo = scaled_cut_symmetric; else TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\""); - GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl; + GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize()<< std::endl; } else if (algo == "classical") { if (classicalAlgoStr == "default") classicalAlgo = defaultAlgo; @@ -396,6 +396,27 @@ namespace MueLu { GO numDropped = 0, numTotal = 0; std::string graphType = "unamalgamated"; //for description purposes only + + /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. + BlockSize is the number of storage blocks that must kept together during the amalgamation process. + + Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: + + numPDEs = BlockSize * storageblocksize. + + If numPDEs==1 + Matrix is point storage (classical CRS storage). storageblocksize=1 and BlockSize=1 + No other values makes sense. + + If numPDEs>1 + If matrix uses point storage, then storageblocksize=1 and BlockSize=numPDEs. + If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n. + Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested. + */ + TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0,Exceptions::RuntimeError,"A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); + const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); + + /************************** RS or SA-style Classical Dropping (and variants) **************************/ if (algo == "classical") { if (predrop_ == null) { @@ -417,7 +438,7 @@ namespace MueLu { // At this points we either have // (predrop_ != null) // Therefore, it is sufficient to check only threshold - if (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { + if ( BlockSize==1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) { // Case 1: scalar problem, no dropping => just use matrix graph RCP graph = rcp(new Graph(A->getCrsGraph(), "graph of A")); // Detect and record rows that correspond to Dirichlet boundary conditions @@ -442,10 +463,10 @@ namespace MueLu { Set(currentLevel, "DofsPerNode", 1); Set(currentLevel, "Graph", graph); - } else if ( (A->GetFixedBlockSize() == 1 && threshold != STS::zero()) || - (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || - (A->GetFixedBlockSize() == 1 && useSignedClassicalRS) || - (A->GetFixedBlockSize() == 1 && useSignedClassicalSA) ) { + } else if ( (BlockSize == 1 && threshold != STS::zero()) || + (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) || + (BlockSize == 1 && useSignedClassicalRS) || + (BlockSize == 1 && useSignedClassicalSA) ) { // Case 2: scalar problem with dropping => record the column indices of undropped entries, but still use original // graph's map information, e.g., whether index is local // OR a matrix without a CrsGraph @@ -721,7 +742,7 @@ namespace MueLu { } #endif }//end generateColoringGraph - } else if (A->GetFixedBlockSize() > 1 && threshold == STS::zero()) { + } else if (BlockSize > 1 && threshold == STS::zero()) { // Case 3: Multiple DOF/node problem without dropping const RCP rowMap = A->getRowMap(); const RCP colMap = A->getColMap(); @@ -853,7 +874,7 @@ namespace MueLu { Set(currentLevel, "Graph", graph); Set(currentLevel, "DofsPerNode", blkSize); // full block size - } else if (A->GetFixedBlockSize() > 1 && threshold != STS::zero()) { + } else if (BlockSize > 1 && threshold != STS::zero()) { // Case 4: Multiple DOF/node problem with dropping const RCP rowMap = A->getRowMap(); const RCP colMap = A->getColMap(); diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp index a3f394fe36b8..8f5a42e6d653 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp @@ -506,7 +506,27 @@ namespace MueLu { const MT zero = Teuchos::ScalarTraits::zero(); auto A = Get< RCP >(currentLevel, "A"); - LO blkSize = A->GetFixedBlockSize(); + + + /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme. + blkSize is the number of storage blocks that must kept together during the amalgamation process. + + Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold: + + numPDEs = blkSize * storageblocksize. + + If numPDEs==1 + Matrix is point storage (classical CRS storage). storageblocksize=1 and blkSize=1 + No other values makes sense. + + If numPDEs>1 + If matrix uses point storage, then storageblocksize=1 and blkSize=numPDEs. + If matrix uses block storage, with block size of n, then storageblocksize=n, and blkSize=numPDEs/n. + Thus far, only storageblocksize=numPDEs and blkSize=1 has been tested. + */ + + TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0,Exceptions::RuntimeError,"A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()"); + LO blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); auto amalInfo = Get< RCP >(currentLevel, "UnAmalgamationInfo"); @@ -542,7 +562,7 @@ namespace MueLu { boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold); // Trivial LWGraph construction - graph = rcp(new LWGraph_kokkos(A->getLocalMatrixDevice().graph, A->getRowMap(), A->getColMap(), "graph of A")); + graph = rcp(new LWGraph_kokkos(A->getCrsGraph()->getLocalGraphDevice(), A->getRowMap(), A->getColMap(), "graph of A")); graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes); numTotal = A->getLocalNumEntries(); diff --git a/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp b/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp index 318da1790a21..1a95e3cd1c7f 100644 --- a/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp +++ b/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp @@ -56,6 +56,7 @@ #include #include #include +#include #include "MueLu_RAPFactory_decl.hpp" @@ -281,8 +282,7 @@ namespace MueLu { Xpetra::TripleMatrixMultiply:: MultiplyRAP(*P, doTranspose, *A, !doTranspose, *P, !doTranspose, *Ac, doFillComplete, doOptimizeStorage, labelstr+std::string("MueLu::R*A*P-implicit-")+levelstr.str(), - RAPparams); - + RAPparams); } else { RCP R = Get< RCP >(coarseLevel, "R"); Ac = MatrixFactory::Build(R->getRowMap(), Teuchos::as(0)); diff --git a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp index 81751dab90ec..eda2f914e8a5 100644 --- a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp +++ b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp @@ -1245,9 +1245,10 @@ namespace MueLu { break; } - Xpetra::global_size_t nnz = Am->getGlobalNumEntries(); + LO storageblocksize=Am->GetStorageBlockSize(); + Xpetra::global_size_t nnz = Am->getGlobalNumEntries()*storageblocksize*storageblocksize; nnzPerLevel .push_back(nnz); - rowsPerLevel .push_back(Am->getGlobalNumRows()); + rowsPerLevel .push_back(Am->getGlobalNumRows()*storageblocksize); numProcsPerLevel.push_back(Am->getRowMap()->getComm()->getSize()); } @@ -1434,8 +1435,9 @@ namespace MueLu { } GetOStream(Runtime1) << "Replacing coordinate map" << std::endl; + TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "Hierarchy::ReplaceCoordinateMap: Storage block size does not evenly divide fixed block size"); - size_t blkSize = A->GetFixedBlockSize(); + size_t blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize(); RCP nodeMap = A->getRowMap(); if (blkSize > 1) { diff --git a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp index 798209774e57..14357b2feba9 100644 --- a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp +++ b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -234,16 +235,21 @@ namespace MueLu { if(Acrs.is_null()) throw std::runtime_error("Ifpack2Smoother: Cannot extract CrsMatrix from matrix A."); RCP At = rcp_dynamic_cast(Acrs); - if(At.is_null()) - throw std::runtime_error("Ifpack2Smoother: Cannot extract TpetraCrsMatrix from matrix A."); - - RCP > blockCrs = Tpetra::convertToBlockCrsMatrix(*At->getTpetra_CrsMatrix(),blocksize); - RCP blockCrs_as_crs = rcp(new TpetraBlockCrsMatrix(blockCrs)); - RCP blockWrap = rcp(new CrsMatrixWrap(blockCrs_as_crs)); - A_ = blockWrap; - this->GetOStream(Statistics0) << "Ifpack2Smoother: Using BlockCrsMatrix storage with blocksize "<::isTpetraBlockCrs(matA)) + throw std::runtime_error("Ifpack2Smoother: Cannot extract CrsMatrix or BlockCrsMatrix from matrix A."); + this->GetOStream(Statistics0) << "Ifpack2Smoother: Using (native) BlockCrsMatrix storage with blocksize "< > blockCrs = Tpetra::convertToBlockCrsMatrix(*At->getTpetra_CrsMatrix(),blocksize); + RCP blockCrs_as_crs = rcp(new TpetraBlockCrsMatrix(blockCrs)); + RCP blockWrap = rcp(new CrsMatrixWrap(blockCrs_as_crs)); + A_ = blockWrap; + this->GetOStream(Statistics0) << "Ifpack2Smoother: Using BlockCrsMatrix storage with blocksize "< @@ -155,6 +157,7 @@ namespace MueLu { SubFactoryMonitor sfm(*this, "BuildCoordinates", coarseLevel); RCP coarseCoordsFineMap = Get< RCP >(fineLevel, "coarseCoordinatesFineMap"); RCP coarseCoordsMap = Get< RCP >(fineLevel, "coarseCoordinatesMap"); + fineCoordinates = Get< RCP >(fineLevel, "Coordinates"); coarseCoordinates = Xpetra::MultiVectorFactory::Build(coarseCoordsFineMap, fineCoordinates->getNumVectors()); @@ -172,6 +175,7 @@ namespace MueLu { *out << "Fine and coarse coordinates have been loaded from the fine level and set on the coarse level." << std::endl; + if(interpolationOrder == 0) { SubFactoryMonitor sfm(*this, "BuildConstantP", coarseLevel); // Compute the prolongator using piece-wise constant interpolation @@ -222,8 +226,19 @@ namespace MueLu { RCP fineNullspace = Get< RCP > (fineLevel, "Nullspace"); RCP coarseNullspace = MultiVectorFactory::Build(P->getDomainMap(), fineNullspace->getNumVectors()); - P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits::one(), - Teuchos::ScalarTraits::zero()); + + using helpers=Xpetra::Helpers; + if(helpers::isTpetraBlockCrs(A)) { + // FIXME: BlockCrs doesn't currently support transpose apply, so we have to do this the hard way + RCP Ptrans = Utilities::Transpose(*P); + Ptrans->apply(*fineNullspace, *coarseNullspace, Teuchos::NO_TRANS, Teuchos::ScalarTraits::one(), + Teuchos::ScalarTraits::zero()); + } + else { + P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits::one(), + Teuchos::ScalarTraits::zero()); + } + Set(coarseLevel, "Nullspace", coarseNullspace); } @@ -257,19 +272,78 @@ namespace MueLu { *out << "Call prolongator constructor" << std::endl; - // Create the prolongator matrix and its associated objects - RCP dummyList = rcp(new ParameterList()); - P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList)); - RCP PCrs = rcp_dynamic_cast(P)->getCrsMatrix(); - PCrs->setAllToScalar(1.0); - PCrs->fillComplete(); + using helpers=Xpetra::Helpers; + if(helpers::isTpetraBlockCrs(A)) { +#ifdef HAVE_MUELU_TPETRA + SC one = Teuchos::ScalarTraits::one(); + SC zero = Teuchos::ScalarTraits::zero(); + LO NSDim = A->GetStorageBlockSize(); + + // Build the exploded Map + RCP BlockMap = prolongatorGraph->getDomainMap(); + Teuchos::ArrayView block_dofs = BlockMap->getLocalElementList(); + Teuchos::Array point_dofs(block_dofs.size()*NSDim); + for(LO i=0, ct=0; i PointMap = MapFactory::Build(BlockMap->lib(), + BlockMap->getGlobalNumElements() *NSDim, + point_dofs(), + BlockMap->getIndexBase(), + BlockMap->getComm()); + strideInfo[0] = A->GetFixedBlockSize(); + RCP stridedPointMap = StridedMapFactory::Build(PointMap, strideInfo); + + RCP > P_xpetra = Xpetra::CrsMatrixFactory::BuildBlock(prolongatorGraph, PointMap, A->getRangeMap(),NSDim); + RCP > P_tpetra = rcp_dynamic_cast >(P_xpetra); + if(P_tpetra.is_null()) throw std::runtime_error("BuildConstantP Matrix factory did not return a Tpetra::BlockCrsMatrix"); + RCP P_wrap = rcp(new CrsMatrixWrap(P_xpetra)); + + // NOTE: Assumes block-diagonal prolongation + Teuchos::Array temp(1); + Teuchos::ArrayView indices; + Teuchos::Array block(NSDim*NSDim, zero); + for(LO i=0; igetLocalNumRows(); i++) { + prolongatorGraph->getLocalRowView(i,indices); + for(LO j=0; j<(LO)indices.size();j++) { + temp[0] = indices[j]; + P_tpetra->replaceLocalValues(i,temp(),block()); + } + } + + P = P_wrap; + if (A->IsView("stridedMaps") == true) { + P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedPointMap); + } + else { + P->CreateView("stridedMaps", P->getRangeMap(), PointMap); + } + +#else + throw std::runtime_error("GeometricInteroplationFactory::Build(): BlockCrs requires Tpetra"); +#endif - // set StridingInformation of P - if (A->IsView("stridedMaps") == true) { - P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap); - } else { - P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap); } + else { + // Create the prolongator matrix and its associated objects + RCP dummyList = rcp(new ParameterList()); + P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList)); + RCP PCrs = rcp_dynamic_cast(P)->getCrsMatrix(); + PCrs->setAllToScalar(1.0); + PCrs->fillComplete(); + + // set StridingInformation of P + if (A->IsView("stridedMaps") == true) + P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap); + else + P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap); + } } // BuildConstantP @@ -293,7 +367,7 @@ namespace MueLu { // Compute 2^numDimensions using bit logic to avoid round-off errors const int numInterpolationPoints = 1 << numDimensions; - const int dofsPerNode = A->GetFixedBlockSize(); + const int dofsPerNode = A->GetFixedBlockSize()/ A->GetStorageBlockSize();; RCP dummyList = rcp(new ParameterList()); P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList)); diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp index 91024e11abe2..b0e5f27a90e2 100644 --- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp +++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp @@ -123,8 +123,8 @@ namespace MueLu{ //@} - private: void BuildConstantP(RCP& P, RCP& prolongatorGraph, RCP& A) const; + private: void BuildLinearP(RCP& A, RCP& prolongatorGraph, RCP& fineCoordinates, RCP& ghostCoordinates, diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp index 5cbcdd71108f..e7c94590f77d 100644 --- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp +++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp @@ -53,6 +53,11 @@ #include "MueLu_Monitor.hpp" #include "MueLu_IndexManager_kokkos.hpp" +#ifdef HAVE_MUELU_TPETRA +#include "Xpetra_TpetraCrsMatrix.hpp" +#endif + + // Including this one last ensure that the short names of the above headers are defined properly #include "MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp" @@ -236,21 +241,87 @@ namespace MueLu { StridedMapFactory::Build(prolongatorGraph->getDomainMap(), strideInfo); *out << "Call prolongator constructor" << std::endl; + using helpers=Xpetra::Helpers; + if(helpers::isTpetraBlockCrs(A)) { +#ifdef HAVE_MUELU_TPETRA + LO NSDim = A->GetStorageBlockSize(); + + // Build the exploded Map + // FIXME: Should look at doing this on device + RCP BlockMap = prolongatorGraph->getDomainMap(); + Teuchos::ArrayView block_dofs = BlockMap->getLocalElementList(); + Teuchos::Array point_dofs(block_dofs.size()*NSDim); + for(LO i=0, ct=0; i PointMap = MapFactory::Build(BlockMap->lib(), + BlockMap->getGlobalNumElements() *NSDim, + point_dofs(), + BlockMap->getIndexBase(), + BlockMap->getComm()); + strideInfo[0] = A->GetFixedBlockSize(); + RCP stridedPointMap = StridedMapFactory::Build(PointMap, strideInfo); + + RCP > P_xpetra = Xpetra::CrsMatrixFactory::BuildBlock(prolongatorGraph, PointMap, A->getRangeMap(),NSDim); + RCP > P_tpetra = rcp_dynamic_cast >(P_xpetra); + if(P_tpetra.is_null()) throw std::runtime_error("BuildConstantP: Matrix factory did not return a Tpetra::BlockCrsMatrix"); + RCP P_wrap = rcp(new CrsMatrixWrap(P_xpetra)); + + const LO stride = strideInfo[0]*strideInfo[0]; + const LO in_stride = strideInfo[0]; + typename CrsMatrix::local_graph_type localGraph = prolongatorGraph->getLocalGraphDevice(); + auto rowptr = localGraph.row_map; + auto indices = localGraph.entries; + auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst(); + + using ISC = typename Tpetra::BlockCrsMatrix::impl_scalar_type; + ISC one = Teuchos::ScalarTraits::one(); + + const Kokkos::TeamPolicy policy(prolongatorGraph->getLocalNumRows(), 1); + + Kokkos::parallel_for("MueLu:GeoInterpFact::BuildConstantP::fill", policy, + KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy::member_type &thread) { + auto row = thread.league_rank(); + for(LO j = (LO)rowptr[row]; j<(LO) rowptr[row+1]; j++) { + LO block_offset = j*stride; + for(LO k=0; kIsView("stridedMaps") == true) { + P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedPointMap); + } + else { + P->CreateView("stridedMaps", P->getRangeMap(), PointMap); + } - // Create the prolongator matrix and its associated objects - RCP dummyList = rcp(new ParameterList()); - P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList)); - RCP PCrs = rcp_dynamic_cast(P)->getCrsMatrix(); - PCrs->setAllToScalar(1.0); - PCrs->fillComplete(); +#else + throw std::runtime_error("GeometricInteroplationFactory::BuildConstantP(): BlockCrs requires Tpetra"); +#endif - // set StridingInformation of P - if (A->IsView("stridedMaps") == true) { - P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap); - } else { - P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap); } - + else { + // Create the prolongator matrix and its associated objects + RCP dummyList = rcp(new ParameterList()); + P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList)); + RCP PCrs = rcp_dynamic_cast(P)->getCrsMatrix(); + PCrs->setAllToScalar(1.0); + PCrs->fillComplete(); + + // set StridingInformation of P + if (A->IsView("stridedMaps") == true) { + P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap); + } else { + P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap); + } + } + } // BuildConstantP template diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp index 24673877c7e6..2cce10e7d3e3 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -157,6 +158,8 @@ template coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const; void BuildPcoupled (RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace) const; + void BuildPuncoupledBlockCrs(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, + RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const; mutable bool bTransferCoordinates_ = false; diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp index c5d76496c439..eb9d0a49f021 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp @@ -49,7 +49,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -58,6 +60,12 @@ #include #include #include +#include + +#ifdef HAVE_MUELU_TPETRA +#include "Xpetra_TpetraBlockCrsMatrix.hpp" +//#include "Tpetra_BlockCrsMatrix.hpp" +#endif #include "MueLu_TentativePFactory_decl.hpp" @@ -71,6 +79,9 @@ #include "MueLu_PerfUtils.hpp" #include "MueLu_Utilities.hpp" + + + namespace MueLu { template @@ -157,7 +168,8 @@ namespace MueLu { Set > >(coarseLevel, "Node Comm", nodeComm); } - TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() != fineNullspace->getMap()->getLocalNumElements(), + // NOTE: We check DomainMap here rather than RowMap because those are different for BlockCrs matrices + TEUCHOS_TEST_FOR_EXCEPTION( A->getDomainMap()->getLocalNumElements() != fineNullspace->getMap()->getLocalNumElements(), Exceptions::RuntimeError,"MueLu::TentativePFactory::MakeTentative: Size mismatch between A and Nullspace"); RCP Ptentative; @@ -225,10 +237,16 @@ namespace MueLu { } } - if (!aggregates->AggregatesCrossProcessors()) - BuildPuncoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,coarseLevel.GetLevelID()); + if (!aggregates->AggregatesCrossProcessors()) { + if(Xpetra::Helpers::isTpetraBlockCrs(A)) { + BuildPuncoupledBlockCrs(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,coarseLevel.GetLevelID()); + } + else { + BuildPuncoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,coarseLevel.GetLevelID()); + } + } else - BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace); + BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace); // If available, use striding information of fine level matrix A for range // map and coarseMap as domain map; otherwise use plain range map of @@ -258,12 +276,24 @@ namespace MueLu { template void TentativePFactory:: - BuildPuncoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, - RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const { - RCP rowMap = A->getRowMap(); - RCP colMap = A->getColMap(); + BuildPuncoupledBlockCrs(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, + RCP coarsePointMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const { +#ifdef HAVE_MUELU_TPETRA - const size_t numRows = rowMap->getLocalNumElements(); + /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could + be generalized later, if we ever need to do so: + 1) Null space dimension === block size of matrix: So no elasticity right now + 2) QR is not supported: Under assumption #1, this shouldn't cause problems. + 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap. + + These assumptions keep our code way simpler and still support the use cases we actually care about. + */ + + RCP rowMap = A->getRowMap(); + RCP rangeMap = A->getRangeMap(); + RCP colMap = A->getColMap(); + // const size_t numFinePointRows = rangeMap->getLocalNumElements(); + const size_t numFineBlockRows = rowMap->getLocalNumElements(); typedef Teuchos::ScalarTraits STS; typedef typename STS::magnitudeType Magnitude; @@ -275,7 +305,16 @@ namespace MueLu { const size_t NSDim = fineNullspace->getNumVectors(); ArrayRCP aggSizes = aggregates->ComputeAggregateSizes(); - + // Need to generate the coarse block map + // NOTE: We assume NSDim == block size here + // NOTE: We also assume that coarseMap has contiguous GIDs + //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements(); + const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim; + RCP coarseBlockMap = MapFactory::Build(coarsePointMap->lib(), + Teuchos::OrdinalTraits::invalid(), + numCoarseBlockRows, + coarsePointMap->getIndexBase(), + coarsePointMap->getComm()); // Sanity checking const ParameterList& pL = GetParameterList(); const bool &doQRStep = pL.get("tentative: calculate qr"); @@ -284,6 +323,8 @@ namespace MueLu { TEUCHOS_TEST_FOR_EXCEPTION(doQRStep && constantColSums,Exceptions::RuntimeError, "MueLu::TentativePFactory::MakeTentative: cannot use 'constant column sums' and 'calculate qr' at the same time"); + // The aggregates use the amalgamated column map, which in this case is what we want + // Aggregates map is based on the amalgamated column map // We can skip global-to-local conversion if LIDs in row map are // same as LIDs in column map @@ -299,394 +340,238 @@ namespace MueLu { if (goodMap) { amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO); GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl; - } else { - amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO); - GetOStream(Warnings0) << "Column map is not consistent with the row map\n" - << "using GO->LO conversion with performance penalty" << std::endl; + throw std::runtime_error("TentativePFactory::PuncoupledBlockCrs: Inconsistent maps not currently supported"); } - - coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); + + coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim); // Pull out the nullspace vectors so that we can have random access. ArrayRCP > fineNS (NSDim); ArrayRCP > coarseNS(NSDim); for (size_t i = 0; i < NSDim; i++) { fineNS[i] = fineNullspace->getData(i); - if (coarseMap->getLocalNumElements() > 0) + if (coarsePointMap->getLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i); } - size_t nnzEstimate = numRows * NSDim; - - // Time to construct the matrix and fill in the values - Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0)); - RCP PtentCrs = rcp_dynamic_cast(Ptentative)->getCrsMatrix(); - + // BlockCrs requires that we build the (block) graph first, so let's do that... + // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one + // block non-zero per row in the matrix; + RCP BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,0); ArrayRCP iaPtent; ArrayRCP jaPtent; - ArrayRCP valPtent; - - PtentCrs->allocateAllValues(nnzEstimate, iaPtent, jaPtent, valPtent); - + BlockGraph->allocateAllIndices(numFineBlockRows, iaPtent, jaPtent); ArrayView ia = iaPtent(); ArrayView ja = jaPtent(); - ArrayView val = valPtent(); - - ia[0] = 0; - for (size_t i = 1; i <= numRows; i++) - ia[i] = ia[i-1] + NSDim; - for (size_t j = 0; j < nnzEstimate; j++) { - ja [j] = INVALID; - val[j] = zero; + for (size_t i = 0; i < numFineBlockRows; i++) { + ia[i] = i; + ja[i] = INVALID; } + ia[numCoarseBlockRows] = numCoarseBlockRows; - if (doQRStep) { - //////////////////////////////// - // Standard aggregate-wise QR // - //////////////////////////////// - for (GO agg = 0; agg < numAggs; agg++) { - LO aggSize = aggStart[agg+1] - aggStart[agg]; + for (GO agg = 0; agg < numAggs; agg++) { + LO aggSize = aggStart[agg+1] - aggStart[agg]; + Xpetra::global_size_t offset = agg; - Xpetra::global_size_t offset = agg*NSDim; + for (LO j = 0; j < aggSize; j++) { + // FIXME: Allow for bad maps + const LO localRow = aggToRowMapLO[aggStart[agg]+j]; + const size_t rowStart = ia[localRow]; + ja[rowStart] = offset; + } + } - // Extract the piece of the nullspace corresponding to the aggregate, and - // put it in the flat array, "localQR" (in column major format) for the - // QR routine. - Teuchos::SerialDenseMatrix localQR(aggSize, NSDim); - if (goodMap) { - for (size_t j = 0; j < NSDim; j++) - for (LO k = 0; k < aggSize; k++) - localQR(k,j) = fineNS[j][aggToRowMapLO[aggStart[agg]+k]]; - } else { - for (size_t j = 0; j < NSDim; j++) - for (LO k = 0; k < aggSize; k++) - localQR(k,j) = fineNS[j][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+k])]; + // Compress storage (remove all INVALID, which happen when we skip zeros) + // We do that in-place + size_t ia_tmp = 0, nnz = 0; + for (size_t i = 0; i < numFineBlockRows; i++) { + for (size_t j = ia_tmp; j < ia[i+1]; j++) + if (ja[j] != INVALID) { + ja [nnz] = ja [j]; + nnz++; } + ia_tmp = ia[i+1]; + ia[i+1] = nnz; + } - // Test for zero columns - for (size_t j = 0; j < NSDim; j++) { - bool bIsZeroNSColumn = true; + if (rowMap->lib() == Xpetra::UseTpetra) { + // - Cannot resize for Epetra, as it checks for same pointers + // - Need to resize for Tpetra, as it check ().size() == ia[numRows] + // NOTE: these invalidate ja and val views + jaPtent .resize(nnz); + } - for (LO k = 0; k < aggSize; k++) - if (localQR(k,j) != zero) - bIsZeroNSColumn = false; + GetOStream(Runtime1) << "TentativePFactory : generating block graph" << std::endl; + BlockGraph->setAllIndices(iaPtent, jaPtent); - TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError, - "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column in NS column " << j); - } + // Managing labels & constants for ESFC + { + RCP FCparams; + if(pL.isSublist("matrixmatrix: kernel params")) + FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); + else + FCparams= rcp(new ParameterList); + // By default, we don't need global constants for TentativeP, but we do want it for the graph + // if we're printing statistics, so let's leave it on for now. + FCparams->set("compute global constants",FCparams->get("compute global constants",true)); + std::string levelIDs = toString(levelID); + FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs); + RCP dummy_e; + RCP dummy_i; + BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams); + } - // Calculate QR decomposition (standard) - // NOTE: Q is stored in localQR and R is stored in coarseNS - if (aggSize >= Teuchos::as(NSDim)) { + // Now let's make a BlockCrs Matrix + // NOTE: Assumes block size== NSDim + RCP > P_xpetra = Xpetra::CrsMatrixFactory::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim); + RCP > P_tpetra = rcp_dynamic_cast >(P_xpetra); + if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix"); + RCP P_wrap = rcp(new CrsMatrixWrap(P_xpetra)); + + ///////////////////////////// + // "no-QR" option // + ///////////////////////////// + // Local Q factor is just the fine nullspace support over the current aggregate. + // Local R factor is the identity. + // NOTE: We're not going to do a QR here as we're assuming that blocksize == NSDim + // NOTE: "goodMap" case only + Teuchos::Array block(NSDim*NSDim, zero); + Teuchos::Array bcol(1); + + GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl; + for (LO agg = 0; agg < numAggs; agg++) { + bcol[0] = agg; + const LO aggSize = aggStart[agg+1] - aggStart[agg]; + Xpetra::global_size_t offset = agg*NSDim; + + // Process each row in the local Q factor + // NOTE: Blocks are in row-major order + for (LO j = 0; j < aggSize; j++) { + const LO localBlockRow = aggToRowMapLO[aggStart[agg]+j]; + + for (size_t r = 0; r < NSDim; r++) { + LO localPointRow = localBlockRow*NSDim + r; + for (size_t c = 0; c < NSDim; c++) + block[r*NSDim+c] = fineNS[c][localPointRow]; + } + // NOTE: Assumes columns==aggs and are ordered sequentially + P_tpetra->replaceLocalValues(localBlockRow,bcol(),block()); + + }//end aggSize + + for (size_t j = 0; j < NSDim; j++) + coarseNS[j][offset+j] = one; + + } //for (GO agg = 0; agg < numAggs; agg++) + + Ptentative = P_wrap; +#else + throw std::runtime_error("TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra"); +#endif + } - if (NSDim == 1) { - // Only one nullspace vector, calculate Q and R by hand - Magnitude norm = STS::magnitude(zero); - for (size_t k = 0; k < Teuchos::as(aggSize); k++) - norm += STS::magnitude(localQR(k,0)*localQR(k,0)); - norm = Teuchos::ScalarTraits::squareroot(norm); + template + void TentativePFactory:: + BuildPcoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, + RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace) const { + typedef Teuchos::ScalarTraits STS; + typedef typename STS::magnitudeType Magnitude; + const SC zero = STS::zero(); + const SC one = STS::one(); - // R = norm - coarseNS[0][offset] = norm; + // number of aggregates + GO numAggs = aggregates->GetNumAggregates(); - // Q = localQR(:,0)/norm - for (LO i = 0; i < aggSize; i++) - localQR(i,0) /= norm; + // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate. + // aggStart is a pointer into aggToRowMap + // aggStart[i]..aggStart[i+1] are indices into aggToRowMap + // aggToRowMap[aggStart[i]]..aggToRowMap[aggStart[i+1]-1] are the DOFs in aggregate i + ArrayRCP aggStart; + ArrayRCP< GO > aggToRowMap; + amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap); - } else { - Teuchos::SerialQRDenseSolver qrSolver; - qrSolver.setMatrix(Teuchos::rcp(&localQR, false)); - qrSolver.factor(); + // find size of the largest aggregate + LO maxAggSize=0; + for (GO i=0; i maxAggSize) maxAggSize = sizeOfThisAgg; + } - // R = upper triangular part of localQR - for (size_t j = 0; j < NSDim; j++) - for (size_t k = 0; k <= j; k++) - coarseNS[j][offset+k] = localQR(k,j); //TODO is offset+k the correct local ID?! + // dimension of fine level nullspace + const size_t NSDim = fineNullspace->getNumVectors(); - // Calculate Q, the tentative prolongator. - // The Lapack GEQRF call only works for myAggsize >= NSDim - qrSolver.formQ(); - Teuchos::RCP > qFactor = qrSolver.getQ(); - for (size_t j = 0; j < NSDim; j++) - for (size_t i = 0; i < Teuchos::as(aggSize); i++) - localQR(i,j) = (*qFactor)(i,j); - } + // index base for coarse Dof map (usually 0) + GO indexBase=A->getRowMap()->getIndexBase(); - } else { - // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics) + const RCP nonUniqueMap = amalgInfo->ComputeUnamalgamatedImportDofMap(*aggregates); + const RCP uniqueMap = A->getDomainMap(); + RCP importer = ImportFactory::Build(uniqueMap, nonUniqueMap); + RCP fineNullspaceWithOverlap = MultiVectorFactory::Build(nonUniqueMap,NSDim); + fineNullspaceWithOverlap->doImport(*fineNullspace,*importer,Xpetra::INSERT); - // The local QR decomposition is not possible in the "overconstrained" - // case (i.e. number of columns in localQR > number of rows), which - // corresponds to #DOFs in Aggregate < NSDim. For usual problems this - // is only possible for single node aggregates in structural mechanics. - // (Similar problems may arise in discontinuous Galerkin problems...) - // We bypass the QR decomposition and use an identity block in the - // tentative prolongator for the single node aggregate and transfer the - // corresponding fine level null space information 1-to-1 to the coarse - // level null space part. + // Pull out the nullspace vectors so that we can have random access. + ArrayRCP< ArrayRCP > fineNS(NSDim); + for (size_t i=0; igetData(i); - // NOTE: The resulting tentative prolongation operator has - // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular - // coarse level operator A. To deal with that one has the following - // options: - // - Use the "RepairMainDiagonal" flag in the RAPFactory (default: - // false) to add some identity block to the diagonal of the zero rows - // in the coarse level operator A, such that standard level smoothers - // can be used again. - // - Use special (projection-based) level smoothers, which can deal - // with singular matrices (very application specific) - // - Adapt the code below to avoid zero columns. However, we do not - // support a variable number of DOFs per node in MueLu/Xpetra which - // makes the implementation really hard. + //Allocate storage for the coarse nullspace. + coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); - // R = extended (by adding identity rows) localQR - for (size_t j = 0; j < NSDim; j++) - for (size_t k = 0; k < NSDim; k++) - if (k < as(aggSize)) - coarseNS[j][offset+k] = localQR(k,j); - else - coarseNS[j][offset+k] = (k == j ? one : zero); + ArrayRCP< ArrayRCP > coarseNS(NSDim); + for (size_t i=0; igetLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i); - // Q = I (rectangular) - for (size_t i = 0; i < as(aggSize); i++) - for (size_t j = 0; j < NSDim; j++) - localQR(i,j) = (j == i ? one : zero); - } + //This makes the rowmap of Ptent the same as that of A-> + //This requires moving some parts of some local Q's to other processors + //because aggregates can span processors. + RCP rowMapForPtent = A->getRowMap(); + const Map& rowMapForPtentRef = *rowMapForPtent; + // Set up storage for the rows of the local Qs that belong to other processors. + // FIXME This is inefficient and could be done within the main loop below with std::vector's. + RCP colMap = A->getColMap(); - // Process each row in the local Q factor - // FIXME: What happens if maps are blocked? - for (LO j = 0; j < aggSize; j++) { - LO localRow = (goodMap ? aggToRowMapLO[aggStart[agg]+j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j])); + RCP ghostQMap; + RCP ghostQvalues; + Array > > ghostQcolumns; + RCP > ghostQrowNums; + ArrayRCP< ArrayRCP > ghostQvals; + ArrayRCP< ArrayRCP > ghostQcols; + ArrayRCP< GO > ghostQrows; - size_t rowStart = ia[localRow]; - for (size_t k = 0, lnnz = 0; k < NSDim; k++) { - // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) - if (localQR(j,k) != zero) { - ja [rowStart+lnnz] = offset + k; - val[rowStart+lnnz] = localQR(j,k); - lnnz++; - } - } + Array ghostGIDs; + for (LO j=0; j1) - GetOStream(Warnings0) << "TentativePFactory : for nontrivial nullspace, this may degrade performance" << std::endl; - ///////////////////////////// - // "no-QR" option // - ///////////////////////////// - // Local Q factor is just the fine nullspace support over the current aggregate. - // Local R factor is the identity. - // TODO I have not implemented any special handling for aggregates that are too - // TODO small to locally support the nullspace, as is done in the standard QR - // TODO case above. - if (goodMap) { - for (GO agg = 0; agg < numAggs; agg++) { - const LO aggSize = aggStart[agg+1] - aggStart[agg]; - Xpetra::global_size_t offset = agg*NSDim; - - // Process each row in the local Q factor - // FIXME: What happens if maps are blocked? - for (LO j = 0; j < aggSize; j++) { - - //TODO Here I do not check for a zero nullspace column on the aggregate. - // as is done in the standard QR case. - - const LO localRow = aggToRowMapLO[aggStart[agg]+j]; - - const size_t rowStart = ia[localRow]; - - for (size_t k = 0, lnnz = 0; k < NSDim; k++) { - // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) - SC qr_jk = fineNS[k][aggToRowMapLO[aggStart[agg]+j]]; - if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; - if (qr_jk != zero) { - ja [rowStart+lnnz] = offset + k; - val[rowStart+lnnz] = qr_jk; - lnnz++; - } - } - } - for (size_t j = 0; j < NSDim; j++) - coarseNS[j][offset+j] = one; - } //for (GO agg = 0; agg < numAggs; agg++) - - } else { - for (GO agg = 0; agg < numAggs; agg++) { - const LO aggSize = aggStart[agg+1] - aggStart[agg]; - Xpetra::global_size_t offset = agg*NSDim; - for (LO j = 0; j < aggSize; j++) { - - const LO localRow = rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]); - - const size_t rowStart = ia[localRow]; - - for (size_t k = 0, lnnz = 0; k < NSDim; ++k) { - // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) - SC qr_jk = fineNS[k][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j])]; - if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; - if (qr_jk != zero) { - ja [rowStart+lnnz] = offset + k; - val[rowStart+lnnz] = qr_jk; - lnnz++; - } - } - } - for (size_t j = 0; j < NSDim; j++) - coarseNS[j][offset+j] = one; - } //for (GO agg = 0; agg < numAggs; agg++) - - } //if (goodmap) else ... - - } //if doQRStep ... else - - // Compress storage (remove all INVALID, which happen when we skip zeros) - // We do that in-place - size_t ia_tmp = 0, nnz = 0; - for (size_t i = 0; i < numRows; i++) { - for (size_t j = ia_tmp; j < ia[i+1]; j++) - if (ja[j] != INVALID) { - ja [nnz] = ja [j]; - val[nnz] = val[j]; - nnz++; - } - ia_tmp = ia[i+1]; - ia[i+1] = nnz; - } - if (rowMap->lib() == Xpetra::UseTpetra) { - // - Cannot resize for Epetra, as it checks for same pointers - // - Need to resize for Tpetra, as it check ().size() == ia[numRows] - // NOTE: these invalidate ja and val views - jaPtent .resize(nnz); - valPtent.resize(nnz); - } - - GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl; - - PtentCrs->setAllValues(iaPtent, jaPtent, valPtent); - - - // Managing labels & constants for ESFC - RCP FCparams; - if(pL.isSublist("matrixmatrix: kernel params")) - FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); - else - FCparams= rcp(new ParameterList); - // By default, we don't need global constants for TentativeP - FCparams->set("compute global constants",FCparams->get("compute global constants",false)); - std::string levelIDs = toString(levelID); - FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs); - RCP dummy_e; - RCP dummy_i; - - PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap(),dummy_i,dummy_e,FCparams); - } - - template - void TentativePFactory:: - BuildPcoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, - RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace) const { - typedef Teuchos::ScalarTraits STS; - typedef typename STS::magnitudeType Magnitude; - const SC zero = STS::zero(); - const SC one = STS::one(); - - // number of aggregates - GO numAggs = aggregates->GetNumAggregates(); - - // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate. - // aggStart is a pointer into aggToRowMap - // aggStart[i]..aggStart[i+1] are indices into aggToRowMap - // aggToRowMap[aggStart[i]]..aggToRowMap[aggStart[i+1]-1] are the DOFs in aggregate i - ArrayRCP aggStart; - ArrayRCP< GO > aggToRowMap; - amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap); - - // find size of the largest aggregate - LO maxAggSize=0; - for (GO i=0; i maxAggSize) maxAggSize = sizeOfThisAgg; - } - - // dimension of fine level nullspace - const size_t NSDim = fineNullspace->getNumVectors(); - - // index base for coarse Dof map (usually 0) - GO indexBase=A->getRowMap()->getIndexBase(); - - const RCP nonUniqueMap = amalgInfo->ComputeUnamalgamatedImportDofMap(*aggregates); - const RCP uniqueMap = A->getDomainMap(); - RCP importer = ImportFactory::Build(uniqueMap, nonUniqueMap); - RCP fineNullspaceWithOverlap = MultiVectorFactory::Build(nonUniqueMap,NSDim); - fineNullspaceWithOverlap->doImport(*fineNullspace,*importer,Xpetra::INSERT); - - // Pull out the nullspace vectors so that we can have random access. - ArrayRCP< ArrayRCP > fineNS(NSDim); - for (size_t i=0; igetData(i); - - //Allocate storage for the coarse nullspace. - coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); - - ArrayRCP< ArrayRCP > coarseNS(NSDim); - for (size_t i=0; igetLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i); - - //This makes the rowmap of Ptent the same as that of A-> - //This requires moving some parts of some local Q's to other processors - //because aggregates can span processors. - RCP rowMapForPtent = A->getRowMap(); - const Map& rowMapForPtentRef = *rowMapForPtent; - - // Set up storage for the rows of the local Qs that belong to other processors. - // FIXME This is inefficient and could be done within the main loop below with std::vector's. - RCP colMap = A->getColMap(); - - RCP ghostQMap; - RCP ghostQvalues; - Array > > ghostQcolumns; - RCP > ghostQrowNums; - ArrayRCP< ArrayRCP > ghostQvals; - ArrayRCP< ArrayRCP > ghostQcols; - ArrayRCP< GO > ghostQrows; - - Array ghostGIDs; - for (LO j=0; jgetRowMap()->lib(), - Teuchos::OrdinalTraits::invalid(), - ghostGIDs, - indexBase, A->getRowMap()->getComm()); //JG:Xpetra::global_size_t>? - //Vector to hold bits of Q that go to other processors. - ghostQvalues = MultiVectorFactory::Build(ghostQMap,NSDim); - //Note that Epetra does not support MultiVectors templated on Scalar != double. - //So to work around this, we allocate an array of Vectors. This shouldn't be too - //expensive, as the number of Vectors is NSDim. - ghostQcolumns.resize(NSDim); - for (size_t i=0; i::Build(ghostQMap); - ghostQrowNums = Xpetra::VectorFactory::Build(ghostQMap); - if (ghostQvalues->getLocalLength() > 0) { - ghostQvals.resize(NSDim); - ghostQcols.resize(NSDim); - for (size_t i=0; igetDataNonConst(i); - ghostQcols[i] = ghostQcolumns[i]->getDataNonConst(0); - } - ghostQrows = ghostQrowNums->getDataNonConst(0); - } + } + ghostQMap = MapFactory::Build(A->getRowMap()->lib(), + Teuchos::OrdinalTraits::invalid(), + ghostGIDs, + indexBase, A->getRowMap()->getComm()); //JG:Xpetra::global_size_t>? + //Vector to hold bits of Q that go to other processors. + ghostQvalues = MultiVectorFactory::Build(ghostQMap,NSDim); + //Note that Epetra does not support MultiVectors templated on Scalar != double. + //So to work around this, we allocate an array of Vectors. This shouldn't be too + //expensive, as the number of Vectors is NSDim. + ghostQcolumns.resize(NSDim); + for (size_t i=0; i::Build(ghostQMap); + ghostQrowNums = Xpetra::VectorFactory::Build(ghostQMap); + if (ghostQvalues->getLocalLength() > 0) { + ghostQvals.resize(NSDim); + ghostQcols.resize(NSDim); + for (size_t i=0; igetDataNonConst(i); + ghostQcols[i] = ghostQcolumns[i]->getDataNonConst(0); + } + ghostQrows = ghostQrowNums->getDataNonConst(0); + } //importer to handle moving Q importer = ImportFactory::Build(ghostQMap, A->getRowMap()); @@ -961,6 +846,338 @@ namespace MueLu { + template + void TentativePFactory:: + BuildPuncoupled(RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, + RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const { + RCP rowMap = A->getRowMap(); + RCP colMap = A->getColMap(); + const size_t numRows = rowMap->getLocalNumElements(); + + typedef Teuchos::ScalarTraits STS; + typedef typename STS::magnitudeType Magnitude; + const SC zero = STS::zero(); + const SC one = STS::one(); + const LO INVALID = Teuchos::OrdinalTraits::invalid(); + + const GO numAggs = aggregates->GetNumAggregates(); + const size_t NSDim = fineNullspace->getNumVectors(); + ArrayRCP aggSizes = aggregates->ComputeAggregateSizes(); + + + // Sanity checking + const ParameterList& pL = GetParameterList(); + const bool &doQRStep = pL.get("tentative: calculate qr"); + const bool &constantColSums = pL.get("tentative: constant column sums"); + + TEUCHOS_TEST_FOR_EXCEPTION(doQRStep && constantColSums,Exceptions::RuntimeError, + "MueLu::TentativePFactory::MakeTentative: cannot use 'constant column sums' and 'calculate qr' at the same time"); + + // Aggregates map is based on the amalgamated column map + // We can skip global-to-local conversion if LIDs in row map are + // same as LIDs in column map + bool goodMap = MueLu::Utilities::MapsAreNested(*rowMap, *colMap); + + // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate. + // aggStart is a pointer into aggToRowMapLO + // aggStart[i]..aggStart[i+1] are indices into aggToRowMapLO + // aggToRowMapLO[aggStart[i]]..aggToRowMapLO[aggStart[i+1]-1] are the DOFs in aggregate i + ArrayRCP aggStart; + ArrayRCP aggToRowMapLO; + ArrayRCP aggToRowMapGO; + if (goodMap) { + amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO); + GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl; + + } else { + amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO); + GetOStream(Warnings0) << "Column map is not consistent with the row map\n" + << "using GO->LO conversion with performance penalty" << std::endl; + } + coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim); + + // Pull out the nullspace vectors so that we can have random access. + ArrayRCP > fineNS (NSDim); + ArrayRCP > coarseNS(NSDim); + for (size_t i = 0; i < NSDim; i++) { + fineNS[i] = fineNullspace->getData(i); + if (coarseMap->getLocalNumElements() > 0) + coarseNS[i] = coarseNullspace->getDataNonConst(i); + } + + size_t nnzEstimate = numRows * NSDim; + + // Time to construct the matrix and fill in the values + Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0)); + RCP PtentCrs = rcp_dynamic_cast(Ptentative)->getCrsMatrix(); + + ArrayRCP iaPtent; + ArrayRCP jaPtent; + ArrayRCP valPtent; + + PtentCrs->allocateAllValues(nnzEstimate, iaPtent, jaPtent, valPtent); + + ArrayView ia = iaPtent(); + ArrayView ja = jaPtent(); + ArrayView val = valPtent(); + + ia[0] = 0; + for (size_t i = 1; i <= numRows; i++) + ia[i] = ia[i-1] + NSDim; + + for (size_t j = 0; j < nnzEstimate; j++) { + ja [j] = INVALID; + val[j] = zero; + } + + + if (doQRStep) { + //////////////////////////////// + // Standard aggregate-wise QR // + //////////////////////////////// + for (GO agg = 0; agg < numAggs; agg++) { + LO aggSize = aggStart[agg+1] - aggStart[agg]; + + Xpetra::global_size_t offset = agg*NSDim; + + // Extract the piece of the nullspace corresponding to the aggregate, and + // put it in the flat array, "localQR" (in column major format) for the + // QR routine. + Teuchos::SerialDenseMatrix localQR(aggSize, NSDim); + if (goodMap) { + for (size_t j = 0; j < NSDim; j++) + for (LO k = 0; k < aggSize; k++) + localQR(k,j) = fineNS[j][aggToRowMapLO[aggStart[agg]+k]]; + } else { + for (size_t j = 0; j < NSDim; j++) + for (LO k = 0; k < aggSize; k++) + localQR(k,j) = fineNS[j][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+k])]; + } + + // Test for zero columns + for (size_t j = 0; j < NSDim; j++) { + bool bIsZeroNSColumn = true; + + for (LO k = 0; k < aggSize; k++) + if (localQR(k,j) != zero) + bIsZeroNSColumn = false; + + TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError, + "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column in NS column " << j); + } + + // Calculate QR decomposition (standard) + // NOTE: Q is stored in localQR and R is stored in coarseNS + if (aggSize >= Teuchos::as(NSDim)) { + + if (NSDim == 1) { + // Only one nullspace vector, calculate Q and R by hand + Magnitude norm = STS::magnitude(zero); + for (size_t k = 0; k < Teuchos::as(aggSize); k++) + norm += STS::magnitude(localQR(k,0)*localQR(k,0)); + norm = Teuchos::ScalarTraits::squareroot(norm); + + // R = norm + coarseNS[0][offset] = norm; + + // Q = localQR(:,0)/norm + for (LO i = 0; i < aggSize; i++) + localQR(i,0) /= norm; + + } else { + Teuchos::SerialQRDenseSolver qrSolver; + qrSolver.setMatrix(Teuchos::rcp(&localQR, false)); + qrSolver.factor(); + + // R = upper triangular part of localQR + for (size_t j = 0; j < NSDim; j++) + for (size_t k = 0; k <= j; k++) + coarseNS[j][offset+k] = localQR(k,j); //TODO is offset+k the correct local ID?! + + // Calculate Q, the tentative prolongator. + // The Lapack GEQRF call only works for myAggsize >= NSDim + qrSolver.formQ(); + Teuchos::RCP > qFactor = qrSolver.getQ(); + for (size_t j = 0; j < NSDim; j++) + for (size_t i = 0; i < Teuchos::as(aggSize); i++) + localQR(i,j) = (*qFactor)(i,j); + } + + } else { + // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics) + + // The local QR decomposition is not possible in the "overconstrained" + // case (i.e. number of columns in localQR > number of rows), which + // corresponds to #DOFs in Aggregate < NSDim. For usual problems this + // is only possible for single node aggregates in structural mechanics. + // (Similar problems may arise in discontinuous Galerkin problems...) + // We bypass the QR decomposition and use an identity block in the + // tentative prolongator for the single node aggregate and transfer the + // corresponding fine level null space information 1-to-1 to the coarse + // level null space part. + + // NOTE: The resulting tentative prolongation operator has + // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular + // coarse level operator A. To deal with that one has the following + // options: + // - Use the "RepairMainDiagonal" flag in the RAPFactory (default: + // false) to add some identity block to the diagonal of the zero rows + // in the coarse level operator A, such that standard level smoothers + // can be used again. + // - Use special (projection-based) level smoothers, which can deal + // with singular matrices (very application specific) + // - Adapt the code below to avoid zero columns. However, we do not + // support a variable number of DOFs per node in MueLu/Xpetra which + // makes the implementation really hard. + + // R = extended (by adding identity rows) localQR + for (size_t j = 0; j < NSDim; j++) + for (size_t k = 0; k < NSDim; k++) + if (k < as(aggSize)) + coarseNS[j][offset+k] = localQR(k,j); + else + coarseNS[j][offset+k] = (k == j ? one : zero); + + // Q = I (rectangular) + for (size_t i = 0; i < as(aggSize); i++) + for (size_t j = 0; j < NSDim; j++) + localQR(i,j) = (j == i ? one : zero); + } + + + // Process each row in the local Q factor + // FIXME: What happens if maps are blocked? + for (LO j = 0; j < aggSize; j++) { + LO localRow = (goodMap ? aggToRowMapLO[aggStart[agg]+j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j])); + + size_t rowStart = ia[localRow]; + for (size_t k = 0, lnnz = 0; k < NSDim; k++) { + // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) + if (localQR(j,k) != zero) { + ja [rowStart+lnnz] = offset + k; + val[rowStart+lnnz] = localQR(j,k); + lnnz++; + } + } + } + } + + } else { + GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl; + if (NSDim>1) + GetOStream(Warnings0) << "TentativePFactory : for nontrivial nullspace, this may degrade performance" << std::endl; + ///////////////////////////// + // "no-QR" option // + ///////////////////////////// + // Local Q factor is just the fine nullspace support over the current aggregate. + // Local R factor is the identity. + // TODO I have not implemented any special handling for aggregates that are too + // TODO small to locally support the nullspace, as is done in the standard QR + // TODO case above. + if (goodMap) { + for (GO agg = 0; agg < numAggs; agg++) { + const LO aggSize = aggStart[agg+1] - aggStart[agg]; + Xpetra::global_size_t offset = agg*NSDim; + + // Process each row in the local Q factor + // FIXME: What happens if maps are blocked? + for (LO j = 0; j < aggSize; j++) { + + //TODO Here I do not check for a zero nullspace column on the aggregate. + // as is done in the standard QR case. + + const LO localRow = aggToRowMapLO[aggStart[agg]+j]; + + const size_t rowStart = ia[localRow]; + + for (size_t k = 0, lnnz = 0; k < NSDim; k++) { + // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) + SC qr_jk = fineNS[k][aggToRowMapLO[aggStart[agg]+j]]; + if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; + if (qr_jk != zero) { + ja [rowStart+lnnz] = offset + k; + val[rowStart+lnnz] = qr_jk; + lnnz++; + } + } + } + for (size_t j = 0; j < NSDim; j++) + coarseNS[j][offset+j] = one; + } //for (GO agg = 0; agg < numAggs; agg++) + + } else { + for (GO agg = 0; agg < numAggs; agg++) { + const LO aggSize = aggStart[agg+1] - aggStart[agg]; + Xpetra::global_size_t offset = agg*NSDim; + for (LO j = 0; j < aggSize; j++) { + + const LO localRow = rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]); + + const size_t rowStart = ia[localRow]; + + for (size_t k = 0, lnnz = 0; k < NSDim; ++k) { + // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions) + SC qr_jk = fineNS[k][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j])]; + if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg]; + if (qr_jk != zero) { + ja [rowStart+lnnz] = offset + k; + val[rowStart+lnnz] = qr_jk; + lnnz++; + } + } + } + for (size_t j = 0; j < NSDim; j++) + coarseNS[j][offset+j] = one; + } //for (GO agg = 0; agg < numAggs; agg++) + + } //if (goodmap) else ... + + } //if doQRStep ... else + + // Compress storage (remove all INVALID, which happen when we skip zeros) + // We do that in-place + size_t ia_tmp = 0, nnz = 0; + for (size_t i = 0; i < numRows; i++) { + for (size_t j = ia_tmp; j < ia[i+1]; j++) + if (ja[j] != INVALID) { + ja [nnz] = ja [j]; + val[nnz] = val[j]; + nnz++; + } + ia_tmp = ia[i+1]; + ia[i+1] = nnz; + } + if (rowMap->lib() == Xpetra::UseTpetra) { + // - Cannot resize for Epetra, as it checks for same pointers + // - Need to resize for Tpetra, as it check ().size() == ia[numRows] + // NOTE: these invalidate ja and val views + jaPtent .resize(nnz); + valPtent.resize(nnz); + } + + GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl; + + PtentCrs->setAllValues(iaPtent, jaPtent, valPtent); + + + // Managing labels & constants for ESFC + RCP FCparams; + if(pL.isSublist("matrixmatrix: kernel params")) + FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); + else + FCparams= rcp(new ParameterList); + // By default, we don't need global constants for TentativeP + FCparams->set("compute global constants",FCparams->get("compute global constants",false)); + std::string levelIDs = toString(levelID); + FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs); + RCP dummy_e; + RCP dummy_i; + + PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap(),dummy_i,dummy_e,FCparams); + } + + + } //namespace MueLu // TODO ReUse: If only P or Nullspace is missing, TentativePFactory can be smart and skip part of the computation. diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp index 35b3151fc4b2..e387eb1a9677 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp @@ -55,6 +55,8 @@ #include "Teuchos_ScalarTraits.hpp" +#include "Xpetra_CrsGraphFactory_fwd.hpp" + #include "MueLu_Aggregates_kokkos_fwd.hpp" #include "MueLu_AmalgamationFactory_kokkos_fwd.hpp" #include "MueLu_AmalgamationInfo_kokkos_fwd.hpp" @@ -151,27 +153,27 @@ namespace MueLu { //@} - // CUDA 7.5 and 8.0 place a restriction on the placement of __device__ lambdas: - // - // An explicit __device__ lambda cannot be defined in a member function - // that has private or protected access within its class. - // - // Therefore, we expose BuildPuncoupled and isGoodMap for now. An alternative solution - // could be writing an out of class implementation, and then calling it in - // a member function. - void BuildPuncoupled(Level& coarseLevel, RCP A, RCP aggregates, - RCP amalgInfo, RCP fineNullspace, - RCP coarseMap, RCP& Ptentative, - RCP& coarseNullspace, const int levelID) const; + + // NOTE: All of thess should really be private, but CUDA doesn't like that + + void BuildPuncoupledBlockCrs(Level& coarseLevel, RCP A, RCP aggregates, RCP amalgInfo, + RCP fineNullspace, RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace, const int levelID) const; + + bool isGoodMap(const Map& rowMap, const Map& colMap) const; - private: + void BuildPcoupled (RCP A, RCP aggregates, RCP amalgInfo, RCP fineNullspace, RCP coarseMap, RCP& Ptentative, RCP& coarseNullspace) const; + void BuildPuncoupled(Level& coarseLevel, RCP A, RCP aggregates, + RCP amalgInfo, RCP fineNullspace, + RCP coarseMap, RCP& Ptentative, + RCP& coarseNullspace, const int levelID) const; + mutable bool bTransferCoordinates_ = false; }; diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp index 08f647cbcce9..e12983cc10d1 100644 --- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp +++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp @@ -49,6 +49,7 @@ #ifdef HAVE_MUELU_KOKKOS_REFACTOR #include "Kokkos_UnorderedMap.hpp" +#include "Xpetra_CrsGraphFactory.hpp" #include "MueLu_TentativePFactory_kokkos_decl.hpp" @@ -56,12 +57,15 @@ #include "MueLu_AmalgamationFactory_kokkos.hpp" #include "MueLu_AmalgamationInfo_kokkos.hpp" #include "MueLu_CoarseMapFactory_kokkos.hpp" + #include "MueLu_MasterList.hpp" #include "MueLu_NullspaceFactory_kokkos.hpp" #include "MueLu_PerfUtils.hpp" #include "MueLu_Monitor.hpp" #include "MueLu_Utilities_kokkos.hpp" +#include "Xpetra_IO.hpp" + namespace MueLu { namespace { // anonymous @@ -531,8 +535,15 @@ namespace MueLu { } } - if (!aggregates->AggregatesCrossProcessors()) - BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID()); + if (!aggregates->AggregatesCrossProcessors()) { + if(Xpetra::Helpers::isTpetraBlockCrs(A)) { + BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, + coarseLevel.GetLevelID()); + } + else { + BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID()); + } + } else BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace); @@ -966,6 +977,310 @@ namespace MueLu { } } + + template + void TentativePFactory_kokkos>:: + BuildPuncoupledBlockCrs(Level& coarseLevel, RCP A, RCP aggregates, + RCP amalgInfo, RCP fineNullspace, + RCP coarsePointMap, RCP& Ptentative, + RCP& coarseNullspace, const int levelID) const { +#ifdef HAVE_MUELU_TPETRA + /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could + be generalized later, if we ever need to do so: + 1) Null space dimension === block size of matrix: So no elasticity right now + 2) QR is not supported: Under assumption #1, this shouldn't cause problems. + 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap. + + These assumptions keep our code way simpler and still support the use cases we actually care about. + */ + + RCP rowMap = A->getRowMap(); + RCP rangeMap = A->getRangeMap(); + RCP colMap = A->getColMap(); + // const size_t numFinePointRows = rangeMap->getLocalNumElements(); + const size_t numFineBlockRows = rowMap->getLocalNumElements(); + + typedef Teuchos::ScalarTraits STS; + typedef typename STS::magnitudeType Magnitude; + // const SC zero = STS::zero(); + const SC one = STS::one(); + const LO INVALID = Teuchos::OrdinalTraits::invalid(); + + // const GO numAggs = aggregates->GetNumAggregates(); + const size_t NSDim = fineNullspace->getNumVectors(); + auto aggSizes = aggregates->ComputeAggregateSizes(); + + + typename Aggregates_kokkos::local_graph_type aggGraph; + { + SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel); + aggGraph = aggregates->GetGraph(); + } + auto aggRows = aggGraph.row_map; + auto aggCols = aggGraph.entries; + + + // Need to generate the coarse block map + // NOTE: We assume NSDim == block size here + // NOTE: We also assume that coarseMap has contiguous GIDs + //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements(); + const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim; + RCP coarseBlockMap = MapFactory::Build(coarsePointMap->lib(), + Teuchos::OrdinalTraits::invalid(), + numCoarseBlockRows, + coarsePointMap->getIndexBase(), + coarsePointMap->getComm()); + // Sanity checking + const ParameterList& pL = GetParameterList(); + // const bool &doQRStep = pL.get("tentative: calculate qr"); + + + // The aggregates use the amalgamated column map, which in this case is what we want + + // Aggregates map is based on the amalgamated column map + // We can skip global-to-local conversion if LIDs in row map are + // same as LIDs in column map + bool goodMap = MueLu::Utilities::MapsAreNested(*rowMap, *colMap); + TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError, + "MueLu: TentativePFactory_kokkos: for now works only with good maps " + "(i.e. \"matching\" row and column maps)"); + + // STEP 1: do unamalgamation + // The non-kokkos version uses member functions from the AmalgamationInfo + // container class to unamalgamate the data. In contrast, the kokkos + // version of TentativePFactory does the unamalgamation here and only uses + // the data of the AmalgamationInfo container class + + // Extract information for unamalgamation + LO fullBlockSize, blockID, stridingOffset, stridedBlockSize; + GO indexBase; + amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase); + //GO globalOffset = amalgInfo->GlobalOffset(); + + // Extract aggregation info (already in Kokkos host views) + auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly); + auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly); + const size_t numAggregates = aggregates->GetNumAggregates(); + + int myPID = aggregates->GetMap()->getComm()->getRank(); + + // Create Kokkos::View (on the device) to store the aggreate dof sizes + // Later used to get aggregate dof offsets + // NOTE: This zeros itself on construction + typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType; + AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan + + { + SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel); + + // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value + aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1); + + Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast(1), numAggregates+1)), aggSizes); + } + + // Find maximum dof size for aggregates + // Later used to reserve enough scratch space for local QR decompositions + LO maxAggSize = 0; + ReduceMaxFunctor reduceMax(aggDofSizes); + Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize); + + // parallel_scan (exclusive) + // The aggDofSizes View then contains the aggregate dof offsets + Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1), + KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) { + update += aggDofSizes(i); + if (final_pass) + aggDofSizes(i) = update; + }); + + // Create Kokkos::View on the device to store mapping + // between (local) aggregate id and row map ids (LIDs) + Kokkos::View aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows); + { + SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel); + + AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates); + Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast(0), numAggregates))); + + Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)), + KOKKOS_LAMBDA(const LO lnode) { + if (procWinner(lnode, 0) == myPID) { + // No need for atomics, it's one-to-one + auto aggID = vertex2AggId(lnode,0); + + auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize ); + // FIXME: I think this may be wrong + // We unconditionally add the whole block here. When we calculated + // aggDofSizes, we did the isLocalElement check. Something's fishy. + for (LO k = 0; k < stridedBlockSize; k++) + aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k; + } + }); + } + + // STEP 2: prepare local QR decomposition + // Reserve memory for tentative prolongation operator + coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim); + + // Pull out the nullspace vectors so that we can have random access (on the device) + auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite); + auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll); + + typedef typename Xpetra::Matrix::local_matrix_type local_matrix_type; + typedef typename local_matrix_type::row_map_type::non_const_type rows_type; + typedef typename local_matrix_type::index_type::non_const_type cols_type; + typedef typename local_matrix_type::values_type::non_const_type vals_type; + + + // Device View for status (error messages...) + typedef Kokkos::View status_type; + status_type status("status"); + + typename AppendTrait::type fineNSRandom = fineNS; + typename AppendTrait ::type statusAtomic = status; + + // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for + GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl; + + // BlockCrs requires that we build the (block) graph first, so let's do that... + + // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one + // block non-zero per row in the matrix; + rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1); + cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows); + + Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows), + KOKKOS_LAMBDA(const LO j) { + ia[j] = j; + ja[j] = INVALID; + + if(j==(LO)numFineBlockRows-1) + ia[numFineBlockRows] = numFineBlockRows; + }); + + // Fill Graph + const Kokkos::TeamPolicy policy(numAggregates, 1); + Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:fillGraph", policy, + KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy::member_type &thread) { + auto agg = thread.league_rank(); + Xpetra::global_size_t offset = agg; + + // size of the aggregate (number of DOFs in aggregate) + LO aggSize = aggRows(agg+1) - aggRows(agg); + + for (LO j = 0; j < aggSize; j++) { + // FIXME: Allow for bad maps + const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j]; + const size_t rowStart = ia[localRow]; + ja[rowStart] = offset; + } + }); + + // Compress storage (remove all INVALID, which happen when we skip zeros) + // We do that in-place + { + // Stage 2: compress the arrays + SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel); + // Fill i_temp with the correct row starts + rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1); + size_t nnz=0; + Kokkos::parallel_scan("MueLu:TentativePF:BlockCrs:compress_rows", range_type(0,numFineBlockRows), + KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) { + if(final) + i_temp[i] = upd; + for (auto j = ia[i]; j < ia[i+1]; j++) + if (ja[j] != INVALID) + upd++; + if(final && i == (LO) numFineBlockRows-1) + i_temp[numFineBlockRows] = upd; + },nnz); + + cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz); + + + Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:compress_cols", range_type(0,numFineBlockRows), + KOKKOS_LAMBDA(const LO i) { + size_t rowStart = i_temp[i]; + size_t lnnz = 0; + for (auto j = ia[i]; j < ia[i+1]; j++) + if (ja[j] != INVALID) { + j_temp[rowStart+lnnz] = ja[j]; + lnnz++; + } + }); + + ia = i_temp; + ja = j_temp; + } + + RCP BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja); + + + // Managing labels & constants for ESFC + { + RCP FCparams; + if(pL.isSublist("matrixmatrix: kernel params")) + FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); + else + FCparams= rcp(new ParameterList); + // By default, we don't need global constants for TentativeP + FCparams->set("compute global constants",FCparams->get("compute global constants",false)); + std::string levelIDs = toString(levelID); + FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs); + RCP dummy_e; + RCP dummy_i; + BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams); + } + + // We can't leave the ia/ja pointers floating around, because of host/device view counting, so + // we clear them here + ia = rows_type(); + ja = cols_type(); + + + // Now let's make a BlockCrs Matrix + // NOTE: Assumes block size== NSDim + RCP > P_xpetra = Xpetra::CrsMatrixFactory::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim); + RCP > P_tpetra = rcp_dynamic_cast >(P_xpetra); + if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix"); + RCP P_wrap = rcp(new CrsMatrixWrap(P_xpetra)); + + auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst(); + const LO stride = NSDim*NSDim; + + Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:main_loop_noqr", policy, + KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy::member_type &thread) { + auto agg = thread.league_rank(); + + // size of the aggregate (number of DOFs in aggregate) + LO aggSize = aggRows(agg+1) - aggRows(agg); + Xpetra::global_size_t offset = agg*NSDim; + + // Q = localQR(:,0)/norm + for (LO j = 0; j < aggSize; j++) { + LO localBlockRow = aggToRowMapLO(aggRows(agg)+j); + LO rowStart = localBlockRow * stride; + for (LO r = 0; r < (LO)NSDim; r++) { + LO localPointRow = localBlockRow*NSDim + r; + for (LO c = 0; c < (LO)NSDim; c++) { + values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c); + } + } + } + + // R = norm + for(LO j=0; j<(LO)NSDim; j++) + coarseNS(offset+j,j) = one; + }); + + Ptentative = P_wrap; + +#else + throw std::runtime_error("TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra"); +#endif + } + template void TentativePFactory_kokkos>:: BuildPcoupled(RCP /* A */, RCP /* aggregates */, diff --git a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp index b3f8ab3fc887..e9f08ae11e6a 100644 --- a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp +++ b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp @@ -97,6 +97,8 @@ class Epetra_Vector; #ifdef HAVE_MUELU_TPETRA #include +#include +#include #include #include #include @@ -199,6 +201,14 @@ namespace MueLu { static const Tpetra::CrsMatrix& Op2TpetraCrs(const Xpetra::Matrix& Op); static Tpetra::CrsMatrix& Op2NonConstTpetraCrs(Xpetra::Matrix& Op); + static RCP > Op2TpetraBlockCrs(RCP > Op); + static RCP< Tpetra::BlockCrsMatrix > Op2NonConstTpetraBlockCrs(RCP > Op); + + static const Tpetra::BlockCrsMatrix& Op2TpetraBlockCrs(const Xpetra::Matrix& Op); + static Tpetra::BlockCrsMatrix& Op2NonConstTpetraBlockCrs(Xpetra::Matrix& Op); + + + static RCP > Op2TpetraRow(RCP > Op); static RCP< Tpetra::RowMatrix > Op2NonConstTpetraRow(RCP > Op); @@ -532,6 +542,76 @@ namespace MueLu { #endif } + + static RCP > Op2TpetraBlockCrs(RCP Op) { +#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \ + (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT)))) + throw Exceptions::RuntimeError("Op2TpetraBlockCrs: Tpetra has not been compiled with support for LO=GO=int."); +#else + // Get the underlying Tpetra Mtx + RCP crsOp = rcp_dynamic_cast(Op); + if (crsOp == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + const RCP > &tmp_ECrsMtx = rcp_dynamic_cast >(crsOp->getCrsMatrix()); + if (tmp_ECrsMtx == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + return tmp_ECrsMtx->getTpetra_BlockCrsMatrix(); +#endif + } + + static RCP< Tpetra::BlockCrsMatrix > Op2NonConstTpetraBlockCrs(RCP Op){ +#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \ + (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT)))) + throw Exceptions::RuntimeError("Op2NonConstTpetraBlockCrs: Tpetra has not been compiled with support for LO=GO=int."); +#else + RCP crsOp = rcp_dynamic_cast(Op); + if (crsOp == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + const RCP > &tmp_ECrsMtx = rcp_dynamic_cast >(crsOp->getCrsMatrix()); + if (tmp_ECrsMtx == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + return tmp_ECrsMtx->getTpetra_BlockCrsMatrixNonConst(); +#endif + }; + + static const Tpetra::BlockCrsMatrix& Op2TpetraBlockCrs(const Matrix& Op) { +#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \ + (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT)))) + throw Exceptions::RuntimeError("Op2TpetraBlockCrs: Tpetra has not been compiled with support for LO=GO=int."); +#else + try { + const CrsMatrixWrap& crsOp = dynamic_cast(Op); + try { + const Xpetra::TpetraBlockCrsMatrix& tmp_ECrsMtx = dynamic_cast&>(*crsOp.getCrsMatrix()); + return *tmp_ECrsMtx.getTpetra_BlockCrsMatrix(); + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + } + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + } +#endif + } + static Tpetra::BlockCrsMatrix& Op2NonConstTpetraBlockCrs(Matrix& Op) { +#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \ + (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT)))) + throw Exceptions::RuntimeError("Op2NonConstTpetraCrs: Tpetra has not been compiled with support for LO=GO=int."); +#else + try { + CrsMatrixWrap& crsOp = dynamic_cast(Op); + try { + Xpetra::TpetraBlockCrsMatrix& tmp_ECrsMtx = dynamic_cast&>(*crsOp.getCrsMatrix()); + return *tmp_ECrsMtx.getTpetra_BlockCrsMatrixNonConst(); + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + } + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + } +#endif + } + + static RCP > Op2TpetraRow(RCP Op) { #if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \ (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT)))) @@ -799,9 +879,11 @@ namespace MueLu { (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT)))) throw Exceptions::RuntimeError("Utilities::Transpose: Tpetra is not compiled with LO=GO=int. Add TPETRA_INST_INT_INT:BOOL=ON to your configuration!"); #else - try { + using Helpers = Xpetra::Helpers; + /***************************************************************/ + if(Helpers::isTpetraCrs(Op)) { const Tpetra::CrsMatrix& tpetraOp = Utilities::Op2TpetraCrs(Op); - + // Compute the transpose A of the Tpetra matrix tpetraOp. RCP > A; Tpetra::RowMatrixTransposer transposer(rcpFromRef(tpetraOp),label); @@ -825,9 +907,43 @@ namespace MueLu { return AAAA; } - catch (std::exception& e) { - std::cout << "threw exception '" << e.what() << "'" << std::endl; - throw Exceptions::RuntimeError("Utilities::Transpose failed, perhaps because matrix is not a Crs matrix"); + /***************************************************************/ + else if(Helpers::isTpetraBlockCrs(Op)) { + using BCRS = Tpetra::BlockCrsMatrix; + using CRS = Tpetra::CrsMatrix; + const BCRS & tpetraOp = Utilities::Op2TpetraBlockCrs(Op); + + if(!Op.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Utilities::Transpose(): Using inefficient placeholder algorithm for Transpose"< At; + RCP Acrs = Tpetra::convertToCrsMatrix(tpetraOp); + { + Tpetra::RowMatrixTransposer transposer(Acrs,label); + + using Teuchos::ParameterList; + using Teuchos::rcp; + RCP transposeParams = params.is_null () ? + rcp (new ParameterList) : + rcp (new ParameterList (*params)); + transposeParams->set ("sort", false); + RCP Atcrs = transposer.createTranspose(transposeParams); + + At = Tpetra::convertToBlockCrsMatrix(*Atcrs,Op.GetStorageBlockSize()); + } + RCP > AA = rcp(new Xpetra::TpetraBlockCrsMatrix(At)); + RCP AAA = rcp_implicit_cast(AA); + RCP AAAA = rcp( new CrsMatrixWrap(AAA)); + + if (Op.IsView("stridedMaps")) + AAAA->CreateView("stridedMaps", Teuchos::rcpFromRef(Op), true/*doTranspose*/); + + return AAAA; + + } + /***************************************************************/ + else { + throw Exceptions::RuntimeError("Utilities::Transpose failed, perhaps because matrix is not a Crs or BlockCrs matrix"); } #endif #else diff --git a/packages/muelu/src/Utils/MueLu_Utilities_def.hpp b/packages/muelu/src/Utils/MueLu_Utilities_def.hpp index 469276531197..49a674aaa91e 100644 --- a/packages/muelu/src/Utils/MueLu_Utilities_def.hpp +++ b/packages/muelu/src/Utils/MueLu_Utilities_def.hpp @@ -300,6 +300,65 @@ namespace MueLu { } } + + template + RCP > Utilities::Op2TpetraBlockCrs(RCP > Op) { + using XCrsMatrixWrap = Xpetra::CrsMatrixWrap; + // Get the underlying Tpetra Mtx + RCP crsOp = rcp_dynamic_cast(Op); + if (crsOp == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + const RCP > &tmp_ECrsMtx = rcp_dynamic_cast >(crsOp->getCrsMatrix()); + if (tmp_ECrsMtx == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + return tmp_ECrsMtx->getTpetra_BlockCrsMatrix(); + } + + template + RCP< Tpetra::BlockCrsMatrix > Utilities::Op2NonConstTpetraBlockCrs(RCP > Op){ + using XCrsMatrixWrap = Xpetra::CrsMatrixWrap; + RCP crsOp = rcp_dynamic_cast(Op); + if (crsOp == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + const RCP > &tmp_ECrsMtx = rcp_dynamic_cast >(crsOp->getCrsMatrix()); + if (tmp_ECrsMtx == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + return tmp_ECrsMtx->getTpetra_BlockCrsMatrixNonConst(); + } + + template + const Tpetra::BlockCrsMatrix& Utilities::Op2TpetraBlockCrs(const Xpetra::Matrix& Op) { + try { + using XCrsMatrixWrap = Xpetra::CrsMatrixWrap; + const XCrsMatrixWrap& crsOp = dynamic_cast(Op); + try { + const Xpetra::TpetraBlockCrsMatrix& tmp_ECrsMtx = dynamic_cast&>(*crsOp.getCrsMatrix()); + return *tmp_ECrsMtx.getTpetra_BlockCrsMatrix(); + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + } + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + } + } + + template + Tpetra::BlockCrsMatrix& Utilities::Op2NonConstTpetraBlockCrs(Xpetra::Matrix& Op) { + try { + using XCrsMatrixWrap = Xpetra::CrsMatrixWrap; + XCrsMatrixWrap& crsOp = dynamic_cast(Op); + try { + Xpetra::TpetraBlockCrsMatrix& tmp_ECrsMtx = dynamic_cast&>(*crsOp.getCrsMatrix()); + return *tmp_ECrsMtx.getTpetra_BlockCrsMatrixNonConst(); + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); + } + } catch (std::bad_cast&) { + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + } + } + + template RCP > Utilities::Op2TpetraRow(RCP > Op) { RCP > mat = rcp_dynamic_cast >(Op); @@ -498,12 +557,14 @@ namespace MueLu { #ifdef HAVE_MUELU_TPETRA if (TorE == "tpetra") { - try { + using Helpers = Xpetra::Helpers; + /***************************************************************/ + if(Helpers::isTpetraCrs(Op)) { const Tpetra::CrsMatrix& tpetraOp = Utilities::Op2TpetraCrs(Op); - + RCP > A; Tpetra::RowMatrixTransposer transposer(rcpFromRef(tpetraOp),label); //more than meets the eye - + { using Teuchos::ParameterList; using Teuchos::rcp; @@ -513,20 +574,53 @@ namespace MueLu { transposeParams->set ("sort", false); A = transposer.createTranspose (transposeParams); } - + RCP > AA = rcp(new Xpetra::TpetraCrsMatrix(A) ); RCP > AAA = rcp_implicit_cast >(AA); RCP > AAAA = rcp( new Xpetra::CrsMatrixWrap(AAA) ); if (!AAAA->isFillComplete()) AAAA->fillComplete(Op.getRangeMap(), Op.getDomainMap()); - + if (Op.IsView("stridedMaps")) AAAA->CreateView("stridedMaps", Teuchos::rcpFromRef(Op), true/*doTranspose*/); - + return AAAA; - - } catch (std::exception& e) { - std::cout << "threw exception '" << e.what() << "'" << std::endl; + } + else if(Helpers::isTpetraBlockCrs(Op)) { + using XMatrix = Xpetra::Matrix; + using XCrsMatrix = Xpetra::CrsMatrix; + using XCrsMatrixWrap = Xpetra::CrsMatrixWrap; + using BCRS = Tpetra::BlockCrsMatrix; + using CRS = Tpetra::CrsMatrix; + const BCRS & tpetraOp = Utilities::Op2TpetraBlockCrs(Op); + + if(!Op.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Utilities::Transpose(): Using inefficient placeholder algorithm for Transpose"< At; + RCP Acrs = Tpetra::convertToCrsMatrix(tpetraOp); + { + Tpetra::RowMatrixTransposer transposer(Acrs,label); + + using Teuchos::ParameterList; + using Teuchos::rcp; + RCP transposeParams = params.is_null () ? + rcp (new ParameterList) : + rcp (new ParameterList (*params)); + transposeParams->set ("sort", false); + RCP Atcrs = transposer.createTranspose(transposeParams); + + At = Tpetra::convertToBlockCrsMatrix(*Atcrs,Op.GetStorageBlockSize()); + } + RCP > AA = rcp(new Xpetra::TpetraBlockCrsMatrix(At)); + RCP AAA = rcp_implicit_cast(AA); + RCP AAAA = rcp( new XCrsMatrixWrap(AAA)); + + if (Op.IsView("stridedMaps")) + AAAA->CreateView("stridedMaps", Teuchos::rcpFromRef(Op), true/*doTranspose*/); + + return AAAA; + } else { throw Exceptions::RuntimeError("Utilities::Transpose failed, perhaps because matrix is not a Crs matrix"); } } //if diff --git a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp index 90799c2cd562..7d379bc68259 100644 --- a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp +++ b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp @@ -368,47 +368,93 @@ namespace MueLu { using impl_scalar_type = typename Kokkos::ArithTraits::val_type; using ATS = Kokkos::ArithTraits; using range_type = Kokkos::RangePolicy; + using helpers = Xpetra::Helpers; - auto localMatrix = A.getLocalMatrixDevice(); - LO numRows = A.getLocalNumRows(); - Kokkos::View boundaryNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows); - if (count_twos_as_dirichlet) - Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows_Twos_As_Dirichlet", range_type(0,numRows), + if(helpers::isTpetraBlockCrs(A)) { +#ifdef HAVE_MUELU_TPETRA + const Tpetra::BlockCrsMatrix & Am = helpers::Op2TpetraBlockCrs(A); + auto b_graph = Am.getCrsGraph().getLocalGraphDevice(); + auto b_rowptr = Am.getCrsGraph().getLocalRowPtrsDevice(); + auto values = Am.getValuesDevice(); + LO numBlockRows = Am.getLocalNumRows(); + const LO stride = Am.getBlockSize() * Am.getBlockSize(); + + Kokkos::View boundaryNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numBlockRows); + + if (count_twos_as_dirichlet) + throw Exceptions::RuntimeError("BlockCrs does not support counting twos as Dirichlet"); + + Kokkos::parallel_for("MueLu:Utils::DetectDirichletRowsBlockCrs", range_type(0,numBlockRows), KOKKOS_LAMBDA(const LO row) { - auto rowView = localMatrix.row(row); + auto rowView = b_graph.rowConst(row); auto length = rowView.length; + LO valstart = b_rowptr[row] * stride; boundaryNodes(row) = true; - if (length > 2) { - decltype(length) colID = 0; - for (; colID < length; colID++) - if ((rowView.colidx(colID) != row) && - (ATS::magnitude(rowView.value(colID)) > tol)) { - if (!boundaryNodes(row)) + decltype(length) colID =0; + for (; colID < length; colID++) { + if (rowView.colidx(colID) != row) { + LO current = valstart + colID*stride; + for(LO k=0; k tol) { + boundaryNodes(row) = false; break; - boundaryNodes(row) = false; + } } - if (colID == length) - boundaryNodes(row) = true; + } + if(boundaryNodes(row) == false) + break; } }); - else - Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows", range_type(0,numRows), - KOKKOS_LAMBDA(const LO row) { - auto rowView = localMatrix.row(row); - auto length = rowView.length; - boundaryNodes(row) = true; - for (decltype(length) colID = 0; colID < length; colID++) - if ((rowView.colidx(colID) != row) && - (ATS::magnitude(rowView.value(colID)) > tol)) { - boundaryNodes(row) = false; - break; + return boundaryNodes; +#else + throw Exceptions::RuntimeError("BlockCrs requires Tpetra"); +#endif + } + else { + auto localMatrix = A.getLocalMatrixDevice(); + LO numRows = A.getLocalNumRows(); + Kokkos::View boundaryNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows); + + if (count_twos_as_dirichlet) + Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows_Twos_As_Dirichlet", range_type(0,numRows), + KOKKOS_LAMBDA(const LO row) { + auto rowView = localMatrix.row(row); + auto length = rowView.length; + + boundaryNodes(row) = true; + if (length > 2) { + decltype(length) colID =0; + for ( ; colID < length; colID++) + if ((rowView.colidx(colID) != row) && + (ATS::magnitude(rowView.value(colID)) > tol)) { + if (!boundaryNodes(row)) + break; + boundaryNodes(row) = false; + } + if (colID == length) + boundaryNodes(row) = true; } - }); - + }); + else + Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows", range_type(0,numRows), + KOKKOS_LAMBDA(const LO row) { + auto rowView = localMatrix.row(row); + auto length = rowView.length; + + boundaryNodes(row) = true; + for (decltype(length) colID = 0; colID < length; colID++) + if ((rowView.colidx(colID) != row) && + (ATS::magnitude(rowView.value(colID)) > tol)) { + boundaryNodes(row) = false; + break; + } + }); return boundaryNodes; + } + } template diff --git a/packages/muelu/test/unit_tests/CMakeLists.txt b/packages/muelu/test/unit_tests/CMakeLists.txt index 122d6bd99c6c..e1d22c25e242 100644 --- a/packages/muelu/test/unit_tests/CMakeLists.txt +++ b/packages/muelu/test/unit_tests/CMakeLists.txt @@ -445,8 +445,9 @@ ENDIF() ADD_SUBDIRECTORY(ParameterList/FactoryFactory/) +ADD_SUBDIRECTORY(ParameterList/ParameterListInterpreter/) + IF (${PACKAGE_NAME}_ENABLE_Epetra) ADD_SUBDIRECTORY(ParameterList/MLParameterListInterpreter/) - ADD_SUBDIRECTORY(ParameterList/ParameterListInterpreter/) ADD_SUBDIRECTORY(ParameterList/CreateSublists/) ENDIF() diff --git a/packages/muelu/test/unit_tests/Hierarchy.cpp b/packages/muelu/test/unit_tests/Hierarchy.cpp index 1a8c92841ed7..6132086accfb 100644 --- a/packages/muelu/test/unit_tests/Hierarchy.cpp +++ b/packages/muelu/test/unit_tests/Hierarchy.cpp @@ -1672,7 +1672,7 @@ namespace MueLuTests { } - TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Hierarchy, BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node) + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Hierarchy, BlockCrs_Mixed, Scalar, LocalOrdinal, GlobalOrdinal, Node) { # include MUELU_TESTING_SET_OSTREAM; @@ -1789,6 +1789,7 @@ namespace MueLuTests { TEST_EQUALITY(0,0); } + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Hierarchy, CheckNullspaceDimension, Scalar, LocalOrdinal, GlobalOrdinal, Node) { // Test that HierarchyManager throws if user-supplied nullspace has dimension smaller than numPDEs @@ -1835,7 +1836,7 @@ namespace MueLuTests { TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, SetupHierarchy3levelFacManagers, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, SetupHierarchyTestBreakCondition, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, Write, Scalar, LO, GO, Node) \ - TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, BlockCrs, Scalar, LO, GO, Node) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, BlockCrs_Mixed, Scalar, LO, GO, Node) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, CheckNullspaceDimension, Scalar, LO, GO, Node) \ diff --git a/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp b/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp index 4bd0c652af14..a6f3e9af16c9 100644 --- a/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp +++ b/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp @@ -881,10 +881,24 @@ namespace MueLuTests { basematrix[4] = two; basematrix[7] = three; basematrix[8] = two; + Teuchos::Array offmatrix(blocksize*blocksize, zero); + offmatrix[0]=offmatrix[4]=offmatrix[8]=-1; + Teuchos::Array lclColInds(1); for (LocalOrdinal lclRowInd = meshRowMap.getMinLocalIndex (); lclRowInd <= meshRowMap.getMaxLocalIndex(); ++lclRowInd) { lclColInds[0] = lclRowInd; bcrsmatrix->replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &basematrix[0], 1); + + // Off diagonals + if(lclRowInd > meshRowMap.getMinLocalIndex ()) { + lclColInds[0] = lclRowInd - 1; + bcrsmatrix->replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &offmatrix[0], 1); + } + if(lclRowInd < meshRowMap.getMaxLocalIndex ()) { + lclColInds[0] = lclRowInd + 1; + bcrsmatrix->replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &offmatrix[0], 1); + } + } RCP > temp = rcp(new Xpetra::TpetraBlockCrsMatrix(bcrsmatrix)); diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp index a0f0ada4b9b7..90639e98e0d9 100644 --- a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp +++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp @@ -43,14 +43,23 @@ // *********************************************************************** // // @HEADER - #include +#include #include #include #include #include +#include + +#include + +#ifdef HAVE_MUELU_TPETRA +#include "Tpetra_BlockCrsMatrix_Helpers.hpp" +#include "TpetraExt_MatrixMatrix.hpp" +#endif + namespace MueLuTests { @@ -67,6 +76,14 @@ namespace MueLuTests { ArrayRCP fileList = TestHelpers::GetFileList(std::string("ParameterList/ParameterListInterpreter/"), std::string(".xml")); for(int i=0; i< fileList.size(); i++) { + // Ignore files with "BlockCrs" in their name + auto found = fileList[i].find("BlockCrs"); + if(found != std::string::npos) continue; + + // Ignore files with "Comparison" in their name + found = fileList[i].find("Comparison"); + if(found != std::string::npos) continue; + out << "Processing file: " << fileList[i] << std::endl; ParameterListInterpreter mueluFactory("ParameterList/ParameterListInterpreter/" + fileList[i],*comm); @@ -83,8 +100,178 @@ namespace MueLuTests { out << "Skipping test because some required packages are not enabled (Tpetra, Epetra, EpetraExt, Ifpack, Ifpack2, Amesos, Amesos2)." << std::endl; # endif } + + +TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ParameterListInterpreter, BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node) + { +# include + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node); +#if defined(HAVE_MUELU_TPETRA) + MUELU_TEST_ONLY_FOR(Xpetra::UseTpetra) { + Teuchos::ParameterList matrixParams; + matrixParams.set("matrixType","Laplace1D"); + matrixParams.set("nx",(GlobalOrdinal)300);// needs to be even + + RCP A = TestHelpers::TpetraTestFactory::BuildBlockMatrix(matrixParams,Xpetra::UseTpetra); + out<<"Matrix Size (block) = "<getGlobalNumRows()<<" (point) "<getRangeMap()->getGlobalNumElements()< > comm = TestHelpers::Parameters::getDefaultComm(); + + ArrayRCP fileList = TestHelpers::GetFileList(std::string("ParameterList/ParameterListInterpreter/"), std::string(".xml")); + + for(int i=0; i< fileList.size(); i++) { + // Only run files with "BlockCrs" in their name + auto found = fileList[i].find("BlockCrs"); + if(found == std::string::npos) continue; + + out << "Processing file: " << fileList[i] << std::endl; + ParameterListInterpreter mueluFactory("ParameterList/ParameterListInterpreter/" + fileList[i],*comm); + + RCP H = mueluFactory.CreateHierarchy(); + H->GetLevel(0)->Set("A", A); + + mueluFactory.SetupHierarchy(*H); + + // Test to make sure all of the matrices in the Hierarchy are actually Block Matrices + using helpers = Xpetra::Helpers; + for(int j=0; jGetNumLevels(); j++) { + RCP level = H->GetLevel(j); + + RCP Am = level->Get >("A"); + TEST_EQUALITY(helpers::isTpetraBlockCrs(Am),true); + if(j>0) { + RCP P = level->Get >("P"); + TEST_EQUALITY(helpers::isTpetraBlockCrs(P),true); + RCP R = level->Get >("R"); + TEST_EQUALITY(helpers::isTpetraBlockCrs(R),true); + } + } + + //TODO: check no unused parameters + //TODO: check results of Iterate() + } + } +# endif + TEST_EQUALITY(1,1); + } + + + +#if defined(HAVE_MUELU_TPETRA) +template +MT compare_matrices(RCP & Ap, RCP &Ab) { + using SC = typename Matrix::scalar_type; + using LO = typename Matrix::local_ordinal_type; + using GO = typename Matrix::global_ordinal_type; + using NO = typename Matrix::node_type; + using CRS=Tpetra::CrsMatrix; + SC one = Teuchos::ScalarTraits::one(); + SC zero = Teuchos::ScalarTraits::zero(); + + RCP Ap_t = MueLu::Utilities::Op2TpetraCrs(Ap); + auto Ab_t = MueLu::Utilities::Op2TpetraBlockCrs(Ab); + RCP Ab_as_point = Tpetra::convertToCrsMatrix(*Ab_t); + + RCP diff = rcp(new CRS(Ap_t->getCrsGraph())); + diff->setAllToScalar(zero); + diff->fillComplete(); + Tpetra::MatrixMatrix::Add(*Ap_t,false,one,*Ab_as_point,false,-one,diff); + return diff->getFrobeniusNorm(); +} +#endif + + +TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ParameterListInterpreter, PointCrs_vs_BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node) + { +# include + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node); +#if defined(HAVE_MUELU_TPETRA) + MUELU_TEST_ONLY_FOR(Xpetra::UseTpetra) { + Teuchos::ParameterList matrixParams; + matrixParams.set("matrixType","Laplace1D"); + matrixParams.set("nx",(GlobalOrdinal)300);// needs to be even + + RCP PointA = TestHelpers::TestFactory::BuildMatrix(matrixParams,Xpetra::UseTpetra); + RCP BlockA; + { + using XCRS = Xpetra::TpetraBlockCrsMatrix; + + auto tA = MueLu::Utilities::Op2TpetraCrs(PointA); + auto bA = Tpetra::convertToBlockCrsMatrix(*tA,1); + RCP AA = rcp(new XCRS(bA)); + BlockA = rcp(new CrsMatrixWrap(rcp_implicit_cast(AA))); + } + + out<<"Point: Matrix Size (block) = "<getGlobalNumRows()<<" (point) "<getRangeMap()->getGlobalNumElements()<getRangeMap()->getGlobalNumElements()< > comm = TestHelpers::Parameters::getDefaultComm(); + + ArrayRCP fileList = TestHelpers::GetFileList(std::string("ParameterList/ParameterListInterpreter/"), std::string(".xml")); + + for(int i=0; i< fileList.size(); i++) { + // Only run files with "Comparison" in their name + auto found = fileList[i].find("Comparison"); + if(found == std::string::npos) continue; + + out << "Processing file: " << fileList[i] << std::endl; + + // Point Hierarchy + ParameterListInterpreter mueluFactory1("ParameterList/ParameterListInterpreter/" + fileList[i],*comm); + RCP PointH = mueluFactory1.CreateHierarchy(); + PointH->GetLevel(0)->Set("A", PointA); + mueluFactory1.SetupHierarchy(*PointH); + + // Block Hierachy + ParameterListInterpreter mueluFactory2("ParameterList/ParameterListInterpreter/" + fileList[i],*comm); + RCP BlockH = mueluFactory2.CreateHierarchy(); + BlockH->GetLevel(0)->Set("A", BlockA); + mueluFactory2.SetupHierarchy(*BlockH); + + // Check to see that we get the same matrices in both hierarchies + TEST_EQUALITY(PointH->GetNumLevels(),BlockH->GetNumLevels()); + + for(int j=0; jGetNumLevels(); j++) { + using CRS=Tpetra::CrsMatrix; + using MT = typename Teuchos::ScalarTraits::magnitudeType; + MT tol = Teuchos::ScalarTraits::squareroot(Teuchos::ScalarTraits::eps()); + + RCP Plevel = PointH->GetLevel(j); + RCP Blevel = BlockH->GetLevel(j); + + // Compare A + RCP Ap = Plevel->Get >("A"); + RCP Ab = Blevel->Get >("A"); + MT norm = compare_matrices(Ap,Ab); + TEUCHOS_TEST_COMPARE(norm,<,tol,out,success); + + // Compare P, R + if(j>0) { + RCP Pp = Plevel->Get >("P"); + RCP Pb = Blevel->Get >("P"); + norm = compare_matrices(Pp,Pb); + TEUCHOS_TEST_COMPARE(norm,<,tol,out,success); + + RCP Rp = Plevel->Get >("R"); + RCP Rb = Blevel->Get >("R"); + norm = compare_matrices(Rp,Rb); + TEUCHOS_TEST_COMPARE(norm,<,tol,out,success); + } + } + + //TODO: check no unused parameters + //TODO: check results of Iterate() + } + } +# endif + TEST_EQUALITY(1,1); + } + + #define MUELU_ETI_GROUP(Scalar, LocalOrdinal, GlobalOrdinal, Node) \ - TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, SetParameterList, Scalar, LocalOrdinal, GlobalOrdinal, Node) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, SetParameterList, Scalar, LocalOrdinal, GlobalOrdinal, Node) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, PointCrs_vs_BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node) #include diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml new file mode 100644 index 000000000000..d048ab0d38d8 --- /dev/null +++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml new file mode 100644 index 000000000000..5954af0d9af4 --- /dev/null +++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt index 8e56cda05351..7023acbef07c 100644 --- a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt +++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt @@ -3,6 +3,8 @@ # regenerate a build system incorporating the new file. # YOU MUST ALSO TOUCH A CMAKE CONFIGURATION FILE WHEN YOU PUSH THE NEW # FILE TO FORCE THE RECONFIGURE ON OTHER PEOPLE'S BUILDS. + + FILE(GLOB xmlFiles RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.xml) TRIBITS_COPY_FILES_TO_BINARY_DIR(ParameterList_ParameterListInterpreter_cp diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/Comparison1.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/Comparison1.xml new file mode 100644 index 000000000000..5432a0dbdc37 --- /dev/null +++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/Comparison1.xml @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp b/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp index eb8d11f54ba0..81bcf614654e 100644 --- a/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp +++ b/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp @@ -567,7 +567,7 @@ namespace MueLuTests { } // banded - TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Ifpack2Smoother, BlockCrsMatrix_Relaxation, Scalar, LocalOrdinal, GlobalOrdinal, Node) + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Ifpack2Smoother, BlockCrsMatrix_Relaxation_ViaPoint, Scalar, LocalOrdinal, GlobalOrdinal, Node) { # include MUELU_TESTING_SET_OSTREAM; @@ -592,6 +592,32 @@ namespace MueLuTests { } } + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Ifpack2Smoother, BlockCrsMatrix_Relaxation_AsBlock, Scalar, LocalOrdinal, GlobalOrdinal, Node) + { +# include + MUELU_TESTING_SET_OSTREAM; + MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node); + + MUELU_TEST_ONLY_FOR(Xpetra::UseTpetra) { + Teuchos::ParameterList matrixParams, ifpack2Params; + + matrixParams.set("matrixType","Laplace1D"); + matrixParams.set("nx",(GlobalOrdinal)20);// needs to be even + + RCP A = TestHelpers::TpetraTestFactory::BuildBlockMatrix(matrixParams,Xpetra::UseTpetra); + ifpack2Params.set("smoother: use blockcrsmatrix storage",true); + + Ifpack2Smoother smoother("RELAXATION",ifpack2Params); + + Level level; TestHelpers::TestFactory::createSingleLevelHierarchy(level); + level.Set("A", A); + smoother.Setup(level); + + TEST_EQUALITY(1,1); + } + } + + #define MUELU_ETI_GROUP(SC,LO,GO,NO) \ @@ -605,7 +631,8 @@ namespace MueLuTests { TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BandedRelaxation,SC,LO,GO,NO) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,TriDiRelaxation,SC,LO,GO,NO) \ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockRelaxation_Autosize,SC,LO,GO,NO) \ - TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockCrsMatrix_Relaxation,SC,LO,GO,NO) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockCrsMatrix_Relaxation_ViaPoint,SC,LO,GO,NO) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockCrsMatrix_Relaxation_AsBlock,SC,LO,GO,NO) #include diff --git a/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp b/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp index bb9d38caf18b..4c7893e0734b 100644 --- a/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp +++ b/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp @@ -1520,6 +1520,9 @@ namespace Xpetra { return thbOp; } #endif + //! Returns the block size of the storage mechanism + LocalOrdinal GetStorageBlockSize() const {return 1;} + //! Compute a residual R = B - (*this) * X void residual(const MultiVector & X, diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp index 407f1d45f31a..c0806e943ba8 100644 --- a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp +++ b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp @@ -352,6 +352,9 @@ namespace Xpetra { //! Does this have an underlying matrix virtual bool hasMatrix() const = 0; + //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix + virtual LocalOrdinal GetStorageBlockSize() const = 0; + //! Compute a residual R = B - (*this) * X virtual void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B, diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp index d7ad7d410d61..9a81d816f02b 100644 --- a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp +++ b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp @@ -52,6 +52,7 @@ #ifdef HAVE_XPETRA_TPETRA #include "Xpetra_TpetraCrsMatrix.hpp" +#include "Xpetra_TpetraBlockCrsMatrix.hpp" #endif #ifdef HAVE_XPETRA_EPETRA @@ -289,6 +290,25 @@ namespace Xpetra { XPETRA_FACTORY_END; } #endif + + // Builds a BlockCrsMatrix + static RCP > BuildBlock ( + const Teuchos::RCP >& blockGraph, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + LocalOrdinal blockSize) { + + XPETRA_MONITOR("CrsMatrixFactory::BuildBlock"); + +#ifdef HAVE_XPETRA_TPETRA + if (domainMap->lib() == UseTpetra) { + return rcp(new Xpetra::TpetraBlockCrsMatrix(blockGraph,domainMap,rangeMap,blockSize) ); + } +#endif + TEUCHOS_TEST_FOR_EXCEPTION(domainMap->lib() == UseEpetra, std::logic_error, "Epetra doesn't support this matrix constructor"); + + XPETRA_FACTORY_END; + } }; @@ -318,10 +338,9 @@ namespace Xpetra { if (rowMap->lib() == UseTpetra) return rcp( new TpetraCrsMatrix(rowMap, 0) ); #endif -#ifdef HAVE_XPETRA_EPETRA if(rowMap->lib() == UseEpetra) return rcp( new EpetraCrsMatrixT(rowMap)); -#endif + XPETRA_FACTORY_END; } @@ -543,6 +562,23 @@ namespace Xpetra { } #endif + //! Build a BlockCrsMatrix + static RCP > BuildBlock ( + const Teuchos::RCP >& blockGraph, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + LocalOrdinal blockSize) { + + XPETRA_MONITOR("CrsMatrixFactory::BuildBlock"); +#ifdef HAVE_XPETRA_TPETRA + if (domainMap->lib() == UseTpetra) + return rcp(new Xpetra::TpetraBlockCrsMatrix(blockGraph,domainMap,rangeMap,blockSize) ); +#endif + TEUCHOS_TEST_FOR_EXCEPTION(domainMap->lib() == UseEpetra, std::logic_error, "Epetra doesn't support this matrix constructor"); + + XPETRA_FACTORY_END; + } + }; #endif @@ -772,6 +808,28 @@ namespace Xpetra { } #endif + + //! Build a BlockCrsMatrix + static RCP > BuildBlock ( + const Teuchos::RCP >& blockGraph, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + LocalOrdinal blockSize) { + + XPETRA_MONITOR("CrsMatrixFactory::BuildBlock"); + +#ifdef HAVE_XPETRA_TPETRA + if (domainMap->lib() == UseTpetra) { + return rcp(new Xpetra::TpetraBlockCrsMatrix(blockGraph,domainMap,rangemap,blockSize) ); + } +#endif + TEUCHOS_TEST_FOR_EXCEPTION(domainMap->lib() == UseEpetra, std::logic_error, "Epetra doesn't support this matrix constructor"); + + XPETRA_FACTORY_END; + } + + + }; #endif diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp index 9b9321049333..42dc9b4d390a 100644 --- a/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp +++ b/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp @@ -258,6 +258,9 @@ local_matrix_type getLocalMatrixDevice () const { TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "Xpetra::EpetraCrsMatrix only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)"); } + + LocalOrdinal GetStorageBlockSize() const {return 1;} + #else #ifdef __GNUC__ #warning "Xpetra Kokkos interface for CrsMatrix is enabled (HAVE_XPETRA_KOKKOS_REFACTOR) but Tpetra is disabled. The Kokkos interface needs Tpetra to be enabled, too." @@ -265,6 +268,7 @@ local_matrix_type getLocalMatrixDevice () const { #endif #endif + void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & R) const{ @@ -1305,6 +1309,7 @@ typename local_matrix_type::HostMirror getLocalMatrixHost () const { } + LocalOrdinal GetStorageBlockSize() const {return 1;} private: #else @@ -1315,6 +1320,8 @@ typename local_matrix_type::HostMirror getLocalMatrixHost () const { #endif //@} + + void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & R) const { @@ -2348,6 +2355,8 @@ class EpetraCrsMatrixT } + + LocalOrdinal GetStorageBlockSize() const {return 1;} private: #else @@ -2356,6 +2365,8 @@ class EpetraCrsMatrixT #endif #endif #endif + + //@} void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp index 1db4e9d5bd3f..2dc8bccf794b 100644 --- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp +++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp @@ -113,6 +113,12 @@ namespace Xpetra { TpetraBlockCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node> > &graph, const LocalOrdinal blockSize); + //! Constructor specifying a previously constructed graph, point maps & blocksize + TpetraBlockCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node> > &graph, + const Teuchos::RCP >& pointDomainMap, + const Teuchos::RCP >& pointRangeMap, + const LocalOrdinal blockSize); + //! Constructor for a fused import ( not implemented ) TpetraBlockCrsMatrix(const Teuchos::RCP >& sourceMatrix, @@ -410,6 +416,9 @@ namespace Xpetra { #endif // HAVE_XPETRA_TPETRA #endif // HAVE_XPETRA_KOKKOS_REFACTOR + //! Returns the block size of the storage mechanism + LocalOrdinal GetStorageBlockSize() const {return mtx_->getBlockSize();} + //! Compute a residual R = B - (*this) * X void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B, diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp index 410dbddfab76..fd069dd54356 100644 --- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp +++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp @@ -46,7 +46,9 @@ #ifndef XPETRA_TPETRABLOCKCRSMATRIX_DEF_HPP #define XPETRA_TPETRABLOCKCRSMATRIX_DEF_HPP + #include "Xpetra_TpetraBlockCrsMatrix_decl.hpp" +#include "Xpetra_TpetraCrsGraph.hpp" namespace Xpetra { @@ -118,6 +120,17 @@ namespace Xpetra { { } + //! Constructor specifying a previously constructed graph, point maps & blocksize + template + TpetraBlockCrsMatrix:: + TpetraBlockCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node> > &graph, + const Teuchos::RCP >& pointDomainMap, + const Teuchos::RCP >& pointRangeMap, + const LocalOrdinal blockSize) + : mtx_(Teuchos::rcp(new Tpetra::BlockCrsMatrix(*toTpetra(graph), *toTpetra(pointDomainMap), *toTpetra(pointRangeMap),blockSize))) + { } + + //! Constructor for a fused import ( not implemented ) template TpetraBlockCrsMatrix:: @@ -377,7 +390,12 @@ namespace Xpetra { TpetraBlockCrsMatrix:: getCrsGraph() const { - throw std::runtime_error("Xpetra::TpetraBlockCrsMatrix function not implemented in "+std::string(__FILE__)+":"+std::to_string(__LINE__)); + XPETRA_MONITOR("TpetraBlockCrsMatrix::getCrsGraph"); + using G_t = Tpetra::CrsGraph; + using G_x = TpetraCrsGraph; + RCP t_graph = Teuchos::rcp_const_cast(Teuchos::rcpFromRef(mtx_->getCrsGraph())); + RCP x_graph = rcp(new G_x(t_graph)); + return x_graph; } diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp index d5e94fee5ade..b6fa11b68eae 100644 --- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp +++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp @@ -453,6 +453,9 @@ namespace Xpetra { #endif #endif + //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix + LocalOrdinal GetStorageBlockSize() const {return 1;} + //! Compute a residual R = B - (*this) * X void residual(const MultiVector & X, const MultiVector & B, @@ -860,6 +863,9 @@ namespace Xpetra { #endif #endif + //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix + LocalOrdinal GetStorageBlockSize() const {return 1;} + //! Compute a residual R = B - (*this) * X void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B, @@ -1263,6 +1269,9 @@ namespace Xpetra { #endif #endif + //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix + LocalOrdinal GetStorageBlockSize() const {return 1;} + //! Compute a residual R = B - (*this) * X void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B, diff --git a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp index c30d1a00991f..415acfa6c940 100644 --- a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp +++ b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp @@ -480,12 +480,18 @@ class CrsMatrixWrap : RCP getCrsMatrix() const; + //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix + LocalOrdinal GetStorageBlockSize() const; + //! Compute a residual R = B - (*this) * X void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & R) const; + //! Expert only + void replaceCrsMatrix(RCP & M); + //@} private: diff --git a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp index c3914921aa83..8a1d48d3b181 100644 --- a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp +++ b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp @@ -497,6 +497,28 @@ namespace Xpetra { } + // Expert only + template + void CrsMatrixWrap::replaceCrsMatrix(RCP & M) { + // Clear the old view table + Teuchos::Hashtable > dummy_table; + Matrix::operatorViewTable_ = dummy_table; + + finalDefaultView_ = M->isFillComplete(); + // Set matrix data + matrixData_ = M; + + + // Default view + CreateDefaultView(); + } + + + template + LocalOrdinal CrsMatrixWrap::GetStorageBlockSize() const { + return matrixData_->GetStorageBlockSize(); + } + template void CrsMatrixWrap::residual( const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, diff --git a/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp b/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp index b7f9432f054f..69c11f74ccec 100644 --- a/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp +++ b/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp @@ -559,11 +559,17 @@ namespace Xpetra { return 1; }; //TODO: why LocalOrdinal? + //! Returns true, if `SetFixedBlockSize` has been called before. bool IsFixedBlockSizeSet() const { return IsView("stridedMaps"); }; + + //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix + virtual LocalOrdinal GetStorageBlockSize() const = 0; + + // ---------------------------------------------------------------------------------- virtual void SetMaxEigenvalueEstimate(Scalar const &sigma) { diff --git a/packages/xpetra/sup/Utils/Xpetra_IO.hpp b/packages/xpetra/sup/Utils/Xpetra_IO.hpp index 364b6743fdb4..da863a390f23 100644 --- a/packages/xpetra/sup/Utils/Xpetra_IO.hpp +++ b/packages/xpetra/sup/Utils/Xpetra_IO.hpp @@ -313,6 +313,15 @@ namespace Xpetra { Tpetra::MatrixMarket::Writer >::writeSparseFile(fileName, A); return; } + const RCP >& tmp_BlockCrs = + Teuchos::rcp_dynamic_cast >(tmp_CrsMtx); + if(tmp_BlockCrs != Teuchos::null) { + std::ofstream outstream (fileName,std::ofstream::out); + Teuchos::FancyOStream ofs(Teuchos::rcpFromRef(outstream)); + tmp_BlockCrs->getTpetra_BlockCrsMatrix()->describe(ofs,Teuchos::VERB_EXTREME); + return; + } + #endif // HAVE_XPETRA_TPETRA throw Exceptions::BadCast("Could not cast to EpetraCrsMatrix or TpetraCrsMatrix in matrix writing"); @@ -1037,6 +1046,15 @@ namespace Xpetra { Tpetra::MatrixMarket::Writer >::writeSparseFile(fileName, A); return; } + const RCP >& tmp_BlockCrs = + Teuchos::rcp_dynamic_cast >(tmp_CrsMtx); + if(tmp_BlockCrs != Teuchos::null) { + std::ofstream outstream (fileName,std::ofstream::out); + Teuchos::FancyOStream ofs(Teuchos::rcpFromRef(outstream)); + tmp_BlockCrs->getTpetra_BlockCrsMatrix()->describe(ofs,Teuchos::VERB_EXTREME); + return; + } + # endif #endif // HAVE_XPETRA_TPETRA diff --git a/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp b/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp index 5fad0246aafe..217c590bf912 100644 --- a/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp +++ b/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #endif // HAVE_XPETRA_TPETRA @@ -254,25 +255,25 @@ Note: this class is not in the Xpetra_UseShortNames.hpp return tmp_BlockCrs->getTpetra_BlockCrsMatrixNonConst(); } - static RCP > Op2TpetraBlockCrs(const Matrix& Op) { + static const Tpetra::BlockCrsMatrix & Op2TpetraBlockCrs(const Matrix& Op) { try { const CrsMatrixWrap& crsOp = dynamic_cast(Op); RCP tmp_CrsMtx = crsOp.getCrsMatrix(); RCP tmp_BlockCrs= Teuchos::rcp_dynamic_cast(tmp_CrsMtx); TEUCHOS_TEST_FOR_EXCEPTION(tmp_BlockCrs == Teuchos::null, Xpetra::Exceptions::BadCast, "Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); - return tmp_BlockCrs->getTpetra_BlockCrsMatrix(); + return *tmp_BlockCrs->getTpetra_BlockCrsMatrix(); } catch(...) { throw(Xpetra::Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed")); } } - static RCP > Op2NonTpetraBlockCrs(const Matrix& Op) { + static Tpetra::BlockCrsMatrix & Op2NonConstTpetraBlockCrs(const Matrix& Op) { try { const CrsMatrixWrap& crsOp = dynamic_cast(Op); RCP tmp_CrsMtx = crsOp.getCrsMatrix(); RCP tmp_BlockCrs= Teuchos::rcp_dynamic_cast(tmp_CrsMtx); TEUCHOS_TEST_FOR_EXCEPTION(tmp_BlockCrs == Teuchos::null, Xpetra::Exceptions::BadCast, "Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed"); - return tmp_BlockCrs->getTpetra_BlockCrsMatrixNonConst(); + return *tmp_BlockCrs->getTpetra_BlockCrsMatrixNonConst(); } catch(...) { throw(Xpetra::Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed")); } @@ -298,7 +299,14 @@ Note: this class is not in the Xpetra_UseShortNames.hpp return false; } } +#else // HAVE_XPETRA_TPETRA + static bool isTpetraCrs(const Matrix& Op) { + return false; + } + static bool isTpetraBlockCrs(const Matrix& Op) { + return false; + } #endif // HAVE_XPETRA_TPETRA @@ -491,9 +499,38 @@ Note: this class is not in the Xpetra_UseShortNames.hpp // Previously, Tpetra's matrix matrix multiply did not support fillComplete. Tpetra::MatrixMatrix::Multiply(tpA, transposeA, tpB, transposeB, tpC, haveMultiplyDoFillComplete, label, params); } - else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B) && helpers::isTpetraBlockCrs(C)) { - // All matrices are BlockCrs - TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "BlockCrs Multiply not currently supported"); + else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B)) { + // All matrices are BlockCrs (except maybe Ac) + // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel, + // we'll need to think about refactoring BlockCrs so we can do something smarter here. + if(!A.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"< & tpA = Xpetra::Helpers::Op2TpetraBlockCrs(A); + const Tpetra::BlockCrsMatrix & tpB = Xpetra::Helpers::Op2TpetraBlockCrs(B); + using CRS=Tpetra::CrsMatrix; + RCP Acrs = Tpetra::convertToCrsMatrix(tpA); + RCP Bcrs = Tpetra::convertToCrsMatrix(tpB); + + // We need the global constants to do the copy back to BlockCrs + RCP new_params; + if(!params.is_null()) { + new_params = rcp(new Teuchos::ParameterList(*params)); + new_params->set("compute global constants",true); + } + + // FIXME: The lines below only works because we're assuming Ac is Point + RCP tempAc = Teuchos::rcp(new CRS(Acrs->getRowMap(),0)); + Tpetra::MatrixMatrix::Multiply(*Acrs, transposeA, *Bcrs, transposeB, *tempAc, haveMultiplyDoFillComplete, label, new_params); + + // Temporary output matrix + RCP > Ac_t = Tpetra::convertToBlockCrsMatrix(*tempAc,A.GetStorageBlockSize()); + RCP > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix(Ac_t)); + RCP > Ac_p = Ac_x; + + // We can now cheat and replace the innards of Ac + RCP > Ac_w = Teuchos::rcp_dynamic_cast >(Teuchos::rcpFromRef(C)); + Ac_w->replaceCrsMatrix(Ac_p); } else { // Mix and match @@ -1022,9 +1059,39 @@ Note: this class is not in the Xpetra_UseShortNames.hpp // Previously, Tpetra's matrix matrix multiply did not support fillComplete. Tpetra::MatrixMatrix::Multiply(tpA, transposeA, tpB, transposeB, tpC, haveMultiplyDoFillComplete, label, params); } - else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B) && helpers::isTpetraBlockCrs(C)) { - // All matrices are BlockCrs - TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "BlockCrs Multiply not currently supported"); + else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B)) { + // All matrices are BlockCrs (except maybe Ac) + // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel, + // we'll need to think about refactoring BlockCrs so we can do something smarter here. + + if(!A.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"< & tpA = Xpetra::Helpers::Op2TpetraBlockCrs(A); + const Tpetra::BlockCrsMatrix & tpB = Xpetra::Helpers::Op2TpetraBlockCrs(B); + using CRS=Tpetra::CrsMatrix; + RCP Acrs = Tpetra::convertToCrsMatrix(tpA); + RCP Bcrs = Tpetra::convertToCrsMatrix(tpB); + + // We need the global constants to do the copy back to BlockCrs + RCP new_params; + if(!params.is_null()) { + new_params = rcp(new Teuchos::ParameterList(*params)); + new_params->set("compute global constants",true); + } + + // FIXME: The lines below only works because we're assuming Ac is Point + RCP tempAc = Teuchos::rcp(new CRS(Acrs->getRowMap(),0)); + Tpetra::MatrixMatrix::Multiply(*Acrs, transposeA, *Bcrs, transposeB, *tempAc, haveMultiplyDoFillComplete, label, new_params); + + // Temporary output matrix + RCP > Ac_t = Tpetra::convertToBlockCrsMatrix(*tempAc,A.GetStorageBlockSize()); + RCP > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix(Ac_t)); + RCP > Ac_p = Ac_x; + + // We can now cheat and replace the innards of Ac + RCP > Ac_w = Teuchos::rcp_dynamic_cast >(Teuchos::rcpFromRef(C)); + Ac_w->replaceCrsMatrix(Ac_p); } else { // Mix and match @@ -1787,9 +1854,39 @@ Note: this class is not in the Xpetra_UseShortNames.hpp // Previously, Tpetra's matrix matrix multiply did not support fillComplete. Tpetra::MatrixMatrix::Multiply(tpA, transposeA, tpB, transposeB, tpC, haveMultiplyDoFillComplete, label, params); } - else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B) && helpers::isTpetraBlockCrs(C)) { - // All matrices are BlockCrs - TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "BlockCrs Multiply not currently supported"); + else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B)) { + // All matrices are BlockCrs (except maybe Ac) + // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel, + // we'll need to think about refactoring BlockCrs so we can do something smarter here. + + if(!A.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"< & tpA = Xpetra::Helpers::Op2TpetraBlockCrs(A); + const Tpetra::BlockCrsMatrix & tpB = Xpetra::Helpers::Op2TpetraBlockCrs(B); + using CRS=Tpetra::CrsMatrix; + RCP Acrs = Tpetra::convertToCrsMatrix(tpA); + RCP Bcrs = Tpetra::convertToCrsMatrix(tpB); + + // We need the global constants to do the copy back to BlockCrs + RCP new_params; + if(!params.is_null()) { + new_params = rcp(new Teuchos::ParameterList(*params)); + new_params->set("compute global constants",true); + } + + // FIXME: The lines below only works because we're assuming Ac is Point + RCP tempAc = Teuchos::rcp(new CRS(Acrs->getRowMap(),0)); + Tpetra::MatrixMatrix::Multiply(*Acrs, transposeA, *Bcrs, transposeB, *tempAc, haveMultiplyDoFillComplete, label, new_params); + + // Temporary output matrix + RCP > Ac_t = Tpetra::convertToBlockCrsMatrix(*tempAc,A.GetStorageBlockSize()); + RCP > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix(Ac_t)); + RCP > Ac_p = Ac_x; + + // We can now cheat and replace the innards of Ac + RCP > Ac_w = Teuchos::rcp_dynamic_cast >(Teuchos::rcpFromRef(C)); + Ac_w->replaceCrsMatrix(Ac_p); } else { // Mix and match diff --git a/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp b/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp index 3b5798fc62ce..d930088f3908 100644 --- a/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp +++ b/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp @@ -58,10 +58,13 @@ #include "Xpetra_Matrix.hpp" #include "Xpetra_StridedMapFactory.hpp" #include "Xpetra_StridedMap.hpp" +#include "Xpetra_IO.hpp" #ifdef HAVE_XPETRA_TPETRA #include #include +#include +#include // #include // #include #endif // HAVE_XPETRA_TPETRA @@ -126,14 +129,55 @@ namespace Xpetra { throw(Xpetra::Exceptions::RuntimeError("Xpetra::TripleMatrixMultiply::MultiplyRAP is only implemented for Tpetra")); } else if (Ac.getRowMap()->lib() == Xpetra::UseTpetra) { #ifdef HAVE_XPETRA_TPETRA - const Tpetra::CrsMatrix & tpR = Xpetra::Helpers::Op2TpetraCrs(R); - const Tpetra::CrsMatrix & tpA = Xpetra::Helpers::Op2TpetraCrs(A); - const Tpetra::CrsMatrix & tpP = Xpetra::Helpers::Op2TpetraCrs(P); - Tpetra::CrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraCrs(Ac); - - // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete. - // Previously, Tpetra's matrix matrix multiply did not support fillComplete. - Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params); + using helpers = Xpetra::Helpers; + if(helpers::isTpetraCrs(R) && helpers::isTpetraCrs(A) && helpers::isTpetraCrs(P)) { + // All matrices are Crs + const Tpetra::CrsMatrix & tpR = Xpetra::Helpers::Op2TpetraCrs(R); + const Tpetra::CrsMatrix & tpA = Xpetra::Helpers::Op2TpetraCrs(A); + const Tpetra::CrsMatrix & tpP = Xpetra::Helpers::Op2TpetraCrs(P); + Tpetra::CrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraCrs(Ac); + + // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete. + // Previously, Tpetra's matrix matrix multiply did not support fillComplete. + Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params); + } + else if (helpers::isTpetraBlockCrs(R) && helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(P)) { + // All matrices are BlockCrs (except maybe Ac) + // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel, + // we'll need to think about refactoring BlockCrs so we can do something smarter here. + + if(!A.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"< & tpR = Xpetra::Helpers::Op2TpetraBlockCrs(R); + const Tpetra::BlockCrsMatrix & tpA = Xpetra::Helpers::Op2TpetraBlockCrs(A); + const Tpetra::BlockCrsMatrix & tpP = Xpetra::Helpers::Op2TpetraBlockCrs(P); + // Tpetra::BlockCrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraBlockCrs(Ac); + + using CRS=Tpetra::CrsMatrix; + RCP Rcrs = Tpetra::convertToCrsMatrix(tpR); + RCP Acrs = Tpetra::convertToCrsMatrix(tpA); + RCP Pcrs = Tpetra::convertToCrsMatrix(tpP); + // RCP Accrs = Tpetra::convertToCrsMatrix(tpAc); + + // FIXME: The lines below only works because we're assuming Ac is Point + RCP Accrs = Teuchos::rcp(new CRS(Rcrs->getRowMap(),0)); + const bool do_fill_complete=true; + Tpetra::TripleMatrixMultiply::MultiplyRAP(*Rcrs, transposeR, *Acrs, transposeA, *Pcrs, transposeP, *Accrs, do_fill_complete, label, params); + + // Temporary output matrix + RCP > Ac_t = Tpetra::convertToBlockCrsMatrix(*Accrs,A.GetStorageBlockSize()); + RCP > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix(Ac_t)); + RCP > Ac_p = Ac_x; + + // We can now cheat and replace the innards of Ac + RCP > Ac_w = Teuchos::rcp_dynamic_cast >(Teuchos::rcpFromRef(Ac)); + Ac_w->replaceCrsMatrix(Ac_p); + } + else { + // Mix and match + TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Mix-and-match Crs/BlockCrs Multiply not currently supported"); + } #else throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra.")); #endif @@ -215,14 +259,55 @@ namespace Xpetra { (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT)))) throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra ETI enabled.")); # else - const Tpetra::CrsMatrix & tpR = Xpetra::Helpers::Op2TpetraCrs(R); - const Tpetra::CrsMatrix & tpA = Xpetra::Helpers::Op2TpetraCrs(A); - const Tpetra::CrsMatrix & tpP = Xpetra::Helpers::Op2TpetraCrs(P); - Tpetra::CrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraCrs(Ac); - - // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete. - // Previously, Tpetra's matrix matrix multiply did not support fillComplete. - Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params); + using helpers = Xpetra::Helpers; + if(helpers::isTpetraCrs(R) && helpers::isTpetraCrs(A) && helpers::isTpetraCrs(P) && helpers::isTpetraCrs(Ac)) { + // All matrices are Crs + const Tpetra::CrsMatrix & tpR = Xpetra::Helpers::Op2TpetraCrs(R); + const Tpetra::CrsMatrix & tpA = Xpetra::Helpers::Op2TpetraCrs(A); + const Tpetra::CrsMatrix & tpP = Xpetra::Helpers::Op2TpetraCrs(P); + Tpetra::CrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraCrs(Ac); + + // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete. + // Previously, Tpetra's matrix matrix multiply did not support fillComplete. + Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params); + } + else if (helpers::isTpetraBlockCrs(R) && helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(P)) { + // All matrices are BlockCrs (except maybe Ac) + // FIXME: For the moment we're just going to clobber the innards of AC, so no reuse. Once we have a reuse kernel, + // we'll need to think about refactoring BlockCrs so we can do something smarter here. + if(!A.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"< & tpR = Xpetra::Helpers::Op2TpetraBlockCrs(R); + const Tpetra::BlockCrsMatrix & tpA = Xpetra::Helpers::Op2TpetraBlockCrs(A); + const Tpetra::BlockCrsMatrix & tpP = Xpetra::Helpers::Op2TpetraBlockCrs(P); + // Tpetra::BlockCrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraBlockCrs(Ac); + + using CRS=Tpetra::CrsMatrix; + RCP Rcrs = Tpetra::convertToCrsMatrix(tpR); + RCP Acrs = Tpetra::convertToCrsMatrix(tpA); + RCP Pcrs = Tpetra::convertToCrsMatrix(tpP); + // RCP Accrs = Tpetra::convertToCrsMatrix(tpAc); + + // FIXME: The lines below only works because we're assuming Ac is Point + RCP Accrs = Teuchos::rcp(new CRS(Rcrs->getRowMap(),0)); + const bool do_fill_complete=true; + Tpetra::TripleMatrixMultiply::MultiplyRAP(*Rcrs, transposeR, *Acrs, transposeA, *Pcrs, transposeP, *Accrs, do_fill_complete, label, params); + + // Temporary output matrix + RCP > Ac_t = Tpetra::convertToBlockCrsMatrix(*Accrs,A.GetStorageBlockSize()); + RCP > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix(Ac_t)); + RCP > Ac_p = Ac_x; + + // We can now cheat and replace the innards of Ac + RCP > Ac_w = Teuchos::rcp_dynamic_cast >(Teuchos::rcpFromRef(Ac)); + Ac_w->replaceCrsMatrix(Ac_p); + + } + else { + // Mix and match (not supported) + TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Mix-and-match Crs/BlockCrs Multiply not currently supported"); + } # endif #else throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra.")); @@ -303,14 +388,55 @@ namespace Xpetra { (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_LONG_LONG)))) throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra ETI enabled.")); # else - const Tpetra::CrsMatrix & tpR = Xpetra::Helpers::Op2TpetraCrs(R); - const Tpetra::CrsMatrix & tpA = Xpetra::Helpers::Op2TpetraCrs(A); - const Tpetra::CrsMatrix & tpP = Xpetra::Helpers::Op2TpetraCrs(P); - Tpetra::CrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraCrs(Ac); - - // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete. - // Previously, Tpetra's matrix matrix multiply did not support fillComplete. - Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params); + using helpers = Xpetra::Helpers; + if(helpers::isTpetraCrs(R) && helpers::isTpetraCrs(A) && helpers::isTpetraCrs(P)) { + // All matrices are Crs + const Tpetra::CrsMatrix & tpR = Xpetra::Helpers::Op2TpetraCrs(R); + const Tpetra::CrsMatrix & tpA = Xpetra::Helpers::Op2TpetraCrs(A); + const Tpetra::CrsMatrix & tpP = Xpetra::Helpers::Op2TpetraCrs(P); + Tpetra::CrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraCrs(Ac); + + // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete. + // Previously, Tpetra's matrix matrix multiply did not support fillComplete. + Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params); + } + else if (helpers::isTpetraBlockCrs(R) && helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(P)) { + // All matrices are BlockCrs (except maybe Ac) + // FIXME: For the moment we're just going to clobber the innards of AC, so no reuse. Once we have a reuse kernel, + // we'll need to think about refactoring BlockCrs so we can do something smarter here. + if(!A.getRowMap()->getComm()->getRank()) + std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"< & tpR = Xpetra::Helpers::Op2TpetraBlockCrs(R); + const Tpetra::BlockCrsMatrix & tpA = Xpetra::Helpers::Op2TpetraBlockCrs(A); + const Tpetra::BlockCrsMatrix & tpP = Xpetra::Helpers::Op2TpetraBlockCrs(P); + // Tpetra::BlockCrsMatrix & tpAc = Xpetra::Helpers::Op2NonConstTpetraBlockCrs(Ac); + + using CRS=Tpetra::CrsMatrix; + RCP Rcrs = Tpetra::convertToCrsMatrix(tpR); + RCP Acrs = Tpetra::convertToCrsMatrix(tpA); + RCP Pcrs = Tpetra::convertToCrsMatrix(tpP); + // RCP Accrs = Tpetra::convertToCrsMatrix(tpAc); + + // FIXME: The lines below only works because we're assuming Ac is Point + RCP Accrs = Teuchos::rcp(new CRS(Rcrs->getRowMap(),0)); + const bool do_fill_complete=true; + Tpetra::TripleMatrixMultiply::MultiplyRAP(*Rcrs, transposeR, *Acrs, transposeA, *Pcrs, transposeP, *Accrs, do_fill_complete, label, params); + + // Temporary output matrix + RCP > Ac_t = Tpetra::convertToBlockCrsMatrix(*Accrs,A.GetStorageBlockSize()); + RCP > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix(Ac_t)); + RCP > Ac_p = Ac_x; + + // We can now cheat and replace the innards of Ac + RCP > Ac_w = Teuchos::rcp_dynamic_cast >(Teuchos::rcpFromRef(Ac)); + Ac_w->replaceCrsMatrix(Ac_p); + } + else { + // Mix and match + TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Mix-and-match Crs/BlockCrs Multiply not currently supported"); + } + # endif #else throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra."));