diff --git a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp
index 53bd87de9a5d..78f5085479d0 100644
--- a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp
@@ -1053,6 +1053,11 @@ description () const
   } else {
     out << "INVALID";
   }
+
+  // BlockCrs if we have that
+  if(hasBlockCrsMatrix_)
+    out<<", BlockCrs";
+
   // Print the approximate # rows per part
   int approx_rows_per_part = A_->getLocalNumRows()/Partitioner_->numLocalParts();
   out <<", blocksize: "<<approx_rows_per_part;
diff --git a/packages/muelu/adapters/tpetra/MueLu_CreateTpetraPreconditioner.hpp b/packages/muelu/adapters/tpetra/MueLu_CreateTpetraPreconditioner.hpp
index db1c7777acb2..e41be8434d6a 100644
--- a/packages/muelu/adapters/tpetra/MueLu_CreateTpetraPreconditioner.hpp
+++ b/packages/muelu/adapters/tpetra/MueLu_CreateTpetraPreconditioner.hpp
@@ -171,6 +171,25 @@ namespace MueLu {
     MueLu::ReuseXpetraPreconditioner<SC,LO,GO,NO>(A, H);
   }
 
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void ReuseTpetraPreconditioner(const Teuchos::RCP<Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& inA,
+                                 MueLu::TpetraOperator<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op) {
+    typedef Scalar          SC;
+    typedef LocalOrdinal    LO;
+    typedef GlobalOrdinal   GO;
+    typedef Node            NO;
+
+    typedef Xpetra::Matrix<SC,LO,GO,NO>     Matrix;
+    typedef MueLu ::Hierarchy<SC,LO,GO,NO>  Hierarchy;
+
+    RCP<Hierarchy> H = Op.GetHierarchy();
+    RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > temp = rcp(new Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>(inA));
+    TEUCHOS_TEST_FOR_EXCEPTION(temp==Teuchos::null, Exceptions::RuntimeError, "ReuseTpetraPreconditioner: cast from Tpetra::BlockCrsMatrix to Xpetra::TpetraBlockCrsMatrix failed.");
+    RCP<Matrix> A = rcp(new Xpetra::CrsMatrixWrap<SC,LO,GO,NO>(temp));
+
+    MueLu::ReuseXpetraPreconditioner<SC,LO,GO,NO>(A, H);
+  }
+
 
 
 } //namespace
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp
index 549b00075d81..9b4f08644349 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_def.hpp
@@ -75,11 +75,30 @@ namespace MueLu {
 
     RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");
 
+    /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
+       fullblocksize is the number of storage blocks that must kept together during the amalgamation process.
+
+       Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
+
+       numPDEs = fullblocksize * storageblocksize.
+       
+       If numPDEs==1
+         Matrix is point storage (classical CRS storage).  storageblocksize=1 and fullblocksize=1
+         No other values makes sense.
+
+       If numPDEs>1
+         If matrix uses point storage, then storageblocksize=1  and fullblockssize=numPDEs.
+         If matrix uses block storage, with block size of n, then storageblocksize=n, and fullblocksize=numPDEs/n.  
+         Thus far, only storageblocksize=numPDEs and fullblocksize=1 has been tested.
+    */
+
+
     LO fullblocksize    = 1;   // block dim for fixed size blocks
     GO offset           = 0;   // global offset of dof gids
     LO blockid          = -1;  // block id in strided map
     LO nStridedOffset   = 0;   // DOF offset for strided block id "blockid" (default = 0)
     LO stridedblocksize = fullblocksize; // size of strided block id "blockid" (default = fullblocksize, only if blockid!=-1 stridedblocksize <= fullblocksize)
+    LO storageblocksize = A->GetStorageBlockSize();
     // GO indexBase        = A->getRowMap()->getIndexBase();  // index base for maps (unused)
 
     // 1) check for blocking/striding information
@@ -101,6 +120,12 @@ namespace MueLu {
       } else {
         stridedblocksize = fullblocksize;
       }
+      // Correct for the storageblocksize
+      // NOTE:  Before this point fullblocksize is actually numPDEs
+      TEUCHOS_TEST_FOR_EXCEPTION(fullblocksize % storageblocksize != 0,Exceptions::RuntimeError,"AmalgamationFactory: fullblocksize needs to be a multiple of A->GetStorageBlockSize()");
+      fullblocksize /= storageblocksize;
+      stridedblocksize /= storageblocksize;
+
       oldView = A->SwitchToView(oldView);
       GetOStream(Runtime1) << "AmalagamationFactory::Build():" << " found fullblocksize=" << fullblocksize << " and stridedblocksize=" << stridedblocksize << " from strided maps. offset=" << offset << std::endl;
 
@@ -108,6 +133,7 @@ namespace MueLu {
       GetOStream(Warnings0) << "AmalagamationFactory::Build(): no striding information available. Use blockdim=1 with offset=0" << std::endl;
     }
 
+
     // build node row map (uniqueMap) and node column map (nonUniqueMap)
     // the arrays rowTranslation and colTranslation contain the local node id
     // given a local dof id. They are only necessary for the CoalesceDropFactory if
@@ -166,7 +192,7 @@ namespace MueLu {
     container               filter;
 
     GO offset = 0;
-    LO blkSize = A.GetFixedBlockSize();
+    LO blkSize = A.GetFixedBlockSize() / A.GetStorageBlockSize();
     if (A.IsView("stridedMaps") == true) {
       Teuchos::RCP<const Map> myMap = A.getRowMap("stridedMaps");
       Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp
index e945469b5ba1..cc284df2e27b 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationFactory_kokkos_def.hpp
@@ -80,11 +80,29 @@ namespace MueLu {
 
     RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");
 
+   /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
+       fullblocksize is the number of storage blocks that must kept together during the amalgamation process.
+
+       Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
+
+       numPDEs = fullblocksize * storageblocksize.
+       
+       If numPDEs==1
+         Matrix is point storage (classical CRS storage).  storageblocksize=1 and fullblocksize=1
+         No other values makes sense.
+
+       If numPDEs>1
+         If matrix uses point storage, then storageblocksize=1  and fullblockssize=numPDEs.
+         If matrix uses block storage, with block size of n, then storageblocksize=n, and fullblocksize=numPDEs/n.  
+         Thus far, only storageblocksize=numPDEs and fullblocksize=1 has been tested.
+    */
+
     LO fullblocksize    = 1;   // block dim for fixed size blocks
     GO offset           = 0;   // global offset of dof gids
     LO blockid          = -1;  // block id in strided map
     LO nStridedOffset   = 0;   // DOF offset for strided block id "blockid" (default = 0)
     LO stridedblocksize = fullblocksize; // size of strided block id "blockid" (default = fullblocksize, only if blockid!=-1 stridedblocksize <= fullblocksize)
+    LO storageblocksize = A->GetStorageBlockSize();
     // GO indexBase        = A->getRowMap()->getIndexBase();  // index base for maps (unused)
 
     // 1) check for blocking/striding information
@@ -106,6 +124,12 @@ namespace MueLu {
       } else {
         stridedblocksize = fullblocksize;
       }
+      // Correct for the storageblocksize
+      // NOTE:  Before this point fullblocksize is actually numPDEs
+      TEUCHOS_TEST_FOR_EXCEPTION(fullblocksize % storageblocksize != 0,Exceptions::RuntimeError,"AmalgamationFactory::Build(): fullblocksize needs to be a multiple of A->GetStorageBlockSize()");
+      fullblocksize /= storageblocksize;
+      stridedblocksize /= storageblocksize;
+
       oldView = A->SwitchToView(oldView);
       GetOStream(Runtime1) << "AmalagamationFactory::Build():" << " found fullblocksize=" << fullblocksize << " and stridedblocksize=" << stridedblocksize << " from strided maps. offset=" << offset << std::endl;
 
@@ -172,7 +196,7 @@ namespace MueLu {
     container               filter; // TODO:  replace std::set with an object having faster lookup/insert, hashtable for instance
 
     GO offset = 0;
-    LO blkSize = A.GetFixedBlockSize();
+    LO blkSize = A.GetFixedBlockSize() / A.GetStorageBlockSize();
     if (A.IsView("stridedMaps") == true) {
       Teuchos::RCP<const Map> myMap = A.getRowMap("stridedMaps");
       Teuchos::RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp
index 9bd4b73d1169..edfad670c279 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_AmalgamationInfo_def.hpp
@@ -132,7 +132,7 @@ namespace MueLu {
 
   template <class LocalOrdinal, class GlobalOrdinal, class Node>
   void AmalgamationInfo<LocalOrdinal, GlobalOrdinal, Node>::UnamalgamateAggregatesLO(const Aggregates& aggregates,
-        Teuchos::ArrayRCP<LO>& aggStart, Teuchos::ArrayRCP<LO>& aggToRowMap) const {
+                      Teuchos::ArrayRCP<LO>& aggStart, Teuchos::ArrayRCP<LO>& aggToRowMap) const {
 
     int myPid = aggregates.GetMap()->getComm()->getRank();
     Teuchos::ArrayView<const GO> nodeGlobalElts = aggregates.GetMap()->getLocalElementList();
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
index 6c2241c12b8d..4d5f035069e1 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp
@@ -370,7 +370,7 @@ namespace MueLu {
           distanceLaplacianAlgo = scaled_cut_symmetric;
         else
           TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "\"aggregation: distance laplacian algo\" must be one of (default|unscaled cut|scaled cut), not \"" << distanceLaplacianAlgoStr << "\"");
-        GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
+        GetOStream(Runtime0) << "algorithm = \"" << algo << "\" distance laplacian algorithm = \"" << distanceLaplacianAlgoStr << "\": threshold = " << threshold << ", blocksize = " << A->GetFixedBlockSize()<< std::endl;
       } else if (algo == "classical") {
         if (classicalAlgoStr == "default")
           classicalAlgo = defaultAlgo;
@@ -396,6 +396,27 @@ namespace MueLu {
       GO numDropped = 0, numTotal = 0;
       std::string graphType = "unamalgamated"; //for description purposes only
 
+      
+      /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
+       BlockSize is the number of storage blocks that must kept together during the amalgamation process.
+
+       Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
+
+       numPDEs = BlockSize * storageblocksize.
+       
+       If numPDEs==1
+         Matrix is point storage (classical CRS storage).  storageblocksize=1 and BlockSize=1
+         No other values makes sense.
+
+       If numPDEs>1
+         If matrix uses point storage, then storageblocksize=1  and BlockSize=numPDEs.
+         If matrix uses block storage, with block size of n, then storageblocksize=n, and BlockSize=numPDEs/n.  
+         Thus far, only storageblocksize=numPDEs and BlockSize=1 has been tested.
+      */      
+      TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0,Exceptions::RuntimeError,"A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
+      const LO BlockSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
+
+
       /************************** RS or SA-style Classical Dropping (and variants) **************************/
       if (algo == "classical") {
         if (predrop_ == null) {
@@ -417,7 +438,7 @@ namespace MueLu {
         // At this points we either have
         //     (predrop_ != null)
         // Therefore, it is sufficient to check only threshold
-        if (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) {
+        if ( BlockSize==1 && threshold == STS::zero() && !useSignedClassicalRS && !useSignedClassicalSA && A->hasCrsGraph()) {
           // Case 1:  scalar problem, no dropping => just use matrix graph
           RCP<GraphBase> graph = rcp(new Graph(A->getCrsGraph(), "graph of A"));
           // Detect and record rows that correspond to Dirichlet boundary conditions
@@ -442,10 +463,10 @@ namespace MueLu {
           Set(currentLevel, "DofsPerNode", 1);
           Set(currentLevel, "Graph", graph);
 
-        } else if ( (A->GetFixedBlockSize() == 1 && threshold != STS::zero()) ||
-                    (A->GetFixedBlockSize() == 1 && threshold == STS::zero() && !A->hasCrsGraph()) ||
-                    (A->GetFixedBlockSize() == 1 && useSignedClassicalRS) ||
-                    (A->GetFixedBlockSize() == 1 && useSignedClassicalSA) ) {	  
+        } else if ( (BlockSize == 1 && threshold != STS::zero()) ||
+                    (BlockSize == 1 && threshold == STS::zero() && !A->hasCrsGraph()) ||
+                    (BlockSize == 1 && useSignedClassicalRS) ||
+                    (BlockSize == 1 && useSignedClassicalSA) ) {	  
           // Case 2:  scalar problem with dropping => record the column indices of undropped entries, but still use original
           //                                          graph's map information, e.g., whether index is local
           // OR a matrix without a CrsGraph
@@ -721,7 +742,7 @@ namespace MueLu {
            }
 #endif
           }//end generateColoringGraph
-        } else if (A->GetFixedBlockSize() > 1 && threshold == STS::zero()) {
+        } else if (BlockSize > 1 && threshold == STS::zero()) {
           // Case 3:  Multiple DOF/node problem without dropping
           const RCP<const Map> rowMap = A->getRowMap();
           const RCP<const Map> colMap = A->getColMap();
@@ -853,7 +874,7 @@ namespace MueLu {
           Set(currentLevel, "Graph",       graph);
           Set(currentLevel, "DofsPerNode", blkSize); // full block size
 
-        } else if (A->GetFixedBlockSize() > 1 && threshold != STS::zero()) {
+        } else if (BlockSize > 1 && threshold != STS::zero()) {
           // Case 4:  Multiple DOF/node problem with dropping
           const RCP<const Map> rowMap = A->getRowMap();
           const RCP<const Map> colMap = A->getColMap();
diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp
index a3f394fe36b8..8f5a42e6d653 100644
--- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp
+++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp
@@ -506,7 +506,27 @@ namespace MueLu {
     const MT zero = Teuchos::ScalarTraits<MT>::zero();
 
     auto A         = Get< RCP<Matrix> >(currentLevel, "A");
-    LO   blkSize   = A->GetFixedBlockSize();
+
+
+    /* NOTE: storageblocksize (from GetStorageBlockSize()) is the size of a block in the chosen storage scheme.
+       blkSize is the number of storage blocks that must kept together during the amalgamation process.
+
+       Both of these quantities may be different than numPDEs (from GetFixedBlockSize()), but the following must always hold:
+
+       numPDEs = blkSize * storageblocksize.
+       
+       If numPDEs==1
+         Matrix is point storage (classical CRS storage).  storageblocksize=1 and  blkSize=1
+         No other values makes sense.
+
+       If numPDEs>1
+         If matrix uses point storage, then storageblocksize=1  and blkSize=numPDEs.
+         If matrix uses block storage, with block size of n, then storageblocksize=n, and blkSize=numPDEs/n.  
+         Thus far, only storageblocksize=numPDEs and blkSize=1 has been tested.
+      */      
+ 
+    TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0,Exceptions::RuntimeError,"A->GetFixedBlockSize() needs to be a multiple of A->GetStorageBlockSize()");
+    LO   blkSize   = A->GetFixedBlockSize() / A->GetStorageBlockSize();
 
     auto amalInfo = Get< RCP<AmalgamationInfo_kokkos> >(currentLevel, "UnAmalgamationInfo");
 
@@ -542,7 +562,7 @@ namespace MueLu {
       boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
 
       // Trivial LWGraph construction
-      graph = rcp(new LWGraph_kokkos(A->getLocalMatrixDevice().graph, A->getRowMap(), A->getColMap(), "graph of A"));
+      graph = rcp(new LWGraph_kokkos(A->getCrsGraph()->getLocalGraphDevice(), A->getRowMap(), A->getColMap(), "graph of A"));
       graph->getLocalLWGraph().SetBoundaryNodeMap(boundaryNodes);
 
       numTotal = A->getLocalNumEntries();
diff --git a/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp b/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp
index 318da1790a21..1a95e3cd1c7f 100644
--- a/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp
+++ b/packages/muelu/src/Misc/MueLu_RAPFactory_def.hpp
@@ -56,6 +56,7 @@
 #include <Xpetra_TripleMatrixMultiply.hpp>
 #include <Xpetra_Vector.hpp>
 #include <Xpetra_VectorFactory.hpp>
+#include <Xpetra_IO.hpp>
 
 #include "MueLu_RAPFactory_decl.hpp"
 
@@ -281,8 +282,7 @@ namespace MueLu {
           Xpetra::TripleMatrixMultiply<SC,LO,GO,NO>::
             MultiplyRAP(*P, doTranspose, *A, !doTranspose, *P, !doTranspose, *Ac, doFillComplete,
                         doOptimizeStorage, labelstr+std::string("MueLu::R*A*P-implicit-")+levelstr.str(),
-                        RAPparams);
-
+                        RAPparams);         
         } else {
           RCP<Matrix> R = Get< RCP<Matrix> >(coarseLevel, "R");
           Ac = MatrixFactory::Build(R->getRowMap(), Teuchos::as<LO>(0));
diff --git a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp
index 81751dab90ec..eda2f914e8a5 100644
--- a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp
+++ b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp
@@ -1245,9 +1245,10 @@ namespace MueLu {
           break;
         }
 
-        Xpetra::global_size_t nnz = Am->getGlobalNumEntries();
+        LO storageblocksize=Am->GetStorageBlockSize();
+        Xpetra::global_size_t nnz = Am->getGlobalNumEntries()*storageblocksize*storageblocksize;
         nnzPerLevel     .push_back(nnz);
-        rowsPerLevel    .push_back(Am->getGlobalNumRows());
+        rowsPerLevel    .push_back(Am->getGlobalNumRows()*storageblocksize);
         numProcsPerLevel.push_back(Am->getRowMap()->getComm()->getSize());
       }
 
@@ -1434,8 +1435,9 @@ namespace MueLu {
     }
 
     GetOStream(Runtime1) << "Replacing coordinate map" << std::endl;
+    TEUCHOS_TEST_FOR_EXCEPTION(A->GetFixedBlockSize() % A->GetStorageBlockSize() != 0, Exceptions::RuntimeError, "Hierarchy::ReplaceCoordinateMap: Storage block size does not evenly divide fixed block size");
 
-    size_t blkSize = A->GetFixedBlockSize();
+    size_t blkSize = A->GetFixedBlockSize() / A->GetStorageBlockSize();
 
     RCP<const Map> nodeMap = A->getRowMap();
     if (blkSize > 1) {
diff --git a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp
index 798209774e57..14357b2feba9 100644
--- a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp
+++ b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp
@@ -67,6 +67,7 @@
 #include <Xpetra_CrsMatrixWrap.hpp>
 #include <Xpetra_TpetraBlockCrsMatrix.hpp>
 #include <Xpetra_Matrix.hpp>
+#include <Xpetra_MatrixMatrix.hpp>
 #include <Xpetra_MultiVectorFactory.hpp>
 #include <Xpetra_TpetraMultiVector.hpp>
 
@@ -234,16 +235,21 @@ namespace MueLu {
         if(Acrs.is_null())
           throw std::runtime_error("Ifpack2Smoother: Cannot extract CrsMatrix from matrix A.");
         RCP<TpetraCrsMatrix> At = rcp_dynamic_cast<TpetraCrsMatrix>(Acrs);
-        if(At.is_null())
-          throw std::runtime_error("Ifpack2Smoother: Cannot extract TpetraCrsMatrix from matrix A.");
-
-        RCP<Tpetra::BlockCrsMatrix<Scalar, LO, GO, Node> > blockCrs = Tpetra::convertToBlockCrsMatrix(*At->getTpetra_CrsMatrix(),blocksize);
-        RCP<CrsMatrix> blockCrs_as_crs  = rcp(new TpetraBlockCrsMatrix(blockCrs));
-        RCP<CrsMatrixWrap> blockWrap = rcp(new CrsMatrixWrap(blockCrs_as_crs));
-        A_ = blockWrap;
-        this->GetOStream(Statistics0) << "Ifpack2Smoother: Using BlockCrsMatrix storage with blocksize "<<blocksize<<std::endl;
-
-        paramList.remove("smoother: use blockcrsmatrix storage");
+        if(At.is_null()) {
+          if(!Xpetra::Helpers<Scalar,LO,GO,Node>::isTpetraBlockCrs(matA))
+            throw std::runtime_error("Ifpack2Smoother: Cannot extract CrsMatrix or BlockCrsMatrix from matrix A.");          
+          this->GetOStream(Statistics0) << "Ifpack2Smoother: Using (native) BlockCrsMatrix storage with blocksize "<<blocksize<<std::endl;
+          paramList.remove("smoother: use blockcrsmatrix storage");
+        }
+        else {
+          RCP<Tpetra::BlockCrsMatrix<Scalar, LO, GO, Node> > blockCrs = Tpetra::convertToBlockCrsMatrix(*At->getTpetra_CrsMatrix(),blocksize);
+          RCP<CrsMatrix> blockCrs_as_crs  = rcp(new TpetraBlockCrsMatrix(blockCrs));
+          RCP<CrsMatrixWrap> blockWrap = rcp(new CrsMatrixWrap(blockCrs_as_crs));
+          A_ = blockWrap;
+          this->GetOStream(Statistics0) << "Ifpack2Smoother: Using BlockCrsMatrix storage with blocksize "<<blocksize<<std::endl;
+          
+          paramList.remove("smoother: use blockcrsmatrix storage");
+        }
       }
     }
 
diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp
index ec727d2155a3..ef90a62e6814 100644
--- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp
+++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_def.hpp
@@ -52,6 +52,9 @@
 #include "MueLu_MasterList.hpp"
 #include "MueLu_Monitor.hpp"
 #include "MueLu_Aggregates.hpp"
+#ifdef HAVE_MUELU_TPETRA
+#include "Xpetra_TpetraCrsMatrix.hpp"
+#endif
 
 // Including this one last ensure that the short names of the above headers are defined properly
 #include "MueLu_GeometricInterpolationPFactory_decl.hpp"
@@ -110,8 +113,7 @@ namespace MueLu {
       Input(fineLevel, "Coordinates");
       Input(fineLevel, "coarseCoordinatesFineMap");
       Input(fineLevel, "coarseCoordinatesMap");
-    }
-
+    }     
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -155,6 +157,7 @@ namespace MueLu {
       SubFactoryMonitor sfm(*this, "BuildCoordinates", coarseLevel);
       RCP<const Map> coarseCoordsFineMap = Get< RCP<const Map> >(fineLevel, "coarseCoordinatesFineMap");
       RCP<const Map> coarseCoordsMap = Get< RCP<const Map> >(fineLevel, "coarseCoordinatesMap");
+
       fineCoordinates   = Get< RCP<realvaluedmultivector_type> >(fineLevel, "Coordinates");
       coarseCoordinates = Xpetra::MultiVectorFactory<real_type,LO,GO,Node>::Build(coarseCoordsFineMap,
                                                                                   fineCoordinates->getNumVectors());
@@ -172,6 +175,7 @@ namespace MueLu {
 
     *out << "Fine and coarse coordinates have been loaded from the fine level and set on the coarse level." << std::endl;
 
+
     if(interpolationOrder == 0) {
       SubFactoryMonitor sfm(*this, "BuildConstantP", coarseLevel);
       // Compute the prolongator using piece-wise constant interpolation
@@ -222,8 +226,19 @@ namespace MueLu {
       RCP<MultiVector> fineNullspace   = Get< RCP<MultiVector> > (fineLevel, "Nullspace");
       RCP<MultiVector> coarseNullspace = MultiVectorFactory::Build(P->getDomainMap(),
                                                                    fineNullspace->getNumVectors());
-      P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits<SC>::one(),
-               Teuchos::ScalarTraits<SC>::zero());
+      
+      using helpers=Xpetra::Helpers<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+      if(helpers::isTpetraBlockCrs(A)) {
+        // FIXME: BlockCrs doesn't currently support transpose apply, so we have to do this the hard way
+        RCP<Matrix> Ptrans = Utilities::Transpose(*P);
+        Ptrans->apply(*fineNullspace, *coarseNullspace, Teuchos::NO_TRANS, Teuchos::ScalarTraits<SC>::one(),
+                 Teuchos::ScalarTraits<SC>::zero());
+      }
+      else {
+        P->apply(*fineNullspace, *coarseNullspace, Teuchos::TRANS, Teuchos::ScalarTraits<SC>::one(),
+                 Teuchos::ScalarTraits<SC>::zero());
+      }
+
       Set(coarseLevel, "Nullspace", coarseNullspace);
     }
 
@@ -257,19 +272,78 @@ namespace MueLu {
 
     *out << "Call prolongator constructor" << std::endl;
 
-    // Create the prolongator matrix and its associated objects
-    RCP<ParameterList> dummyList = rcp(new ParameterList());
-    P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList));
-    RCP<CrsMatrix> PCrs = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
-    PCrs->setAllToScalar(1.0);
-    PCrs->fillComplete();
+    using helpers=Xpetra::Helpers<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+    if(helpers::isTpetraBlockCrs(A)) {
+#ifdef HAVE_MUELU_TPETRA
+      SC one  = Teuchos::ScalarTraits<SC>::one();
+      SC zero = Teuchos::ScalarTraits<SC>::zero();
+      LO NSDim = A->GetStorageBlockSize();
+
+      // Build the exploded Map
+      RCP<const Map> BlockMap = prolongatorGraph->getDomainMap();
+      Teuchos::ArrayView<const GO> block_dofs = BlockMap->getLocalElementList();
+      Teuchos::Array<GO> point_dofs(block_dofs.size()*NSDim);
+      for(LO i=0, ct=0; i<block_dofs.size(); i++) {
+        for(LO j=0; j<NSDim; j++) {
+          point_dofs[ct] = block_dofs[i]*NSDim + j;
+          ct++;
+        }
+      }
+      
+      RCP<const Map> PointMap = MapFactory::Build(BlockMap->lib(),
+                                                  BlockMap->getGlobalNumElements() *NSDim,
+                                                  point_dofs(),
+                                                  BlockMap->getIndexBase(),
+                                                  BlockMap->getComm());
+      strideInfo[0]    = A->GetFixedBlockSize();
+      RCP<const StridedMap> stridedPointMap =  StridedMapFactory::Build(PointMap, strideInfo);
+
+      RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(prolongatorGraph, PointMap, A->getRangeMap(),NSDim);
+      RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
+      if(P_tpetra.is_null()) throw std::runtime_error("BuildConstantP Matrix factory did not return a Tpetra::BlockCrsMatrix");
+      RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
+
+      // NOTE: Assumes block-diagonal prolongation
+      Teuchos::Array<LO> temp(1);
+      Teuchos::ArrayView<const LO> indices;
+      Teuchos::Array<Scalar> block(NSDim*NSDim, zero);
+      for(LO i=0; i<NSDim; i++)
+        block[i*NSDim+i] = one;
+      for(LO i=0; i<(int)prolongatorGraph->getLocalNumRows(); i++) {
+        prolongatorGraph->getLocalRowView(i,indices);
+        for(LO j=0; j<(LO)indices.size();j++) {
+          temp[0] = indices[j];
+          P_tpetra->replaceLocalValues(i,temp(),block());
+        }
+      }
+
+      P = P_wrap;
+      if (A->IsView("stridedMaps") == true) {
+        P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedPointMap);
+      }
+      else {
+        P->CreateView("stridedMaps", P->getRangeMap(),   PointMap);
+      }
+
+#else
+      throw std::runtime_error("GeometricInteroplationFactory::Build(): BlockCrs requires Tpetra");
+#endif
 
-    // set StridingInformation of P
-    if (A->IsView("stridedMaps") == true) {
-      P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap);
-    } else {
-      P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap);
     }
+    else {
+      // Create the prolongator matrix and its associated objects
+      RCP<ParameterList> dummyList = rcp(new ParameterList());
+      P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList));
+      RCP<CrsMatrix> PCrs = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
+      PCrs->setAllToScalar(1.0);
+      PCrs->fillComplete();          
+
+      // set StridingInformation of P    
+      if (A->IsView("stridedMaps") == true)
+        P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap);
+      else 
+        P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap);
+    }      
 
   } // BuildConstantP
 
@@ -293,7 +367,7 @@ namespace MueLu {
 
     // Compute 2^numDimensions using bit logic to avoid round-off errors
     const int numInterpolationPoints = 1 << numDimensions;
-    const int dofsPerNode = A->GetFixedBlockSize();
+    const int dofsPerNode = A->GetFixedBlockSize()/ A->GetStorageBlockSize();;
 
     RCP<ParameterList> dummyList = rcp(new ParameterList());
     P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList));
diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp
index 91024e11abe2..b0e5f27a90e2 100644
--- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp
+++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp
@@ -123,8 +123,8 @@ namespace MueLu{
 
     //@}
 
-  private:
     void BuildConstantP(RCP<Matrix>& P, RCP<const CrsGraph>& prolongatorGraph, RCP<Matrix>& A) const;
+  private:
     void BuildLinearP(RCP<Matrix>& A, RCP<const CrsGraph>& prolongatorGraph,
                       RCP<realvaluedmultivector_type>& fineCoordinates,
                       RCP<realvaluedmultivector_type>& ghostCoordinates,
diff --git a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp
index 5cbcdd71108f..e7c94590f77d 100644
--- a/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp
+++ b/packages/muelu/src/Transfers/GeneralGeometric/MueLu_GeometricInterpolationPFactory_kokkos_def.hpp
@@ -53,6 +53,11 @@
 #include "MueLu_Monitor.hpp"
 #include "MueLu_IndexManager_kokkos.hpp"
 
+#ifdef HAVE_MUELU_TPETRA
+#include "Xpetra_TpetraCrsMatrix.hpp"
+#endif
+
+
 // Including this one last ensure that the short names of the above headers are defined properly
 #include "MueLu_GeometricInterpolationPFactory_kokkos_decl.hpp"
 
@@ -236,21 +241,87 @@ namespace MueLu {
       StridedMapFactory::Build(prolongatorGraph->getDomainMap(), strideInfo);
 
     *out << "Call prolongator constructor" << std::endl;
+    using helpers=Xpetra::Helpers<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+    if(helpers::isTpetraBlockCrs(A)) {
+#ifdef HAVE_MUELU_TPETRA
+      LO NSDim = A->GetStorageBlockSize();
+
+      // Build the exploded Map
+      // FIXME: Should look at doing this on device
+      RCP<const Map> BlockMap = prolongatorGraph->getDomainMap();
+      Teuchos::ArrayView<const GO> block_dofs = BlockMap->getLocalElementList();
+      Teuchos::Array<GO> point_dofs(block_dofs.size()*NSDim);
+      for(LO i=0, ct=0; i<block_dofs.size(); i++) {
+        for(LO j=0; j<NSDim; j++) {
+          point_dofs[ct] = block_dofs[i]*NSDim + j;
+          ct++;
+        }
+      }
+      
+      RCP<const Map> PointMap = MapFactory::Build(BlockMap->lib(),
+                                                  BlockMap->getGlobalNumElements() *NSDim,
+                                                  point_dofs(),
+                                                  BlockMap->getIndexBase(),
+                                                  BlockMap->getComm());
+      strideInfo[0]    = A->GetFixedBlockSize();
+      RCP<const StridedMap> stridedPointMap =  StridedMapFactory::Build(PointMap, strideInfo);
+
+     RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(prolongatorGraph, PointMap, A->getRangeMap(),NSDim);
+      RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
+      if(P_tpetra.is_null()) throw std::runtime_error("BuildConstantP: Matrix factory did not return a Tpetra::BlockCrsMatrix");
+      RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
+      
+      const LO stride = strideInfo[0]*strideInfo[0];
+      const LO in_stride = strideInfo[0];
+      typename CrsMatrix::local_graph_type localGraph = prolongatorGraph->getLocalGraphDevice();
+      auto rowptr  = localGraph.row_map;
+      auto indices = localGraph.entries;
+      auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
+
+      using ISC = typename Tpetra::BlockCrsMatrix<SC,LO,GO,NO>::impl_scalar_type;
+      ISC one = Teuchos::ScalarTraits<ISC>::one();
+
+      const Kokkos::TeamPolicy<execution_space> policy(prolongatorGraph->getLocalNumRows(), 1);
+      
+      Kokkos::parallel_for("MueLu:GeoInterpFact::BuildConstantP::fill", policy,
+                           KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
+                           auto row = thread.league_rank();
+                           for(LO j = (LO)rowptr[row]; j<(LO) rowptr[row+1]; j++) {
+                             LO block_offset = j*stride;
+                             for(LO k=0; k<in_stride; k++)
+                               values[block_offset + k*(in_stride+1) ] = one;
+                           }
+                         });
+
+      P = P_wrap;
+      if (A->IsView("stridedMaps") == true) {
+        P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedPointMap);
+      }
+      else {
+        P->CreateView("stridedMaps", P->getRangeMap(),   PointMap);
+      }
 
-    // Create the prolongator matrix and its associated objects
-    RCP<ParameterList> dummyList = rcp(new ParameterList());
-    P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList));
-    RCP<CrsMatrix> PCrs = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
-    PCrs->setAllToScalar(1.0);
-    PCrs->fillComplete();
+#else
+      throw std::runtime_error("GeometricInteroplationFactory::BuildConstantP(): BlockCrs requires Tpetra");
+#endif
 
-    // set StridingInformation of P
-    if (A->IsView("stridedMaps") == true) {
-      P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap);
-    } else {
-      P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap);
     }
-
+    else {
+      // Create the prolongator matrix and its associated objects
+      RCP<ParameterList> dummyList = rcp(new ParameterList());
+      P = rcp(new CrsMatrixWrap(prolongatorGraph, dummyList));
+      RCP<CrsMatrix> PCrs = rcp_dynamic_cast<CrsMatrixWrap>(P)->getCrsMatrix();
+      PCrs->setAllToScalar(1.0);
+      PCrs->fillComplete();
+      
+      // set StridingInformation of P
+      if (A->IsView("stridedMaps") == true) {
+        P->CreateView("stridedMaps", A->getRowMap("stridedMaps"), stridedDomainMap);
+      } else {
+        P->CreateView("stridedMaps", P->getRangeMap(), stridedDomainMap);
+      }
+    }
+      
   } // BuildConstantP
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp
index 24673877c7e6..2cce10e7d3e3 100644
--- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp
+++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_decl.hpp
@@ -50,6 +50,7 @@
 #include <Teuchos_SerialDenseMatrix.hpp>
 #include <Teuchos_SerialQRDenseSolver.hpp>
 
+#include <Xpetra_CrsGraphFactory_fwd.hpp>
 #include <Xpetra_CrsMatrix_fwd.hpp>
 #include <Xpetra_Matrix_fwd.hpp>
 #include <Xpetra_MultiVector_fwd.hpp>
@@ -157,6 +158,8 @@ template <class Scalar = DefaultScalar,
                          RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace, const int levelID) const;
     void BuildPcoupled  (RCP<Matrix> A, RCP<Aggregates> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
                          RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace) const;
+    void BuildPuncoupledBlockCrs(RCP<Matrix> A, RCP<Aggregates> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
+                         RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace, const int levelID) const;
 
     mutable bool bTransferCoordinates_ = false;
 
diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp
index c5d76496c439..eb9d0a49f021 100644
--- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp
+++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_def.hpp
@@ -49,7 +49,9 @@
 #include <Xpetra_MapFactory.hpp>
 #include <Xpetra_Map.hpp>
 #include <Xpetra_CrsMatrix.hpp>
+#include <Xpetra_CrsGraphFactory.hpp>
 #include <Xpetra_Matrix.hpp>
+#include <Xpetra_MatrixMatrix.hpp>
 #include <Xpetra_MultiVector.hpp>
 #include <Xpetra_MultiVectorFactory.hpp>
 #include <Xpetra_VectorFactory.hpp>
@@ -58,6 +60,12 @@
 #include <Xpetra_CrsMatrixWrap.hpp>
 #include <Xpetra_StridedMap.hpp>
 #include <Xpetra_StridedMapFactory.hpp>
+#include <Xpetra_IO.hpp>
+
+#ifdef HAVE_MUELU_TPETRA
+#include "Xpetra_TpetraBlockCrsMatrix.hpp"
+//#include "Tpetra_BlockCrsMatrix.hpp"
+#endif
 
 #include "MueLu_TentativePFactory_decl.hpp"
 
@@ -71,6 +79,9 @@
 #include "MueLu_PerfUtils.hpp"
 #include "MueLu_Utilities.hpp"
 
+
+
+
 namespace MueLu {
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
@@ -157,7 +168,8 @@ namespace MueLu {
       Set<RCP<const Teuchos::Comm<int> > >(coarseLevel, "Node Comm", nodeComm);
     }
 
-    TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getLocalNumElements() != fineNullspace->getMap()->getLocalNumElements(),
+    // NOTE:  We check DomainMap here rather than RowMap because those are different for BlockCrs matrices
+    TEUCHOS_TEST_FOR_EXCEPTION( A->getDomainMap()->getLocalNumElements() != fineNullspace->getMap()->getLocalNumElements(),
 			       Exceptions::RuntimeError,"MueLu::TentativePFactory::MakeTentative: Size mismatch between A and Nullspace");
 
     RCP<Matrix>                Ptentative;
@@ -225,10 +237,16 @@ namespace MueLu {
       }
     }
 
-    if (!aggregates->AggregatesCrossProcessors())
-      BuildPuncoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,coarseLevel.GetLevelID());
+    if (!aggregates->AggregatesCrossProcessors()) {
+      if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
+        BuildPuncoupledBlockCrs(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,coarseLevel.GetLevelID());
+      }
+      else {
+        BuildPuncoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,coarseLevel.GetLevelID());
+      }
+    }
     else
-      BuildPcoupled  (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
+      BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
 
     // If available, use striding information of fine level matrix A for range
     // map and coarseMap as domain map; otherwise use plain range map of
@@ -258,12 +276,24 @@ namespace MueLu {
 
   template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
   void TentativePFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  BuildPuncoupled(RCP<Matrix> A, RCP<Aggregates> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
-                RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace, const int levelID) const {
-    RCP<const Map> rowMap = A->getRowMap();
-    RCP<const Map> colMap = A->getColMap();
+  BuildPuncoupledBlockCrs(RCP<Matrix> A, RCP<Aggregates> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
+                          RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace, const int levelID) const {
+#ifdef HAVE_MUELU_TPETRA
 
-    const size_t numRows   = rowMap->getLocalNumElements();
+    /* This routine generates a BlockCrs P for a BlockCrs A.  There are a few assumptions here, which meet the use cases we care about, but could 
+       be generalized later, if we ever need to do so:
+       1) Null space dimension === block size of matrix:  So no elasticity right now
+       2) QR is not supported:  Under assumption #1, this shouldn't cause problems.
+       3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
+
+       These assumptions keep our code way simpler and still support the use cases we actually care about.
+     */
+
+    RCP<const Map> rowMap     = A->getRowMap();
+    RCP<const Map> rangeMap   = A->getRangeMap();
+    RCP<const Map> colMap     = A->getColMap();
+    //    const size_t numFinePointRows = rangeMap->getLocalNumElements();
+    const size_t numFineBlockRows = rowMap->getLocalNumElements();
 
     typedef Teuchos::ScalarTraits<SC> STS;
     typedef typename STS::magnitudeType Magnitude;
@@ -275,7 +305,16 @@ namespace MueLu {
     const size_t NSDim     = fineNullspace->getNumVectors();
     ArrayRCP<LO> aggSizes  = aggregates->ComputeAggregateSizes();
 
-
+    // Need to generate the coarse block map
+    // NOTE: We assume NSDim == block size here
+    // NOTE: We also assume that coarseMap has contiguous GIDs
+    //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
+    const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
+    RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
+                                                      Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
+                                                      numCoarseBlockRows,
+                                                      coarsePointMap->getIndexBase(),
+                                                      coarsePointMap->getComm());    
     // Sanity checking
     const ParameterList& pL = GetParameterList();
     const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
@@ -284,6 +323,8 @@ namespace MueLu {
     TEUCHOS_TEST_FOR_EXCEPTION(doQRStep && constantColSums,Exceptions::RuntimeError,
                                "MueLu::TentativePFactory::MakeTentative: cannot use 'constant column sums' and 'calculate qr' at the same time");
 
+    // The aggregates use the amalgamated column map, which in this case is what we want
+
     // Aggregates map is based on the amalgamated column map
     // We can skip global-to-local conversion if LIDs in row map are
     // same as LIDs in column map
@@ -299,394 +340,238 @@ namespace MueLu {
     if (goodMap) {
       amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO);
       GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl;
-
     } else {
-      amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO);
-      GetOStream(Warnings0) << "Column map is not consistent with the row map\n"
-                            << "using GO->LO conversion with performance penalty" << std::endl;
+      throw std::runtime_error("TentativePFactory::PuncoupledBlockCrs: Inconsistent maps not currently supported");
     }
-
-    coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
+  
+    coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
 
     // Pull out the nullspace vectors so that we can have random access.
     ArrayRCP<ArrayRCP<const SC> > fineNS  (NSDim);
     ArrayRCP<ArrayRCP<SC> >       coarseNS(NSDim);
     for (size_t i = 0; i < NSDim; i++) {
       fineNS[i] = fineNullspace->getData(i);
-      if (coarseMap->getLocalNumElements() > 0)
+      if (coarsePointMap->getLocalNumElements() > 0)
         coarseNS[i] = coarseNullspace->getDataNonConst(i);
     }
 
-    size_t nnzEstimate = numRows * NSDim;
-
-    // Time to construct the matrix and fill in the values
-    Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0));
-    RCP<CrsMatrix> PtentCrs   = rcp_dynamic_cast<CrsMatrixWrap>(Ptentative)->getCrsMatrix();
-
+    // BlockCrs requires that we build the (block) graph first, so let's do that...
+    // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one 
+    // block non-zero per row in the matrix;
+    RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,0);
     ArrayRCP<size_t>  iaPtent;
     ArrayRCP<LO>      jaPtent;
-    ArrayRCP<SC>     valPtent;
-
-    PtentCrs->allocateAllValues(nnzEstimate, iaPtent, jaPtent, valPtent);
-
+    BlockGraph->allocateAllIndices(numFineBlockRows, iaPtent, jaPtent);
     ArrayView<size_t> ia  = iaPtent();
     ArrayView<LO>     ja  = jaPtent();
-    ArrayView<SC>     val = valPtent();
-
-    ia[0] = 0;
-    for (size_t i = 1; i <= numRows; i++)
-      ia[i] = ia[i-1] + NSDim;
 
-    for (size_t j = 0; j < nnzEstimate; j++) {
-      ja [j] = INVALID;
-      val[j] = zero;
+    for (size_t i = 0; i < numFineBlockRows; i++) {
+      ia[i] = i;
+      ja[i] = INVALID;
     }
+    ia[numCoarseBlockRows] = numCoarseBlockRows;
 
 
-    if (doQRStep) {
-      ////////////////////////////////
-      // Standard aggregate-wise QR //
-      ////////////////////////////////
-      for (GO agg = 0; agg < numAggs; agg++) {
-        LO aggSize = aggStart[agg+1] - aggStart[agg];
+    for (GO agg = 0; agg < numAggs; agg++) {
+      LO aggSize = aggStart[agg+1] - aggStart[agg];
+      Xpetra::global_size_t offset = agg;
 
-        Xpetra::global_size_t offset = agg*NSDim;
+      for (LO j = 0; j < aggSize; j++) {
+        // FIXME: Allow for bad maps
+        const LO localRow = aggToRowMapLO[aggStart[agg]+j];
+        const size_t rowStart = ia[localRow];
+        ja[rowStart] = offset;
+      }      
+    }
 
-        // Extract the piece of the nullspace corresponding to the aggregate, and
-        // put it in the flat array, "localQR" (in column major format) for the
-        // QR routine.
-        Teuchos::SerialDenseMatrix<LO,SC> localQR(aggSize, NSDim);
-        if (goodMap) {
-          for (size_t j = 0; j < NSDim; j++)
-            for (LO k = 0; k < aggSize; k++)
-              localQR(k,j) = fineNS[j][aggToRowMapLO[aggStart[agg]+k]];
-        } else {
-          for (size_t j = 0; j < NSDim; j++)
-            for (LO k = 0; k < aggSize; k++)
-              localQR(k,j) = fineNS[j][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+k])];
+    // Compress storage (remove all INVALID, which happen when we skip zeros)
+    // We do that in-place
+    size_t ia_tmp = 0, nnz = 0;
+    for (size_t i = 0; i < numFineBlockRows; i++) {
+      for (size_t j = ia_tmp; j < ia[i+1]; j++)
+        if (ja[j] != INVALID) {
+          ja [nnz] = ja [j];
+          nnz++;
         }
+      ia_tmp  = ia[i+1];
+      ia[i+1] = nnz;
+    }
 
-        // Test for zero columns
-        for (size_t j = 0; j < NSDim; j++) {
-          bool bIsZeroNSColumn = true;
+    if (rowMap->lib() == Xpetra::UseTpetra) {
+      // - Cannot resize for Epetra, as it checks for same pointers
+      // - Need to resize for Tpetra, as it check ().size() == ia[numRows]
+      // NOTE: these invalidate ja and val views
+      jaPtent .resize(nnz);
+    }
 
-          for (LO k = 0; k < aggSize; k++)
-            if (localQR(k,j) != zero)
-              bIsZeroNSColumn = false;
+    GetOStream(Runtime1) << "TentativePFactory : generating block graph" << std::endl;
+    BlockGraph->setAllIndices(iaPtent, jaPtent);
 
-          TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError,
-                                     "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column in NS column " << j);
-        }
+    // Managing labels & constants for ESFC
+    {
+      RCP<ParameterList> FCparams;
+      if(pL.isSublist("matrixmatrix: kernel params"))
+        FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
+      else
+        FCparams= rcp(new ParameterList);
+      // By default, we don't need global constants for TentativeP, but we do want it for the graph
+      // if we're printing statistics, so let's leave it on for now.
+      FCparams->set("compute global constants",FCparams->get("compute global constants",true));
+      std::string levelIDs = toString(levelID);
+      FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
+      RCP<const Export> dummy_e;
+      RCP<const Import> dummy_i;
+      BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
+    }
 
-        // Calculate QR decomposition (standard)
-        // NOTE: Q is stored in localQR and R is stored in coarseNS
-        if (aggSize >= Teuchos::as<LO>(NSDim)) {
+    // Now let's make a BlockCrs Matrix
+    // NOTE: Assumes block size== NSDim
+    RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
+    RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
+    if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
+    RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
+
+    /////////////////////////////
+    //      "no-QR" option     //
+    /////////////////////////////
+    // Local Q factor is just the fine nullspace support over the current aggregate.
+    // Local R factor is the identity.
+    // NOTE: We're not going to do a QR here as we're assuming that blocksize == NSDim
+    // NOTE: "goodMap" case only
+    Teuchos::Array<Scalar> block(NSDim*NSDim, zero);
+    Teuchos::Array<LO> bcol(1);
+
+    GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
+    for (LO agg = 0; agg < numAggs; agg++) {
+      bcol[0] = agg;
+      const LO aggSize = aggStart[agg+1] - aggStart[agg];
+      Xpetra::global_size_t offset = agg*NSDim;
+
+      // Process each row in the local Q factor
+      // NOTE: Blocks are in row-major order
+      for (LO j = 0; j < aggSize; j++) {
+        const LO localBlockRow = aggToRowMapLO[aggStart[agg]+j];
+
+        for (size_t r = 0; r < NSDim; r++) {
+          LO localPointRow = localBlockRow*NSDim + r;
+          for (size_t c = 0; c < NSDim; c++) 
+            block[r*NSDim+c] = fineNS[c][localPointRow];
+        }
+        // NOTE: Assumes columns==aggs and are ordered sequentially
+        P_tpetra->replaceLocalValues(localBlockRow,bcol(),block());
+
+      }//end aggSize
+        
+      for (size_t j = 0; j < NSDim; j++)
+        coarseNS[j][offset+j] = one;
+      
+    } //for (GO agg = 0; agg < numAggs; agg++)
+
+    Ptentative = P_wrap;
+#else
+    throw std::runtime_error("TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra");
+#endif
+  }
 
-          if (NSDim == 1) {
-            // Only one nullspace vector, calculate Q and R by hand
-            Magnitude norm = STS::magnitude(zero);
-            for (size_t k = 0; k < Teuchos::as<size_t>(aggSize); k++)
-              norm += STS::magnitude(localQR(k,0)*localQR(k,0));
-            norm = Teuchos::ScalarTraits<Magnitude>::squareroot(norm);
+  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+  void TentativePFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  BuildPcoupled(RCP<Matrix> A, RCP<Aggregates> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
+                RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace) const {
+    typedef Teuchos::ScalarTraits<SC> STS;
+    typedef typename STS::magnitudeType Magnitude;
+    const SC     zero      = STS::zero();
+    const SC     one       = STS::one();
 
-            // R = norm
-            coarseNS[0][offset] = norm;
+    // number of aggregates
+    GO numAggs = aggregates->GetNumAggregates();
 
-            // Q = localQR(:,0)/norm
-            for (LO i = 0; i < aggSize; i++)
-              localQR(i,0) /= norm;
+    // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate.
+    // aggStart is a pointer into aggToRowMap
+    // aggStart[i]..aggStart[i+1] are indices into aggToRowMap
+    // aggToRowMap[aggStart[i]]..aggToRowMap[aggStart[i+1]-1] are the DOFs in aggregate i
+    ArrayRCP<LO> aggStart;
+    ArrayRCP< GO > aggToRowMap;
+    amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap);
 
-          } else {
-            Teuchos::SerialQRDenseSolver<LO,SC> qrSolver;
-            qrSolver.setMatrix(Teuchos::rcp(&localQR, false));
-            qrSolver.factor();
+    // find size of the largest aggregate
+    LO maxAggSize=0;
+    for (GO i=0; i<numAggs; ++i) {
+      LO sizeOfThisAgg = aggStart[i+1] - aggStart[i];
+      if (sizeOfThisAgg > maxAggSize) maxAggSize = sizeOfThisAgg;
+    }
 
-            // R = upper triangular part of localQR
-            for (size_t j = 0; j < NSDim; j++)
-              for (size_t k = 0; k <= j; k++)
-                coarseNS[j][offset+k] = localQR(k,j); //TODO is offset+k the correct local ID?!
+    // dimension of fine level nullspace
+    const size_t NSDim = fineNullspace->getNumVectors();
 
-            // Calculate Q, the tentative prolongator.
-            // The Lapack GEQRF call only works for myAggsize >= NSDim
-            qrSolver.formQ();
-            Teuchos::RCP<Teuchos::SerialDenseMatrix<LO,SC> > qFactor = qrSolver.getQ();
-            for (size_t j = 0; j < NSDim; j++)
-              for (size_t i = 0; i < Teuchos::as<size_t>(aggSize); i++)
-                localQR(i,j) = (*qFactor)(i,j);
-          }
+    // index base for coarse Dof map (usually 0)
+    GO indexBase=A->getRowMap()->getIndexBase();
 
-        } else {
-          // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics)
+    const RCP<const Map> nonUniqueMap = amalgInfo->ComputeUnamalgamatedImportDofMap(*aggregates);
+    const RCP<const Map> uniqueMap    = A->getDomainMap();
+    RCP<const Import> importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
+    RCP<MultiVector> fineNullspaceWithOverlap = MultiVectorFactory::Build(nonUniqueMap,NSDim);
+    fineNullspaceWithOverlap->doImport(*fineNullspace,*importer,Xpetra::INSERT);
 
-          // The local QR decomposition is not possible in the "overconstrained"
-          // case (i.e. number of columns in localQR > number of rows), which
-          // corresponds to #DOFs in Aggregate < NSDim. For usual problems this
-          // is only possible for single node aggregates in structural mechanics.
-          // (Similar problems may arise in discontinuous Galerkin problems...)
-          // We bypass the QR decomposition and use an identity block in the
-          // tentative prolongator for the single node aggregate and transfer the
-          // corresponding fine level null space information 1-to-1 to the coarse
-          // level null space part.
+    // Pull out the nullspace vectors so that we can have random access.
+    ArrayRCP< ArrayRCP<const SC> > fineNS(NSDim);
+    for (size_t i=0; i<NSDim; ++i)
+      fineNS[i] = fineNullspaceWithOverlap->getData(i);
 
-          // NOTE: The resulting tentative prolongation operator has
-          // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular
-          // coarse level operator A.  To deal with that one has the following
-          // options:
-          // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
-          //   false) to add some identity block to the diagonal of the zero rows
-          //   in the coarse level operator A, such that standard level smoothers
-          //   can be used again.
-          // - Use special (projection-based) level smoothers, which can deal
-          //   with singular matrices (very application specific)
-          // - Adapt the code below to avoid zero columns. However, we do not
-          //   support a variable number of DOFs per node in MueLu/Xpetra which
-          //   makes the implementation really hard.
+    //Allocate storage for the coarse nullspace.
+    coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
 
-          // R = extended (by adding identity rows) localQR
-          for (size_t j = 0; j < NSDim; j++)
-            for (size_t k = 0; k < NSDim; k++)
-              if (k < as<size_t>(aggSize))
-                coarseNS[j][offset+k] = localQR(k,j);
-              else
-                coarseNS[j][offset+k] = (k == j ? one : zero);
+    ArrayRCP< ArrayRCP<SC> > coarseNS(NSDim);
+    for (size_t i=0; i<NSDim; ++i)
+      if (coarseMap->getLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i);
 
-          // Q = I (rectangular)
-          for (size_t i = 0; i < as<size_t>(aggSize); i++)
-            for (size_t j = 0; j < NSDim; j++)
-              localQR(i,j) = (j == i ? one : zero);
-        }
+    //This makes the rowmap of Ptent the same as that of A->
+    //This requires moving some parts of some local Q's to other processors
+    //because aggregates can span processors.
+    RCP<const Map > rowMapForPtent = A->getRowMap();
+    const Map& rowMapForPtentRef = *rowMapForPtent;
 
+    // Set up storage for the rows of the local Qs that belong to other processors.
+    // FIXME This is inefficient and could be done within the main loop below with std::vector's.
+    RCP<const Map> colMap = A->getColMap();
 
-        // Process each row in the local Q factor
-        // FIXME: What happens if maps are blocked?
-        for (LO j = 0; j < aggSize; j++) {
-          LO localRow = (goodMap ? aggToRowMapLO[aggStart[agg]+j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]));
+    RCP<const Map > ghostQMap;
+    RCP<MultiVector> ghostQvalues;
+    Array<RCP<Xpetra::Vector<GO,LO,GO,Node> > > ghostQcolumns;
+    RCP<Xpetra::Vector<GO,LO,GO,Node> > ghostQrowNums;
+    ArrayRCP< ArrayRCP<SC> > ghostQvals;
+    ArrayRCP< ArrayRCP<GO> > ghostQcols;
+    ArrayRCP< GO > ghostQrows;
 
-          size_t rowStart = ia[localRow];
-          for (size_t k = 0, lnnz = 0; k < NSDim; k++) {
-            // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions)
-            if (localQR(j,k) != zero) {
-              ja [rowStart+lnnz] = offset + k;
-              val[rowStart+lnnz] = localQR(j,k);
-              lnnz++;
-            }
-          }
+    Array<GO> ghostGIDs;
+    for (LO j=0; j<numAggs; ++j) {
+      for (LO k=aggStart[j]; k<aggStart[j+1]; ++k) {
+        if (rowMapForPtentRef.isNodeGlobalElement(aggToRowMap[k]) == false) {
+          ghostGIDs.push_back(aggToRowMap[k]);
         }
       }
-
-    } else {
-      GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
-      if (NSDim>1)
-        GetOStream(Warnings0) << "TentativePFactory : for nontrivial nullspace, this may degrade performance" << std::endl;
-      /////////////////////////////
-      //      "no-QR" option     //
-      /////////////////////////////
-      // Local Q factor is just the fine nullspace support over the current aggregate.
-      // Local R factor is the identity.
-      // TODO I have not implemented any special handling for aggregates that are too
-      // TODO small to locally support the nullspace, as is done in the standard QR
-      // TODO case above.
-      if (goodMap) {
-        for (GO agg = 0; agg < numAggs; agg++) {
-          const LO aggSize = aggStart[agg+1] - aggStart[agg];
-          Xpetra::global_size_t offset = agg*NSDim;
-
-          // Process each row in the local Q factor
-          // FIXME: What happens if maps are blocked?
-          for (LO j = 0; j < aggSize; j++) {
-
-            //TODO Here I do not check for a zero nullspace column on the aggregate.
-            //     as is done in the standard QR case.
-
-            const LO localRow = aggToRowMapLO[aggStart[agg]+j];
-
-            const size_t rowStart = ia[localRow];
-
-            for (size_t k = 0, lnnz = 0; k < NSDim; k++) {
-              // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions)
-              SC qr_jk = fineNS[k][aggToRowMapLO[aggStart[agg]+j]];
-              if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg];
-              if (qr_jk != zero) {
-                ja [rowStart+lnnz] = offset + k;
-                val[rowStart+lnnz] = qr_jk;
-                lnnz++;
-              }
-            }
-          }
-          for (size_t j = 0; j < NSDim; j++)
-            coarseNS[j][offset+j] = one;
-        } //for (GO agg = 0; agg < numAggs; agg++)
-
-      } else {
-        for (GO agg = 0; agg < numAggs; agg++) {
-          const LO aggSize = aggStart[agg+1] - aggStart[agg];
-          Xpetra::global_size_t offset = agg*NSDim;
-          for (LO j = 0; j < aggSize; j++) {
-
-            const LO localRow = rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]);
-
-            const size_t rowStart = ia[localRow];
-
-            for (size_t k = 0, lnnz = 0; k < NSDim; ++k) {
-              // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions)
-              SC qr_jk = fineNS[k][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j])];
-              if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg];
-              if (qr_jk != zero) {
-                ja [rowStart+lnnz] = offset + k;
-                val[rowStart+lnnz] = qr_jk;
-                lnnz++;
-              }
-            }
-          }
-          for (size_t j = 0; j < NSDim; j++)
-            coarseNS[j][offset+j] = one;
-        } //for (GO agg = 0; agg < numAggs; agg++)
-
-      } //if (goodmap) else ...
-
-    } //if doQRStep ... else
-
-    // Compress storage (remove all INVALID, which happen when we skip zeros)
-    // We do that in-place
-    size_t ia_tmp = 0, nnz = 0;
-    for (size_t i = 0; i < numRows; i++) {
-      for (size_t j = ia_tmp; j < ia[i+1]; j++)
-        if (ja[j] != INVALID) {
-          ja [nnz] = ja [j];
-          val[nnz] = val[j];
-          nnz++;
-        }
-      ia_tmp  = ia[i+1];
-      ia[i+1] = nnz;
-    }
-    if (rowMap->lib() == Xpetra::UseTpetra) {
-      // - Cannot resize for Epetra, as it checks for same pointers
-      // - Need to resize for Tpetra, as it check ().size() == ia[numRows]
-      // NOTE: these invalidate ja and val views
-      jaPtent .resize(nnz);
-      valPtent.resize(nnz);
-    }
-
-    GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
-
-    PtentCrs->setAllValues(iaPtent, jaPtent, valPtent);
-
-
-    // Managing labels & constants for ESFC
-    RCP<ParameterList> FCparams;
-    if(pL.isSublist("matrixmatrix: kernel params"))
-      FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
-    else
-      FCparams= rcp(new ParameterList);
-    // By default, we don't need global constants for TentativeP
-    FCparams->set("compute global constants",FCparams->get("compute global constants",false));
-    std::string levelIDs = toString(levelID);
-    FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
-    RCP<const Export> dummy_e;
-    RCP<const Import> dummy_i;
-
-    PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap(),dummy_i,dummy_e,FCparams);
-  }
-
-  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
-  void TentativePFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  BuildPcoupled(RCP<Matrix> A, RCP<Aggregates> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
-                RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace) const {
-    typedef Teuchos::ScalarTraits<SC> STS;
-    typedef typename STS::magnitudeType Magnitude;
-    const SC     zero      = STS::zero();
-    const SC     one       = STS::one();
-
-    // number of aggregates
-    GO numAggs = aggregates->GetNumAggregates();
-
-    // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate.
-    // aggStart is a pointer into aggToRowMap
-    // aggStart[i]..aggStart[i+1] are indices into aggToRowMap
-    // aggToRowMap[aggStart[i]]..aggToRowMap[aggStart[i+1]-1] are the DOFs in aggregate i
-    ArrayRCP<LO> aggStart;
-    ArrayRCP< GO > aggToRowMap;
-    amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMap);
-
-    // find size of the largest aggregate
-    LO maxAggSize=0;
-    for (GO i=0; i<numAggs; ++i) {
-      LO sizeOfThisAgg = aggStart[i+1] - aggStart[i];
-      if (sizeOfThisAgg > maxAggSize) maxAggSize = sizeOfThisAgg;
-    }
-
-    // dimension of fine level nullspace
-    const size_t NSDim = fineNullspace->getNumVectors();
-
-    // index base for coarse Dof map (usually 0)
-    GO indexBase=A->getRowMap()->getIndexBase();
-
-    const RCP<const Map> nonUniqueMap = amalgInfo->ComputeUnamalgamatedImportDofMap(*aggregates);
-    const RCP<const Map> uniqueMap    = A->getDomainMap();
-    RCP<const Import> importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
-    RCP<MultiVector> fineNullspaceWithOverlap = MultiVectorFactory::Build(nonUniqueMap,NSDim);
-    fineNullspaceWithOverlap->doImport(*fineNullspace,*importer,Xpetra::INSERT);
-
-    // Pull out the nullspace vectors so that we can have random access.
-    ArrayRCP< ArrayRCP<const SC> > fineNS(NSDim);
-    for (size_t i=0; i<NSDim; ++i)
-      fineNS[i] = fineNullspaceWithOverlap->getData(i);
-
-    //Allocate storage for the coarse nullspace.
-    coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
-
-    ArrayRCP< ArrayRCP<SC> > coarseNS(NSDim);
-    for (size_t i=0; i<NSDim; ++i)
-      if (coarseMap->getLocalNumElements() > 0) coarseNS[i] = coarseNullspace->getDataNonConst(i);
-
-    //This makes the rowmap of Ptent the same as that of A->
-    //This requires moving some parts of some local Q's to other processors
-    //because aggregates can span processors.
-    RCP<const Map > rowMapForPtent = A->getRowMap();
-    const Map& rowMapForPtentRef = *rowMapForPtent;
-
-    // Set up storage for the rows of the local Qs that belong to other processors.
-    // FIXME This is inefficient and could be done within the main loop below with std::vector's.
-    RCP<const Map> colMap = A->getColMap();
-
-    RCP<const Map > ghostQMap;
-    RCP<MultiVector> ghostQvalues;
-    Array<RCP<Xpetra::Vector<GO,LO,GO,Node> > > ghostQcolumns;
-    RCP<Xpetra::Vector<GO,LO,GO,Node> > ghostQrowNums;
-    ArrayRCP< ArrayRCP<SC> > ghostQvals;
-    ArrayRCP< ArrayRCP<GO> > ghostQcols;
-    ArrayRCP< GO > ghostQrows;
-
-    Array<GO> ghostGIDs;
-    for (LO j=0; j<numAggs; ++j) {
-      for (LO k=aggStart[j]; k<aggStart[j+1]; ++k) {
-        if (rowMapForPtentRef.isNodeGlobalElement(aggToRowMap[k]) == false) {
-          ghostGIDs.push_back(aggToRowMap[k]);
-        }
-      }
-    }
-    ghostQMap = MapFactory::Build(A->getRowMap()->lib(),
-                                  Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
-                                  ghostGIDs,
-                                  indexBase, A->getRowMap()->getComm()); //JG:Xpetra::global_size_t>?
-    //Vector to hold bits of Q that go to other processors.
-    ghostQvalues = MultiVectorFactory::Build(ghostQMap,NSDim);
-    //Note that Epetra does not support MultiVectors templated on Scalar != double.
-    //So to work around this, we allocate an array of Vectors.  This shouldn't be too
-    //expensive, as the number of Vectors is NSDim.
-    ghostQcolumns.resize(NSDim);
-    for (size_t i=0; i<NSDim; ++i)
-      ghostQcolumns[i] = Xpetra::VectorFactory<GO,LO,GO,Node>::Build(ghostQMap);
-    ghostQrowNums = Xpetra::VectorFactory<GO,LO,GO,Node>::Build(ghostQMap);
-    if (ghostQvalues->getLocalLength() > 0) {
-      ghostQvals.resize(NSDim);
-      ghostQcols.resize(NSDim);
-      for (size_t i=0; i<NSDim; ++i) {
-        ghostQvals[i] = ghostQvalues->getDataNonConst(i);
-        ghostQcols[i] = ghostQcolumns[i]->getDataNonConst(0);
-      }
-      ghostQrows = ghostQrowNums->getDataNonConst(0);
-    }
+    }
+    ghostQMap = MapFactory::Build(A->getRowMap()->lib(),
+                                  Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
+                                  ghostGIDs,
+                                  indexBase, A->getRowMap()->getComm()); //JG:Xpetra::global_size_t>?
+    //Vector to hold bits of Q that go to other processors.
+    ghostQvalues = MultiVectorFactory::Build(ghostQMap,NSDim);
+    //Note that Epetra does not support MultiVectors templated on Scalar != double.
+    //So to work around this, we allocate an array of Vectors.  This shouldn't be too
+    //expensive, as the number of Vectors is NSDim.
+    ghostQcolumns.resize(NSDim);
+    for (size_t i=0; i<NSDim; ++i)
+      ghostQcolumns[i] = Xpetra::VectorFactory<GO,LO,GO,Node>::Build(ghostQMap);
+    ghostQrowNums = Xpetra::VectorFactory<GO,LO,GO,Node>::Build(ghostQMap);
+    if (ghostQvalues->getLocalLength() > 0) {
+      ghostQvals.resize(NSDim);
+      ghostQcols.resize(NSDim);
+      for (size_t i=0; i<NSDim; ++i) {
+        ghostQvals[i] = ghostQvalues->getDataNonConst(i);
+        ghostQcols[i] = ghostQcolumns[i]->getDataNonConst(0);
+      }
+      ghostQrows = ghostQrowNums->getDataNonConst(0);
+    }
 
     //importer to handle moving Q
     importer = ImportFactory::Build(ghostQMap, A->getRowMap());
@@ -961,6 +846,338 @@ namespace MueLu {
 
 
 
+  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class Node>
+  void TentativePFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+  BuildPuncoupled(RCP<Matrix> A, RCP<Aggregates> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
+                RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace, const int levelID) const {
+    RCP<const Map> rowMap = A->getRowMap();
+    RCP<const Map> colMap = A->getColMap();
+    const size_t numRows   = rowMap->getLocalNumElements();
+
+    typedef Teuchos::ScalarTraits<SC> STS;
+    typedef typename STS::magnitudeType Magnitude;
+    const SC     zero      = STS::zero();
+    const SC     one       = STS::one();
+    const LO     INVALID   = Teuchos::OrdinalTraits<LO>::invalid();
+
+    const GO     numAggs   = aggregates->GetNumAggregates();
+    const size_t NSDim     = fineNullspace->getNumVectors();
+    ArrayRCP<LO> aggSizes  = aggregates->ComputeAggregateSizes();
+
+
+    // Sanity checking
+    const ParameterList& pL = GetParameterList();
+    const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
+    const bool &constantColSums = pL.get<bool>("tentative: constant column sums");
+
+    TEUCHOS_TEST_FOR_EXCEPTION(doQRStep && constantColSums,Exceptions::RuntimeError,
+                               "MueLu::TentativePFactory::MakeTentative: cannot use 'constant column sums' and 'calculate qr' at the same time");
+
+    // Aggregates map is based on the amalgamated column map
+    // We can skip global-to-local conversion if LIDs in row map are
+    // same as LIDs in column map
+    bool goodMap = MueLu::Utilities<SC,LO,GO,NO>::MapsAreNested(*rowMap, *colMap);
+
+    // Create a lookup table to determine the rows (fine DOFs) that belong to a given aggregate.
+    // aggStart is a pointer into aggToRowMapLO
+    // aggStart[i]..aggStart[i+1] are indices into aggToRowMapLO
+    // aggToRowMapLO[aggStart[i]]..aggToRowMapLO[aggStart[i+1]-1] are the DOFs in aggregate i
+    ArrayRCP<LO> aggStart;
+    ArrayRCP<LO> aggToRowMapLO;
+    ArrayRCP<GO> aggToRowMapGO;
+    if (goodMap) {
+      amalgInfo->UnamalgamateAggregatesLO(*aggregates, aggStart, aggToRowMapLO);
+      GetOStream(Runtime1) << "Column map is consistent with the row map, good." << std::endl;
+
+    } else {
+      amalgInfo->UnamalgamateAggregates(*aggregates, aggStart, aggToRowMapGO);
+      GetOStream(Warnings0) << "Column map is not consistent with the row map\n"
+                            << "using GO->LO conversion with performance penalty" << std::endl;
+    }
+    coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
+
+    // Pull out the nullspace vectors so that we can have random access.
+    ArrayRCP<ArrayRCP<const SC> > fineNS  (NSDim);
+    ArrayRCP<ArrayRCP<SC> >       coarseNS(NSDim);
+    for (size_t i = 0; i < NSDim; i++) {
+      fineNS[i] = fineNullspace->getData(i);
+      if (coarseMap->getLocalNumElements() > 0)
+        coarseNS[i] = coarseNullspace->getDataNonConst(i);
+    }
+
+    size_t nnzEstimate = numRows * NSDim;
+
+    // Time to construct the matrix and fill in the values
+    Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0));
+    RCP<CrsMatrix> PtentCrs   = rcp_dynamic_cast<CrsMatrixWrap>(Ptentative)->getCrsMatrix();
+
+    ArrayRCP<size_t>  iaPtent;
+    ArrayRCP<LO>      jaPtent;
+    ArrayRCP<SC>     valPtent;
+
+    PtentCrs->allocateAllValues(nnzEstimate, iaPtent, jaPtent, valPtent);
+
+    ArrayView<size_t> ia  = iaPtent();
+    ArrayView<LO>     ja  = jaPtent();
+    ArrayView<SC>     val = valPtent();
+
+    ia[0] = 0;
+    for (size_t i = 1; i <= numRows; i++)
+      ia[i] = ia[i-1] + NSDim;
+
+    for (size_t j = 0; j < nnzEstimate; j++) {
+      ja [j] = INVALID;
+      val[j] = zero;
+    }
+
+
+    if (doQRStep) {
+      ////////////////////////////////
+      // Standard aggregate-wise QR //
+      ////////////////////////////////
+      for (GO agg = 0; agg < numAggs; agg++) {
+        LO aggSize = aggStart[agg+1] - aggStart[agg];
+
+        Xpetra::global_size_t offset = agg*NSDim;
+
+        // Extract the piece of the nullspace corresponding to the aggregate, and
+        // put it in the flat array, "localQR" (in column major format) for the
+        // QR routine.
+        Teuchos::SerialDenseMatrix<LO,SC> localQR(aggSize, NSDim);
+        if (goodMap) {
+          for (size_t j = 0; j < NSDim; j++)
+            for (LO k = 0; k < aggSize; k++)
+              localQR(k,j) = fineNS[j][aggToRowMapLO[aggStart[agg]+k]];
+        } else {
+          for (size_t j = 0; j < NSDim; j++)
+            for (LO k = 0; k < aggSize; k++)
+              localQR(k,j) = fineNS[j][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+k])];
+        }
+
+        // Test for zero columns
+        for (size_t j = 0; j < NSDim; j++) {
+          bool bIsZeroNSColumn = true;
+
+          for (LO k = 0; k < aggSize; k++)
+            if (localQR(k,j) != zero)
+              bIsZeroNSColumn = false;
+
+          TEUCHOS_TEST_FOR_EXCEPTION(bIsZeroNSColumn == true, Exceptions::RuntimeError,
+                                     "MueLu::TentativePFactory::MakeTentative: fine level NS part has a zero column in NS column " << j);
+        }
+
+        // Calculate QR decomposition (standard)
+        // NOTE: Q is stored in localQR and R is stored in coarseNS
+        if (aggSize >= Teuchos::as<LO>(NSDim)) {
+
+          if (NSDim == 1) {
+            // Only one nullspace vector, calculate Q and R by hand
+            Magnitude norm = STS::magnitude(zero);
+            for (size_t k = 0; k < Teuchos::as<size_t>(aggSize); k++)
+              norm += STS::magnitude(localQR(k,0)*localQR(k,0));
+            norm = Teuchos::ScalarTraits<Magnitude>::squareroot(norm);
+
+            // R = norm
+            coarseNS[0][offset] = norm;
+
+            // Q = localQR(:,0)/norm
+            for (LO i = 0; i < aggSize; i++)
+              localQR(i,0) /= norm;
+
+          } else {
+            Teuchos::SerialQRDenseSolver<LO,SC> qrSolver;
+            qrSolver.setMatrix(Teuchos::rcp(&localQR, false));
+            qrSolver.factor();
+
+            // R = upper triangular part of localQR
+            for (size_t j = 0; j < NSDim; j++)
+              for (size_t k = 0; k <= j; k++)
+                coarseNS[j][offset+k] = localQR(k,j); //TODO is offset+k the correct local ID?!
+
+            // Calculate Q, the tentative prolongator.
+            // The Lapack GEQRF call only works for myAggsize >= NSDim
+            qrSolver.formQ();
+            Teuchos::RCP<Teuchos::SerialDenseMatrix<LO,SC> > qFactor = qrSolver.getQ();
+            for (size_t j = 0; j < NSDim; j++)
+              for (size_t i = 0; i < Teuchos::as<size_t>(aggSize); i++)
+                localQR(i,j) = (*qFactor)(i,j);
+          }
+
+        } else {
+          // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics)
+
+          // The local QR decomposition is not possible in the "overconstrained"
+          // case (i.e. number of columns in localQR > number of rows), which
+          // corresponds to #DOFs in Aggregate < NSDim. For usual problems this
+          // is only possible for single node aggregates in structural mechanics.
+          // (Similar problems may arise in discontinuous Galerkin problems...)
+          // We bypass the QR decomposition and use an identity block in the
+          // tentative prolongator for the single node aggregate and transfer the
+          // corresponding fine level null space information 1-to-1 to the coarse
+          // level null space part.
+
+          // NOTE: The resulting tentative prolongation operator has
+          // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular
+          // coarse level operator A.  To deal with that one has the following
+          // options:
+          // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
+          //   false) to add some identity block to the diagonal of the zero rows
+          //   in the coarse level operator A, such that standard level smoothers
+          //   can be used again.
+          // - Use special (projection-based) level smoothers, which can deal
+          //   with singular matrices (very application specific)
+          // - Adapt the code below to avoid zero columns. However, we do not
+          //   support a variable number of DOFs per node in MueLu/Xpetra which
+          //   makes the implementation really hard.
+
+          // R = extended (by adding identity rows) localQR
+          for (size_t j = 0; j < NSDim; j++)
+            for (size_t k = 0; k < NSDim; k++)
+              if (k < as<size_t>(aggSize))
+                coarseNS[j][offset+k] = localQR(k,j);
+              else
+                coarseNS[j][offset+k] = (k == j ? one : zero);
+
+          // Q = I (rectangular)
+          for (size_t i = 0; i < as<size_t>(aggSize); i++)
+            for (size_t j = 0; j < NSDim; j++)
+              localQR(i,j) = (j == i ? one : zero);
+        }
+
+
+        // Process each row in the local Q factor
+        // FIXME: What happens if maps are blocked?
+        for (LO j = 0; j < aggSize; j++) {
+          LO localRow = (goodMap ? aggToRowMapLO[aggStart[agg]+j] : rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]));
+
+          size_t rowStart = ia[localRow];
+          for (size_t k = 0, lnnz = 0; k < NSDim; k++) {
+            // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions)
+            if (localQR(j,k) != zero) {
+              ja [rowStart+lnnz] = offset + k;
+              val[rowStart+lnnz] = localQR(j,k);
+              lnnz++;
+            }
+          }
+        }
+      }
+
+    } else {
+      GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
+      if (NSDim>1)
+        GetOStream(Warnings0) << "TentativePFactory : for nontrivial nullspace, this may degrade performance" << std::endl;
+      /////////////////////////////
+      //      "no-QR" option     //
+      /////////////////////////////
+      // Local Q factor is just the fine nullspace support over the current aggregate.
+      // Local R factor is the identity.
+      // TODO I have not implemented any special handling for aggregates that are too
+      // TODO small to locally support the nullspace, as is done in the standard QR
+      // TODO case above.
+      if (goodMap) {
+        for (GO agg = 0; agg < numAggs; agg++) {
+          const LO aggSize = aggStart[agg+1] - aggStart[agg];
+          Xpetra::global_size_t offset = agg*NSDim;
+
+          // Process each row in the local Q factor
+          // FIXME: What happens if maps are blocked?
+          for (LO j = 0; j < aggSize; j++) {
+
+            //TODO Here I do not check for a zero nullspace column on the aggregate.
+            //     as is done in the standard QR case.
+
+            const LO localRow = aggToRowMapLO[aggStart[agg]+j];
+
+            const size_t rowStart = ia[localRow];
+
+            for (size_t k = 0, lnnz = 0; k < NSDim; k++) {
+              // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions)
+              SC qr_jk = fineNS[k][aggToRowMapLO[aggStart[agg]+j]];
+              if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg];
+              if (qr_jk != zero) {
+                ja [rowStart+lnnz] = offset + k;
+                val[rowStart+lnnz] = qr_jk;
+                lnnz++;
+              }
+            }
+          }
+          for (size_t j = 0; j < NSDim; j++)
+            coarseNS[j][offset+j] = one;
+        } //for (GO agg = 0; agg < numAggs; agg++)
+
+      } else {
+        for (GO agg = 0; agg < numAggs; agg++) {
+          const LO aggSize = aggStart[agg+1] - aggStart[agg];
+          Xpetra::global_size_t offset = agg*NSDim;
+          for (LO j = 0; j < aggSize; j++) {
+
+            const LO localRow = rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j]);
+
+            const size_t rowStart = ia[localRow];
+
+            for (size_t k = 0, lnnz = 0; k < NSDim; ++k) {
+              // Skip zeros (there may be plenty of them, i.e., NSDim > 1 or boundary conditions)
+              SC qr_jk = fineNS[k][rowMap->getLocalElement(aggToRowMapGO[aggStart[agg]+j])];
+              if(constantColSums) qr_jk = qr_jk / (Magnitude)aggSizes[agg];
+              if (qr_jk != zero) {
+                ja [rowStart+lnnz] = offset + k;
+                val[rowStart+lnnz] = qr_jk;
+                lnnz++;
+              }
+            }
+          }
+          for (size_t j = 0; j < NSDim; j++)
+            coarseNS[j][offset+j] = one;
+        } //for (GO agg = 0; agg < numAggs; agg++)
+
+      } //if (goodmap) else ...
+
+    } //if doQRStep ... else
+
+    // Compress storage (remove all INVALID, which happen when we skip zeros)
+    // We do that in-place
+    size_t ia_tmp = 0, nnz = 0;
+    for (size_t i = 0; i < numRows; i++) {
+      for (size_t j = ia_tmp; j < ia[i+1]; j++)
+        if (ja[j] != INVALID) {
+          ja [nnz] = ja [j];
+          val[nnz] = val[j];
+          nnz++;
+        }
+      ia_tmp  = ia[i+1];
+      ia[i+1] = nnz;
+    }
+    if (rowMap->lib() == Xpetra::UseTpetra) {
+      // - Cannot resize for Epetra, as it checks for same pointers
+      // - Need to resize for Tpetra, as it check ().size() == ia[numRows]
+      // NOTE: these invalidate ja and val views
+      jaPtent .resize(nnz);
+      valPtent.resize(nnz);
+    }
+
+    GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
+
+    PtentCrs->setAllValues(iaPtent, jaPtent, valPtent);
+
+
+    // Managing labels & constants for ESFC
+    RCP<ParameterList> FCparams;
+    if(pL.isSublist("matrixmatrix: kernel params"))
+      FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
+    else
+      FCparams= rcp(new ParameterList);
+    // By default, we don't need global constants for TentativeP
+    FCparams->set("compute global constants",FCparams->get("compute global constants",false));
+    std::string levelIDs = toString(levelID);
+    FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
+    RCP<const Export> dummy_e;
+    RCP<const Import> dummy_i;
+
+    PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap(),dummy_i,dummy_e,FCparams);
+  }
+
+
+
 } //namespace MueLu
 
 // TODO ReUse: If only P or Nullspace is missing, TentativePFactory can be smart and skip part of the computation.
diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp
index 35b3151fc4b2..e387eb1a9677 100644
--- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp
+++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_decl.hpp
@@ -55,6 +55,8 @@
 
 #include "Teuchos_ScalarTraits.hpp"
 
+#include "Xpetra_CrsGraphFactory_fwd.hpp"
+
 #include "MueLu_Aggregates_kokkos_fwd.hpp"
 #include "MueLu_AmalgamationFactory_kokkos_fwd.hpp"
 #include "MueLu_AmalgamationInfo_kokkos_fwd.hpp"
@@ -151,27 +153,27 @@ namespace MueLu {
 
     //@}
 
-    // CUDA 7.5 and 8.0 place a restriction on the placement of __device__ lambdas:
-    //
-    //     An explicit __device__ lambda cannot be defined in a member function
-    //     that has private or protected access within its class.
-    //
-    // Therefore, we expose BuildPuncoupled and isGoodMap for now. An alternative solution
-    // could be writing an out of class implementation, and then calling it in
-    // a member function.
-    void BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
-                         RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
-                         RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
-                         RCP<MultiVector>& coarseNullspace, const int levelID) const;
+
+    // NOTE: All of thess should really be private, but CUDA doesn't like that
+    
+    void BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo_kokkos> amalgInfo, 
+                                 RCP<MultiVector> fineNullspace, RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace, const int levelID) const;
+
+
     bool isGoodMap(const Map& rowMap, const Map& colMap) const;
 
-  private:
+
 
     void BuildPcoupled  (RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
                          RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
                          RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
                          RCP<MultiVector>& coarseNullspace) const;
 
+    void BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
+                         RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
+                         RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
+                         RCP<MultiVector>& coarseNullspace, const int levelID) const;
+
     mutable bool bTransferCoordinates_ = false;
 
   };
diff --git a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp
index 08f647cbcce9..e12983cc10d1 100644
--- a/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp
+++ b/packages/muelu/src/Transfers/Smoothed-Aggregation/MueLu_TentativePFactory_kokkos_def.hpp
@@ -49,6 +49,7 @@
 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
 
 #include "Kokkos_UnorderedMap.hpp"
+#include "Xpetra_CrsGraphFactory.hpp"
 
 #include "MueLu_TentativePFactory_kokkos_decl.hpp"
 
@@ -56,12 +57,15 @@
 #include "MueLu_AmalgamationFactory_kokkos.hpp"
 #include "MueLu_AmalgamationInfo_kokkos.hpp"
 #include "MueLu_CoarseMapFactory_kokkos.hpp"
+
 #include "MueLu_MasterList.hpp"
 #include "MueLu_NullspaceFactory_kokkos.hpp"
 #include "MueLu_PerfUtils.hpp"
 #include "MueLu_Monitor.hpp"
 #include "MueLu_Utilities_kokkos.hpp"
 
+#include "Xpetra_IO.hpp"
+
 namespace MueLu {
 
   namespace { // anonymous
@@ -531,8 +535,15 @@ namespace MueLu {
       }
     }
 
-    if (!aggregates->AggregatesCrossProcessors())
-      BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
+    if (!aggregates->AggregatesCrossProcessors()) {
+      if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
+        BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
+                                coarseLevel.GetLevelID());
+      }
+      else {
+        BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
+      }
+    }
     else
       BuildPcoupled  (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
 
@@ -966,6 +977,310 @@ namespace MueLu {
     }
   }
 
+
+  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
+  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
+    BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
+                  RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
+                  RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
+                  RCP<MultiVector>& coarseNullspace, const int levelID) const {
+#ifdef HAVE_MUELU_TPETRA
+  /* This routine generates a BlockCrs P for a BlockCrs A.  There are a few assumptions here, which meet the use cases we care about, but could 
+       be generalized later, if we ever need to do so:
+       1) Null space dimension === block size of matrix:  So no elasticity right now
+       2) QR is not supported:  Under assumption #1, this shouldn't cause problems.
+       3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
+
+       These assumptions keep our code way simpler and still support the use cases we actually care about.
+     */
+
+    RCP<const Map> rowMap     = A->getRowMap();
+    RCP<const Map> rangeMap   = A->getRangeMap();
+    RCP<const Map> colMap     = A->getColMap();
+    //    const size_t numFinePointRows = rangeMap->getLocalNumElements();
+    const size_t numFineBlockRows = rowMap->getLocalNumElements();
+
+    typedef Teuchos::ScalarTraits<SC> STS;
+    typedef typename STS::magnitudeType Magnitude;
+    //    const SC     zero      = STS::zero();
+    const SC     one       = STS::one();
+    const LO     INVALID   = Teuchos::OrdinalTraits<LO>::invalid();
+
+    //    const GO     numAggs   = aggregates->GetNumAggregates();
+    const size_t NSDim     = fineNullspace->getNumVectors();
+    auto aggSizes          = aggregates->ComputeAggregateSizes();
+
+
+    typename Aggregates_kokkos::local_graph_type aggGraph;
+    {
+      SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
+      aggGraph = aggregates->GetGraph();
+    }
+    auto aggRows  = aggGraph.row_map;
+    auto aggCols  = aggGraph.entries;
+
+
+    // Need to generate the coarse block map
+    // NOTE: We assume NSDim == block size here
+    // NOTE: We also assume that coarseMap has contiguous GIDs
+    //const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
+    const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
+    RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
+                                                      Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
+                                                      numCoarseBlockRows,
+                                                      coarsePointMap->getIndexBase(),
+                                                      coarsePointMap->getComm());    
+    // Sanity checking
+    const ParameterList& pL = GetParameterList();
+    //    const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
+
+
+    // The aggregates use the amalgamated column map, which in this case is what we want
+
+    // Aggregates map is based on the amalgamated column map
+    // We can skip global-to-local conversion if LIDs in row map are
+    // same as LIDs in column map
+    bool goodMap = MueLu::Utilities<SC,LO,GO,NO>::MapsAreNested(*rowMap, *colMap);
+    TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
+                               "MueLu: TentativePFactory_kokkos: for now works only with good maps "
+                               "(i.e. \"matching\" row and column maps)");
+
+    // STEP 1: do unamalgamation
+    // The non-kokkos version uses member functions from the AmalgamationInfo
+    // container class to unamalgamate the data. In contrast, the kokkos
+    // version of TentativePFactory does the unamalgamation here and only uses
+    // the data of the AmalgamationInfo container class
+
+    // Extract information for unamalgamation
+    LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
+    GO indexBase;
+    amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
+    //GO globalOffset = amalgInfo->GlobalOffset();
+
+    // Extract aggregation info (already in Kokkos host views)
+    auto         procWinner    = aggregates->GetProcWinner()  ->getDeviceLocalView(Xpetra::Access::ReadOnly);
+    auto         vertex2AggId  = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
+    const size_t numAggregates = aggregates->GetNumAggregates();
+
+    int myPID = aggregates->GetMap()->getComm()->getRank();
+
+    // Create Kokkos::View (on the device) to store the aggreate dof sizes
+    // Later used to get aggregate dof offsets
+    // NOTE: This zeros itself on construction
+    typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
+    AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
+
+    {
+      SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
+
+      // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
+      aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
+
+      Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), aggSizes);
+    } 
+
+    // Find maximum dof size for aggregates
+    // Later used to reserve enough scratch space for local QR decompositions
+    LO maxAggSize = 0;
+    ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
+    Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
+
+    // parallel_scan (exclusive)
+    // The aggDofSizes View then contains the aggregate dof offsets
+    Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
+      KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
+        update += aggDofSizes(i);
+        if (final_pass)
+          aggDofSizes(i) = update;
+      });
+
+    // Create Kokkos::View on the device to store mapping
+    // between (local) aggregate id and row map ids (LIDs)
+    Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
+    {
+      SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
+
+      AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
+      Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
+
+      Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
+        KOKKOS_LAMBDA(const LO lnode) {
+          if (procWinner(lnode, 0) == myPID) {
+            // No need for atomics, it's one-to-one
+            auto aggID = vertex2AggId(lnode,0);
+
+            auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
+            // FIXME: I think this may be wrong
+            // We unconditionally add the whole block here. When we calculated
+            // aggDofSizes, we did the isLocalElement check. Something's fishy.
+            for (LO k = 0; k < stridedBlockSize; k++)
+              aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k;
+          }
+        });
+    }
+
+    // STEP 2: prepare local QR decomposition
+    // Reserve memory for tentative prolongation operator
+    coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
+
+    // Pull out the nullspace vectors so that we can have random access (on the device)
+    auto fineNS   = fineNullspace  ->getDeviceLocalView(Xpetra::Access::ReadWrite);
+    auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
+
+    typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type    local_matrix_type;
+    typedef typename local_matrix_type::row_map_type::non_const_type   rows_type;
+    typedef typename local_matrix_type::index_type::non_const_type     cols_type;
+    typedef typename local_matrix_type::values_type::non_const_type    vals_type;
+
+
+    // Device View for status (error messages...)
+    typedef Kokkos::View<int[10], DeviceType> status_type;
+    status_type status("status");
+
+    typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
+    typename AppendTrait<status_type,      Kokkos::Atomic>      ::type statusAtomic = status;
+
+    // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
+    GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
+
+    // BlockCrs requires that we build the (block) graph first, so let's do that...
+
+    // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one 
+    // block non-zero per row in the matrix;
+    rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
+    cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
+
+    Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
+                         KOKKOS_LAMBDA(const LO j) {
+                           ia[j] = j;
+                           ja[j] = INVALID;
+
+                           if(j==(LO)numFineBlockRows-1)
+                             ia[numFineBlockRows] = numFineBlockRows;
+                         });
+
+    // Fill Graph
+    const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
+    Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:fillGraph", policy,
+                         KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
+                           auto agg = thread.league_rank();
+                           Xpetra::global_size_t offset = agg;
+                           
+                           // size of the aggregate (number of DOFs in aggregate)
+                           LO aggSize = aggRows(agg+1) - aggRows(agg);
+                           
+                           for (LO j = 0; j < aggSize; j++) {
+                             // FIXME: Allow for bad maps
+                             const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j];
+                             const size_t rowStart = ia[localRow];
+                             ja[rowStart] = offset;
+                           }
+                         });
+
+    // Compress storage (remove all INVALID, which happen when we skip zeros)
+    // We do that in-place
+    {
+      // Stage 2: compress the arrays
+      SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
+      // Fill i_temp with the correct row starts
+      rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows+1);
+      size_t nnz=0;
+      Kokkos::parallel_scan("MueLu:TentativePF:BlockCrs:compress_rows", range_type(0,numFineBlockRows),
+                            KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
+                              if(final) 
+                                i_temp[i] = upd;
+                              for (auto j = ia[i]; j < ia[i+1]; j++)
+                                if (ja[j] != INVALID)
+                                  upd++;
+                              if(final && i == (LO) numFineBlockRows-1)
+                                i_temp[numFineBlockRows] = upd;
+                            },nnz);
+
+      cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
+      
+      
+      Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:compress_cols", range_type(0,numFineBlockRows),
+                           KOKKOS_LAMBDA(const LO i) {
+                             size_t rowStart = i_temp[i];
+                             size_t lnnz = 0;
+                             for (auto j = ia[i]; j < ia[i+1]; j++)
+                               if (ja[j] != INVALID) {
+                                 j_temp[rowStart+lnnz] = ja[j];
+                                 lnnz++;
+                               }
+                           });     
+      
+      ia = i_temp;
+      ja = j_temp;
+    }
+    
+    RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja);
+ 
+
+    // Managing labels & constants for ESFC
+    {
+      RCP<ParameterList> FCparams;
+      if(pL.isSublist("matrixmatrix: kernel params"))
+        FCparams=rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
+      else
+        FCparams= rcp(new ParameterList);
+      // By default, we don't need global constants for TentativeP
+      FCparams->set("compute global constants",FCparams->get("compute global constants",false));
+      std::string levelIDs = toString(levelID);
+      FCparams->set("Timer Label",std::string("MueLu::TentativeP-")+levelIDs);
+      RCP<const Export> dummy_e;
+      RCP<const Import> dummy_i;
+      BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
+    }
+
+    // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
+    // we clear them here
+    ia = rows_type();
+    ja = cols_type();
+
+    
+    // Now let's make a BlockCrs Matrix
+    // NOTE: Assumes block size== NSDim
+    RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
+    RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
+    if(P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
+    RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
+
+    auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
+    const LO stride = NSDim*NSDim;
+
+    Kokkos::parallel_for("MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
+                         KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
+                           auto agg = thread.league_rank();
+                           
+                           // size of the aggregate (number of DOFs in aggregate)
+                           LO aggSize = aggRows(agg+1) - aggRows(agg);
+                           Xpetra::global_size_t offset = agg*NSDim;
+
+                           // Q = localQR(:,0)/norm
+                           for (LO j = 0; j < aggSize; j++) {
+                             LO localBlockRow = aggToRowMapLO(aggRows(agg)+j);
+                             LO rowStart = localBlockRow * stride;
+                             for (LO r = 0; r < (LO)NSDim; r++) {
+                               LO localPointRow = localBlockRow*NSDim + r;
+                               for (LO c = 0; c < (LO)NSDim; c++) {
+                                 values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c);
+                               }
+                             }
+                           }
+
+                           // R = norm
+                           for(LO j=0; j<(LO)NSDim; j++)
+                             coarseNS(offset+j,j) = one;                                                    
+                         });
+
+  Ptentative = P_wrap;
+
+#else
+    throw std::runtime_error("TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra");
+#endif
+  }
+
   template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
   void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
   BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates_kokkos> /* aggregates */,
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
index b3f8ab3fc887..e9f08ae11e6a 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_decl.hpp
@@ -97,6 +97,8 @@ class Epetra_Vector;
 
 #ifdef HAVE_MUELU_TPETRA
 #include <Tpetra_CrsMatrix.hpp>
+#include <Tpetra_BlockCrsMatrix.hpp>
+#include <Tpetra_BlockCrsMatrix_Helpers.hpp>
 #include <Tpetra_FECrsMatrix.hpp>
 #include <Tpetra_RowMatrixTransposer.hpp>
 #include <Tpetra_Map.hpp>
@@ -199,6 +201,14 @@ namespace MueLu {
     static const Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Op2TpetraCrs(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op);
     static       Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Op2NonConstTpetraCrs(Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op);
 
+    static RCP<const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Op2TpetraBlockCrs(RCP<const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Op);
+   static RCP<      Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Op2NonConstTpetraBlockCrs(RCP<Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Op);
+
+   static const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Op2TpetraBlockCrs(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op);
+    static       Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Op2NonConstTpetraBlockCrs(Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op);
+
+
+
     static RCP<const Tpetra::RowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Op2TpetraRow(RCP<const Xpetra::Operator<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Op);
     static RCP<      Tpetra::RowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Op2NonConstTpetraRow(RCP<Xpetra::Operator<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Op);
 
@@ -532,6 +542,76 @@ namespace MueLu {
 #endif
     }
 
+
+    static RCP<const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Op2TpetraBlockCrs(RCP<const Matrix> Op)  {
+#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \
+    (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT))))
+      throw Exceptions::RuntimeError("Op2TpetraBlockCrs: Tpetra has not been compiled with support for LO=GO=int.");
+#else
+      // Get the underlying Tpetra Mtx
+      RCP<const CrsMatrixWrap> crsOp = rcp_dynamic_cast<const CrsMatrixWrap>(Op);
+      if (crsOp == Teuchos::null)
+        throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+      const RCP<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > &tmp_ECrsMtx = rcp_dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(crsOp->getCrsMatrix());
+      if (tmp_ECrsMtx == Teuchos::null)
+        throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+      return tmp_ECrsMtx->getTpetra_BlockCrsMatrix();
+#endif
+    }
+
+    static RCP<      Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Op2NonConstTpetraBlockCrs(RCP<Matrix> Op){
+#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \
+    (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT))))
+      throw Exceptions::RuntimeError("Op2NonConstTpetraBlockCrs: Tpetra has not been compiled with support for LO=GO=int.");
+#else
+      RCP<const CrsMatrixWrap> crsOp = rcp_dynamic_cast<const CrsMatrixWrap>(Op);
+      if (crsOp == Teuchos::null)
+        throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+      const RCP<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > &tmp_ECrsMtx = rcp_dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(crsOp->getCrsMatrix());
+      if (tmp_ECrsMtx == Teuchos::null)
+        throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+      return tmp_ECrsMtx->getTpetra_BlockCrsMatrixNonConst();
+#endif
+    };
+
+    static const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Op2TpetraBlockCrs(const Matrix& Op)   {
+#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \
+    (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT))))
+      throw Exceptions::RuntimeError("Op2TpetraBlockCrs: Tpetra has not been compiled with support for LO=GO=int.");
+#else
+      try {
+        const CrsMatrixWrap& crsOp = dynamic_cast<const CrsMatrixWrap&>(Op);
+        try {
+          const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& tmp_ECrsMtx = dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&>(*crsOp.getCrsMatrix());
+          return *tmp_ECrsMtx.getTpetra_BlockCrsMatrix();
+        } catch (std::bad_cast&) {
+          throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+        }
+      } catch (std::bad_cast&) {
+        throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+      }
+#endif
+    }
+    static       Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Op2NonConstTpetraBlockCrs(Matrix& Op) {
+#if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \
+    (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT))))
+      throw Exceptions::RuntimeError("Op2NonConstTpetraCrs: Tpetra has not been compiled with support for LO=GO=int.");
+#else
+      try {
+        CrsMatrixWrap& crsOp = dynamic_cast<CrsMatrixWrap&>(Op);
+        try {
+          Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& tmp_ECrsMtx = dynamic_cast<Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&>(*crsOp.getCrsMatrix());
+          return *tmp_ECrsMtx.getTpetra_BlockCrsMatrixNonConst();
+        } catch (std::bad_cast&) {
+          throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+        }
+      } catch (std::bad_cast&) {
+        throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+      }
+#endif
+    }
+
+
     static RCP<const Tpetra::RowMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Op2TpetraRow(RCP<const Operator> Op)   {
 #if ((defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_OPENMP) || !defined(HAVE_TPETRA_INST_INT_INT))) || \
     (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT))))
@@ -799,9 +879,11 @@ namespace MueLu {
     (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT))))
             throw Exceptions::RuntimeError("Utilities::Transpose: Tpetra is not compiled with LO=GO=int. Add TPETRA_INST_INT_INT:BOOL=ON to your configuration!");
 #else
-            try {
+            using Helpers = Xpetra::Helpers<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+            /***************************************************************/
+            if(Helpers::isTpetraCrs(Op)) {
               const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& tpetraOp = Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2TpetraCrs(Op);
-
+              
               // Compute the transpose A of the Tpetra matrix tpetraOp.
               RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > A;
               Tpetra::RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer(rcpFromRef(tpetraOp),label);
@@ -825,9 +907,43 @@ namespace MueLu {
 
               return AAAA;
             }
-            catch (std::exception& e) {
-              std::cout << "threw exception '" << e.what() << "'" << std::endl;
-              throw Exceptions::RuntimeError("Utilities::Transpose failed, perhaps because matrix is not a Crs matrix");
+              /***************************************************************/
+            else if(Helpers::isTpetraBlockCrs(Op)) {
+              using BCRS = Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+              using CRS  = Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+              const BCRS & tpetraOp = Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2TpetraBlockCrs(Op);
+
+              if(!Op.getRowMap()->getComm()->getRank())
+                std::cout<<"WARNING: Utilities::Transpose(): Using inefficient placeholder algorithm for Transpose"<<std::endl;
+
+              RCP<BCRS> At;
+              RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpetraOp);
+              {
+                Tpetra::RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer(Acrs,label);
+
+                using Teuchos::ParameterList;
+                using Teuchos::rcp;
+                RCP<ParameterList> transposeParams = params.is_null () ?
+                  rcp (new ParameterList) :
+                  rcp (new ParameterList (*params));
+                transposeParams->set ("sort", false);
+                RCP<CRS> Atcrs = transposer.createTranspose(transposeParams);
+                
+                At = Tpetra::convertToBlockCrsMatrix(*Atcrs,Op.GetStorageBlockSize());
+              }
+              RCP<Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > AA   = rcp(new Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(At));
+              RCP<CrsMatrix>                                                           AAA  = rcp_implicit_cast<CrsMatrix>(AA);
+              RCP<Matrix>                                                              AAAA = rcp( new CrsMatrixWrap(AAA));
+              
+              if (Op.IsView("stridedMaps"))
+                AAAA->CreateView("stridedMaps", Teuchos::rcpFromRef(Op), true/*doTranspose*/);
+
+              return AAAA;
+
+            }
+            /***************************************************************/
+            else {
+              throw Exceptions::RuntimeError("Utilities::Transpose failed, perhaps because matrix is not a Crs or BlockCrs matrix");
             }
 #endif
 #else
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_def.hpp b/packages/muelu/src/Utils/MueLu_Utilities_def.hpp
index 469276531197..49a674aaa91e 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_def.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_def.hpp
@@ -300,6 +300,65 @@ namespace MueLu {
     }
   }
 
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  RCP<const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2TpetraBlockCrs(RCP<const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Op)  {
+    using XCrsMatrixWrap = Xpetra::CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+    // Get the underlying Tpetra Mtx
+    RCP<const XCrsMatrixWrap> crsOp = rcp_dynamic_cast<const XCrsMatrixWrap>(Op);
+    if (crsOp == Teuchos::null)
+      throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+    const RCP<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > &tmp_ECrsMtx = rcp_dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(crsOp->getCrsMatrix());
+    if (tmp_ECrsMtx == Teuchos::null)
+        throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+    return tmp_ECrsMtx->getTpetra_BlockCrsMatrix();
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  RCP<      Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >   Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2NonConstTpetraBlockCrs(RCP<Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > Op){
+    using XCrsMatrixWrap = Xpetra::CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+    RCP<const XCrsMatrixWrap> crsOp = rcp_dynamic_cast<const XCrsMatrixWrap>(Op);
+    if (crsOp == Teuchos::null)
+      throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+    const RCP<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > &tmp_ECrsMtx = rcp_dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(crsOp->getCrsMatrix());
+    if (tmp_ECrsMtx == Teuchos::null)
+      throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+    return tmp_ECrsMtx->getTpetra_BlockCrsMatrixNonConst();
+  }
+  
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2TpetraBlockCrs(const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op)   {
+    try {
+      using XCrsMatrixWrap = Xpetra::CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+      const XCrsMatrixWrap& crsOp = dynamic_cast<const XCrsMatrixWrap&>(Op);
+      try {
+        const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& tmp_ECrsMtx = dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&>(*crsOp.getCrsMatrix());
+        return *tmp_ECrsMtx.getTpetra_BlockCrsMatrix();
+      } catch (std::bad_cast&) {
+        throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+      }
+    } catch (std::bad_cast&) {
+      throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+    }
+  }
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&        Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2NonConstTpetraBlockCrs(Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Op) {
+    try {
+      using XCrsMatrixWrap = Xpetra::CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+      XCrsMatrixWrap& crsOp = dynamic_cast<XCrsMatrixWrap&>(Op);
+      try {
+        Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>& tmp_ECrsMtx = dynamic_cast<Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>&>(*crsOp.getCrsMatrix());
+        return *tmp_ECrsMtx.getTpetra_BlockCrsMatrixNonConst();
+      } catch (std::bad_cast&) {
+        throw Exceptions::BadCast("Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
+      }
+    } catch (std::bad_cast&) {
+        throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed");
+    }    
+  }
+
+
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   RCP<const Tpetra::RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2TpetraRow(RCP<const Xpetra::Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Op) {
     RCP<const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> > mat = rcp_dynamic_cast<const Xpetra::Matrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >(Op);
@@ -498,12 +557,14 @@ namespace MueLu {
 
 #ifdef HAVE_MUELU_TPETRA
     if (TorE == "tpetra") {
-      try {
+      using Helpers = Xpetra::Helpers<Scalar,LocalOrdinal,GlobalOrdinal,Node>;
+      /***************************************************************/
+      if(Helpers::isTpetraCrs(Op)) {
         const Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& tpetraOp = Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2TpetraCrs(Op);
-
+        
         RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > A;
         Tpetra::RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer(rcpFromRef(tpetraOp),label); //more than meets the eye
-
+        
         {
           using Teuchos::ParameterList;
           using Teuchos::rcp;
@@ -513,20 +574,53 @@ namespace MueLu {
           transposeParams->set ("sort", false);
           A = transposer.createTranspose (transposeParams);
         }
-
+        
         RCP<Xpetra::TpetraCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > AA   = rcp(new Xpetra::TpetraCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(A) );
         RCP<Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >       AAA  = rcp_implicit_cast<Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >(AA);
         RCP<Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >          AAAA = rcp( new Xpetra::CrsMatrixWrap<Scalar, LocalOrdinal, GlobalOrdinal, Node>(AAA) );
         if (!AAAA->isFillComplete())
           AAAA->fillComplete(Op.getRangeMap(), Op.getDomainMap());
-
+        
         if (Op.IsView("stridedMaps"))
           AAAA->CreateView("stridedMaps", Teuchos::rcpFromRef(Op), true/*doTranspose*/);
-
+        
         return AAAA;
-
-      } catch (std::exception& e) {
-        std::cout << "threw exception '" << e.what() << "'" << std::endl;
+      }
+      else if(Helpers::isTpetraBlockCrs(Op)) {
+        using XMatrix = Xpetra::Matrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+        using XCrsMatrix = Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+        using XCrsMatrixWrap = Xpetra::CrsMatrixWrap<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+        using BCRS = Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+        using CRS  = Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
+        const BCRS & tpetraOp = Utilities<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Op2TpetraBlockCrs(Op);
+        
+        if(!Op.getRowMap()->getComm()->getRank())
+          std::cout<<"WARNING: Utilities::Transpose(): Using inefficient placeholder algorithm for Transpose"<<std::endl;
+
+        RCP<BCRS> At;
+        RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpetraOp);
+        {
+          Tpetra::RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer(Acrs,label);
+          
+          using Teuchos::ParameterList;
+          using Teuchos::rcp;
+          RCP<ParameterList> transposeParams = params.is_null () ?
+            rcp (new ParameterList) :
+            rcp (new ParameterList (*params));
+          transposeParams->set ("sort", false);
+          RCP<CRS> Atcrs = transposer.createTranspose(transposeParams);
+          
+          At = Tpetra::convertToBlockCrsMatrix(*Atcrs,Op.GetStorageBlockSize());
+        }
+        RCP<Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > AA   = rcp(new Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(At));
+        RCP<XCrsMatrix>                                                           AAA  = rcp_implicit_cast<XCrsMatrix>(AA);
+        RCP<XMatrix>                                                              AAAA = rcp( new XCrsMatrixWrap(AAA));
+        
+        if (Op.IsView("stridedMaps"))
+          AAAA->CreateView("stridedMaps", Teuchos::rcpFromRef(Op), true/*doTranspose*/);
+        
+        return AAAA;
+      } else {
         throw Exceptions::RuntimeError("Utilities::Transpose failed, perhaps because matrix is not a Crs matrix");
       }
     } //if
diff --git a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp
index 90799c2cd562..7d379bc68259 100644
--- a/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp
+++ b/packages/muelu/src/Utils/MueLu_Utilities_kokkos_def.hpp
@@ -368,47 +368,93 @@ namespace MueLu {
     using impl_scalar_type = typename Kokkos::ArithTraits<SC>::val_type;
     using ATS        = Kokkos::ArithTraits<impl_scalar_type>;
     using range_type = Kokkos::RangePolicy<LO, typename NO::execution_space>;
+    using helpers    = Xpetra::Helpers<SC,LO,GO,NO>;
 
-    auto localMatrix = A.getLocalMatrixDevice();
-    LO   numRows     = A.getLocalNumRows();
 
-    Kokkos::View<bool*, typename NO::device_type> boundaryNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
-    if (count_twos_as_dirichlet)
-      Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows_Twos_As_Dirichlet", range_type(0,numRows),
+    if(helpers::isTpetraBlockCrs(A)) {
+#ifdef HAVE_MUELU_TPETRA
+      const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & Am = helpers::Op2TpetraBlockCrs(A);
+      auto b_graph      = Am.getCrsGraph().getLocalGraphDevice();
+      auto b_rowptr     = Am.getCrsGraph().getLocalRowPtrsDevice();
+      auto values       = Am.getValuesDevice();
+      LO   numBlockRows = Am.getLocalNumRows();      
+      const LO stride   = Am.getBlockSize() * Am.getBlockSize();
+
+      Kokkos::View<bool*, typename NO::device_type> boundaryNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numBlockRows);      
+
+      if (count_twos_as_dirichlet)
+        throw Exceptions::RuntimeError("BlockCrs does not support counting twos as Dirichlet");
+
+      Kokkos::parallel_for("MueLu:Utils::DetectDirichletRowsBlockCrs", range_type(0,numBlockRows),
                            KOKKOS_LAMBDA(const LO row) {
-                             auto rowView = localMatrix.row(row);
+                             auto rowView = b_graph.rowConst(row);
                              auto length  = rowView.length;
+                             LO valstart  = b_rowptr[row] * stride;
 
                              boundaryNodes(row) = true;
-                             if (length > 2) {
-                               decltype(length) colID = 0;
-                               for (; colID < length; colID++)
-                                 if ((rowView.colidx(colID) != row) &&
-                                     (ATS::magnitude(rowView.value(colID)) > tol)) {
-                                   if (!boundaryNodes(row))
+                             decltype(length) colID =0;
+                             for (; colID < length; colID++) {
+                               if (rowView.colidx(colID) != row) {
+                                 LO current = valstart + colID*stride;
+                                 for(LO k=0; k<stride; k++) {
+                                   if (ATS::magnitude(values[current+ k]) > tol) {
+                                     boundaryNodes(row) = false;
                                      break;
-                                   boundaryNodes(row) = false;
+                                   }
                                  }
-                               if (colID == length)
-                                 boundaryNodes(row) = true;
+                               }
+                               if(boundaryNodes(row) == false)
+                                 break;
                              }
                            });
-    else
-      Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows", range_type(0,numRows),
-                           KOKKOS_LAMBDA(const LO row) {
-                             auto rowView = localMatrix.row(row);
-                             auto length  = rowView.length;
 
-                             boundaryNodes(row) = true;
-                             for (decltype(length) colID = 0; colID < length; colID++)
-                               if ((rowView.colidx(colID) != row) &&
-                                   (ATS::magnitude(rowView.value(colID)) > tol)) {
-                                 boundaryNodes(row) = false;
-                                 break;
+      return boundaryNodes;
+#else 
+      throw Exceptions::RuntimeError("BlockCrs requires Tpetra");
+#endif
+    } 
+    else {
+      auto localMatrix = A.getLocalMatrixDevice();
+      LO   numRows     = A.getLocalNumRows();
+      Kokkos::View<bool*, typename NO::device_type> boundaryNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);      
+
+      if (count_twos_as_dirichlet)
+        Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows_Twos_As_Dirichlet", range_type(0,numRows),
+                             KOKKOS_LAMBDA(const LO row) {
+                               auto rowView = localMatrix.row(row);
+                               auto length  = rowView.length;
+                               
+                               boundaryNodes(row) = true;
+                               if (length > 2) {
+                                 decltype(length) colID =0;
+                                 for ( ; colID < length; colID++)
+                                   if ((rowView.colidx(colID) != row) &&
+                                       (ATS::magnitude(rowView.value(colID)) > tol)) {
+                                     if (!boundaryNodes(row))
+                                       break;
+                                     boundaryNodes(row) = false;
+                                   }
+                                 if (colID == length)
+                                   boundaryNodes(row) = true;
                                }
-                           });
-
+                             });
+      else
+        Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows", range_type(0,numRows),
+                             KOKKOS_LAMBDA(const LO row) {
+                               auto rowView = localMatrix.row(row);
+                               auto length  = rowView.length;
+                               
+                               boundaryNodes(row) = true;
+                               for (decltype(length) colID = 0; colID < length; colID++)
+                                 if ((rowView.colidx(colID) != row) &&
+                                     (ATS::magnitude(rowView.value(colID)) > tol)) {
+                                   boundaryNodes(row) = false;
+                                   break;
+                                 }
+                             });
     return boundaryNodes;
+    }
+
   }
 
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
diff --git a/packages/muelu/test/unit_tests/CMakeLists.txt b/packages/muelu/test/unit_tests/CMakeLists.txt
index 122d6bd99c6c..e1d22c25e242 100644
--- a/packages/muelu/test/unit_tests/CMakeLists.txt
+++ b/packages/muelu/test/unit_tests/CMakeLists.txt
@@ -445,8 +445,9 @@ ENDIF()
 
 
 ADD_SUBDIRECTORY(ParameterList/FactoryFactory/)
+ADD_SUBDIRECTORY(ParameterList/ParameterListInterpreter/)
+
 IF (${PACKAGE_NAME}_ENABLE_Epetra)
   ADD_SUBDIRECTORY(ParameterList/MLParameterListInterpreter/)
-  ADD_SUBDIRECTORY(ParameterList/ParameterListInterpreter/)
   ADD_SUBDIRECTORY(ParameterList/CreateSublists/)
 ENDIF()
diff --git a/packages/muelu/test/unit_tests/Hierarchy.cpp b/packages/muelu/test/unit_tests/Hierarchy.cpp
index 1a8c92841ed7..6132086accfb 100644
--- a/packages/muelu/test/unit_tests/Hierarchy.cpp
+++ b/packages/muelu/test/unit_tests/Hierarchy.cpp
@@ -1672,7 +1672,7 @@ namespace MueLuTests {
 
   }
 
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Hierarchy, BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Hierarchy, BlockCrs_Mixed, Scalar, LocalOrdinal, GlobalOrdinal, Node)
   {
 #   include <MueLu_UseShortNames.hpp>
     MUELU_TESTING_SET_OSTREAM;
@@ -1789,6 +1789,7 @@ namespace MueLuTests {
     TEST_EQUALITY(0,0);
   }
 
+
   TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Hierarchy, CheckNullspaceDimension, Scalar, LocalOrdinal, GlobalOrdinal, Node)
   {
     // Test that HierarchyManager throws if user-supplied nullspace has dimension smaller than numPDEs
@@ -1835,7 +1836,7 @@ namespace MueLuTests {
     TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, SetupHierarchy3levelFacManagers, Scalar, LO, GO, Node) \
     TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, SetupHierarchyTestBreakCondition, Scalar, LO, GO, Node) \
     TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, Write, Scalar, LO, GO, Node) \
-    TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, BlockCrs, Scalar, LO, GO, Node) \
+    TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, BlockCrs_Mixed, Scalar, LO, GO, Node) \
     TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Hierarchy, CheckNullspaceDimension, Scalar, LO, GO, Node) \
     
 
diff --git a/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp b/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp
index 4bd0c652af14..a6f3e9af16c9 100644
--- a/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp
+++ b/packages/muelu/test/unit_tests/MueLu_TestHelpers.hpp
@@ -881,10 +881,24 @@ namespace MueLuTests {
          basematrix[4] = two;
          basematrix[7] = three;
          basematrix[8] = two;
+         Teuchos::Array<Scalar> offmatrix(blocksize*blocksize, zero);
+         offmatrix[0]=offmatrix[4]=offmatrix[8]=-1;
+
          Teuchos::Array<LocalOrdinal> lclColInds(1);
          for (LocalOrdinal lclRowInd = meshRowMap.getMinLocalIndex (); lclRowInd <= meshRowMap.getMaxLocalIndex(); ++lclRowInd) {
            lclColInds[0] = lclRowInd;
            bcrsmatrix->replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &basematrix[0], 1);
+           
+           // Off diagonals
+           if(lclRowInd > meshRowMap.getMinLocalIndex ()) {
+             lclColInds[0] = lclRowInd - 1;
+             bcrsmatrix->replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &offmatrix[0], 1);
+           }
+           if(lclRowInd < meshRowMap.getMaxLocalIndex ()) {
+             lclColInds[0] = lclRowInd + 1;
+             bcrsmatrix->replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &offmatrix[0], 1);
+           }
+
          }
 
          RCP<Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > temp = rcp(new Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(bcrsmatrix));
diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp
index a0f0ada4b9b7..90639e98e0d9 100644
--- a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp
+++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter.cpp
@@ -43,14 +43,23 @@
 // ***********************************************************************
 //
 // @HEADER
-
 #include <Teuchos_UnitTestHarness.hpp>
+#include <Teuchos_ParameterList.hpp>
 #include <Teuchos_XMLParameterListHelpers.hpp>
 
 #include <MueLu_TestHelpers.hpp>
 
 #include <MueLu_ParameterListInterpreter.hpp>
 #include <MueLu_Exceptions.hpp>
+#include <MueLu_Utilities.hpp>
+
+#include <Xpetra_MatrixMatrix.hpp>
+
+#ifdef HAVE_MUELU_TPETRA
+#include "Tpetra_BlockCrsMatrix_Helpers.hpp"
+#include "TpetraExt_MatrixMatrix.hpp"
+#endif
+
 
 namespace MueLuTests {
 
@@ -67,6 +76,14 @@ namespace MueLuTests {
     ArrayRCP<std::string> fileList = TestHelpers::GetFileList(std::string("ParameterList/ParameterListInterpreter/"), std::string(".xml"));
 
     for(int i=0; i< fileList.size(); i++) {
+      // Ignore files with "BlockCrs" in their name
+      auto found = fileList[i].find("BlockCrs");
+      if(found != std::string::npos) continue;
+
+      // Ignore files with "Comparison" in their name
+      found = fileList[i].find("Comparison");
+      if(found != std::string::npos) continue;
+
       out << "Processing file: " << fileList[i] << std::endl;
       ParameterListInterpreter mueluFactory("ParameterList/ParameterListInterpreter/" + fileList[i],*comm);
 
@@ -83,8 +100,178 @@ namespace MueLuTests {
     out << "Skipping test because some required packages are not enabled (Tpetra, Epetra, EpetraExt, Ifpack, Ifpack2, Amesos, Amesos2)." << std::endl;
 #   endif
   }
+
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ParameterListInterpreter, BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  {
+#   include <MueLu_UseShortNames.hpp>
+    MUELU_TESTING_SET_OSTREAM;
+    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node);
+#if defined(HAVE_MUELU_TPETRA)
+    MUELU_TEST_ONLY_FOR(Xpetra::UseTpetra) {
+      Teuchos::ParameterList matrixParams;
+      matrixParams.set("matrixType","Laplace1D");
+      matrixParams.set("nx",(GlobalOrdinal)300);// needs to be even
+
+      RCP<Matrix> A = TestHelpers::TpetraTestFactory<SC, LO, GO, NO>::BuildBlockMatrix(matrixParams,Xpetra::UseTpetra);  
+      out<<"Matrix Size (block) = "<<A->getGlobalNumRows()<<" (point) "<<A->getRangeMap()->getGlobalNumElements()<<std::endl;
+      RCP<const Teuchos::Comm<int> > comm = TestHelpers::Parameters::getDefaultComm();
+      
+      ArrayRCP<std::string> fileList = TestHelpers::GetFileList(std::string("ParameterList/ParameterListInterpreter/"), std::string(".xml"));
+      
+      for(int i=0; i< fileList.size(); i++) {
+        // Only run files with "BlockCrs" in their name
+        auto found = fileList[i].find("BlockCrs");
+        if(found == std::string::npos) continue;
+
+        out << "Processing file: " << fileList[i] << std::endl;
+        ParameterListInterpreter mueluFactory("ParameterList/ParameterListInterpreter/" + fileList[i],*comm);
+        
+        RCP<Hierarchy> H = mueluFactory.CreateHierarchy();
+        H->GetLevel(0)->Set("A", A);
+        
+        mueluFactory.SetupHierarchy(*H);
+
+        // Test to make sure all of the matrices in the Hierarchy are actually Block Matrices
+        using helpers = Xpetra::Helpers<Scalar,LocalOrdinal,GlobalOrdinal,Node>;        
+        for(int j=0; j<H->GetNumLevels(); j++) {
+          RCP<Level> level = H->GetLevel(j);
+
+          RCP<Matrix> Am = level->Get<RCP<Matrix> >("A");
+          TEST_EQUALITY(helpers::isTpetraBlockCrs(Am),true);
+          if(j>0) {
+            RCP<Matrix> P = level->Get<RCP<Matrix> >("P");
+            TEST_EQUALITY(helpers::isTpetraBlockCrs(P),true);
+            RCP<Matrix> R = level->Get<RCP<Matrix> >("R");
+            TEST_EQUALITY(helpers::isTpetraBlockCrs(R),true);          
+          }
+        }
+        
+        //TODO: check no unused parameters
+        //TODO: check results of Iterate()
+      }
+    }
+#   endif
+    TEST_EQUALITY(1,1);
+  }
+
+
+
+#if defined(HAVE_MUELU_TPETRA)
+template<class Matrix,class MT>
+MT compare_matrices(RCP<Matrix> & Ap, RCP<Matrix> &Ab) {
+  using SC = typename Matrix::scalar_type;
+  using LO = typename Matrix::local_ordinal_type;
+  using GO = typename Matrix::global_ordinal_type;
+  using NO = typename Matrix::node_type;
+  using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+  SC one   = Teuchos::ScalarTraits<SC>::one();
+  SC zero  = Teuchos::ScalarTraits<SC>::zero();
+
+  RCP<const CRS> Ap_t = MueLu::Utilities<SC,LO,GO,NO>::Op2TpetraCrs(Ap);
+  auto Ab_t = MueLu::Utilities<SC,LO,GO,NO>::Op2TpetraBlockCrs(Ab);
+  RCP<CRS> Ab_as_point = Tpetra::convertToCrsMatrix<SC,LO,GO,NO>(*Ab_t);  
+
+  RCP<CRS> diff = rcp(new CRS(Ap_t->getCrsGraph()));
+  diff->setAllToScalar(zero);
+  diff->fillComplete();
+  Tpetra::MatrixMatrix::Add<SC,LO,GO,NO>(*Ap_t,false,one,*Ab_as_point,false,-one,diff);
+  return diff->getFrobeniusNorm();
+}
+#endif
+
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(ParameterListInterpreter, PointCrs_vs_BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  {
+#   include <MueLu_UseShortNames.hpp>
+    MUELU_TESTING_SET_OSTREAM;
+    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node);
+#if defined(HAVE_MUELU_TPETRA)
+    MUELU_TEST_ONLY_FOR(Xpetra::UseTpetra) {
+      Teuchos::ParameterList matrixParams;
+      matrixParams.set("matrixType","Laplace1D");
+      matrixParams.set("nx",(GlobalOrdinal)300);// needs to be even
+      
+      RCP<Matrix> PointA = TestHelpers::TestFactory<SC, LO, GO, NO>::BuildMatrix(matrixParams,Xpetra::UseTpetra);  
+      RCP<Matrix> BlockA;
+      {
+        using XCRS = Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>;
+
+        auto tA = MueLu::Utilities<SC,LO,GO,NO>::Op2TpetraCrs(PointA);
+        auto bA = Tpetra::convertToBlockCrsMatrix<SC,LO,GO,NO>(*tA,1);      
+        RCP<XCRS> AA   = rcp(new XCRS(bA));
+        BlockA = rcp(new CrsMatrixWrap(rcp_implicit_cast<CrsMatrix>(AA)));
+      }
+
+      out<<"Point: Matrix Size (block) = "<<PointA->getGlobalNumRows()<<" (point) "<<PointA->getRangeMap()->getGlobalNumElements()<<std::endl;
+      out<<"Block: Matrix Size (block) = "<<BlockA->getGlobalNumRows()<<" (point) "<<BlockA->getRangeMap()->getGlobalNumElements()<<std::endl;
+      RCP<const Teuchos::Comm<int> > comm = TestHelpers::Parameters::getDefaultComm();
+      
+      ArrayRCP<std::string> fileList = TestHelpers::GetFileList(std::string("ParameterList/ParameterListInterpreter/"), std::string(".xml"));
+      
+      for(int i=0; i< fileList.size(); i++) {
+        // Only run files with "Comparison" in their name
+        auto found = fileList[i].find("Comparison");
+        if(found == std::string::npos) continue;
+        
+        out << "Processing file: " << fileList[i] << std::endl;
+        
+        // Point Hierarchy
+        ParameterListInterpreter mueluFactory1("ParameterList/ParameterListInterpreter/" + fileList[i],*comm);
+        RCP<Hierarchy> PointH = mueluFactory1.CreateHierarchy();
+        PointH->GetLevel(0)->Set("A", PointA);       
+        mueluFactory1.SetupHierarchy(*PointH);
+
+        // Block Hierachy
+        ParameterListInterpreter mueluFactory2("ParameterList/ParameterListInterpreter/" + fileList[i],*comm);
+        RCP<Hierarchy> BlockH = mueluFactory2.CreateHierarchy();
+        BlockH->GetLevel(0)->Set("A", BlockA);       
+        mueluFactory2.SetupHierarchy(*BlockH);
+
+        // Check to see that we get the same matrices in both hierarchies
+        TEST_EQUALITY(PointH->GetNumLevels(),BlockH->GetNumLevels());
+
+        for(int j=0; j<PointH->GetNumLevels(); j++) {
+          using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+          using MT  = typename Teuchos::ScalarTraits<SC>::magnitudeType;
+          MT tol = Teuchos::ScalarTraits<MT>::squareroot(Teuchos::ScalarTraits<MT>::eps());
+         
+          RCP<Level> Plevel = PointH->GetLevel(j);
+          RCP<Level> Blevel = BlockH->GetLevel(j);
+
+          // Compare A
+          RCP<Matrix> Ap = Plevel->Get<RCP<Matrix> >("A");
+          RCP<Matrix> Ab = Blevel->Get<RCP<Matrix> >("A");
+          MT norm = compare_matrices<Matrix,MT>(Ap,Ab);
+          TEUCHOS_TEST_COMPARE(norm,<,tol,out,success);         
+
+          // Compare P, R
+          if(j>0) {
+            RCP<Matrix> Pp = Plevel->Get<RCP<Matrix> >("P");
+            RCP<Matrix> Pb = Blevel->Get<RCP<Matrix> >("P");
+            norm = compare_matrices<Matrix,MT>(Pp,Pb);
+            TEUCHOS_TEST_COMPARE(norm,<,tol,out,success);         
+
+            RCP<Matrix> Rp = Plevel->Get<RCP<Matrix> >("R");
+            RCP<Matrix> Rb = Blevel->Get<RCP<Matrix> >("R");
+            norm = compare_matrices<Matrix,MT>(Rp,Rb);
+            TEUCHOS_TEST_COMPARE(norm,<,tol,out,success);         
+          }
+        }
+
+        //TODO: check no unused parameters
+        //TODO: check results of Iterate()
+      }
+    }
+#   endif
+    TEST_EQUALITY(1,1);
+  }
+
+
 #define MUELU_ETI_GROUP(Scalar, LocalOrdinal, GlobalOrdinal, Node) \
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, SetParameterList, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, SetParameterList, Scalar, LocalOrdinal, GlobalOrdinal, Node) \
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node) \
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(ParameterListInterpreter, PointCrs_vs_BlockCrs, Scalar, LocalOrdinal, GlobalOrdinal, Node)
 
 #include <MueLu_ETI_4arg.hpp>
 
diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml
new file mode 100644
index 000000000000..d048ab0d38d8
--- /dev/null
+++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml
@@ -0,0 +1,28 @@
+<ParameterList name="MueLu">
+
+  <!--
+    For a generic symmetric scalar problem, these are the recommended settings for MueLu.
+  -->
+
+  <!-- ===========  GENERAL ================ -->
+    <Parameter        name="verbosity"                            type="string"   value="high"/>
+
+    <Parameter        name="coarse: max size"                     type="int"      value="50"/>
+
+    <Parameter        name="multigrid algorithm"                  type="string"   value="unsmoothed"/>
+
+    <Parameter        name="max levels"                	          type="int"      value="3"/>
+
+    <!-- This is hardwired in the test generator -->
+    <Parameter        name="number of equations"                  type="int"      value="3"/>
+
+    <Parameter        name="smoother: type"                       type="string"  value="RELAXATION"/>
+    <Parameter        name="coarse: type"                         type="string"  value="RELAXATION"/>
+
+    <!-- end of default values -->
+
+    <!-- ===========  REPARTITIONING  =========== -->
+    <Parameter        name="repartition: enable"                  type="bool"     value="false"/>
+    <!-- end of default values -->
+
+</ParameterList>
diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml
new file mode 100644
index 000000000000..5954af0d9af4
--- /dev/null
+++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml
@@ -0,0 +1,32 @@
+<ParameterList name="MueLu">
+
+  <!--
+    For a generic symmetric scalar problem, these are the recommended settings for MueLu.
+  -->
+
+  <!-- ===========  GENERAL ================ -->
+    <Parameter        name="verbosity"                            type="string"   value="high"/>
+
+    <Parameter        name="coarse: max size"                     type="int"      value="50"/>
+
+    <Parameter        name="multigrid algorithm"                  type="string"   value="unsmoothed"/>
+
+    <Parameter        name="max levels"                	          type="int"      value="3"/>
+
+    <!-- This is hardwired in the test generator -->
+    <Parameter        name="number of equations"                  type="int"      value="3"/>
+
+    <Parameter        name="smoother: type"                       type="string"  value="RELAXATION"/>
+    <Parameter        name="coarse: type"                         type="string"  value="RELAXATION"/>
+
+
+    <!-- What makes this different from BlockCrs1.xml -->
+    <Parameter        name="rap: triple product"                  type="bool"      value="false"/>
+
+    <!-- end of default values -->
+
+    <!-- ===========  REPARTITIONING  =========== -->
+    <Parameter        name="repartition: enable"                  type="bool"     value="false"/>
+    <!-- end of default values -->
+
+</ParameterList>
diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt
index 8e56cda05351..7023acbef07c 100644
--- a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt
+++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/CMakeLists.txt
@@ -3,6 +3,8 @@
 # regenerate a build system incorporating the new file.
 # YOU MUST ALSO TOUCH A CMAKE CONFIGURATION FILE WHEN YOU PUSH THE NEW
 # FILE TO FORCE THE RECONFIGURE ON OTHER PEOPLE'S BUILDS.
+
+
 FILE(GLOB xmlFiles RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.xml)
 
 TRIBITS_COPY_FILES_TO_BINARY_DIR(ParameterList_ParameterListInterpreter_cp
diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/Comparison1.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/Comparison1.xml
new file mode 100644
index 000000000000..5432a0dbdc37
--- /dev/null
+++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/Comparison1.xml
@@ -0,0 +1,27 @@
+<ParameterList name="MueLu">
+
+  <!--
+    For a generic symmetric scalar problem, these are the recommended settings for MueLu.
+  -->
+
+  <!-- ===========  GENERAL ================ -->
+    <Parameter        name="verbosity"                            type="string"   value="high"/>
+
+    <Parameter        name="coarse: max size"                     type="int"      value="50"/>
+
+    <Parameter        name="multigrid algorithm"                  type="string"   value="unsmoothed"/>
+
+    <Parameter        name="max levels"                	          type="int"      value="3"/>
+
+    <Parameter        name="number of equations"                  type="int"      value="1"/>
+
+    <Parameter        name="smoother: type"                       type="string"  value="RELAXATION"/>
+    <Parameter        name="coarse: type"                         type="string"  value="RELAXATION"/>
+
+    <!-- end of default values -->
+
+    <!-- ===========  REPARTITIONING  =========== -->
+    <Parameter        name="repartition: enable"                  type="bool"     value="false"/>
+    <!-- end of default values -->
+
+</ParameterList>
diff --git a/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp b/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp
index eb8d11f54ba0..81bcf614654e 100644
--- a/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp
+++ b/packages/muelu/test/unit_tests/Smoothers/Ifpack2Smoother.cpp
@@ -567,7 +567,7 @@ namespace MueLuTests {
   } // banded
 
 
- TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Ifpack2Smoother, BlockCrsMatrix_Relaxation, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Ifpack2Smoother, BlockCrsMatrix_Relaxation_ViaPoint, Scalar, LocalOrdinal, GlobalOrdinal, Node)
   {
 #   include <MueLu_UseShortNames.hpp>
     MUELU_TESTING_SET_OSTREAM;
@@ -592,6 +592,32 @@ namespace MueLuTests {
     }
   }
 
+ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Ifpack2Smoother, BlockCrsMatrix_Relaxation_AsBlock, Scalar, LocalOrdinal, GlobalOrdinal, Node)
+  {
+#   include <MueLu_UseShortNames.hpp>
+    MUELU_TESTING_SET_OSTREAM;
+    MUELU_TESTING_LIMIT_SCOPE(Scalar,GlobalOrdinal,Node);
+
+    MUELU_TEST_ONLY_FOR(Xpetra::UseTpetra) {
+      Teuchos::ParameterList matrixParams, ifpack2Params;
+
+      matrixParams.set("matrixType","Laplace1D");
+      matrixParams.set("nx",(GlobalOrdinal)20);// needs to be even
+
+      RCP<Matrix> A = TestHelpers::TpetraTestFactory<SC, LO, GO, NO>::BuildBlockMatrix(matrixParams,Xpetra::UseTpetra);     
+      ifpack2Params.set("smoother: use blockcrsmatrix storage",true);
+      
+      Ifpack2Smoother smoother("RELAXATION",ifpack2Params);
+      
+      Level level; TestHelpers::TestFactory<SC,LO,GO,NO>::createSingleLevelHierarchy(level);
+      level.Set("A", A);
+      smoother.Setup(level);
+
+      TEST_EQUALITY(1,1);
+    }
+  }
+
+
 
 
 #define MUELU_ETI_GROUP(SC,LO,GO,NO) \
@@ -605,7 +631,8 @@ namespace MueLuTests {
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BandedRelaxation,SC,LO,GO,NO) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,TriDiRelaxation,SC,LO,GO,NO) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockRelaxation_Autosize,SC,LO,GO,NO) \
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockCrsMatrix_Relaxation,SC,LO,GO,NO)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockCrsMatrix_Relaxation_ViaPoint,SC,LO,GO,NO) \
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(Ifpack2Smoother,BlockCrsMatrix_Relaxation_AsBlock,SC,LO,GO,NO)
 
 #include <MueLu_ETI_4arg.hpp>
 
diff --git a/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp b/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp
index bb9d38caf18b..4c7893e0734b 100644
--- a/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp
+++ b/packages/xpetra/src/BlockedCrsMatrix/Xpetra_BlockedCrsMatrix.hpp
@@ -1520,6 +1520,9 @@ namespace Xpetra {
       return thbOp;
     }
 #endif
+    //! Returns the block size of the storage mechanism
+    LocalOrdinal GetStorageBlockSize() const {return 1;}
+
 
     //! Compute a residual R = B - (*this) * X
     void residual(const MultiVector & X,
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp
index 407f1d45f31a..c0806e943ba8 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrix.hpp
@@ -352,6 +352,9 @@ namespace Xpetra {
     //! Does this have an underlying matrix
     virtual bool hasMatrix() const = 0;
 
+    //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix
+    virtual LocalOrdinal GetStorageBlockSize() const = 0;
+
     //! Compute a residual R = B - (*this) * X
     virtual void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
                           const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B,
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp
index d7ad7d410d61..9a81d816f02b 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_CrsMatrixFactory.hpp
@@ -52,6 +52,7 @@
 
 #ifdef HAVE_XPETRA_TPETRA
 #include "Xpetra_TpetraCrsMatrix.hpp"
+#include "Xpetra_TpetraBlockCrsMatrix.hpp"
 #endif
 
 #ifdef HAVE_XPETRA_EPETRA
@@ -289,6 +290,25 @@ namespace Xpetra {
       XPETRA_FACTORY_END;
     }
 #endif
+    
+    // Builds a BlockCrsMatrix
+    static RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > BuildBlock (
+        const Teuchos::RCP<const Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >& blockGraph,
+        const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMap,
+        const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& rangeMap,
+        LocalOrdinal blockSize) {
+  
+      XPETRA_MONITOR("CrsMatrixFactory::BuildBlock");
+
+#ifdef HAVE_XPETRA_TPETRA
+      if (domainMap->lib() == UseTpetra) {
+        return rcp(new Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>(blockGraph,domainMap,rangeMap,blockSize) );
+      }
+#endif
+      TEUCHOS_TEST_FOR_EXCEPTION(domainMap->lib() == UseEpetra, std::logic_error, "Epetra doesn't support this matrix constructor");
+
+      XPETRA_FACTORY_END;
+    }
 
   };
 
@@ -318,10 +338,9 @@ namespace Xpetra {
       if (rowMap->lib() == UseTpetra)
         return rcp( new TpetraCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(rowMap, 0) );
 #endif
-#ifdef HAVE_XPETRA_EPETRA
       if(rowMap->lib() == UseEpetra)
         return rcp( new EpetraCrsMatrixT<int,Node>(rowMap));
-#endif
+
       XPETRA_FACTORY_END;
     }
 
@@ -543,6 +562,23 @@ namespace Xpetra {
     }
 #endif
 
+    //! Build a BlockCrsMatrix
+    static RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > BuildBlock (
+        const Teuchos::RCP<const Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >& blockGraph,
+        const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMap,
+        const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& rangeMap,
+        LocalOrdinal blockSize) {
+  
+      XPETRA_MONITOR("CrsMatrixFactory::BuildBlock");
+#ifdef HAVE_XPETRA_TPETRA
+      if (domainMap->lib() == UseTpetra)
+        return rcp(new Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>(blockGraph,domainMap,rangeMap,blockSize) );
+#endif
+      TEUCHOS_TEST_FOR_EXCEPTION(domainMap->lib() == UseEpetra, std::logic_error, "Epetra doesn't support this matrix constructor");
+
+      XPETRA_FACTORY_END;
+    }
+
   };
 #endif
 
@@ -772,6 +808,28 @@ namespace Xpetra {
     }
 #endif
 
+
+    //! Build a BlockCrsMatrix
+    static RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > BuildBlock (
+        const Teuchos::RCP<const Xpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >& blockGraph,
+        const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMap,
+        const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& rangeMap,
+        LocalOrdinal blockSize) {
+  
+      XPETRA_MONITOR("CrsMatrixFactory::BuildBlock");
+
+#ifdef HAVE_XPETRA_TPETRA
+      if (domainMap->lib() == UseTpetra) {
+        return rcp(new Xpetra::TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>(blockGraph,domainMap,rangemap,blockSize) );
+      }
+#endif
+      TEUCHOS_TEST_FOR_EXCEPTION(domainMap->lib() == UseEpetra, std::logic_error, "Epetra doesn't support this matrix constructor");
+
+      XPETRA_FACTORY_END;
+    }
+
+
+
   };
 #endif
 
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp
index 9b9321049333..42dc9b4d390a 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_EpetraCrsMatrix.hpp
@@ -258,6 +258,9 @@ local_matrix_type getLocalMatrixDevice () const {
     TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError,
                                "Xpetra::EpetraCrsMatrix only available for GO=int or GO=long long with EpetraNode (Serial or OpenMP depending on configuration)");
   }
+
+  LocalOrdinal GetStorageBlockSize() const {return 1;}
+
 #else
 #ifdef __GNUC__
 #warning "Xpetra Kokkos interface for CrsMatrix is enabled (HAVE_XPETRA_KOKKOS_REFACTOR) but Tpetra is disabled. The Kokkos interface needs Tpetra to be enabled, too."
@@ -265,6 +268,7 @@ local_matrix_type getLocalMatrixDevice () const {
 #endif
 #endif
 
+
   void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
                 const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B,
                 MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & R) const{ 
@@ -1305,6 +1309,7 @@ typename local_matrix_type::HostMirror getLocalMatrixHost () const {
   }
 
 
+  LocalOrdinal GetStorageBlockSize() const {return 1;}
 
 private:
 #else
@@ -1315,6 +1320,8 @@ typename local_matrix_type::HostMirror getLocalMatrixHost () const {
 #endif
   //@}
 
+
+
   void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
                 const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B,
                 MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & R) const {
@@ -2348,6 +2355,8 @@ class EpetraCrsMatrixT <long long, EpetraNode>
 
   }
 
+
+  LocalOrdinal GetStorageBlockSize() const {return 1;}
  
 private:
 #else
@@ -2356,6 +2365,8 @@ class EpetraCrsMatrixT <long long, EpetraNode>
 #endif
 #endif
 #endif
+
+
   //@}
 
  void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp
index 1db4e9d5bd3f..2dc8bccf794b 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_decl.hpp
@@ -113,6 +113,12 @@ namespace Xpetra {
     TpetraBlockCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node> > &graph, 
                          const LocalOrdinal blockSize);
 
+    //! Constructor specifying a previously constructed graph, point maps & blocksize
+    TpetraBlockCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node> > &graph, 
+                         const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& pointDomainMap,
+                         const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& pointRangeMap,
+                         const LocalOrdinal blockSize);
+
 
     //! Constructor for a fused import ( not implemented )
     TpetraBlockCrsMatrix(const Teuchos::RCP<const Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node> >& sourceMatrix,
@@ -410,6 +416,9 @@ namespace Xpetra {
 #endif  // HAVE_XPETRA_TPETRA
 #endif  // HAVE_XPETRA_KOKKOS_REFACTOR
 
+    //! Returns the block size of the storage mechanism
+    LocalOrdinal GetStorageBlockSize() const {return mtx_->getBlockSize();}
+
     //! Compute a residual R = B - (*this) * X
     void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
                   const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B,
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp
index 410dbddfab76..fd069dd54356 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraBlockCrsMatrix_def.hpp
@@ -46,7 +46,9 @@
 #ifndef XPETRA_TPETRABLOCKCRSMATRIX_DEF_HPP
 #define XPETRA_TPETRABLOCKCRSMATRIX_DEF_HPP
 
+
 #include "Xpetra_TpetraBlockCrsMatrix_decl.hpp"
+#include "Xpetra_TpetraCrsGraph.hpp"
 
 namespace Xpetra {
 
@@ -118,6 +120,17 @@ namespace Xpetra {
     { }
 
 
+    //! Constructor specifying a previously constructed graph, point maps & blocksize
+    template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+    TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
+    TpetraBlockCrsMatrix(const Teuchos::RCP< const CrsGraph< LocalOrdinal, GlobalOrdinal, Node> > &graph, 
+                         const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& pointDomainMap,
+                         const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& pointRangeMap,
+                         const LocalOrdinal blockSize)
+      : mtx_(Teuchos::rcp(new Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(*toTpetra(graph), *toTpetra(pointDomainMap), *toTpetra(pointRangeMap),blockSize)))
+    { }
+
+
     //! Constructor for a fused import ( not implemented )
     template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
     TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
@@ -377,7 +390,12 @@ namespace Xpetra {
     TpetraBlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::
     getCrsGraph() const
     {
-      throw std::runtime_error("Xpetra::TpetraBlockCrsMatrix function not implemented in "+std::string(__FILE__)+":"+std::to_string(__LINE__));
+      XPETRA_MONITOR("TpetraBlockCrsMatrix::getCrsGraph"); 
+      using G_t = Tpetra::CrsGraph<LocalOrdinal,GlobalOrdinal,Node>;
+      using G_x = TpetraCrsGraph<LocalOrdinal,GlobalOrdinal,Node>;
+      RCP<G_t> t_graph = Teuchos::rcp_const_cast<G_t>(Teuchos::rcpFromRef(mtx_->getCrsGraph()));
+      RCP<const G_x> x_graph = rcp(new G_x(t_graph));
+      return x_graph;
     }
     
 
diff --git a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp
index d5e94fee5ade..b6fa11b68eae 100644
--- a/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp
+++ b/packages/xpetra/src/CrsMatrix/Xpetra_TpetraCrsMatrix_decl.hpp
@@ -453,6 +453,9 @@ namespace Xpetra {
 #endif
 #endif
 
+    //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix
+    LocalOrdinal GetStorageBlockSize() const {return 1;}
+
     //! Compute a residual R = B - (*this) * X
     void residual(const MultiVector & X,
                   const MultiVector & B,
@@ -860,6 +863,9 @@ namespace Xpetra {
 #endif
 #endif
 
+    //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix
+    LocalOrdinal GetStorageBlockSize() const {return 1;}
+
     //! Compute a residual R = B - (*this) * X
     void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
                   const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B,
@@ -1263,6 +1269,9 @@ namespace Xpetra {
 #endif
 #endif
 
+    //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix
+    LocalOrdinal GetStorageBlockSize() const {return 1;}
+
     //! Compute a residual R = B - (*this) * X
     void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
                   const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B,
diff --git a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp
index c30d1a00991f..415acfa6c940 100644
--- a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp
+++ b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_decl.hpp
@@ -480,12 +480,18 @@ class CrsMatrixWrap :
   RCP<CrsMatrix> getCrsMatrix() const;
 
 
+  //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix
+  LocalOrdinal GetStorageBlockSize() const;
+
   //! Compute a residual R = B - (*this) * X
   void residual(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X,
                 const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & B,
                 MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & R) const;
   
 
+  //! Expert only
+  void replaceCrsMatrix(RCP<CrsMatrix> & M);
+
   //@}
 private:
 
diff --git a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp
index c3914921aa83..8a1d48d3b181 100644
--- a/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp
+++ b/packages/xpetra/sup/Matrix/Xpetra_CrsMatrixWrap_def.hpp
@@ -497,6 +497,28 @@ namespace Xpetra {
   }
 
 
+  // Expert only
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  void CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>::replaceCrsMatrix(RCP<CrsMatrix> & M) {
+    // Clear the old view table
+    Teuchos::Hashtable<viewLabel_t, RCP<MatrixView> > dummy_table;
+    Matrix::operatorViewTable_ = dummy_table;
+
+    finalDefaultView_ = M->isFillComplete();
+    // Set matrix data
+    matrixData_ = M;
+    
+
+    // Default view
+    CreateDefaultView();
+  }
+
+
+  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
+  LocalOrdinal CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GetStorageBlockSize() const {
+    return matrixData_->GetStorageBlockSize();
+  }
+
   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   void CrsMatrixWrap<Scalar,LocalOrdinal,GlobalOrdinal,Node>::residual(
             const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > & X, 
diff --git a/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp b/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp
index b7f9432f054f..69c11f74ccec 100644
--- a/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp
+++ b/packages/xpetra/sup/Matrix/Xpetra_Matrix.hpp
@@ -559,11 +559,17 @@ namespace Xpetra {
         return 1;
     }; //TODO: why LocalOrdinal?
 
+
     //! Returns true, if `SetFixedBlockSize` has been called before.
     bool IsFixedBlockSizeSet() const {
       return IsView("stridedMaps");
     };
 
+
+    //! Returns the block size of the storage mechanism, which is usually 1, except for Tpetra::BlockCrsMatrix
+    virtual LocalOrdinal GetStorageBlockSize() const = 0;
+
+
     // ----------------------------------------------------------------------------------
 
     virtual void SetMaxEigenvalueEstimate(Scalar const &sigma) {
diff --git a/packages/xpetra/sup/Utils/Xpetra_IO.hpp b/packages/xpetra/sup/Utils/Xpetra_IO.hpp
index 364b6743fdb4..da863a390f23 100644
--- a/packages/xpetra/sup/Utils/Xpetra_IO.hpp
+++ b/packages/xpetra/sup/Utils/Xpetra_IO.hpp
@@ -313,6 +313,15 @@ namespace Xpetra {
         Tpetra::MatrixMarket::Writer<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >::writeSparseFile(fileName, A);
         return;
       }
+      const RCP<const Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& tmp_BlockCrs = 
+        Teuchos::rcp_dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >(tmp_CrsMtx);
+      if(tmp_BlockCrs != Teuchos::null) {
+        std::ofstream outstream (fileName,std::ofstream::out);
+        Teuchos::FancyOStream ofs(Teuchos::rcpFromRef(outstream));
+        tmp_BlockCrs->getTpetra_BlockCrsMatrix()->describe(ofs,Teuchos::VERB_EXTREME);        
+        return;
+      }
+
 #endif // HAVE_XPETRA_TPETRA
 
       throw Exceptions::BadCast("Could not cast to EpetraCrsMatrix or TpetraCrsMatrix in matrix writing");
@@ -1037,6 +1046,15 @@ namespace Xpetra {
         Tpetra::MatrixMarket::Writer<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >::writeSparseFile(fileName, A);
         return;
       }
+      const RCP<const Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& tmp_BlockCrs = 
+        Teuchos::rcp_dynamic_cast<const Xpetra::TpetraBlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >(tmp_CrsMtx);
+      if(tmp_BlockCrs != Teuchos::null) {
+        std::ofstream outstream (fileName,std::ofstream::out);
+        Teuchos::FancyOStream ofs(Teuchos::rcpFromRef(outstream));
+        tmp_BlockCrs->getTpetra_BlockCrsMatrix()->describe(ofs,Teuchos::VERB_EXTREME);
+        return;
+      }
+
 # endif
 #endif // HAVE_XPETRA_TPETRA
 
diff --git a/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp b/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp
index 5fad0246aafe..217c590bf912 100644
--- a/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp
+++ b/packages/xpetra/sup/Utils/Xpetra_MatrixMatrix.hpp
@@ -75,6 +75,7 @@
 #include <MatrixMarket_Tpetra.hpp>
 #include <Xpetra_TpetraCrsMatrix.hpp>
 #include <Xpetra_TpetraBlockCrsMatrix.hpp>
+#include <Tpetra_BlockCrsMatrix_Helpers.hpp>
 #include <Xpetra_TpetraMultiVector.hpp>
 #include <Xpetra_TpetraVector.hpp>
 #endif // HAVE_XPETRA_TPETRA
@@ -254,25 +255,25 @@ Note: this class is not in the Xpetra_UseShortNames.hpp
       return tmp_BlockCrs->getTpetra_BlockCrsMatrixNonConst();
     }
 
-    static RCP<const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Op2TpetraBlockCrs(const Matrix& Op) {
+    static const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & Op2TpetraBlockCrs(const Matrix& Op) {
       try {
         const CrsMatrixWrap& crsOp = dynamic_cast<const CrsMatrixWrap&>(Op);
         RCP<const CrsMatrix> tmp_CrsMtx = crsOp.getCrsMatrix();
         RCP<const TpetraBlockCrsMatrix> tmp_BlockCrs= Teuchos::rcp_dynamic_cast<const TpetraBlockCrsMatrix>(tmp_CrsMtx);
         TEUCHOS_TEST_FOR_EXCEPTION(tmp_BlockCrs == Teuchos::null, Xpetra::Exceptions::BadCast, "Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
-        return tmp_BlockCrs->getTpetra_BlockCrsMatrix();
+        return *tmp_BlockCrs->getTpetra_BlockCrsMatrix();
       } catch(...) {
         throw(Xpetra::Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"));
       }
     }
 
-    static RCP<Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Op2NonTpetraBlockCrs(const Matrix& Op) {
+    static Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & Op2NonConstTpetraBlockCrs(const Matrix& Op) {
       try {
         const CrsMatrixWrap& crsOp = dynamic_cast<const CrsMatrixWrap&>(Op);
         RCP<const CrsMatrix> tmp_CrsMtx = crsOp.getCrsMatrix();
         RCP<const TpetraBlockCrsMatrix> tmp_BlockCrs= Teuchos::rcp_dynamic_cast<const TpetraBlockCrsMatrix>(tmp_CrsMtx);     
         TEUCHOS_TEST_FOR_EXCEPTION(tmp_BlockCrs == Teuchos::null, Xpetra::Exceptions::BadCast, "Cast from Xpetra::CrsMatrix to Xpetra::TpetraBlockCrsMatrix failed");
-        return tmp_BlockCrs->getTpetra_BlockCrsMatrixNonConst();
+        return *tmp_BlockCrs->getTpetra_BlockCrsMatrixNonConst();
       } catch(...) {
         throw(Xpetra::Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"));
       }
@@ -298,7 +299,14 @@ Note: this class is not in the Xpetra_UseShortNames.hpp
        return false;
      }
     }
+#else // HAVE_XPETRA_TPETRA
+    static bool isTpetraCrs(const Matrix& Op) {
+      return false;
+    }
 
+    static bool isTpetraBlockCrs(const Matrix&  Op) {
+      return false;
+    }
 
 #endif // HAVE_XPETRA_TPETRA
 
@@ -491,9 +499,38 @@ Note: this class is not in the Xpetra_UseShortNames.hpp
           // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
           Tpetra::MatrixMatrix::Multiply(tpA, transposeA, tpB, transposeB, tpC, haveMultiplyDoFillComplete, label, params);
         }
-        else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B) && helpers::isTpetraBlockCrs(C)) {
-          // All matrices are BlockCrs
-          TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "BlockCrs Multiply not currently supported");          
+        else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B)) {
+          // All matrices are BlockCrs (except maybe Ac)
+          // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel,
+          // we'll need to think about refactoring BlockCrs so we can do something smarter here.
+          if(!A.getRowMap()->getComm()->getRank())
+            std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"<<std::endl;          
+
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpA  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(A);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpB  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(B);
+          using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+          RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpA);
+          RCP<const CRS> Bcrs = Tpetra::convertToCrsMatrix(tpB);
+
+          // We need the global constants to do the copy back to BlockCrs
+          RCP<ParameterList> new_params;
+          if(!params.is_null()) {
+            new_params = rcp(new Teuchos::ParameterList(*params));
+            new_params->set("compute global constants",true);
+          }
+
+          // FIXME: The lines below only works because we're assuming Ac is Point
+          RCP<CRS> tempAc = Teuchos::rcp(new CRS(Acrs->getRowMap(),0));
+          Tpetra::MatrixMatrix::Multiply(*Acrs, transposeA, *Bcrs, transposeB, *tempAc, haveMultiplyDoFillComplete, label, new_params);
+
+          // Temporary output matrix
+          RCP<Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Ac_t = Tpetra::convertToBlockCrsMatrix(*tempAc,A.GetStorageBlockSize());          
+          RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>(Ac_t));
+          RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > Ac_p = Ac_x;
+
+          // We can now cheat and replace the innards of Ac
+          RCP<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> > Ac_w = Teuchos::rcp_dynamic_cast<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> >(Teuchos::rcpFromRef(C));
+          Ac_w->replaceCrsMatrix(Ac_p);         
         }
         else {
           // Mix and match
@@ -1022,9 +1059,39 @@ Note: this class is not in the Xpetra_UseShortNames.hpp
           // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
           Tpetra::MatrixMatrix::Multiply(tpA, transposeA, tpB, transposeB, tpC, haveMultiplyDoFillComplete, label, params);
         }
-        else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B) && helpers::isTpetraBlockCrs(C)) {
-          // All matrices are BlockCrs
-          TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "BlockCrs Multiply not currently supported");          
+        else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B)) {
+          // All matrices are BlockCrs (except maybe Ac)
+          // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel,
+          // we'll need to think about refactoring BlockCrs so we can do something smarter here.
+
+          if(!A.getRowMap()->getComm()->getRank())
+            std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"<<std::endl;          
+
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpA  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(A);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpB  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(B);
+          using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+          RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpA);
+          RCP<const CRS> Bcrs = Tpetra::convertToCrsMatrix(tpB);
+
+          // We need the global constants to do the copy back to BlockCrs
+          RCP<ParameterList> new_params;
+          if(!params.is_null()) {
+            new_params = rcp(new Teuchos::ParameterList(*params));
+            new_params->set("compute global constants",true);
+          }
+
+          // FIXME: The lines below only works because we're assuming Ac is Point
+          RCP<CRS> tempAc = Teuchos::rcp(new CRS(Acrs->getRowMap(),0));
+          Tpetra::MatrixMatrix::Multiply(*Acrs, transposeA, *Bcrs, transposeB, *tempAc, haveMultiplyDoFillComplete, label, new_params);
+
+          // Temporary output matrix
+          RCP<Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Ac_t = Tpetra::convertToBlockCrsMatrix(*tempAc,A.GetStorageBlockSize());          
+          RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>(Ac_t));
+          RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > Ac_p = Ac_x;
+
+          // We can now cheat and replace the innards of Ac
+          RCP<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> > Ac_w = Teuchos::rcp_dynamic_cast<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> >(Teuchos::rcpFromRef(C));
+          Ac_w->replaceCrsMatrix(Ac_p);  
         }
         else {
           // Mix and match
@@ -1787,9 +1854,39 @@ Note: this class is not in the Xpetra_UseShortNames.hpp
           // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
           Tpetra::MatrixMatrix::Multiply(tpA, transposeA, tpB, transposeB, tpC, haveMultiplyDoFillComplete, label, params);
         }
-        else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B) && helpers::isTpetraBlockCrs(C)) {
-          // All matrices are BlockCrs
-          TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "BlockCrs Multiply not currently supported");          
+        else if (helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(B)) {
+          // All matrices are BlockCrs (except maybe Ac)
+          // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel,
+          // we'll need to think about refactoring BlockCrs so we can do something smarter here.
+
+          if(!A.getRowMap()->getComm()->getRank())
+            std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"<<std::endl;          
+
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpA  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(A);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpB  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(B);
+          using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+          RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpA);
+          RCP<const CRS> Bcrs = Tpetra::convertToCrsMatrix(tpB);
+
+          // We need the global constants to do the copy back to BlockCrs
+          RCP<ParameterList> new_params;
+          if(!params.is_null()) {
+            new_params = rcp(new Teuchos::ParameterList(*params));
+            new_params->set("compute global constants",true);
+          }
+
+          // FIXME: The lines below only works because we're assuming Ac is Point
+          RCP<CRS> tempAc = Teuchos::rcp(new CRS(Acrs->getRowMap(),0));
+          Tpetra::MatrixMatrix::Multiply(*Acrs, transposeA, *Bcrs, transposeB, *tempAc, haveMultiplyDoFillComplete, label, new_params);
+
+          // Temporary output matrix
+          RCP<Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Ac_t = Tpetra::convertToBlockCrsMatrix(*tempAc,A.GetStorageBlockSize());          
+          RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>(Ac_t));
+          RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > Ac_p = Ac_x;
+
+          // We can now cheat and replace the innards of Ac
+          RCP<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> > Ac_w = Teuchos::rcp_dynamic_cast<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> >(Teuchos::rcpFromRef(C));
+          Ac_w->replaceCrsMatrix(Ac_p);         
         }
         else {
           // Mix and match
diff --git a/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp b/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp
index 3b5798fc62ce..d930088f3908 100644
--- a/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp
+++ b/packages/xpetra/sup/Utils/Xpetra_TripleMatrixMultiply.hpp
@@ -58,10 +58,13 @@
 #include "Xpetra_Matrix.hpp"
 #include "Xpetra_StridedMapFactory.hpp"
 #include "Xpetra_StridedMap.hpp"
+#include "Xpetra_IO.hpp"
 
 #ifdef HAVE_XPETRA_TPETRA
 #include <TpetraExt_TripleMatrixMultiply.hpp>
 #include <Xpetra_TpetraCrsMatrix.hpp>
+#include <Tpetra_BlockCrsMatrix.hpp>
+#include <Tpetra_BlockCrsMatrix_Helpers.hpp>
 // #include <Xpetra_TpetraMultiVector.hpp>
 // #include <Xpetra_TpetraVector.hpp>
 #endif // HAVE_XPETRA_TPETRA
@@ -126,14 +129,55 @@ namespace Xpetra {
         throw(Xpetra::Exceptions::RuntimeError("Xpetra::TripleMatrixMultiply::MultiplyRAP is only implemented for Tpetra"));
       } else if (Ac.getRowMap()->lib() == Xpetra::UseTpetra) {
 #ifdef HAVE_XPETRA_TPETRA
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpR = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(R);
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpA = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(A);
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpP = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(P);
-        Tpetra::CrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraCrs(Ac);
-
-        // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete.
-        // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
-        Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params);
+        using helpers = Xpetra::Helpers<SC,LO,GO,NO>;
+        if(helpers::isTpetraCrs(R) && helpers::isTpetraCrs(A) && helpers::isTpetraCrs(P)) {
+          // All matrices are Crs
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpR = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(R);
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpA = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(A);
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpP = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(P);
+          Tpetra::CrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraCrs(Ac);
+          
+          // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete.
+          // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
+          Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params);
+        }
+        else if (helpers::isTpetraBlockCrs(R) && helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(P)) {
+          // All matrices are BlockCrs (except maybe Ac)
+          // FIXME: For the moment we're just going to clobber the innards of Ac, so no reuse. Once we have a reuse kernel,
+          // we'll need to think about refactoring BlockCrs so we can do something smarter here.
+
+          if(!A.getRowMap()->getComm()->getRank())
+            std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"<<std::endl;          
+
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpR  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(R);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpA  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(A);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpP  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(P);
+          //          Tpetra::BlockCrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraBlockCrs(Ac);
+
+          using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+          RCP<const CRS> Rcrs = Tpetra::convertToCrsMatrix(tpR);
+          RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpA);
+          RCP<const CRS> Pcrs = Tpetra::convertToCrsMatrix(tpP);
+          //          RCP<CRS> Accrs = Tpetra::convertToCrsMatrix(tpAc);
+          
+          // FIXME: The lines below only works because we're assuming Ac is Point
+          RCP<CRS> Accrs = Teuchos::rcp(new CRS(Rcrs->getRowMap(),0));
+          const bool do_fill_complete=true;
+          Tpetra::TripleMatrixMultiply::MultiplyRAP(*Rcrs, transposeR, *Acrs, transposeA, *Pcrs, transposeP, *Accrs, do_fill_complete, label, params);
+
+          // Temporary output matrix
+          RCP<Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Ac_t = Tpetra::convertToBlockCrsMatrix(*Accrs,A.GetStorageBlockSize());          
+          RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>(Ac_t));
+          RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > Ac_p = Ac_x;
+
+          // We can now cheat and replace the innards of Ac
+          RCP<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> > Ac_w = Teuchos::rcp_dynamic_cast<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> >(Teuchos::rcpFromRef(Ac));
+          Ac_w->replaceCrsMatrix(Ac_p);         
+        }
+        else {
+          // Mix and match
+          TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Mix-and-match Crs/BlockCrs Multiply not currently supported");      
+        }            
 #else
         throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra."));
 #endif
@@ -215,14 +259,55 @@ namespace Xpetra {
       (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_INT))))
         throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra <double,int,int> ETI enabled."));
 # else
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpR = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(R);
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpA = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(A);
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpP = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(P);
-        Tpetra::CrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraCrs(Ac);
-
-        // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete.
-        // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
-        Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params);
+        using helpers = Xpetra::Helpers<SC,LO,GO,NO>;
+        if(helpers::isTpetraCrs(R) && helpers::isTpetraCrs(A) && helpers::isTpetraCrs(P) && helpers::isTpetraCrs(Ac)) {
+          // All matrices are Crs
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpR  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(R);
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpA  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(A);
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpP  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(P);
+          Tpetra::CrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraCrs(Ac);
+          
+          // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete.
+          // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
+          Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params);
+        }
+        else if (helpers::isTpetraBlockCrs(R) && helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(P)) {
+          // All matrices are BlockCrs (except maybe Ac)
+          // FIXME: For the moment we're just going to clobber the innards of AC, so no reuse. Once we have a reuse kernel,
+          // we'll need to think about refactoring BlockCrs so we can do something smarter here.
+          if(!A.getRowMap()->getComm()->getRank())
+            std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"<<std::endl;          
+
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpR  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(R);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpA  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(A);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpP  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(P);
+          //          Tpetra::BlockCrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraBlockCrs(Ac);
+
+          using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+          RCP<const CRS> Rcrs = Tpetra::convertToCrsMatrix(tpR);
+          RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpA);
+          RCP<const CRS> Pcrs = Tpetra::convertToCrsMatrix(tpP);
+          //          RCP<CRS> Accrs = Tpetra::convertToCrsMatrix(tpAc);
+          
+          // FIXME: The lines below only works because we're assuming Ac is Point
+          RCP<CRS> Accrs = Teuchos::rcp(new CRS(Rcrs->getRowMap(),0));
+          const bool do_fill_complete=true;
+          Tpetra::TripleMatrixMultiply::MultiplyRAP(*Rcrs, transposeR, *Acrs, transposeA, *Pcrs, transposeP, *Accrs, do_fill_complete, label, params);
+
+          // Temporary output matrix
+          RCP<Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Ac_t = Tpetra::convertToBlockCrsMatrix(*Accrs,A.GetStorageBlockSize());          
+          RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>(Ac_t));
+          RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > Ac_p = Ac_x;
+
+          // We can now cheat and replace the innards of Ac
+          RCP<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> > Ac_w = Teuchos::rcp_dynamic_cast<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> >(Teuchos::rcpFromRef(Ac));
+          Ac_w->replaceCrsMatrix(Ac_p);
+
+        }
+        else {
+          // Mix and match (not supported)
+          TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Mix-and-match Crs/BlockCrs Multiply not currently supported");      
+        }            
 # endif
 #else
         throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra."));
@@ -303,14 +388,55 @@ namespace Xpetra {
       (!defined(EPETRA_HAVE_OMP) && (!defined(HAVE_TPETRA_INST_SERIAL) || !defined(HAVE_TPETRA_INST_INT_LONG_LONG))))
         throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra <double,int,long long,EpetraNode> ETI enabled."));
 # else
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpR = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(R);
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpA = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(A);
-        const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpP = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(P);
-        Tpetra::CrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraCrs(Ac);
-
-        // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete.
-        // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
-        Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params);
+        using helpers = Xpetra::Helpers<SC,LO,GO,NO>;
+        if(helpers::isTpetraCrs(R) && helpers::isTpetraCrs(A) && helpers::isTpetraCrs(P)) {
+          // All matrices are Crs
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpR = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(R);
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpA = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(A);
+          const Tpetra::CrsMatrix<SC,LO,GO,NO> & tpP = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraCrs(P);
+          Tpetra::CrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraCrs(Ac);
+          
+          // 18Feb2013 JJH I'm reenabling the code that allows the matrix matrix multiply to do the fillComplete.
+          // Previously, Tpetra's matrix matrix multiply did not support fillComplete.
+          Tpetra::TripleMatrixMultiply::MultiplyRAP(tpR, transposeR, tpA, transposeA, tpP, transposeP, tpAc, haveMultiplyDoFillComplete, label, params);
+        }
+        else if (helpers::isTpetraBlockCrs(R) && helpers::isTpetraBlockCrs(A) && helpers::isTpetraBlockCrs(P)) {
+          // All matrices are BlockCrs (except maybe Ac)
+          // FIXME: For the moment we're just going to clobber the innards of AC, so no reuse. Once we have a reuse kernel,
+          // we'll need to think about refactoring BlockCrs so we can do something smarter here.
+          if(!A.getRowMap()->getComm()->getRank())
+            std::cout<<"WARNING: Using inefficient BlockCrs Multiply Placeholder"<<std::endl;          
+
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpR  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(R);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpA  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(A);
+          const Tpetra::BlockCrsMatrix<SC,LO,GO,NO> & tpP  = Xpetra::Helpers<SC,LO,GO,NO>::Op2TpetraBlockCrs(P);
+          //          Tpetra::BlockCrsMatrix<SC,LO,GO,NO> &       tpAc = Xpetra::Helpers<SC,LO,GO,NO>::Op2NonConstTpetraBlockCrs(Ac);
+
+          using CRS=Tpetra::CrsMatrix<SC,LO,GO,NO>;
+          RCP<const CRS> Rcrs = Tpetra::convertToCrsMatrix(tpR);
+          RCP<const CRS> Acrs = Tpetra::convertToCrsMatrix(tpA);
+          RCP<const CRS> Pcrs = Tpetra::convertToCrsMatrix(tpP);
+          //          RCP<CRS> Accrs = Tpetra::convertToCrsMatrix(tpAc);
+          
+          // FIXME: The lines below only works because we're assuming Ac is Point
+          RCP<CRS> Accrs = Teuchos::rcp(new CRS(Rcrs->getRowMap(),0));
+          const bool do_fill_complete=true;
+          Tpetra::TripleMatrixMultiply::MultiplyRAP(*Rcrs, transposeR, *Acrs, transposeA, *Pcrs, transposeP, *Accrs, do_fill_complete, label, params);
+
+          // Temporary output matrix
+          RCP<Tpetra::BlockCrsMatrix<SC,LO,GO,NO> > Ac_t = Tpetra::convertToBlockCrsMatrix(*Accrs,A.GetStorageBlockSize());         
+          RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > Ac_x = Teuchos::rcp(new Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO>(Ac_t));
+          RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > Ac_p = Ac_x;
+
+          // We can now cheat and replace the innards of Ac
+          RCP<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> > Ac_w = Teuchos::rcp_dynamic_cast<Xpetra::CrsMatrixWrap<SC,LO,GO,NO> >(Teuchos::rcpFromRef(Ac));
+          Ac_w->replaceCrsMatrix(Ac_p);   
+        }
+        else {
+          // Mix and match
+          TEUCHOS_TEST_FOR_EXCEPTION(1, Exceptions::RuntimeError, "Mix-and-match Crs/BlockCrs Multiply not currently supported");      
+        }    
+
 # endif
 #else
         throw(Xpetra::Exceptions::RuntimeError("Xpetra must be compiled with Tpetra."));