diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake
index bcf8ff05f..702d8af7f 100644
--- a/cmake/dca_config.cmake
+++ b/cmake/dca_config.cmake
@@ -145,6 +145,8 @@ elseif (DCA_LATTICE STREQUAL "Kagome")
     "dca/phys/models/analytic_hamiltonians/Kagome_hubbard.hpp")
 elseif (DCA_LATTICE STREQUAL "hund")
   set(DCA_LATTICE_TYPE dca::phys::models::HundLattice<PointGroup>)
+  set(DCA_LATTICE_INCLUDE
+    "dca/phys/models/analytic_hamiltonians/hund_lattice.hpp")
 elseif (DCA_LATTICE STREQUAL "threeband")
   set(DCA_LATTICE_TYPE dca::phys::models::ThreebandHubbard<PointGroup>)
   set(DCA_LATTICE_INCLUDE
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp
index 5aef9dd73..6cc5d55df 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_cluster_solver.hpp
@@ -117,6 +117,8 @@ class CtauxClusterSolver {
     return g0_;
   };
 
+  Walker::Resource& getResource() { return dummy_walker_resource_; };
+
 protected:
   void warmUp(Walker& walker);
 
@@ -161,6 +163,7 @@ class CtauxClusterSolver {
 
   G0Interpolation<device, Scalar> g0_;
 
+  Walker::Resource walker_dummy_resource_;
 private:
   Rng rng_;
 
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
index 1d60d0522..e822f4025 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
@@ -66,6 +66,9 @@ class CtauxWalker : public WalkerBIT<Parameters, Data>, public CtauxWalkerData<d
   constexpr static dca::linalg::DeviceType device = device_t;
   static constexpr bool is_complex = dca::util::IsComplex<Scalar>::value;
 
+  struct DummyResource {};
+  using Resource = DummyResource;
+
 public:
   /** constructor
    *  param[in] id    thread id
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp
index a68e8f26f..b73a9faed 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/ctint_cluster_solver.hpp
@@ -35,6 +35,7 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/domains/common_domains.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_choice.hpp"
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp"
 #include "dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation.hpp"
 //#include "dca/phys/dca_step/cluster_solver/shared_tools/accumulation/time_correlator.hpp"
 #include "dca/phys/dca_data/dca_data.hpp"
@@ -74,6 +75,8 @@ class CtintClusterSolver {
   using Data = DcaData<Parameters, DIST>;
   static constexpr linalg::DeviceType device = device_t;
 
+  using DMatrixBuilder = ctint::DMatrixBuilder<device_t, Scalar>;
+
   CtintClusterSolver(Parameters& parameters_ref, Data& Data_ref,
                      std::shared_ptr<io::Writer<Concurrency>> writer);
 
@@ -108,7 +111,8 @@ class CtintClusterSolver {
   // Returns the function G(k,w) without averaging across MPI ranks.
   auto local_G_k_w() const;
 
-protected:  // thread jacket interface.
+  DMatrixBuilder& getResource() { return *d_matrix_builder_; };
+protected:  // thread jacket interface.  
   using ParametersType = Parameters;
   using DataType = Data;
   using Rng = typename Parameters::random_number_generator;
@@ -166,6 +170,7 @@ class CtintClusterSolver {
   std::unique_ptr<Walker> walker_;
   // Walker input.
   Rng rng_;
+  std::unique_ptr<DMatrixBuilder> d_matrix_builder_;
 };
 
 template <dca::linalg::DeviceType DEV, class PARAM, bool use_submatrix, DistType DIST>
@@ -174,11 +179,10 @@ CtintClusterSolver<DEV, PARAM, use_submatrix, DIST>::CtintClusterSolver(
     : parameters_(parameters_ref),
       concurrency_(parameters_.get_concurrency()),
       data_(data_ref),
-
       accumulator_(parameters_, data_),
       writer_(writer),
       rng_(concurrency_.id(), concurrency_.number_of_processors(), parameters_.get_seed()) {
-  Walker::setDMatrixBuilder(g0_);
+  d_matrix_builder_ = std::make_unique<DMatrixBuilder>(g0_, PARAM::bands, RDmn());
   Walker::setInteractionVertices(data_, parameters_);
 
   if (concurrency_.id() == concurrency_.first())
@@ -191,7 +195,7 @@ void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::initialize(i
   
   g0_.initializeShrinked(data_.G0_r_t_cluster_excluded);
 
-  Walker::setDMatrixAlpha(parameters_.getAlphas(), parameters_.adjustAlphaDd());
+  d_matrix_builder_->setAlphas(parameters_.getAlphas(), parameters_.adjustAlphaDd());
 
   // It is a waiting to happen bug for this to be here and in CtintAccumulator
   accumulator_.initialize(dca_iteration_);
@@ -203,7 +207,7 @@ void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::initialize(i
 
 template <dca::linalg::DeviceType device_t, class Parameters, bool use_submatrix, DistType DIST>
 void CtintClusterSolver<device_t, Parameters, use_submatrix, DIST>::integrate() {
-  walker_ = std::make_unique<Walker>(parameters_, data_, rng_, 0);
+  walker_ = std::make_unique<Walker>(parameters_, data_, rng_, d_matrix_builder_, 0);
   walker_->initialize(dca_iteration_);
 
   dca::profiling::WallTime start_time;
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
index 44015c6a7..70cfb39c4 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
@@ -80,15 +80,22 @@ class CtintWalkerBase {
   void computeM(MatrixPair& m_accum) const;
 
   // Reset the counters and recompute the configuration sign and weight.
-  void markThermalized();
+  virtual void markThermalized() = 0;
 
   /** Recompute the matrix M from the configuration in O(expansion_order^3) time.
    *  Postcondition: sign_ and mc_log_weight_ are recomputed.
    *  mc_log_weight is the negative sum of the log det of both sectors of M
    *  + log of the abs of each vertices interaction strength.
    */
-  virtual void setMFromConfig();
+  virtual void setMFromConfig() = 0;
 
+  virtual void doStep() = 0;
+
+  virtual void doSweep() = 0;
+  
+  template<linalg::DeviceType DEVICE>
+  void setMFromConfigImpl(DMatrixBuilder<DEVICE, Scalar>& d_matrix_builder);
+  
   bool is_thermalized() const {
     return thermalized_;
   }
@@ -152,14 +159,6 @@ class CtintWalkerBase {
     buff >> configuration_;
   }
 
-  // Initialize the builder object shared by all walkers.
-  template <linalg::DeviceType device_type>
-  static void setDMatrixBuilder(const G0Interpolation<device_type, Scalar>& g0);
-
-  static void setDMatrixAlpha(const std::array<double, 3>& alphas, bool adjust_dd);
-
-  static void setInteractionVertices(const Data& data, const Parameters& parameters);
-
   double stealFLOPs() {
     auto flop = flop_;
     flop_ = 0.;
@@ -173,12 +172,11 @@ class CtintWalkerBase {
 
   static void sumConcurrency(const Concurrency&) {}
 
-  const auto& get_dmatrix_builder() const {
-    return *d_builder_ptr_;
-  }
-
   void writeAlphas() const;
 
+  static void setInteractionVertices(const Data& data,
+			      const Parameters& parameters);
+
 protected:
   // typedefs
   using RDmn = typename Parameters::RClusterDmn;
@@ -187,7 +185,6 @@ class CtintWalkerBase {
   void updateSweepAverages();
 
 protected:  // Members.
-  static inline std::unique_ptr<DMatrixBuilder<linalg::CPU, Scalar>> d_builder_ptr_;
   static inline InteractionVertices vertices_;
 
   const Parameters& parameters_;
@@ -271,41 +268,6 @@ void CtintWalkerBase<Parameters,DIST>::initialize(int iteration) {
   setMFromConfig();
 }
 
-template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::setMFromConfig() {
-  mc_log_weight_ = 0.;
-  phase_.reset();
-
-  for (int s = 0; s < 2; ++s) {
-    // compute Mij = g0(t_i,t_j) - I* alpha(s_i)
-
-    const auto& sector = configuration_.getSector(s);
-    auto& M = M_[s];
-    const int n = sector.size();
-    M.resize(n);
-    if (!n)
-      continue;
-    for (int j = 0; j < n; ++j)
-      for (int i = 0; i < n; ++i)
-        M(i, j) = d_builder_ptr_->computeD(i, j, sector);
-
-    if (M.nrRows()) {
-      const auto [log_det, phase] = linalg::matrixop::inverseAndLogDeterminant(M);
-
-      mc_log_weight_ -= log_det;  // Weight proportional to det(M^{-1})
-      phase_.divide(phase);
-    }
-  }
-
-  // So what is going on here.
-  for (int i = 0; i < configuration_.size(); ++i) {
-    // This is actual interaction strength of the vertex i.e H_int(nu1, nu2, delta_r)
-    const Real term = -configuration_.getStrength(i);
-    mc_log_weight_ += std::log(std::abs(term));
-    phase_.multiply(term);
-  }
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerBase<Parameters,DIST>::updateSweepAverages() {
   order_avg_.addSample(order());
@@ -315,39 +277,22 @@ void CtintWalkerBase<Parameters,DIST>::updateSweepAverages() {
     partial_order_avg_.addSample(order());
 }
 
-template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::writeAlphas() const {
-  std::cout << "For initial configuration integration:\n";
-  for (int isec = 0; isec < 2; ++isec) {
-    std::cout << "Sector: " << isec << '\n';
-    for (int ic = 0; ic < order(); ++ic) {
-      auto aux_spin = configuration_.getSector(isec).getAuxFieldType(ic);
-      auto left_b = configuration_.getSector(isec).getLeftB(ic);
-      auto alpha_left = get_dmatrix_builder().computeAlpha(aux_spin, left_b);
-      std::cout << "vertex: " << std::setw(6) << ic;
-      std::cout << " | aux spin: " << aux_spin << " | left B: " << left_b
-                << " | alpha left = " << alpha_left << '\n';
-    }
-  }
-}
-
-template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::markThermalized() {
-  thermalized_ = true;
-
-  nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
-  thermalization_steps_ = n_steps_;
-
-  order_avg_.reset();
-  sign_avg_.reset();
-  n_accepted_ = 0;
+// template <class Parameters, DistType DIST>
+// void CtintWalkerBase<Parameters,DIST>::writeAlphas() const {
+//   std::cout << "For initial configuration integration:\n";
+//   for (int isec = 0; isec < 2; ++isec) {
+//     std::cout << "Sector: " << isec << '\n';
+//     for (int ic = 0; ic < order(); ++ic) {
+//       auto aux_spin = configuration_.getSector(isec).getAuxFieldType(ic);
+//       auto left_b = configuration_.getSector(isec).getLeftB(ic);
+//       auto alpha_left = get_dmatrix_builder().computeAlpha(aux_spin, left_b);
+//       std::cout << "vertex: " << std::setw(6) << ic;
+//       std::cout << " | aux spin: " << aux_spin << " | left B: " << left_b
+//                 << " | alpha left = " << alpha_left << '\n';
+//     }
+//   }
+// }
 
-  // Recompute the Monte Carlo weight.
-  setMFromConfig();
-#ifndef NDEBUG
-  //writeAlphas();
-#endif
-}
 
 template <class Parameters, DistType DIST>
 void CtintWalkerBase<Parameters,DIST>::updateShell(int meas_id, int meas_to_do) const {
@@ -383,25 +328,6 @@ void CtintWalkerBase<Parameters,DIST>::printSummary() const {
   std::cout << std::endl;
 }
 
-template <class Parameters, DistType DIST>
-template <linalg::DeviceType device_type>
-void CtintWalkerBase<Parameters,DIST>::setDMatrixBuilder(
-							 const dca::phys::solver::G0Interpolation<device_type, Scalar>& g0) {
-  using RDmn = typename Parameters::RClusterDmn;
-
-  if (d_builder_ptr_)
-    std::cerr << "Warning: DMatrixBuilder already set." << std::endl;
-
-  d_builder_ptr_ = std::make_unique<DMatrixBuilder<device_type, Scalar>>(g0, n_bands_, RDmn());
-}
-
-template <class Parameters, DistType DIST>
-void CtintWalkerBase<Parameters,DIST>::setDMatrixAlpha(const std::array<double, 3>& alphas,
-                                                              bool adjust_dd) {
-  assert(d_builder_ptr_);
-  d_builder_ptr_->setAlphas(alphas, adjust_dd);
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerBase<Parameters,DIST>::setInteractionVertices(const Data& data,
                                                                      const Parameters& parameters) {
@@ -419,6 +345,78 @@ void CtintWalkerBase<Parameters,DIST>::computeM(MatrixPair& m_accum) const {
   m_accum = M_;
 }
 
+// template<class WALKER, linalg::DeviceType DEVICE>  
+// void setMFromConfigHelper(WALKER& walker, DMatrixBuilder<DEVICE, Scalar>& d_matrix_builder) {
+//   walker.mc_log_weight_ = 0.;
+//   walker.phase_.reset();
+
+//   for (int s = 0; s < 2; ++s) {
+//     // compute Mij = g0(t_i,t_j) - I* alpha(s_i)
+
+//     const auto& sector = walker.configuration_.getSector(s);
+//     auto& M = walker.M_[s];
+//     const int n = sector.size();
+//     M.resize(n);
+//     if (!n)
+//       continue;
+//     for (int j = 0; j < n; ++j)
+//       for (int i = 0; i < n; ++i)
+//         M(i, j) = d_matrix_builder.computeD(i, j, sector);
+
+//     if (M.nrRows()) {
+//       const auto [log_det, phase] = linalg::matrixop::inverseAndLogDeterminant(M);
+
+//       walker.mc_log_weight_ -= log_det;  // Weight proportional to det(M^{-1})
+//       walker.phase_.divide(phase);
+//     }
+//   }
+
+//   // So what is going on here.
+//   for (int i = 0; i < walker.configuration_.size(); ++i) {
+//     // This is actual interaction strength of the vertex i.e H_int(nu1, nu2, delta_r)
+//     const typename decltype(walker)::Real term = -walker.configuration_.getStrength(i);
+//     walker.mc_log_weight_ += std::log(std::abs(term));
+//     walker.phase_.multiply(term);
+//   }
+// }
+  
+     
+template <class Parameters, DistType DIST>
+template <linalg::DeviceType DEVICE>
+void CtintWalkerBase<Parameters, DIST>::setMFromConfigImpl(DMatrixBuilder<DEVICE, Scalar>& d_matrix_builder) {
+  mc_log_weight_ = 0.;
+  phase_.reset();
+
+  for (int s = 0; s < 2; ++s) {
+    // compute Mij = g0(t_i,t_j) - I* alpha(s_i)
+
+    const auto& sector = configuration_.getSector(s);
+    auto& M = M_[s];
+    const int n = sector.size();
+    M.resize(n);
+    if (!n)
+      continue;
+    for (int j = 0; j < n; ++j)
+      for (int i = 0; i < n; ++i)
+        M(i, j) = d_matrix_builder.computeD(i, j, sector);
+
+    if (M.nrRows()) {
+      const auto [log_det, phase] = linalg::matrixop::inverseAndLogDeterminant(M);
+
+      mc_log_weight_ -= log_det;  // Weight proportional to det(M^{-1})
+      phase_.divide(phase);
+    }
+  }
+
+  // So what is going on here.
+  for (int i = 0; i < configuration_.size(); ++i) {
+    // This is actual interaction strength of the vertex i.e H_int(nu1, nu2, delta_r)
+    const Real term = -configuration_.getStrength(i);
+    mc_log_weight_ += std::log(std::abs(term));
+    phase_.multiply(term);
+  }
+}
+  
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu.hpp
index b6c3fe687..695ee4961 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu.hpp
@@ -49,13 +49,16 @@ class CtintWalker<linalg::CPU, Parameters, DIST> : public CtintWalkerBase<Parame
   using MatrixView = typename linalg::MatrixView<Scalar, linalg::CPU>;
   using ConstView = typename linalg::MatrixView<const Scalar, linalg::CPU>;
 
+  using Resource = DMatrixBuilder<linalg::CPU, Scalar>;
+
 public:
-  CtintWalker(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, int id = 0);
+  CtintWalker(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
 
-  void doSweep();
+  void doSweep() override;
 
+  void markThermalized() override;
 protected:
-  void doStep();
+  void doStep() override;
   /** try to insertVertex, if accepted do it.
    *  on accept applyInsertion is called.
    */
@@ -72,6 +75,7 @@ class CtintWalker<linalg::CPU, Parameters, DIST> : public CtintWalkerBase<Parame
 
   void initializeStep();
 
+  void setMFromConfig() override;
 private:
   auto insertionProbability(int delta_vertices);
 
@@ -87,7 +91,7 @@ class CtintWalker<linalg::CPU, Parameters, DIST> : public CtintWalkerBase<Parame
   using BaseClass::parameters_;
   using BaseClass::configuration_;
   using BaseClass::rng_;
-  using BaseClass::d_builder_ptr_;
+  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
   using BaseClass::total_interaction_;
   using BaseClass::beta_;
   using BaseClass::phase_;
@@ -98,11 +102,16 @@ class CtintWalker<linalg::CPU, Parameters, DIST> : public CtintWalkerBase<Parame
   using BaseClass::n_accepted_;
 
   using BaseClass::M_;
-
   // For testing purposes.
   using BaseClass::acceptance_prob_;
   std::array<Scalar, 2> det_ratio_;
-
+  using BaseClass::sweeps_per_meas_;
+  using BaseClass::partial_order_avg_;
+  using BaseClass::thermalization_steps_;
+  using BaseClass::order_avg_;
+  using BaseClass::sign_avg_;
+  using BaseClass::n_steps_;
+  using BaseClass::mc_log_weight_;
 private:
   std::array<linalg::Matrix<Scalar, linalg::CPU>, 2> S_, Q_, R_;
   // work spaces
@@ -121,14 +130,33 @@ class CtintWalker<linalg::CPU, Parameters, DIST> : public CtintWalkerBase<Parame
 
 template <class Parameters, DistType DIST>
 CtintWalker<linalg::CPU, Parameters,DIST>::CtintWalker(const Parameters& parameters_ref,
-                                                        const Data& /*data*/, Rng& rng_ref, int id)
+						       const Data& /*data*/, Rng& rng_ref, DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id)
     : BaseClass(parameters_ref, rng_ref, id),
       det_ratio_{1, 1},
+      d_matrix_builder_(d_matrix_builder),
       // If we perform double updates, we need at most 3 rng values for: selecting the first vertex,
       // deciding if we select a second one, select the second vertex. Otherwise only the first is
       // needed.
       n_removal_rngs_(configuration_.getDoubleUpdateProb() ? 3 : 1) {}
 
+template <class Parameters, DistType DIST>
+void CtintWalker<linalg::CPU, Parameters,DIST>::markThermalized() {
+  thermalized_ = true;
+
+  nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
+  thermalization_steps_ = n_steps_;
+
+  order_avg_.reset();
+  sign_avg_.reset();
+  n_accepted_ = 0;
+
+  // Recompute the Monte Carlo weight.
+  setMFromConfig();
+#ifndef NDEBUG
+  //writeAlphas();
+#endif
+}
+  
 template <class Parameters, DistType DIST>
 void CtintWalker<linalg::CPU, Parameters,DIST>::doSweep() {
   int nb_of_steps;
@@ -172,7 +200,7 @@ bool CtintWalker<linalg::CPU, Parameters,DIST>::tryVertexInsert() {
   const int delta_vertices = configuration_.lastInsertionSize();
 
   // Compute the new pieces of the D(= M^-1) matrix.
-  d_builder_ptr_->buildSQR(S_, Q_, R_, configuration_);
+  d_matrix_builder_.buildSQR(S_, Q_, R_, configuration_);
 
   acceptance_prob_ = insertionProbability(delta_vertices);
 
@@ -434,6 +462,12 @@ void CtintWalker<linalg::CPU, Parameters,DIST>::initializeStep() {
   }
 }
 
+template <class Parameters, DistType DIST>
+void CtintWalker<linalg::CPU, Parameters, DIST>::setMFromConfig() {
+  BaseClass::setMFromConfigImpl(d_matrix_builder_);
+}
+
+  
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
index 1763ebaaf..861df5cb6 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
@@ -31,6 +31,7 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp"
 #include "dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/walker_methods.hpp"
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp"
 #include "dca/phys/parameters/parameters.hpp"
 #include "dca/util/integer_division.hpp"
 
@@ -40,9 +41,10 @@ namespace solver {
 namespace ctint {
 
 template <class Parameters, DistType DIST>
-class CtintWalkerSubmatrixCpu : public CtintWalkerBase<Parameters, DIST> {
+class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST> {
 public:
   using this_type = CtintWalkerSubmatrixCpu;
+  using SubmatrixBase = CtintWalkerSubmatrixBase<Parameters, DIST>;
   using BaseClass = CtintWalkerBase<Parameters, DIST>;
   using typename BaseClass::Rng;
   using typename BaseClass::Data;
@@ -51,21 +53,19 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerBase<Parameters, DIST> {
   using typename BaseClass::Real;
   using typename BaseClass::Scalar;
 
-  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, int id = 0);
+  using Resource = DMatrixBuilder<linalg::CPU, Scalar>;
 
-  virtual ~CtintWalkerSubmatrixCpu() = default;
+  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
 
-  virtual void doSweep();
+  virtual ~CtintWalkerSubmatrixCpu() = default;
 
   void computeM(typename BaseClass::MatrixPair& m_accum);
 
   using BaseClass::order;
 
+  void setMFromConfig() override;
 protected:
-  virtual void setMFromConfig() override;
-
   void doSteps();
-
   void generateDelayedMoves(int nbr_of_movesto_delay);
 
   /** This does a bunch of things, this is the majority of a step
@@ -84,15 +84,14 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerBase<Parameters, DIST> {
    */
   void mainSubmatrixProcess();
 
-  void updateM();
+  void markThermalized() override;
 
-  void transformM();
+  void updateM() override;
 
-  // For testing purposes.
-  void doStep(const int nbr_of_movesto_delay);
+  void transformM();
 
+  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
 private:
-  virtual void doStep();
 
   void doSubmatrixUpdate();
 
@@ -104,10 +103,10 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerBase<Parameters, DIST> {
 
   void removeRowAndColOfGammaInv();
 
-  void computeMInit();
+  void computeMInit() override;
 
   //  void computeG0Init();
-  void computeGInit();
+  void computeGInit() override;
 
   Move generateMoveType();
 
@@ -121,11 +120,6 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerBase<Parameters, DIST> {
 
   void recomputeGammaInv();
 
-  bool recentlyAdded(int move_idx, int s) const {
-    assert(move_idx < sector_indices_[s].size());
-    return sector_indices_[s][move_idx] >= n_init_[s];
-  }
-
 protected:
   using MatrixView = linalg::MatrixView<Scalar, linalg::CPU>;
   using Matrix = linalg::Matrix<Scalar, linalg::CPU>;
@@ -134,7 +128,6 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerBase<Parameters, DIST> {
   using BaseClass::configuration_;
   using BaseClass::rng_;
   using BaseClass::thread_id_;
-  using BaseClass::d_builder_ptr_;
   using BaseClass::total_interaction_;
   using BaseClass::phase_;
   using BaseClass::M_;
@@ -142,438 +135,98 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerBase<Parameters, DIST> {
   using BaseClass::beta_;
 
   using BaseClass::thermalized_;
+  using BaseClass::sweeps_per_meas_;
+  using BaseClass::partial_order_avg_;
+  using BaseClass::thermalization_steps_;
+  using BaseClass::order_avg_;
+  using BaseClass::sign_avg_;
+  using BaseClass::n_steps_;
+  using BaseClass::mc_log_weight_;
+  using BaseClass::n_accepted_;
 
 protected:
-  int max_submatrix_size_;
-
-  struct DelayedMoveType {
-    Move move_type;
-    std::array<double, 3> removal_rng{1., 1., 1.};
-    Real acceptance_rng;
-    std::array<int, 2> indices{-1, -1};
-  };
-
+  using SubmatrixBase::max_submatrix_size_;
+
+  using typename SubmatrixBase::DelayedMoveType;
+  using SubmatrixBase::f_;
+  using SubmatrixBase::gamma_values_;
+  using SubmatrixBase::prob_const_;
+  using SubmatrixBase::nb_steps_per_sweep_;
+  using SubmatrixBase::n_max_;
+  using SubmatrixBase::n_init_;
+  using SubmatrixBase::D_;
+  using SubmatrixBase::G_;
+  using SubmatrixBase::M_;
+  using SubmatrixBase::s_;
+  using SubmatrixBase::Gamma_indices_;
+  using SubmatrixBase::Gamma_inv_;
+  using SubmatrixBase::Gamma_inv_cpy_;
+  using SubmatrixBase::Gamma_q_;
+  using SubmatrixBase::workspace_;
+  using SubmatrixBase::r_;
+  using SubmatrixBase::move_indices_;
+  using SubmatrixBase::gamma_;
+  using SubmatrixBase::source_list_;
+  using SubmatrixBase::removal_list_;
+  using SubmatrixBase::conf_removal_list_;
+  using SubmatrixBase::sector_indices_;
+  using SubmatrixBase::index_;
+  using SubmatrixBase::nbr_of_indices_;
+  using SubmatrixBase::q_;
+  using SubmatrixBase::det_ratio_;
 protected:
   using BaseClass::acceptance_prob_;
 
 protected:
-  static constexpr Scalar the_one_ = dca::util::TheOne<Scalar>::value;
-
-  std::vector<DelayedMoveType> delayed_moves_;
-
-  using MatrixPair = std::array<linalg::Matrix<Scalar, linalg::CPU>, 2>;
-  MatrixPair G_;
-  MatrixPair G0_;
-  MatrixPair Gamma_inv_;      // TODO: don't pin
-  MatrixPair Gamma_inv_cpy_;  // TODO: don't pin
-  MatrixPair q_;              // TODO: don't pin or store in Gamma inv
-  MatrixPair r_;
-  MatrixPair s_;
-  std::array<std::vector<Real>, 2> gamma_;
-
-  Scalar det_ratio_;
-  std::map<int, std::array<Real, n_bands_>> f_;
-  std::map<int, std::array<Real, n_bands_>> prob_const_;
-  std::map<std::pair<int, int>, std::array<Real, n_bands_>> gamma_values_;
-
-  using BaseClass::nb_steps_per_sweep_;
-  int nbr_of_steps_;
-  int nbr_of_moves_to_delay_;
-  int max_nbr_of_moves;
-
-  std::array<int, 2> Gamma_size_;
-  std::array<std::vector<int>, 2> Gamma_indices_;
-  std::array<std::vector<int>, 2> sector_indices_;
-
-  const DelayedMoveType* current_move_;
-
-  std::array<unsigned, 2> nbr_of_indices_;
-
-  std::vector<int> index_;
-
-  // Initial configuration size.
-  unsigned config_size_init_;
-
-  // Initial and current sector sizes.
-  std::array<unsigned, 2> n_init_;
-  unsigned n_;
-
-  // Maximal sector size after submatrix update.
-
-  std::array<unsigned, 2> n_max_;
-
-  std::array<linalg::util::HostVector<int>, 2> move_indices_;
-  std::array<linalg::util::HostVector<int>, 2> removal_list_;
-  std::array<linalg::util::HostVector<int>, 2> source_list_;
-  std::array<std::vector<int>, 2> insertion_list_;
-  std::array<std::vector<int>, 2> insertion_Gamma_indices_;
-
-  std::vector<int> conf_removal_list_;
-
-  std::vector<int>::iterator insertion_list_it_;
-
-  std::array<Matrix, 2> Gamma_q_;
-  Matrix workspace_;
-  Matrix D_;
 
   using BaseClass::flop_;
 };
 
 template <class Parameters, DistType DIST>
-CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(const Parameters& parameters_ref,
-                                                                   const Data& /*data*/,
-                                                                   Rng& rng_ref, int id)
-    : BaseClass(parameters_ref, rng_ref, id) {
-  for (int b = 0; b < n_bands_; ++b) {
+CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(
+    const Parameters& parameters_ref, const Data& data, Rng& rng_ref,
+    DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id)
+    : SubmatrixBase(parameters_ref, data, rng_ref, id), d_matrix_builder_(d_matrix_builder) {
+
+    for (int b = 0; b < n_bands_; ++b) {
     for (int i = 1; i <= 3; ++i) {
-      f_[i][b] = d_builder_ptr_->computeF(i, b);
-      f_[-i][b] = d_builder_ptr_->computeF(-i, b);
+      f_[i][b] = d_matrix_builder_.computeF(i, b);
+      f_[-i][b] = d_matrix_builder_.computeF(-i, b);
 
-      gamma_values_[std::make_pair(0, i)][b] = d_builder_ptr_->computeGamma(0, i, b);
-      gamma_values_[std::make_pair(0, -i)][b] = d_builder_ptr_->computeGamma(0, -i, b);
-      gamma_values_[std::make_pair(i, 0)][b] = d_builder_ptr_->computeGamma(i, 0, b);
-      gamma_values_[std::make_pair(-i, 0)][b] = d_builder_ptr_->computeGamma(-i, 0, b);
+      gamma_values_[std::make_pair(0, i)][b] = d_matrix_builder_.computeGamma(0, i, b);
+      gamma_values_[std::make_pair(0, -i)][b] = d_matrix_builder_.computeGamma(0, -i, b);
+      gamma_values_[std::make_pair(i, 0)][b] = d_matrix_builder_.computeGamma(i, 0, b);
+      gamma_values_[std::make_pair(-i, 0)][b] = d_matrix_builder_.computeGamma(-i, 0, b);
 
       prob_const_[i][b] = prob_const_[-i][b] = -1. / (f_[i][b] - 1) / (f_[-i][b] - 1);
     }
     f_[0][b] = 1;
   }
 
-  if (BaseClass::concurrency_.id() == BaseClass::concurrency_.first() && thread_id_ == 0)
-    std::cout << "\nCT-INT submatrix walker created." << std::endl;
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::setMFromConfig() {
-  BaseClass::setMFromConfig();
+  BaseClass::setMFromConfigImpl(d_matrix_builder_);
   transformM();
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::doSweep() {
-  Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
-  doSteps();
-}
-
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::doSteps() {
-  // Here nbr_of_steps is the number of single steps/moves during the current sweep,
-  // while nbr_of_submatrix_steps is the number of times the entire submatrix algorithm  is run.
-
-  if (nb_steps_per_sweep_ < 0)  // Not thermalized or fixed.
-    nbr_of_steps_ = BaseClass::avgOrder() + 1;
-  else
-    nbr_of_steps_ = nb_steps_per_sweep_;
-
-  // Get the maximum of Monte Carlo steps/moves that can be performed during one submatrix step.
-  max_nbr_of_moves = parameters_.getMaxSubmatrixSize();
-
-  BaseClass::n_steps_ += nbr_of_steps_;
-  while (nbr_of_steps_ > 0) {
-    nbr_of_moves_to_delay_ = std::min(nbr_of_steps_, max_nbr_of_moves);
-    nbr_of_steps_ -= nbr_of_moves_to_delay_;
-
-    doStep();
-  }
-
-  BaseClass::updateSweepAverages();
-}
-
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::doStep() {
-  generateDelayedMoves(nbr_of_moves_to_delay_);
-  doSubmatrixUpdate();
-}
-
-// Do one step with arbitrary number of moves. For testing.
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::doStep(const int nbr_of_movesto_delay) {
-  std::cout << "\nStarted doStep() function for testing." << std::endl;
-
-  generateDelayedMoves(nbr_of_movesto_delay);
-
-  std::cout << "\nGenerated " << nbr_of_movesto_delay << " moves for testing.\n" << std::endl;
-
-  doSubmatrixUpdate();
-}
-
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::generateDelayedMoves(const int nbr_of_movesto_delay) {
-  assert(nbr_of_movesto_delay > 0);
-
-  delayed_moves_.clear();
-
-  for (int s = 0; s < 2; ++s) {
-    n_init_[s] = configuration_.getSector(s).size();
-  }
-  n_ = config_size_init_ = configuration_.size();
-
-  // Generate delayed moves.
-  for (int i = 0; i < nbr_of_movesto_delay; ++i) {
-    DelayedMoveType delayed_move;
-
-    delayed_move.move_type = generateMoveType();
-
-    switch (delayed_move.move_type) {
-      case REMOVAL:
-        if (configuration_.getDoubleUpdateProb())
-          delayed_move.removal_rng = {rng_(), rng_(), rng_()};
-        else
-          delayed_move.removal_rng[0] = rng_();
-        break;
-
-      case INSERTION:
-        configuration_.insertRandom(rng_);
-        for (int i = 0; i < configuration_.lastInsertionSize(); ++i)
-          delayed_move.indices[i] = configuration_.size() - configuration_.lastInsertionSize() + i;
-        break;
-
-      default:
-        throw(std::logic_error("Unknown move type encountered."));
-    }
-
-    delayed_move.acceptance_rng = rng_();
-    delayed_moves_.push_back(delayed_move);
-  }
-
-  for (int s = 0; s < 2; ++s) {
-    n_max_[s] = configuration_.getSector(s).size();
-  }
-
-  const int config_size_final = configuration_.size();
-  conf_removal_list_.resize(config_size_final - config_size_init_);
-  std::iota(conf_removal_list_.begin(), conf_removal_list_.end(), config_size_init_);
-}
-
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::doSubmatrixUpdate() {
-  computeMInit();
-  computeGInit();
-  mainSubmatrixProcess();
-  updateM();
-}
-
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::mainSubmatrixProcess() {
-  Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
-
-  for (int s = 0; s < 2; ++s) {
-    Gamma_inv_[s].resizeNoCopy(0);
-    gamma_[s].clear();
-
-    move_indices_[s].clear();
-    insertion_list_[s].clear();
-    insertion_Gamma_indices_[s].clear();
-  }
-
-  std::vector<int> aux_spin_type, new_aux_spin_type, move_band;
-
-  acceptance_prob_ = 1.0;
-  for (int delay_ind = 0; delay_ind < delayed_moves_.size(); ++delay_ind) {
-    current_move_ = &delayed_moves_[delay_ind];
-    const auto move_type = current_move_->move_type;
-
-    det_ratio_ = 1.;
-    std::array<int, 2> indices_array;
-
-    if (move_type == INSERTION) {
-      indices_array = current_move_->indices;
-    }
-    else {  // move_type == REMOVAL
-      indices_array = configuration_.randomRemovalCandidate(delayed_moves_[delay_ind].removal_rng);
-    }
-
-    index_.clear();
-    for (auto idx : indices_array) {
-      if (idx >= 0)
-        index_.push_back(idx);
-    }
-
-    // This leads to a bunch of complex branchy and premature looking optimization
-    // The branch predictor must be weeping
-    bool at_least_one_recently_added = false;
-    bool all_recently_added = false;
-    if (move_type == REMOVAL) {
-      // Check if the vertex to remove was inserted during the current submatrix update.
-      const auto recently_added = [=](int id) { return id >= config_size_init_; };
-      all_recently_added = math::util::all(index_, recently_added);
-      at_least_one_recently_added = math::util::any(index_, recently_added);
-    }
+void CtintWalkerSubmatrixCpu<Parameters,DIST>::markThermalized() {
+  thermalized_ = true;
 
-    for (int s = 0; s < 2; ++s) {
-      Gamma_indices_[s].clear();
-      new_aux_spin_type.clear();
-      aux_spin_type.clear();
-      move_band.clear();
+  nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
+  thermalization_steps_ = n_steps_;
 
-      findSectorIndices(s);
+  order_avg_.reset();
+  sign_avg_.reset();
+  n_accepted_ = 0;
 
-      if (move_type == INSERTION) {
-        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
-          new_aux_spin_type.push_back(
-              configuration_.getSector(s).getAuxFieldType(sector_indices_[s][ind]));
-          aux_spin_type.push_back(0);
-        }
-      }
-      else if (move_type == REMOVAL) {
-        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
-          new_aux_spin_type.push_back(0);
-          aux_spin_type.push_back(
-              configuration_.getSector(s).getAuxFieldType(sector_indices_[s][ind]));
-
-          if (recentlyAdded(ind, s)) {
-            // Find pre-existing Gamma_indices_.
-            insertion_list_it_ = std::find(insertion_list_[s].begin(), insertion_list_[s].end(),
-                                           sector_indices_[s][ind]);
-            Gamma_indices_[s].push_back(insertion_Gamma_indices_[s][std::distance(
-                insertion_list_[s].begin(), insertion_list_it_)]);
-          }
-        }
-      }  // endif removal
-      for (int index : sector_indices_[s])
-        move_band.push_back(configuration_.getSector(s).getLeftB(index));
-
-      for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
-        if (move_type == INSERTION || !recentlyAdded(ind, s))
-          gamma_[s].push_back(
-              gamma_values_[std::make_pair(aux_spin_type[ind], new_aux_spin_type[ind])][move_band[ind]]);
-        else {
-          // TODO: find instead of adding.
-          gamma_[s].push_back(
-              gamma_values_[std::make_pair(new_aux_spin_type[ind], aux_spin_type[ind])][move_band[ind]]);
-        }
-      }
-
-      if (!at_least_one_recently_added)
-        computeInsertionMatrices(sector_indices_[s], s);
-      else {
-        if (all_recently_added)
-          computeRemovalMatrix(s);
-        else
-          computeMixedInsertionAndRemoval(s);
-      }
-
-      det_ratio_ *= details::smallDeterminant(s_[s]);
-    }  // s loop.
-
-    if (n_ == 0 && move_type == REMOVAL)
-      continue;
-
-    // Compute acceptance probability.
-    auto [acceptance_prob, mc_weight_ratio] = computeAcceptanceProbability();
-    acceptance_prob_ = acceptance_prob;
-
-    // Note: acceptance and rejection can be forced for testing with the appropriate "acceptance_rng".
-    const bool accepted =
-        delayed_moves_[delay_ind].acceptance_rng < std::min(std::abs(acceptance_prob_), Real(1.));
-
-    // NB: recomputeGammaInv is just a inefficient alternative to updateGammaInv. Only for testing
-    // or debbuging.
-    // recomputeGammaInv();
-
-    // Update
-    if (accepted) {
-      ++BaseClass::n_accepted_;
-
-      BaseClass::mc_log_weight_ += std::log(std::abs(mc_weight_ratio));
-
-      // Are we capturing the avg sign properly wrt multiple delayed moves
-      phase_.multiply(acceptance_prob_);
-
-      // Update GammaInv if necessary.
-      if (!at_least_one_recently_added)
-        for (int s = 0; s < 2; ++s)
-          updateGammaInv(s);
-      else {
-        removeRowAndColOfGammaInv();
-        for (int s = 0; s < 2; ++s) {
-          for (int ind = sector_indices_[s].size() - 1; ind >= 0; --ind) {
-            if (!recentlyAdded(ind, s))
-              continue;
-            insertion_list_[s].erase(std::remove(insertion_list_[s].begin(),
-                                                 insertion_list_[s].end(), sector_indices_[s][ind]),
-                                     insertion_list_[s].end());
-          }
-          for (int ind = Gamma_indices_[s].size() - 1; ind >= 0; --ind) {
-            insertion_Gamma_indices_[s].erase(std::remove(insertion_Gamma_indices_[s].begin(),
-                                                          insertion_Gamma_indices_[s].end(),
-                                                          Gamma_indices_[s][ind]),
-                                              insertion_Gamma_indices_[s].end());
-            gamma_[s].erase(gamma_[s].begin() + Gamma_indices_[s][ind]);
-
-            for (int i = 0; i < insertion_Gamma_indices_[s].size(); ++i) {
-              if (insertion_Gamma_indices_[s][i] > Gamma_indices_[s][ind])
-                --insertion_Gamma_indices_[s][i];
-            }
-          }
-        }
-      }
-
-      for (int s = 0; s < 2; ++s) {
-        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
-          if (move_type == INSERTION || !recentlyAdded(ind, s)) {
-            move_indices_[s].push_back(sector_indices_[s][ind]);
-          }
-          else {
-            gamma_[s].pop_back();
-            move_indices_[s].erase(std::remove(move_indices_[s].begin(), move_indices_[s].end(),
-                                               sector_indices_[s][ind]),
-                                   move_indices_[s].end());
-          }
-        }
-      }
-
-      if (move_type == INSERTION) {
-        n_ += index_.size();
-        for (auto idx : index_)
-          configuration_.commitInsertion(idx);
-
-        for (int s = 0; s < 2; ++s) {
-          for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
-            insertion_list_[s].push_back(sector_indices_[s][ind]);
-            insertion_Gamma_indices_[s].push_back(Gamma_inv_[s].nrRows() - nbr_of_indices_[s] + ind);
-          }
-        }
-
-        // TODO: cleanup
-        for (int idx : index_)
-          conf_removal_list_.erase(
-              std::remove(conf_removal_list_.begin(), conf_removal_list_.end(), idx),
-              conf_removal_list_.end());
-      }
-      else if (move_type == REMOVAL) {
-        n_ -= index_.size();
-        for (auto idx : index_)
-          configuration_.markForRemoval(idx);
-
-        // TODO: cleanup.
-        for (int idx : index_)
-          conf_removal_list_.push_back(idx);
-      }
-    }
-    else {  // The move is rejected:
-      for (int s = 0; s < 2; ++s) {
-        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind)
-          gamma_[s].pop_back();
-      }
-      if (at_least_one_recently_added && !all_recently_added)
-        for (int s = 0; s < 2; ++s) {
-          Gamma_inv_[s].swap(Gamma_inv_cpy_[s]);
-        }
-    }
-  }
-
-  // This seems correct in that we count all steps
-  // just as is done for the non submatrix cpu version
-  BaseClass::n_steps_ += delayed_moves_.size();
-}
-
-template <class Parameters, DistType DIST>
-Move CtintWalkerSubmatrixCpu<Parameters, DIST>::generateMoveType() {
-  if (rng_() <= 0.5)
-    return INSERTION;
-  else
-    return REMOVAL;
+  // Recompute the Monte Carlo weight.
+  setMFromConfig();
+#ifndef NDEBUG
+  //writeAlphas();
+#endif
 }
 
 // Extend M by adding non-interacting vertices.
@@ -589,7 +242,7 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
       Scalar f_j;
       D_.resize(std::make_pair(delta, n_init_[s]));
 
-      d_builder_ptr_->computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0);
+      d_matrix_builder_.computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0);
 
       for (int j = 0; j < n_init_[s]; ++j) {
         const auto field_type = configuration_.getSector(s).getAuxFieldType(j);
@@ -646,7 +299,7 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeGInit() {
     }
 
     if (delta > 0) {
-      d_builder_ptr_->computeG0(G0, configuration_.getSector(s), n_init_[s], n_max_[s], 1);
+      d_matrix_builder_.computeG0(G0, configuration_.getSector(s), n_init_[s], n_max_[s], 1);
 
       MatrixView G(G_[s], 0, n_init_[s], n_max_[s], delta);
 
@@ -656,76 +309,6 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeGInit() {
   }
 }
 
-template <class Parameters, DistType DIST>
-auto CtintWalkerSubmatrixCpu<Parameters, DIST>::computeAcceptanceProbability() {
-  Scalar acceptance_probability = det_ratio_;
-
-  Scalar gamma_factor = the_one_;
-  const auto move_type = current_move_->move_type;
-  for (int s = 0; s < 2; ++s) {
-    if (!sector_indices_[s].size())
-      continue;
-
-    for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
-      if (move_type == INSERTION || !recentlyAdded(ind, s))
-        gamma_factor *= gamma_[s].at(gamma_[s].size() - nbr_of_indices_[s] + ind);
-      else
-        gamma_factor /= gamma_[s].at(gamma_[s].size() - nbr_of_indices_[s] + ind);
-    }
-  }
-  acceptance_probability *= gamma_factor;
-
-  const int delta_vertices = index_.size();
-  const int interaction_sign = configuration_.getSign(index_[0]);
-  const int non_empty_sector = sector_indices_[0].size() ? 0 : 1;
-
-  Scalar mc_weight_ratio = acceptance_probability;
-  Real K = total_interaction_;
-
-  for (int v_id = 0; v_id < delta_vertices; ++v_id) {
-    const auto field_type = configuration_.getSector(non_empty_sector)
-                                .getAuxFieldType(sector_indices_[non_empty_sector][v_id]);
-    const auto b =
-        configuration_.getSector(non_empty_sector).getLeftB(sector_indices_[non_empty_sector][v_id]);
-    K *= beta_ * prob_const_[field_type][b] * interaction_sign;
-
-    const Scalar weight_term = prob_const_[field_type][b] * configuration_.getStrength(index_[v_id]);
-    if (move_type == INSERTION)
-      mc_weight_ratio *= weight_term;
-    else
-      mc_weight_ratio /= weight_term;
-  }
-
-  // Account for combinatorial factor and update acceptance probability.
-  if (move_type == INSERTION) {
-    if (delta_vertices == 1) {
-      acceptance_probability *= K / (n_ + 1);
-    }
-    else if (delta_vertices == 2) {
-      const Real possible_partners = configuration_.possiblePartners(index_[0]);
-      const Real combinatorial_factor =
-          (n_ + 2) * (configuration_.nPartners(index_[0]) + 1) / possible_partners;
-      acceptance_probability *= configuration_.getStrength(index_[1]) * K / combinatorial_factor;
-    }
-    else
-      throw(std::logic_error("Not implemented"));
-  }
-  else if (move_type == REMOVAL) {
-    if (delta_vertices == 1) {
-      acceptance_probability *= n_ / K;
-    }
-    else if (delta_vertices == 2) {
-      const Real possible_partners = configuration_.possiblePartners(index_[0]);
-      const Real combinatorial_factor = n_ * configuration_.nPartners(index_[0]) / possible_partners;
-      acceptance_probability *= combinatorial_factor / (configuration_.getStrength(index_[1]) * K);
-    }
-    else
-      throw(std::logic_error("Not implemented"));
-  }
-
-  return std::make_tuple(acceptance_probability, mc_weight_ratio);
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::updateGammaInv(int s) {
   Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
@@ -997,7 +580,7 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMixedInsertionAndRemoval(
   // TODO: avoid reallocation.
   std::vector<int> insertion_indices;
   for (int i = 0; i < nbr_of_indices_[s]; ++i)
-    if (!recentlyAdded(i, s))
+    if (!SubmatrixBase::recentlyAdded(i, s))
       insertion_indices.push_back(sector_indices_[s][i]);
   assert(Gamma_indices_[s].size() + insertion_indices.size() == sector_indices_.size());
 
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
index 885bcebe4..e098599e6 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_gpu_submatrix.hpp
@@ -25,6 +25,7 @@
 #include <vector>
 
 #include "dca/linalg/util/gpu_event.hpp"
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/structs/device_configuration_manager.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/kernels_interface.hpp"
@@ -35,11 +36,11 @@ namespace solver {
 namespace ctint {
 
 template <class Parameters, DistType DIST>
-class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixCpu<Parameters, DIST> {
+class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixBase<Parameters, DIST> {
 public:
   using this_type = CtintWalkerSubmatrixGpu<Parameters, DIST>;
-  using BaseClass = CtintWalkerSubmatrixCpu<Parameters, DIST>;
-  using RootClass = CtintWalkerBase<Parameters,  DIST>;
+  using SubmatrixBase = CtintWalkerSubmatrixBase<Parameters, DIST>;
+  using BaseClass = CtintWalkerBase<Parameters, DIST>;
 
   using typename BaseClass::Data;
   using typename BaseClass::Profiler;
@@ -48,6 +49,8 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixCpu<Parameters, DIST>
   using typename BaseClass::Real;
   using typename BaseClass::Scalar;
 
+  using Resource = DMatrixBuilder<linalg::GPU, Scalar>;
+
   template <linalg::DeviceType dev>
   using MatrixView = linalg::MatrixView<Scalar, dev>;
   template <linalg::DeviceType dev>
@@ -58,7 +61,7 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixCpu<Parameters, DIST>
   constexpr static linalg::DeviceType device = linalg::GPU;
 
   CtintWalkerSubmatrixGpu(const Parameters& pars_ref, const Data& /*data_ref*/, Rng& rng_ref,
-                          int id = 0);
+                          DMatrixBuilder<linalg::GPU, Scalar>& d_matrix_builder, int id = 0);
 
   void computeM(MatrixPair<linalg::GPU>& m_accum);
 
@@ -67,16 +70,15 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixCpu<Parameters, DIST>
   void synchronize() const override;
 
   using BaseClass::order;
-  using RootClass::get_stream;
+  using BaseClass::get_stream;
 
   std::size_t deviceFingerprint() const override;
 
+  void setMFromConfig() override;
 protected:
   // For testing purposes:
   void doStep(int n_moves_to_delay);
 
-  void setMFromConfig() override;
-
 private:
   void doStep() override;
 
@@ -89,27 +91,38 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixCpu<Parameters, DIST>
 protected:
   using BaseClass::configuration_;
   using BaseClass::M_;
+  using BaseClass::sweeps_per_meas_;
+  using BaseClass::partial_order_avg_;
+  using BaseClass::thermalization_steps_;
+  using BaseClass::order_avg_;
+  using BaseClass::sign_avg_;
+  using BaseClass::n_steps_;
+  using BaseClass::mc_log_weight_;
+  using BaseClass::n_accepted_;
+
 
+  
 private:
   using BaseClass::concurrency_;
   using BaseClass::thread_id_;
-  using BaseClass::removal_list_;
-  using BaseClass::source_list_;
-  using BaseClass::conf_removal_list_;
-  using RootClass::d_builder_ptr_;
-  using BaseClass::parameters_;
-  using BaseClass::rng_;
-  using BaseClass::Gamma_inv_;
-  using BaseClass::f_;
+  using SubmatrixBase::removal_list_;
+  using SubmatrixBase::source_list_;
+  using SubmatrixBase::conf_removal_list_;
+  using SubmatrixBase::parameters_;
+  using SubmatrixBase::rng_;
+  using SubmatrixBase::Gamma_inv_;
+  using SubmatrixBase::gamma_values_;
+  using SubmatrixBase::f_;
   // Initial and current sector sizes.
-  using BaseClass::n_init_;
+  using SubmatrixBase::n_init_;
   // Maximal sector size after submatrix update.
-  using BaseClass::n_max_;
-  using BaseClass::gamma_;
-  using BaseClass::G_;
-  using BaseClass::move_indices_;
-  using BaseClass::flop_;
-
+  using SubmatrixBase::n_max_;
+  using SubmatrixBase::n_bands_;
+  using SubmatrixBase::gamma_;
+  using SubmatrixBase::G_;
+  using SubmatrixBase::move_indices_;
+  using SubmatrixBase::flop_;
+  using SubmatrixBase::prob_const_;
   DeviceConfigurationManager device_config_;
 
   std::array<linalg::Vector<int, linalg::GPU>, 2> removal_list_dev_;
@@ -129,19 +142,37 @@ class CtintWalkerSubmatrixGpu : public CtintWalkerSubmatrixCpu<Parameters, DIST>
   std::array<linalg::Vector<std::pair<int, Real>, linalg::GPU>, 2> gamma_index_dev_;
 
   std::array<linalg::util::GpuEvent, 2> config_copied_;
+
+  DMatrixBuilder<linalg::GPU, Scalar>& d_matrix_builder_;
 };
 
 template <class Parameters, DistType DIST>
 CtintWalkerSubmatrixGpu<Parameters, DIST>::CtintWalkerSubmatrixGpu(
-    const Parameters& pars_ref, const Data& data, Rng& rng_ref, int id)
-    : BaseClass(pars_ref, data, rng_ref, id) {
+    const Parameters& pars_ref, const Data& data, Rng& rng_ref,
+    DMatrixBuilder<linalg::GPU, Scalar>& d_matrix_builder, int id)
+    : SubmatrixBase(pars_ref, data, rng_ref, id), d_matrix_builder_(d_matrix_builder) {
+  for (int b = 0; b < n_bands_; ++b) {
+    for (int i = 1; i <= 3; ++i) {
+      f_[i][b] = d_matrix_builder_.computeF(i, b);
+      f_[-i][b] = d_matrix_builder_.computeF(-i, b);
+
+      gamma_values_[std::make_pair(0, i)][b] = d_matrix_builder_.computeGamma(0, i, b);
+      gamma_values_[std::make_pair(0, -i)][b] = d_matrix_builder_.computeGamma(0, -i, b);
+      gamma_values_[std::make_pair(i, 0)][b] = d_matrix_builder_.computeGamma(i, 0, b);
+      gamma_values_[std::make_pair(-i, 0)][b] = d_matrix_builder_.computeGamma(-i, 0, b);
+
+      prob_const_[i][b] = prob_const_[-i][b] = -1. / (f_[i][b] - 1) / (f_[-i][b] - 1);
+    }
+    f_[0][b] = 1;
+  }
+
   if (concurrency_.id() == concurrency_.first() && thread_id_ == 0)
     std::cout << "\nCT-INT submatrix walker extended to GPU." << std::endl;
 }
-
+  
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::setMFromConfig() {
-  BaseClass::setMFromConfig();
+  BaseClass::setMFromConfigImpl(d_matrix_builder_);
   for (int s = 0; s < 2; ++s) {
     M_dev_[s].setAsync(M_[s], get_stream(s));
   }
@@ -159,26 +190,26 @@ template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::doSweep() {
   Profiler profiler(__FUNCTION__, "CT-INT GPU walker", __LINE__, thread_id_);
 
-  BaseClass::doSteps();
+  SubmatrixBase::doSteps();
   uploadConfiguration();
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::doStep(const int n_moves_to_delay) {
-  BaseClass::nbr_of_moves_to_delay_ = n_moves_to_delay;
+  SubmatrixBase::nbr_of_moves_to_delay_ = n_moves_to_delay;
   doStep();
   uploadConfiguration();
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixGpu<Parameters, DIST>::doStep() {
-  BaseClass::generateDelayedMoves(BaseClass::nbr_of_moves_to_delay_);
+  SubmatrixBase::generateDelayedMoves(SubmatrixBase::nbr_of_moves_to_delay_);
   uploadConfiguration();
 
   computeMInit();
   computeGInit();
   synchronize();
-  BaseClass::mainSubmatrixProcess();
+  SubmatrixBase::mainSubmatrixProcess();
   updateM();
 }
 
@@ -216,8 +247,8 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeMInit() {
     const int delta = n_max_[s] - n_init_[s];
     if (delta > 0) {
       D_dev_[s].resizeNoCopy(std::make_pair(delta, n_init_[s]));
-      d_builder_ptr_->computeG0(D_dev_[s], device_config_.getDeviceData(s), n_init_[s], false,
-                                get_stream(s));
+      d_matrix_builder_.computeG0(D_dev_[s], device_config_.getDeviceData(s), n_init_[s], false,
+                                 get_stream(s));
       flop_ += D_dev_[s].nrCols() * D_dev_[s].nrRows() * 10;
 
       MatrixView<linalg::GPU> D_view(D_dev_[s]);
@@ -249,12 +280,12 @@ void CtintWalkerSubmatrixGpu<Parameters, DIST>::computeGInit() {
     const MatrixView<linalg::GPU> M(M_dev_[s]);
     details::computeGLeft(G, M, f_dev.ptr(), n_init_[s], get_stream(s));
     flop_ += n_init_[s] * n_max_[s] * 2;
-    
+
     if (delta > 0) {
       G0_dev_[s].resizeNoCopy(std::make_pair(n_max_[s], delta));
       // This doesn't do flops but the g0 interp data it uses does somewhere.
-      d_builder_ptr_->computeG0(G0_dev_[s], device_config_.getDeviceData(s), n_init_[s], true,
-                                get_stream(s));
+      d_matrix_builder_.computeG0(G0_dev_[s], device_config_.getDeviceData(s), n_init_[s], true,
+                                 get_stream(s));
       flop_ += G0_dev_[s].nrCols() * G0_dev_[s].nrRows() * 10;
       MatrixView<linalg::GPU> G(G_dev_[s], 0, n_init_[s], n_max_[s], delta);
       // compute G right.
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
new file mode 100644
index 000000000..3b2901aa6
--- /dev/null
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_submatrix_base.hpp
@@ -0,0 +1,868 @@
+#ifndef DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_CTINT_WALKER_CTINT_WALKER_SUBMATRIX_BASE_HPP
+#define DCA_PHYS_DCA_STEP_CLUSTER_SOLVER_CTINT_WALKER_CTINT_WALKER_SUBMATRIX_BASE_HPP
+
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <map>
+#include <numeric>
+#include <stdexcept>
+#include <vector>
+
+#include <ctime>
+
+#include "dca/linalg/linalg.hpp"
+#include "dca/math/util/vector_operations.hpp"
+#include "dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp"
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/move.hpp"
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp"
+#include "dca/phys/dca_step/cluster_solver/shared_tools/interpolation/g0_interpolation.hpp"
+#include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/walker_methods.hpp"
+#include "dca/phys/parameters/parameters.hpp"
+#include "dca/util/integer_division.hpp"
+
+namespace dca {
+namespace phys {
+namespace solver {
+namespace ctint {
+
+template <class Parameters, DistType DIST>
+class CtintWalkerSubmatrixBase : public CtintWalkerBase<Parameters, DIST> {
+public:
+  using this_type = CtintWalkerSubmatrixBase;
+  using BaseClass = CtintWalkerBase<Parameters, DIST>;
+  using typename BaseClass::Rng;
+  using typename BaseClass::Data;
+  using typename BaseClass::Profiler;
+  using typename BaseClass::GpuStream;
+  using typename BaseClass::Real;
+  using typename BaseClass::Scalar;
+
+  CtintWalkerSubmatrixBase(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, int id = 0);
+
+  virtual ~CtintWalkerSubmatrixBase() = default;
+
+  void doSweep() override;
+
+  void computeM(typename BaseClass::MatrixPair& m_accum);
+
+  using BaseClass::order;
+
+  virtual void setMFromConfig() = 0;
+protected:
+  virtual void doStep() override;
+  void doSteps();
+  void generateDelayedMoves(int nbr_of_movesto_delay);
+
+  /** This does a bunch of things, this is the majority of a step
+   *  For each delayed_move it:
+   *  * computes the acceptance prob
+   *  * compares std::abs(acceptance prob) and random value
+   *    * Does an update of the log_weight for a configuration
+   *        mc_weight_ratio = acceptance_probability;
+   *      For a new vertex
+   *        mc_weight ratio *= weight_term
+   *      For a removed vertes
+   *        mc_weight_ratio /= weight_term
+   *      with
+   *        weight_term = prob_const_[field_type][b] * the vertex interaction strength;
+   *        BaseClass::mc_log_weight_ += std::log(std::abs(mc_weight_ratio));
+   */
+  void mainSubmatrixProcess();
+
+  void markThermalized() override;
+
+  void transformM();
+
+  // For testing purposes.
+  virtual void doStep(const int nbr_of_movesto_delay);
+
+  virtual void updateM() = 0;
+
+private:
+
+  void doSubmatrixUpdate();
+
+  /** returns [acceptance_probability , mc_weight_ratio ]
+   */
+  auto computeAcceptanceProbability();
+
+  void updateGammaInv(int s);
+
+  void removeRowAndColOfGammaInv();
+
+  virtual void computeMInit() = 0;
+
+  //  void computeG0Init();
+  virtual void computeGInit() = 0;
+
+  Move generateMoveType();
+
+  void computeInsertionMatrices(const std::vector<int>& indices, const int s);
+
+  void computeRemovalMatrix(int s);
+
+  void computeMixedInsertionAndRemoval(int s);
+
+  void findSectorIndices(const int s);
+
+  void recomputeGammaInv();
+
+  bool recentlyAdded(int move_idx, int s) const {
+    assert(move_idx < sector_indices_[s].size());
+    return sector_indices_[s][move_idx] >= n_init_[s];
+  }
+
+protected:
+  using MatrixView = linalg::MatrixView<Scalar, linalg::CPU>;
+  using Matrix = linalg::Matrix<Scalar, linalg::CPU>;
+
+  using BaseClass::parameters_;
+  using BaseClass::configuration_;
+  using BaseClass::rng_;
+  using BaseClass::thread_id_;
+  using BaseClass::total_interaction_;
+  using BaseClass::phase_;
+  using BaseClass::M_;
+  using BaseClass::n_bands_;
+  using BaseClass::beta_;
+
+  using BaseClass::thermalized_;
+  using BaseClass::sweeps_per_meas_;
+  using BaseClass::partial_order_avg_;
+  using BaseClass::thermalization_steps_;
+  using BaseClass::order_avg_;
+  using BaseClass::sign_avg_;
+  using BaseClass::n_steps_;
+  using BaseClass::mc_log_weight_;
+  using BaseClass::n_accepted_;
+
+protected:
+  int max_submatrix_size_;
+
+  struct DelayedMoveType {
+    Move move_type;
+    std::array<double, 3> removal_rng{1., 1., 1.};
+    Real acceptance_rng;
+    std::array<int, 2> indices{-1, -1};
+  };
+
+protected:
+  using BaseClass::acceptance_prob_;
+
+protected:
+  static constexpr Scalar the_one_ = dca::util::TheOne<Scalar>::value;
+
+  std::vector<DelayedMoveType> delayed_moves_;
+
+  using MatrixPair = std::array<linalg::Matrix<Scalar, linalg::CPU>, 2>;
+  MatrixPair G_;
+  MatrixPair G0_;
+  MatrixPair Gamma_inv_;      // TODO: don't pin
+  MatrixPair Gamma_inv_cpy_;  // TODO: don't pin
+  MatrixPair q_;              // TODO: don't pin or store in Gamma inv
+  MatrixPair r_;
+  MatrixPair s_;
+  std::array<std::vector<Real>, 2> gamma_;
+
+  Scalar det_ratio_;
+  std::map<int, std::array<Real, n_bands_>> f_;
+  std::map<int, std::array<Real, n_bands_>> prob_const_;
+  std::map<std::pair<int, int>, std::array<Real, n_bands_>> gamma_values_;
+
+  using BaseClass::nb_steps_per_sweep_;
+  int nbr_of_steps_;
+  int nbr_of_moves_to_delay_;
+  int max_nbr_of_moves;
+
+  std::array<int, 2> Gamma_size_;
+  std::array<std::vector<int>, 2> Gamma_indices_;
+  std::array<std::vector<int>, 2> sector_indices_;
+
+  const DelayedMoveType* current_move_;
+
+  std::array<unsigned, 2> nbr_of_indices_;
+
+  std::vector<int> index_;
+
+  // Initial configuration size.
+  unsigned config_size_init_;
+
+  // Initial and current sector sizes.
+  std::array<unsigned, 2> n_init_;
+  unsigned n_;
+
+  // Maximal sector size after submatrix update.
+
+  std::array<unsigned, 2> n_max_;
+
+  std::array<linalg::util::HostVector<int>, 2> move_indices_;
+  std::array<linalg::util::HostVector<int>, 2> removal_list_;
+  std::array<linalg::util::HostVector<int>, 2> source_list_;
+  std::array<std::vector<int>, 2> insertion_list_;
+  std::array<std::vector<int>, 2> insertion_Gamma_indices_;
+
+  std::vector<int> conf_removal_list_;
+
+  std::vector<int>::iterator insertion_list_it_;
+
+  std::array<Matrix, 2> Gamma_q_;
+  Matrix workspace_;
+  Matrix D_;
+
+  using BaseClass::flop_;
+};
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixBase<Parameters, DIST>::CtintWalkerSubmatrixBase(const Parameters& parameters_ref,
+                                                                   const Data& /*data*/,
+                                                                   Rng& rng_ref,
+								   int id)
+  : BaseClass(parameters_ref, rng_ref, id) {
+  if (BaseClass::concurrency_.id() == BaseClass::concurrency_.first() && thread_id_ == 0)
+    std::cout << "\nCT-INT submatrix walker created." << std::endl;
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters,DIST>::markThermalized() {
+  thermalized_ = true;
+
+  nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
+  thermalization_steps_ = n_steps_;
+
+  order_avg_.reset();
+  sign_avg_.reset();
+  n_accepted_ = 0;
+
+  // Recompute the Monte Carlo weight.
+  setMFromConfig();
+#ifndef NDEBUG
+  //writeAlphas();
+#endif
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::doSweep() {
+  Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
+  doSteps();
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::doSteps() {
+  // Here nbr_of_steps is the number of single steps/moves during the current sweep,
+  // while nbr_of_submatrix_steps is the number of times the entire submatrix algorithm  is run.
+
+  if (nb_steps_per_sweep_ < 0)  // Not thermalized or fixed.
+    nbr_of_steps_ = BaseClass::avgOrder() + 1;
+  else
+    nbr_of_steps_ = nb_steps_per_sweep_;
+
+  // Get the maximum of Monte Carlo steps/moves that can be performed during one submatrix step.
+  max_nbr_of_moves = parameters_.getMaxSubmatrixSize();
+
+  BaseClass::n_steps_ += nbr_of_steps_;
+  while (nbr_of_steps_ > 0) {
+    nbr_of_moves_to_delay_ = std::min(nbr_of_steps_, max_nbr_of_moves);
+    nbr_of_steps_ -= nbr_of_moves_to_delay_;
+
+    doStep();
+  }
+
+  BaseClass::updateSweepAverages();
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::doStep() {
+  generateDelayedMoves(nbr_of_moves_to_delay_);
+  doSubmatrixUpdate();
+}
+
+// Do one step with arbitrary number of moves. For testing.
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::doStep(const int nbr_of_movesto_delay) {
+  std::cout << "\nStarted doStep() function for testing." << std::endl;
+
+  generateDelayedMoves(nbr_of_movesto_delay);
+
+  std::cout << "\nGenerated " << nbr_of_movesto_delay << " moves for testing.\n" << std::endl;
+
+  doSubmatrixUpdate();
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::generateDelayedMoves(const int nbr_of_movesto_delay) {
+  assert(nbr_of_movesto_delay > 0);
+
+  delayed_moves_.clear();
+
+  for (int s = 0; s < 2; ++s) {
+    n_init_[s] = configuration_.getSector(s).size();
+  }
+  n_ = config_size_init_ = configuration_.size();
+
+  // Generate delayed moves.
+  for (int i = 0; i < nbr_of_movesto_delay; ++i) {
+    DelayedMoveType delayed_move;
+
+    delayed_move.move_type = generateMoveType();
+
+    switch (delayed_move.move_type) {
+      case REMOVAL:
+        if (configuration_.getDoubleUpdateProb())
+          delayed_move.removal_rng = {rng_(), rng_(), rng_()};
+        else
+          delayed_move.removal_rng[0] = rng_();
+        break;
+
+      case INSERTION:
+        configuration_.insertRandom(rng_);
+        for (int i = 0; i < configuration_.lastInsertionSize(); ++i)
+          delayed_move.indices[i] = configuration_.size() - configuration_.lastInsertionSize() + i;
+        break;
+
+      default:
+        throw(std::logic_error("Unknown move type encountered."));
+    }
+
+    delayed_move.acceptance_rng = rng_();
+    delayed_moves_.push_back(delayed_move);
+  }
+
+  for (int s = 0; s < 2; ++s) {
+    n_max_[s] = configuration_.getSector(s).size();
+  }
+
+  const int config_size_final = configuration_.size();
+  conf_removal_list_.resize(config_size_final - config_size_init_);
+  std::iota(conf_removal_list_.begin(), conf_removal_list_.end(), config_size_init_);
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::doSubmatrixUpdate() {
+  computeMInit();
+  computeGInit();
+  mainSubmatrixProcess();
+  updateM();
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::mainSubmatrixProcess() {
+  Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
+
+  for (int s = 0; s < 2; ++s) {
+    Gamma_inv_[s].resizeNoCopy(0);
+    gamma_[s].clear();
+
+    move_indices_[s].clear();
+    insertion_list_[s].clear();
+    insertion_Gamma_indices_[s].clear();
+  }
+
+  std::vector<int> aux_spin_type, new_aux_spin_type, move_band;
+
+  acceptance_prob_ = 1.0;
+  for (int delay_ind = 0; delay_ind < delayed_moves_.size(); ++delay_ind) {
+    current_move_ = &delayed_moves_[delay_ind];
+    const auto move_type = current_move_->move_type;
+
+    det_ratio_ = 1.;
+    std::array<int, 2> indices_array;
+
+    if (move_type == INSERTION) {
+      indices_array = current_move_->indices;
+    }
+    else {  // move_type == REMOVAL
+      indices_array = configuration_.randomRemovalCandidate(delayed_moves_[delay_ind].removal_rng);
+    }
+
+    index_.clear();
+    for (auto idx : indices_array) {
+      if (idx >= 0)
+        index_.push_back(idx);
+    }
+
+    // This leads to a bunch of complex branchy and premature looking optimization
+    // The branch predictor must be weeping
+    bool at_least_one_recently_added = false;
+    bool all_recently_added = false;
+    if (move_type == REMOVAL) {
+      // Check if the vertex to remove was inserted during the current submatrix update.
+      const auto recently_added = [=](int id) { return id >= config_size_init_; };
+      all_recently_added = math::util::all(index_, recently_added);
+      at_least_one_recently_added = math::util::any(index_, recently_added);
+    }
+
+    for (int s = 0; s < 2; ++s) {
+      Gamma_indices_[s].clear();
+      new_aux_spin_type.clear();
+      aux_spin_type.clear();
+      move_band.clear();
+
+      findSectorIndices(s);
+
+      if (move_type == INSERTION) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+          new_aux_spin_type.push_back(
+              configuration_.getSector(s).getAuxFieldType(sector_indices_[s][ind]));
+          aux_spin_type.push_back(0);
+        }
+      }
+      else if (move_type == REMOVAL) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+          new_aux_spin_type.push_back(0);
+          aux_spin_type.push_back(
+              configuration_.getSector(s).getAuxFieldType(sector_indices_[s][ind]));
+
+          if (recentlyAdded(ind, s)) {
+            // Find pre-existing Gamma_indices_.
+            insertion_list_it_ = std::find(insertion_list_[s].begin(), insertion_list_[s].end(),
+                                           sector_indices_[s][ind]);
+            Gamma_indices_[s].push_back(insertion_Gamma_indices_[s][std::distance(
+                insertion_list_[s].begin(), insertion_list_it_)]);
+          }
+        }
+      }  // endif removal
+      for (int index : sector_indices_[s])
+        move_band.push_back(configuration_.getSector(s).getLeftB(index));
+
+      for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+        if (move_type == INSERTION || !recentlyAdded(ind, s))
+          gamma_[s].push_back(
+              gamma_values_[std::make_pair(aux_spin_type[ind], new_aux_spin_type[ind])][move_band[ind]]);
+        else {
+          // TODO: find instead of adding.
+          gamma_[s].push_back(
+              gamma_values_[std::make_pair(new_aux_spin_type[ind], aux_spin_type[ind])][move_band[ind]]);
+        }
+      }
+
+      if (!at_least_one_recently_added)
+        computeInsertionMatrices(sector_indices_[s], s);
+      else {
+        if (all_recently_added)
+          computeRemovalMatrix(s);
+        else
+          computeMixedInsertionAndRemoval(s);
+      }
+
+      det_ratio_ *= details::smallDeterminant(s_[s]);
+    }  // s loop.
+
+    if (n_ == 0 && move_type == REMOVAL)
+      continue;
+
+    // Compute acceptance probability.
+    auto [acceptance_prob, mc_weight_ratio] = computeAcceptanceProbability();
+    acceptance_prob_ = acceptance_prob;
+
+    // Note: acceptance and rejection can be forced for testing with the appropriate "acceptance_rng".
+    const bool accepted =
+        delayed_moves_[delay_ind].acceptance_rng < std::min(std::abs(acceptance_prob_), Real(1.));
+
+    // NB: recomputeGammaInv is just a inefficient alternative to updateGammaInv. Only for testing
+    // or debbuging.
+    // recomputeGammaInv();
+
+    // Update
+    if (accepted) {
+      ++BaseClass::n_accepted_;
+
+      BaseClass::mc_log_weight_ += std::log(std::abs(mc_weight_ratio));
+
+      // Are we capturing the avg sign properly wrt multiple delayed moves
+      phase_.multiply(acceptance_prob_);
+
+      // Update GammaInv if necessary.
+      if (!at_least_one_recently_added)
+        for (int s = 0; s < 2; ++s)
+          updateGammaInv(s);
+      else {
+        removeRowAndColOfGammaInv();
+        for (int s = 0; s < 2; ++s) {
+          for (int ind = sector_indices_[s].size() - 1; ind >= 0; --ind) {
+            if (!recentlyAdded(ind, s))
+              continue;
+            insertion_list_[s].erase(std::remove(insertion_list_[s].begin(),
+                                                 insertion_list_[s].end(), sector_indices_[s][ind]),
+                                     insertion_list_[s].end());
+          }
+          for (int ind = Gamma_indices_[s].size() - 1; ind >= 0; --ind) {
+            insertion_Gamma_indices_[s].erase(std::remove(insertion_Gamma_indices_[s].begin(),
+                                                          insertion_Gamma_indices_[s].end(),
+                                                          Gamma_indices_[s][ind]),
+                                              insertion_Gamma_indices_[s].end());
+            gamma_[s].erase(gamma_[s].begin() + Gamma_indices_[s][ind]);
+
+            for (int i = 0; i < insertion_Gamma_indices_[s].size(); ++i) {
+              if (insertion_Gamma_indices_[s][i] > Gamma_indices_[s][ind])
+                --insertion_Gamma_indices_[s][i];
+            }
+          }
+        }
+      }
+
+      for (int s = 0; s < 2; ++s) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+          if (move_type == INSERTION || !recentlyAdded(ind, s)) {
+            move_indices_[s].push_back(sector_indices_[s][ind]);
+          }
+          else {
+            gamma_[s].pop_back();
+            move_indices_[s].erase(std::remove(move_indices_[s].begin(), move_indices_[s].end(),
+                                               sector_indices_[s][ind]),
+                                   move_indices_[s].end());
+          }
+        }
+      }
+
+      if (move_type == INSERTION) {
+        n_ += index_.size();
+        for (auto idx : index_)
+          configuration_.commitInsertion(idx);
+
+        for (int s = 0; s < 2; ++s) {
+          for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+            insertion_list_[s].push_back(sector_indices_[s][ind]);
+            insertion_Gamma_indices_[s].push_back(Gamma_inv_[s].nrRows() - nbr_of_indices_[s] + ind);
+          }
+        }
+
+        // TODO: cleanup
+        for (int idx : index_)
+          conf_removal_list_.erase(
+              std::remove(conf_removal_list_.begin(), conf_removal_list_.end(), idx),
+              conf_removal_list_.end());
+      }
+      else if (move_type == REMOVAL) {
+        n_ -= index_.size();
+        for (auto idx : index_)
+          configuration_.markForRemoval(idx);
+
+        // TODO: cleanup.
+        for (int idx : index_)
+          conf_removal_list_.push_back(idx);
+      }
+    }
+    else {  // The move is rejected:
+      for (int s = 0; s < 2; ++s) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind)
+          gamma_[s].pop_back();
+      }
+      if (at_least_one_recently_added && !all_recently_added)
+        for (int s = 0; s < 2; ++s) {
+          Gamma_inv_[s].swap(Gamma_inv_cpy_[s]);
+        }
+    }
+  }
+
+  // This seems correct in that we count all steps
+  // just as is done for the non submatrix cpu version
+  BaseClass::n_steps_ += delayed_moves_.size();
+}
+
+template <class Parameters, DistType DIST>
+Move CtintWalkerSubmatrixBase<Parameters, DIST>::generateMoveType() {
+  if (rng_() <= 0.5)
+    return INSERTION;
+  else
+    return REMOVAL;
+}
+
+template <class Parameters, DistType DIST>
+auto CtintWalkerSubmatrixBase<Parameters, DIST>::computeAcceptanceProbability() {
+  Scalar acceptance_probability = det_ratio_;
+
+  Scalar gamma_factor = the_one_;
+  const auto move_type = current_move_->move_type;
+  for (int s = 0; s < 2; ++s) {
+    if (!sector_indices_[s].size())
+      continue;
+
+    for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+      if (move_type == INSERTION || !recentlyAdded(ind, s))
+        gamma_factor *= gamma_[s].at(gamma_[s].size() - nbr_of_indices_[s] + ind);
+      else
+        gamma_factor /= gamma_[s].at(gamma_[s].size() - nbr_of_indices_[s] + ind);
+    }
+  }
+  acceptance_probability *= gamma_factor;
+
+  const int delta_vertices = index_.size();
+  const int interaction_sign = configuration_.getSign(index_[0]);
+  const int non_empty_sector = sector_indices_[0].size() ? 0 : 1;
+
+  Scalar mc_weight_ratio = acceptance_probability;
+  Real K = total_interaction_;
+
+  for (int v_id = 0; v_id < delta_vertices; ++v_id) {
+    const auto field_type = configuration_.getSector(non_empty_sector)
+                                .getAuxFieldType(sector_indices_[non_empty_sector][v_id]);
+    const auto b =
+        configuration_.getSector(non_empty_sector).getLeftB(sector_indices_[non_empty_sector][v_id]);
+    K *= beta_ * prob_const_[field_type][b] * interaction_sign;
+
+    const Scalar weight_term = prob_const_[field_type][b] * configuration_.getStrength(index_[v_id]);
+    if (move_type == INSERTION)
+      mc_weight_ratio *= weight_term;
+    else
+      mc_weight_ratio /= weight_term;
+  }
+
+  // Account for combinatorial factor and update acceptance probability.
+  if (move_type == INSERTION) {
+    if (delta_vertices == 1) {
+      acceptance_probability *= K / (n_ + 1);
+    }
+    else if (delta_vertices == 2) {
+      const Real possible_partners = configuration_.possiblePartners(index_[0]);
+      const Real combinatorial_factor =
+          (n_ + 2) * (configuration_.nPartners(index_[0]) + 1) / possible_partners;
+      acceptance_probability *= configuration_.getStrength(index_[1]) * K / combinatorial_factor;
+    }
+    else
+      throw(std::logic_error("Not implemented"));
+  }
+  else if (move_type == REMOVAL) {
+    if (delta_vertices == 1) {
+      acceptance_probability *= n_ / K;
+    }
+    else if (delta_vertices == 2) {
+      const Real possible_partners = configuration_.possiblePartners(index_[0]);
+      const Real combinatorial_factor = n_ * configuration_.nPartners(index_[0]) / possible_partners;
+      acceptance_probability *= combinatorial_factor / (configuration_.getStrength(index_[1]) * K);
+    }
+    else
+      throw(std::logic_error("Not implemented"));
+  }
+
+  return std::make_tuple(acceptance_probability, mc_weight_ratio);
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::updateGammaInv(int s) {
+  Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
+  const int delta = s_[s].nrRows();
+  if (delta == 0)
+    return;
+  const int old_size = Gamma_inv_[s].nrRows();
+  Gamma_inv_[s].resize(old_size + delta);
+
+  if (old_size > 0) {
+    MatrixView bulk(Gamma_inv_[s], 0, 0, old_size, old_size);
+    MatrixView q_inv(Gamma_inv_[s], 0, old_size, old_size, delta);
+    MatrixView r_inv(Gamma_inv_[s], old_size, 0, delta, old_size);
+    MatrixView s_inv(Gamma_inv_[s], old_size, old_size, delta, delta);
+
+    details::smallInverse(s_[s], s_inv);
+
+    auto& Gamma_q = Gamma_q_[s];
+    linalg::matrixop::gemm(Scalar(-1.), Gamma_q, s_inv, Scalar(0.), q_inv);
+
+    auto& r_Gamma = workspace_;
+    r_Gamma.resizeNoCopy(r_[s].size());
+    linalg::matrixop::gemm(r_[s], bulk, r_Gamma);
+    linalg::matrixop::gemm(Scalar(-1.), s_inv, r_Gamma, Scalar(0.), r_inv);
+
+    // Gamma_ += Gamma_ * q_ * s_^-1 * r_ * Gamma_
+    linalg::matrixop::gemm(Scalar(-1.), q_inv, r_Gamma, Scalar(1.), bulk);
+  }
+  else {
+    Gamma_inv_[s].resizeNoCopy(delta);
+    details::smallInverse(s_[s], Gamma_inv_[s]);
+  }
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::findSectorIndices(const int s) {
+  sector_indices_[s].clear();
+  for (auto index : index_) {
+    configuration_.findIndices(sector_indices_[s], index, s);
+  }
+
+  if (index_.size() > 1)
+    std::sort(sector_indices_[s].begin(), sector_indices_[s].end());
+  nbr_of_indices_[s] = sector_indices_[s].size();
+}
+
+// Remove row and column of Gamma_inv with Woodbury's formula.
+// Gamma <- Gamma - U.V => GammaInv <- GammaInv + GammaInv.U.(Id-V.GammaInv.U)^(-1).V.GammaInv.
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::removeRowAndColOfGammaInv() {
+  Profiler profiler(__FUNCTION__, "CT-INT walker", __LINE__, thread_id_);
+  for (int s = 0; s < 2; ++s) {
+    const int delta = Gamma_indices_[s].size();
+    if (delta == 0)
+      continue;
+    const int n_init = Gamma_inv_[s].nrRows();
+    const int n = n_init - delta;
+
+    if (n) {
+      q_[s].resizeNoCopy(std::make_pair(n, delta));
+      r_[s].resizeNoCopy(std::make_pair(delta, n));
+      auto& q_s = workspace_;
+      q_s.resizeNoCopy(q_[s].size());
+
+      //  TODO: check if gamma indices are ordered.
+      for (int j = 0; j < delta; ++j) {
+        int idx = 0;
+        for (int i = 0; i < n_init; ++i) {
+          if (idx == delta || i != Gamma_indices_[s][idx])
+            q_[s](i - idx, j) = Gamma_inv_[s](i, Gamma_indices_[s][j]);
+          else
+            ++idx;
+        }
+      }
+
+      int idx = 0;
+      for (int j = 0; j < n_init; ++j) {
+        if (idx == delta || j != Gamma_indices_[s][idx])
+          for (int i = 0; i < delta; ++i)
+            r_[s](i, j - idx) = Gamma_inv_[s](Gamma_indices_[s][i], j);
+        else
+          ++idx;
+      }
+
+      // TODO: this could be avoided in the case of pure removal.
+      s_[s].resizeNoCopy(delta);
+      for (int j = 0; j < delta; ++j)
+        for (int i = 0; i < delta; ++i)
+          s_[s](i, j) = Gamma_inv_[s](Gamma_indices_[s][i], Gamma_indices_[s][j]);
+
+      for (int ind = delta - 1; ind >= 0; --ind)
+        linalg::matrixop::removeRowAndCol(Gamma_inv_[s], Gamma_indices_[s][ind]);
+
+      details::smallInverse(s_[s]);
+
+      linalg::matrixop::gemm(q_[s], s_[s], q_s);
+
+      // Gamma_inv_ -= Q*S^-1*R
+      linalg::matrixop::gemm(Scalar(-1.), q_s, r_[s], Scalar(1.), Gamma_inv_[s]);
+    }  // if n
+    else {
+      Gamma_inv_[s].resizeNoCopy(0);
+    }
+  }
+}
+
+// This method is unused and left to potentially use as a testing reference.
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::recomputeGammaInv() {
+  for (int s = 0; s < 2; ++s) {
+    if (Gamma_inv_[s].nrRows() > 0)
+      linalg::matrixop::inverse(Gamma_inv_[s]);
+
+    Gamma_inv_[s].resize(Gamma_inv_[s].nrRows() + nbr_of_indices_[s]);
+
+    for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+      for (int i = 0; i < Gamma_inv_[s].nrRows(); ++i) {
+        Gamma_inv_[s](Gamma_inv_[s].nrRows() + ind, i) =
+            G_[s](sector_indices_[s][ind], move_indices_[s][i]);
+        Gamma_inv_[s](i, Gamma_inv_[s].nrRows() + ind) =
+            G_[s](move_indices_[s][i], sector_indices_[s][ind]);
+      }
+
+      for (int ind_2 = 0; ind_2 < nbr_of_indices_[s]; ++ind_2) {
+        Gamma_inv_[s](Gamma_inv_[s].nrRows() + ind, Gamma_inv_[s].nrRows() + ind_2) =
+            G_[s](sector_indices_[s][ind], sector_indices_[s][ind_2]);
+      }
+      Gamma_inv_[s](Gamma_inv_[s].nrRows() + ind, Gamma_inv_[s].nrRows() + ind) -=
+          (1 + gamma_[s].end()[-nbr_of_indices_[s] + ind]) /
+          gamma_[s].end()[-nbr_of_indices_[s] + ind];
+    }
+
+    Gamma_inv_[s].nrRows() = Gamma_inv_[s].nrRows();
+
+    if (Gamma_inv_[s].nrRows() > 0)
+      details::smallInverse(Gamma_inv_[s]);
+  }
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::transformM() {
+  for (int s = 0; s < 2; ++s) {
+    for (int j = 0; j < M_[s].size().second; ++j) {
+      for (int i = 0; i < M_[s].size().first; ++i) {
+        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
+        const auto b = configuration_.getSector(s).getLeftB(i);
+        const Scalar f_i = -(f_[field_type][b] - 1);
+        M_[s](i, j) /= f_i;
+      }
+    }
+  }
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::computeInsertionMatrices(
+    const std::vector<int>& insertion_indices, const int s) {
+  const int delta = insertion_indices.size();
+
+  s_[s].resizeNoCopy(delta);
+  if (delta == 0)
+    return;
+
+  for (int i = 0; i < delta; ++i)
+    for (int j = 0; j < delta; ++j) {
+      s_[s](i, j) = G_[s](insertion_indices[i], insertion_indices[j]);
+      if (i == j) {
+        const Real gamma_val = gamma_[s].at(gamma_[s].size() + i - nbr_of_indices_[s]);
+        s_[s](i, j) -= (1 + gamma_val) / gamma_val;
+      }
+    }
+
+  assert(Gamma_inv_[s].nrRows() == move_indices_[s].size());
+  if (Gamma_inv_[s].nrRows() > 0) {
+    q_[s].resizeNoCopy(std::make_pair(Gamma_inv_[s].nrRows(), delta));
+    r_[s].resizeNoCopy(std::make_pair(delta, Gamma_inv_[s].nrRows()));
+
+    for (int i = 0; i < Gamma_inv_[s].nrRows(); ++i)
+      for (int j = 0; j < delta; ++j) {
+        q_[s](i, j) = G_[s](move_indices_[s][i], insertion_indices[j]);
+        r_[s](j, i) = G_[s](insertion_indices[j], move_indices_[s][i]);
+      }
+
+    auto& Gamma_q = Gamma_q_[s];
+    Gamma_q.resizeNoCopy(q_[s].size());
+    linalg::matrixop::gemm(Gamma_inv_[s], q_[s], Gamma_q);
+    linalg::matrixop::gemm(Scalar(-1.), r_[s], Gamma_q, Scalar(1.), s_[s]);
+  }
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::computeRemovalMatrix(const int s) {
+  const int delta = Gamma_indices_[s].size();
+  s_[s].resizeNoCopy(delta);
+  for (int j = 0; j < delta; ++j)
+    for (int i = 0; i < delta; ++i)
+      s_[s](i, j) = Gamma_inv_[s](Gamma_indices_[s][i], Gamma_indices_[s][j]);
+}
+
+template <class Parameters, DistType DIST>
+void CtintWalkerSubmatrixBase<Parameters, DIST>::computeMixedInsertionAndRemoval(int s) {
+  Gamma_inv_cpy_[s] = Gamma_inv_[s];
+
+  if (sector_indices_[s].size() == 0) {
+    s_[s].resizeNoCopy(0);
+    return;
+  }
+
+  // TODO: avoid reallocation.
+  std::vector<int> insertion_indices;
+  for (int i = 0; i < nbr_of_indices_[s]; ++i)
+    if (!recentlyAdded(i, s))
+      insertion_indices.push_back(sector_indices_[s][i]);
+  assert(Gamma_indices_[s].size() + insertion_indices.size() == sector_indices_.size());
+
+  computeInsertionMatrices(insertion_indices, s);
+  det_ratio_ *= details::smallDeterminant(s_[s]);
+  updateGammaInv(s);
+
+  computeRemovalMatrix(s);
+}
+
+}  // namespace ctint
+}  // namespace solver
+}  // namespace phys
+}  // namespace dca
+
+#endif
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/submat_impl.cpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/submat_impl.cpp
new file mode 100644
index 000000000..ec86d92d6
--- /dev/null
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/submat_impl.cpp
@@ -0,0 +1,221 @@
+#include "submat_impl.hpp"
+
+namespace dca {
+namespace phys {
+namespace solver {
+namespace ctint {
+
+template <typename SCALAR>
+void SubmatImpl<SCALAR>::mainSubmatrixProcess() {
+  for (int s = 0; s < 2; ++s) {
+    move_indices_[s].clear();
+    insertion_list_[s].clear();
+    insertion_Gamma_indices_[s].clear();
+  }
+
+  std::vector<int> aux_spin_type, new_aux_spin_type, move_band;
+
+  SCALAR acceptance_prob_ = 1.0;
+  for (int delay_ind = 0; delay_ind < delayed_moves_.size(); ++delay_ind) {
+    current_move_ = &delayed_moves_[delay_ind];
+    const auto move_type = current_move_->move_type;
+    // there seems to be no need that this be persistent.
+    det_ratio_ = 1.;
+    std::array<int, 2> indices_array;
+
+    if (move_type == INSERTION) {
+      indices_array = current_move_->indices;
+    }
+    else {  // move_type == REMOVAL
+      indices_array = configuration_.randomRemovalCandidate(delayed_moves_[delay_ind].removal_rng);
+    }
+
+    index_.clear();
+    for (auto idx : indices_array) {
+      if (idx >= 0)
+        index_.push_back(idx);
+    }
+
+    // This leads to a bunch of complex branchy and premature looking optimization
+    // The branch predictor must be weeping
+    bool at_least_one_recently_added = false;
+    bool all_recently_added = false;
+    if (move_type == REMOVAL) {
+      // Check if the vertex to remove was inserted during the current submatrix update.
+      const auto recently_added = [=](int id) { return id >= config_size_init_; };
+      all_recently_added = math::util::all(index_, recently_added);
+      at_least_one_recently_added = math::util::any(index_, recently_added);
+    }
+
+    for (int s = 0; s < 2; ++s) {
+      Gamma_indices_[s].clear();
+      new_aux_spin_type.clear();
+      aux_spin_type.clear();
+      move_band.clear();
+
+      findSectorIndices(s);
+
+      if (move_type == INSERTION) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+          new_aux_spin_type.push_back(
+              configuration_.getSector(s).getAuxFieldType(sector_indices_[s][ind]));
+          aux_spin_type.push_back(0);
+        }
+      }
+      else if (move_type == REMOVAL) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+          new_aux_spin_type.push_back(0);
+          aux_spin_type.push_back(
+              configuration_.getSector(s).getAuxFieldType(sector_indices_[s][ind]));
+
+          if (recentlyAdded(ind, s)) {
+            // Find pre-existing Gamma_indices_.
+            insertion_list_it_ = std::find(insertion_list_[s].begin(), insertion_list_[s].end(),
+                                           sector_indices_[s][ind]);
+            Gamma_indices_[s].push_back(insertion_Gamma_indices_[s][std::distance(
+                insertion_list_[s].begin(), insertion_list_it_)]);
+          }
+        }
+      }  // endif removal
+      for (int index : sector_indices_[s])
+        move_band.push_back(configuration_.getSector(s).getLeftB(index));
+
+      for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+        if (move_type == INSERTION || !recentlyAdded(ind, s))
+          gamma_[s].push_back(
+              gamma_values_[std::make_pair(aux_spin_type[ind], new_aux_spin_type[ind])][move_band[ind]]);
+        else {
+          // TODO: find instead of adding.
+          gamma_[s].push_back(
+              gamma_values_[std::make_pair(new_aux_spin_type[ind], aux_spin_type[ind])][move_band[ind]]);
+        }
+      }
+
+      if (!at_least_one_recently_added)
+        computeInsertionMatrices(sector_indices_[s], s);
+      else {
+        if (all_recently_added)
+          computeRemovalMatrix(s);
+        else
+          computeMixedInsertionAndRemoval(s);
+      }
+
+      det_ratio_ *= details::smallDeterminant(s_[s]);
+    }  // s loop.
+
+    if (n_ == 0 && move_type == REMOVAL)
+      continue;
+
+    // Compute acceptance probability.
+    auto [acceptance_prob, mc_weight_ratio] = computeAcceptanceProbability();
+    acceptance_prob_ = acceptance_prob;
+
+    // Note: acceptance and rejection can be forced for testing with the appropriate "acceptance_rng".
+    const bool accepted =
+        delayed_moves_[delay_ind].acceptance_rng < std::min(std::abs(acceptance_prob_), Real(1.));
+
+    // NB: recomputeGammaInv is just a inefficient alternative to updateGammaInv. Only for testing
+    // or debbuging.
+    // recomputeGammaInv();
+
+    // Update
+    if (accepted) {
+      ++BaseClass::n_accepted_;
+
+      BaseClass::mc_log_weight_ += std::log(std::abs(mc_weight_ratio));
+
+      // Are we capturing the avg sign properly wrt multiple delayed moves
+      phase_.multiply(acceptance_prob_);
+
+      // Update GammaInv if necessary.
+      if (!at_least_one_recently_added)
+        for (int s = 0; s < 2; ++s)
+          updateGammaInv(s);
+      else {
+        removeRowAndColOfGammaInv();
+        for (int s = 0; s < 2; ++s) {
+          for (int ind = sector_indices_[s].size() - 1; ind >= 0; --ind) {
+            if (!recentlyAdded(ind, s))
+              continue;
+            insertion_list_[s].erase(std::remove(insertion_list_[s].begin(),
+                                                 insertion_list_[s].end(), sector_indices_[s][ind]),
+                                     insertion_list_[s].end());
+          }
+          for (int ind = Gamma_indices_[s].size() - 1; ind >= 0; --ind) {
+            insertion_Gamma_indices_[s].erase(std::remove(insertion_Gamma_indices_[s].begin(),
+                                                          insertion_Gamma_indices_[s].end(),
+                                                          Gamma_indices_[s][ind]),
+                                              insertion_Gamma_indices_[s].end());
+            gamma_[s].erase(gamma_[s].begin() + Gamma_indices_[s][ind]);
+
+            for (int i = 0; i < insertion_Gamma_indices_[s].size(); ++i) {
+              if (insertion_Gamma_indices_[s][i] > Gamma_indices_[s][ind])
+                --insertion_Gamma_indices_[s][i];
+            }
+          }
+        }
+      }
+
+      for (int s = 0; s < 2; ++s) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+          if (move_type == INSERTION || !recentlyAdded(ind, s)) {
+            move_indices_[s].push_back(sector_indices_[s][ind]);
+          }
+          else {
+            gamma_[s].pop_back();
+            move_indices_[s].erase(std::remove(move_indices_[s].begin(), move_indices_[s].end(),
+                                               sector_indices_[s][ind]),
+                                   move_indices_[s].end());
+          }
+        }
+      }
+
+      if (move_type == INSERTION) {
+        n_ += index_.size();
+        for (auto idx : index_)
+          configuration_.commitInsertion(idx);
+
+        for (int s = 0; s < 2; ++s) {
+          for (int ind = 0; ind < nbr_of_indices_[s]; ++ind) {
+            insertion_list_[s].push_back(sector_indices_[s][ind]);
+            insertion_Gamma_indices_[s].push_back(Gamma_inv_[s].nrRows() - nbr_of_indices_[s] + ind);
+          }
+        }
+
+        // TODO: cleanup
+        for (int idx : index_)
+          conf_removal_list_.erase(
+              std::remove(conf_removal_list_.begin(), conf_removal_list_.end(), idx),
+              conf_removal_list_.end());
+      }
+      else if (move_type == REMOVAL) {
+        n_ -= index_.size();
+        for (auto idx : index_)
+          configuration_.markForRemoval(idx);
+
+        // TODO: cleanup.
+        for (int idx : index_)
+          conf_removal_list_.push_back(idx);
+      }
+    }
+    else {  // The move is rejected:
+      for (int s = 0; s < 2; ++s) {
+        for (int ind = 0; ind < nbr_of_indices_[s]; ++ind)
+          gamma_[s].pop_back();
+      }
+      if (at_least_one_recently_added && !all_recently_added)
+        for (int s = 0; s < 2; ++s) {
+          Gamma_inv_[s].swap(Gamma_inv_cpy_[s]);
+        }
+    }
+  }
+
+  // This seems correct in that we count all steps
+  // just as is done for the non submatrix cpu version
+  BaseClass::n_steps_ += delayed_moves_.size();
+}
+
+}
+}
+}
+}
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/submat_impl.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/submat_impl.hpp
new file mode 100644
index 000000000..1f93b81c8
--- /dev/null
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/submat_impl.hpp
@@ -0,0 +1,51 @@
+#ifndef SUBMATIMPL_HPP
+#define SUBMATIMPL_HPP
+
+#include <array>
+#include <vector>
+#include "dca/linalg/linalg.hpp"
+
+namespace dca {
+namespace phys {
+namespace solver {
+namespace ctint {
+
+template<typename SCALAR>
+class SubmatImpl {
+public:
+  struct DelayedMoveType {
+    Move move_type;
+    std::array<double, 3> removal_rng{1., 1., 1.};
+    Real acceptance_rng;
+    std::array<int, 2> indices{-1, -1};
+  };
+
+private:
+  const DelayedMoveType* current_move_;
+  std::array<int, 2> Gamma_size_;
+  std::array<std::vector<int>, 2> Gamma_indices_;
+  std::array<std::vector<int>, 2> sector_indices_;
+
+  std::array<unsigned, 2> nbr_of_indices_;
+
+  std::vector<int> index_;
+
+  std::vector<DelayedMoveType> delayed_moves_;
+  std::array<linalg::util::HostVector<int>, 2> move_indices;
+  std::array<linalg::util::HostVector<int>, 2> removal_list;
+  std::array<linalg::util::HostVector<int>, 2> source_list;
+  std::array<std::vector<int>, 2> insertion_list;
+  std::array<std::vector<int>, 2> insertion_Gamma_indices;
+  std::vector<int> conf_removal_list;
+  std::vector<int>::iterator insertion_list_it;
+
+public:
+  void mainSubmatrixProcess();
+};
+
+}  // namespace ctint
+}  // namespace solver
+}  // namespace phys
+}  // namespace dca
+
+#endif
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
index 6601fb439..ecec506e0 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder.hpp
@@ -86,7 +86,7 @@ class DMatrixBuilder<linalg::CPU, Scalar> {
 #ifdef DCA_HAVE_GPU
   virtual void computeG0(linalg::Matrix<Scalar, linalg::GPU>& /*G0*/,
                          const details::DeviceConfiguration& /*configuration*/, int /*n_init*/,
-                         bool /*right_section*/, GpuStream /*stream*/) const {
+                         bool /*right_section*/, const GpuStream& /*stream*/) const {
     throw(std::runtime_error("Not implemented."));
   }
 #endif  // DCA_HAVE_GPU
diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp
index 752c3283a..0e742d182 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp
@@ -39,6 +39,7 @@ class DMatrixBuilder<linalg::GPU, Scalar> final : public DMatrixBuilder<linalg::
   using BaseClass = DMatrixBuilder<linalg::CPU, Scalar>;
   using GpuStream = dca::linalg::util::GpuStream;
 public:
+  using BaseClass::setAlphas;
   template <class RDmn>
   DMatrixBuilder(const G0Interpolation<linalg::GPU, Scalar>& g0, int nb, const RDmn& /*rdmn*/);
 
diff --git a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp
index 825abdcba..554b2e652 100644
--- a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_cluster_solver.hpp
@@ -366,7 +366,7 @@ void StdThreadQmciClusterSolver<QmciSolver>::startWalker(int id) {
   const int walker_index = thread_task_handler_.walkerIDToRngIndex(id);
 
   auto walker_log = last_iteration_ ? BaseClass::writer_ : nullptr;
-  Walker walker(parameters_, data_, rng_vector_[walker_index], concurrency_.get_id(), id,
+  Walker walker(parameters_, data_, rng_vector_[walker_index], BaseClass::getResource(), concurrency_.get_id(), id,
                 walker_log, BaseClass::g0_);
 
   std::unique_ptr<std::exception> exception_ptr;
@@ -588,7 +588,7 @@ void StdThreadQmciClusterSolver<QmciSolver>::startWalkerAndAccumulator(int id,
 
   // Create and warm a walker.
   auto walker_log = BaseClass::writer_;
-  Walker walker(parameters_, data_, rng_vector_[id], concurrency_.get_id(), id, walker_log,
+  Walker walker(parameters_, data_, rng_vector_[id], BaseClass::getResource(), concurrency_.get_id(), id, walker_log,
                 BaseClass::g0_);
   initializeAndWarmUp(walker, id, id);
 
diff --git a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_walker.hpp b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_walker.hpp
index 10fe87f99..032076278 100644
--- a/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_walker.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/stdthread_qmci/stdthread_qmci_walker.hpp
@@ -35,12 +35,13 @@ class StdThreadQmciWalker final
   using Concurrency = typename Parameters::concurrency_type;
   using Rng = typename Parameters::random_number_generator;
   using Real = typename QmciWalker::Scalar;
-
+  using WalkerResource = QmciWalker::Resource;
+  
   constexpr static auto device = QmciWalker::device;
   constexpr static int bands = Parameters::bands;
 
 public:
-  StdThreadQmciWalker(Parameters& parameters_ref, DATA& data_ref, Rng& rng, int concurrency_id,
+  StdThreadQmciWalker(Parameters& parameters_ref, DATA& data_ref, Rng& rng, WalkerResource& d_matrix_builder, int concurrency_id,
                       int id, const std::shared_ptr<io::Writer<Concurrency>>& writer,
                       G0Interpolation<device, Real>& g0);
 
@@ -80,9 +81,10 @@ class StdThreadQmciWalker final
 
 template <class QmciWalker, class DATA>
 StdThreadQmciWalker<QmciWalker, DATA>::StdThreadQmciWalker(
-    Parameters& parameters, DATA& data_ref, Rng& rng, int concurrency_id, int id,
-    const std::shared_ptr<io::Writer<Concurrency>>& writer, [[maybe_unused]]G0Interpolation<device, Real>& g0)
-    : QmciWalker(parameters, data_ref, rng, id),
+    Parameters& parameters, DATA& data_ref, Rng& rng, WalkerResource& d_matrix_builder,
+    int concurrency_id, int id, const std::shared_ptr<io::Writer<Concurrency>>& writer,
+    [[maybe_unused]] G0Interpolation<device, Real>& g0)
+    : QmciWalker(parameters, data_ref, rng, d_matrix_builder, id),
       // QmciAutocorrelationData<QmciWalker>(parameters, id, g0),
       stamping_period_(parameters.stamping_period()),
       concurrency_id_(concurrency_id),
@@ -113,10 +115,13 @@ void StdThreadQmciWalker<QmciWalker, DATA>::doSweep() {
 
 template <class QmciWalker, class DATA>
 void StdThreadQmciWalker<QmciWalker, DATA>::logConfiguration() const {
-  const bool print_to_log = writer_ && static_cast<bool>(*writer_);  // File exists and it is open. \todo possibly this should always be true
+  const bool print_to_log =
+      writer_ &&
+      static_cast<bool>(
+          *writer_);  // File exists and it is open. \todo possibly this should always be true
   if (print_to_log &&
-      (writer_->isADIOS2() ||
-       (writer_->isHDF5() && writer_->get_concurrency().id() == writer_->get_concurrency().first()))) {
+      (writer_->isADIOS2() || (writer_->isHDF5() && writer_->get_concurrency().id() ==
+                                                        writer_->get_concurrency().first()))) {
     if (stamping_period_ && (meas_id_ % stamping_period_) == 0) {
       const std::string stamp_name = "r_" + std::to_string(concurrency_id_) + "_meas_" +
                                      std::to_string(meas_id_) + "_w_" + std::to_string(thread_id_);
diff --git a/include/dca/phys/models/tight_binding_model.hpp b/include/dca/phys/models/tight_binding_model.hpp
index bfb51d8e3..565da7005 100644
--- a/include/dca/phys/models/tight_binding_model.hpp
+++ b/include/dca/phys/models/tight_binding_model.hpp
@@ -28,6 +28,7 @@ namespace models {
 template <typename Lattice>
 class TightBindingModel {
 public:
+  using model_type = TightBindingModel<Lattice>;
   using lattice_type = Lattice;
   using b = func::dmn_0<domains::electron_band_domain>;
   using s = func::dmn_0<domains::electron_spin_domain>;
diff --git a/src/io/hdf5/hdf5_writer.cpp b/src/io/hdf5/hdf5_writer.cpp
index 855924a42..bdce48c58 100644
--- a/src/io/hdf5/hdf5_writer.cpp
+++ b/src/io/hdf5/hdf5_writer.cpp
@@ -27,6 +27,7 @@ void HDF5Writer::open_file(std::string file_name, bool overwrite) {
   if (file_)
     throw std::logic_error(__FUNCTION__);
 
+  try {
   if (overwrite) {
     file_ = std::make_unique<H5::H5File>(file_name.c_str(), H5F_ACC_TRUNC);
   }
@@ -36,7 +37,9 @@ void HDF5Writer::open_file(std::string file_name, bool overwrite) {
     else
       file_ = std::make_unique<H5::H5File>(file_name.c_str(), H5F_ACC_EXCL);
   }
-
+  } catch (const H5::Exception& exc) {
+    std::cerr << exc.getDetailMsg() << '\n';
+  }
   file_id_ = file_->getId();
 }
 
diff --git a/test/performance/phys/ctint/ctint_walker_performance_test.cpp b/test/performance/phys/ctint/ctint_walker_performance_test.cpp
index 6e62bfba5..e58793e58 100644
--- a/test/performance/phys/ctint/ctint_walker_performance_test.cpp
+++ b/test/performance/phys/ctint/ctint_walker_performance_test.cpp
@@ -52,18 +52,25 @@ using Real = double;
 
 using RngType = dca::math::random::StdRandomWrapper<std::ranlux48_base>;
 using Lattice = dca::phys::models::bilayer_lattice<dca::phys::domains::D4>;
+constexpr int bands = Lattice::BANDS;
+
 using Model = dca::phys::models::TightBindingModel<Lattice>;
 using NoThreading = dca::parallel::NoThreading;
 using TestConcurrency = dca::parallel::NoConcurrency;
 using Profiler = dca::profiling::CountingProfiler<dca::profiling::time_event<std::size_t>>;
 using Scalar = double;
-using Parameters = dca::phys::params::Parameters<TestConcurrency, NoThreading, Profiler, Model, RngType,
-                                                 dca::ClusterSolverId::CT_INT, dca::NumericalTraits<dca::util::RealAlias<Scalar>, Scalar>>;
+using Parameters =
+    dca::phys::params::Parameters<TestConcurrency, NoThreading, Profiler, Model, RngType,
+                                  dca::ClusterSolverId::CT_INT,
+                                  dca::NumericalTraits<dca::util::RealAlias<Scalar>, Scalar>>;
 using Data = dca::phys::DcaData<Parameters>;
-template <dca::linalg::DeviceType device_t>
 
+template <dca::linalg::DeviceType device_t>
 using Walker = dca::phys::solver::ctint::CtintWalkerChoice<device_t, Parameters, true>;
 
+template <dca::linalg::DeviceType DEVICE, typename SCALAR>
+using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<DEVICE, SCALAR>;
+
 using BDmn = dca::func::dmn_0<dca::phys::domains::electron_band_domain>;
 using RDmn = Parameters::RClusterDmn;
 using BBRDmn = dca::func::dmn_variadic<BDmn, BDmn, RDmn>;
@@ -111,9 +118,6 @@ int main(int argc, char** argv) {
       dca::phys::solver::details::shrinkG0(data.G0_r_t));
 
   BBRDmn bbr_dmn;
-  Walker<device>::setDMatrixBuilder(g0);
-  Walker<device>::setDMatrixAlpha(parameters.getAlphas(), false);
-  Walker<device>::setInteractionVertices(data, parameters);
 
   auto printTime = [](const std::string& str, const auto& start, const auto& end) {
     dca::profiling::Duration time(end, start);
@@ -138,9 +142,12 @@ int main(int argc, char** argv) {
     if (!test_gpu)
       Profiler::start();
 
+    DMatrixBuilder<dca::linalg::CPU, Scalar> d_matrix_builder(g0, bands, RDmn());
+    Walker<dca::linalg::CPU>::setInteractionVertices(data, parameters);
+
     // Do one integration step.
     RngType rng(0, 1, 0);
-    Walker<dca::linalg::CPU> walker(parameters, data, rng);
+    Walker<dca::linalg::CPU> walker(parameters, data, rng, d_matrix_builder);
     walker.initialize(0);
 
     // Timed section.
@@ -164,7 +171,11 @@ int main(int argc, char** argv) {
 
     RngType::resetCounter();
     RngType rng(0, 1, 0);
-    Walker<dca::linalg::GPU> walker_gpu(parameters, data, rng, 0);
+
+    DMatrixBuilder<dca::linalg::GPU, Scalar> d_matrix_builder_gpu(g0, bands, RDmn());
+    Walker<dca::linalg::GPU>::setInteractionVertices(data, parameters);
+
+    Walker<dca::linalg::GPU> walker_gpu(parameters, data, rng, d_matrix_builder_gpu, 0);
     walker_gpu.initialize(0);
 
     // Timed section.
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
index 98d85a934..b8669830a 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_multinsertion_test.cpp
@@ -46,23 +46,30 @@ using Walker = testing::phys::solver::ctint::WalkerWrapper<Scalar, G0Setup::Para
 using Matrix = Walker::Matrix;
 using MatrixPair = std::array<Matrix, 2>;
 
+constexpr int bands = dca::testing::LatticeHund::BANDS;
+
 auto getVertexRng = [](int id) {
   const double n_vertices = 12.;
   return (n_vertices - id - 0.5) / n_vertices;
 };
 
+using CDA = dca::phys::ClusterDomainAliases<dca::testing::LatticeHund::DIMENSION>;
+using RDmn = typename CDA::RClusterDmn;
+
+
 // Compare single versus double update without a submatrix update.
 TEST_F(G0Setup, NoSubmatrix) {
   G0Setup::RngType rng(std::vector<double>{});
   G0Interpolation<dca::linalg::CPU, double> g0(
       dca::phys::solver::ctint::details::shrinkG0(data_->G0_r_t));
   G0Setup::LabelDomain label_dmn;
-  Walker::setDMatrixBuilder(g0);
-  Walker::setDMatrixAlpha(parameters_.getAlphas(), false);
+  using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
+  DMatrixBuilder d_matrix_builder(g0, bands, RDmn());
+  d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
   Walker::setInteractionVertices(*data_, parameters_);
 
   parameters_.setDoubleUpdateProbability(0);
-  Walker walker_single(parameters_, rng);
+  Walker walker_single(parameters_, rng, d_matrix_builder);
   walker_single.setInteractionVertices(*data_, parameters_);
 
   // interaction, tau, aux, accept
@@ -101,7 +108,7 @@ TEST_F(G0Setup, NoSubmatrix) {
   parameters_.setDoubleUpdateProbability(1);
 
   Walker::setInteractionVertices(*data_, parameters_);
-  Walker walker_double(parameters_, rng);
+  Walker walker_double(parameters_, rng, d_matrix_builder);
 
   ////////////////////////////////
   // interaction, partner, tau, aux, tau, aux, accept
@@ -153,12 +160,13 @@ TEST_F(G0Setup, Submatrix) {
   G0Interpolation<dca::linalg::CPU, double> g0(
       dca::phys::solver::ctint::details::shrinkG0(data_->G0_r_t));
   G0Setup::LabelDomain label_dmn;
-  Walker::setDMatrixBuilder(g0);
-  Walker::setDMatrixAlpha(parameters_.getAlphas(), false);
+  using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
+  DMatrixBuilder d_matrix_builder(g0, bands, RDmn());
+  d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
   parameters_.setDoubleUpdateProbability(1);
   Walker::setInteractionVertices(*data_, parameters_);
 
-  Walker walker(parameters_, rng);
+  Walker walker(parameters_, rng, d_matrix_builder);
 
   // interaction, partner, tau, aux, tau, aux, accept
   rng.setNewValues(std::vector<double>{getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0});
@@ -173,12 +181,12 @@ TEST_F(G0Setup, Submatrix) {
   const double prob1 = walker.getAcceptanceProbability();
 
   /////////////////////////////
-
-  WalkerSubmatrix::setDMatrixBuilder(g0);
-  WalkerSubmatrix::setDMatrixAlpha(parameters_.getAlphas(), false);
+  using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
+  DMatrixBuilder d_matrix_cpu(g0, bands, RDmn());
+  d_matrix_builder.setAlphas(parameters_.getAlphas(), false);
   WalkerSubmatrix::setInteractionVertices(*data_, parameters_);
 
-  WalkerSubmatrix walker_subm(parameters_, rng);
+  WalkerSubmatrix walker_subm(parameters_, rng, d_matrix_builder);
 
   std::vector<double> random_vals{// (insert, first_id, partner_id, tau, aux, tau, aux, accept) x 2
                                   0, getVertexRng(6), 0, 0.41, 0, 0.53, 1, 0, 0, getVertexRng(7),
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp
index bd2c9b5e4..157ceee38 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_complex_test.cpp
@@ -39,6 +39,13 @@ constexpr char input_name[] =
 template <typename Scalar>
 using CtintWalkerSubmatrixGpuComplexTest =
   typename dca::testing::G0Setup<Scalar, dca::testing::LatticeRashba, dca::ClusterSolverId::CT_INT, input_name>;
+constexpr int bands = dca::testing::LatticeRashba::BANDS;
+
+template<dca::linalg::DeviceType DEVICE, typename SCALAR>
+using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<DEVICE, SCALAR>;
+
+using CDA = dca::phys::ClusterDomainAliases<dca::testing::LatticeBilayer::DIMENSION>;
+using RDmn = typename CDA::RClusterDmn;
 
 using namespace dca::phys::solver;
 
@@ -70,15 +77,12 @@ TYPED_TEST(CtintWalkerSubmatrixGpuComplexTest, doSteps) {
   G0Interpolation<GPU, Scalar> g0_gpu(g0_func);
   typename TestFixture::LabelDomain label_dmn;
 
-  // TODO: improve API.
-  SbmWalkerCpu::setDMatrixBuilder(g0_cpu);
-  SbmWalkerCpu::setDMatrixAlpha(parameters.getAlphas(), false);
-  SbmWalkerGpu::setDMatrixBuilder(g0_gpu);
-  SbmWalkerGpu::setDMatrixAlpha(parameters.getAlphas(), false);
-
+  DMatrixBuilder<dca::linalg::CPU, Scalar> d_matrix_cpu(g0_cpu, bands, RDmn());
   SbmWalkerCpu::setInteractionVertices(data, parameters);
-  SbmWalkerGpu::setInteractionVertices(data, parameters);
-
+  d_matrix_cpu.setAlphas(parameters.getAlphas(), parameters.adjustAlphaDd());
+  DMatrixBuilder<dca::linalg::GPU, Scalar> d_matrix_gpu(g0_gpu, bands, RDmn());
+  d_matrix_gpu.setAlphas(parameters.getAlphas(), parameters.adjustAlphaDd());
+  
   // ************************************
   // Test vertex insertion / removal ****
   // ************************************
@@ -105,9 +109,9 @@ TYPED_TEST(CtintWalkerSubmatrixGpuComplexTest, doSteps) {
 
     for (int steps = 1; steps <= 8; ++steps) {
       rng.setNewValues(setup_rngs);
-      SbmWalkerCpu walker_cpu(parameters, rng);
+      SbmWalkerCpu walker_cpu(parameters, rng, d_matrix_cpu);
       rng.setNewValues(setup_rngs);
-      SbmWalkerGpu walker_gpu(parameters, rng);
+      SbmWalkerGpu walker_gpu(parameters, rng, d_matrix_gpu);
 
       rng.setNewValues(rng_vals);
       walker_cpu.doStep(steps);
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
index 7f8478f60..1f727cd4e 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_gpu_test.cpp
@@ -51,11 +51,17 @@ struct CtINTWalkerSubmatrixGPUTestT : public ::testing::Test {
   G0Setup gpu_setup;
 };
 
+using CDA = dca::phys::ClusterDomainAliases<dca::testing::LatticeBilayer::DIMENSION>;
+using RDmn = typename CDA::RClusterDmn;
+
 using namespace dca::phys::solver;
 
 template <typename Scalar>
 using CtintWalkerSubmatrixGpuTest = CtINTWalkerSubmatrixGPUTestT<Scalar>;
 
+template<dca::linalg::DeviceType DEVICE>
+using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<DEVICE, Scalar>;
+
 // Currently testing float isn't really possible due to the way the Scalar type is
 // carried through from mc_options. See test_setup.hpp PD
 using ScalarTypes = ::testing::Types<double>; //double,
@@ -91,14 +97,15 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
   G0Interpolation<GPU, Scalar> g0_gpu(g0_func_gpu);
   typename CtintWalkerSubmatrixGpuTest<Scalar>::G0Setup::LabelDomain label_dmn;
 
-  // TODO: improve API.
-  Walker::setDMatrixBuilder(g0_cpu);
-  Walker::setDMatrixAlpha(cpu_parameters.getAlphas(), false);
-  Walker::setInteractionVertices(cpu_data, cpu_parameters);
 
-  SbmWalkerGpu::setDMatrixBuilder(g0_gpu);
-  SbmWalkerGpu::setDMatrixAlpha(gpu_parameters.getAlphas(), false);
-  SbmWalkerGpu::setInteractionVertices(gpu_data, gpu_parameters);
+  constexpr int bands = dca::testing::LatticeBilayer::BANDS;
+
+  DMatrixBuilder<dca::linalg::CPU> d_matrix_cpu(g0_cpu, bands, RDmn());
+  SbmWalkerCpu::setInteractionVertices(cpu_data, cpu_parameters);
+  d_matrix_cpu.setAlphas(cpu_parameters.getAlphas(), false); //cpu_parameters.adjustAlphaDd());
+  DMatrixBuilder<dca::linalg::GPU> d_matrix_gpu(g0_gpu, bands, RDmn());
+  SbmWalkerGpu::setInteractionVertices(cpu_data, gpu_parameters);
+  d_matrix_gpu.setAlphas(gpu_parameters.getAlphas(), false); //gpu_parameters.adjustAlphaDd());
 
   // ************************************
   // Test vertex insertion / removal ****
@@ -126,9 +133,9 @@ TYPED_TEST(CtintWalkerSubmatrixGpuTest, doSteps) {
     gpu_parameters.setInitialConfigurationSize(initial_size);
     for (int steps = 1; steps <= 8; ++steps) {
       cpu_rng.setNewValues(setup_rngs);
-      SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng);
+      SbmWalkerCpu walker_cpu(cpu_parameters, cpu_rng, d_matrix_cpu);
       gpu_rng.setNewValues(setup_rngs);
-      SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng);
+      SbmWalkerGpu walker_gpu(gpu_parameters, gpu_rng, d_matrix_gpu);
 
       cpu_rng.setNewValues(rng_vals);
       walker_cpu.doStep(steps);
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
index 155e5a67a..a35bf3ce7 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_submatrix_test.cpp
@@ -36,6 +36,10 @@ template <typename Scalar>
 using CtintWalkerSubmatrixTest =
   typename dca::testing::G0Setup<Scalar, dca::testing::LatticeBilayer, dca::ClusterSolverId::CT_INT, input_name>;
 
+  using CDA = dca::phys::ClusterDomainAliases<dca::testing::LatticeBilayer::DIMENSION>;
+  using RDmn = typename CDA::RClusterDmn;
+
+
 using namespace dca::phys::solver;
 
 // Currently testing float isn't really possible due to the way the Scalar type is
@@ -55,7 +59,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
   using MatrixPair = std::array<Matrix, 2>;
   using SubmatrixWalker =
     testing::phys::solver::ctint::WalkerWrapperSubmatrix<Scalar, Parameters, dca::linalg::CPU>;
-
+  
   std::vector<double> setup_rngs{0., 0.00, 0.9,  0.5, 0.01, 0,    0.75, 0.02,
                                  0,  0.6,  0.03, 1,   0.99, 0.04, 0.99};
   typename TestFixture::RngType rng(setup_rngs);
@@ -66,9 +70,10 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
   G0Interpolation<dca::linalg::CPU, Scalar> g0(
       dca::phys::solver::ctint::details::shrinkG0(data.G0_r_t));
   typename TestFixture::LabelDomain label_dmn;
-  Walker::setDMatrixBuilder(g0);
-  Walker::setDMatrixAlpha(parameters.getAlphas(), false);
-  Walker::setInteractionVertices(data, parameters);
+  using DMatrixBuilder = dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar>;
+  DMatrixBuilder d_matrix_builder(g0, 2, RDmn());
+  d_matrix_builder.setAlphas(parameters.getAlphas(), false);
+  SubmatrixWalker::setInteractionVertices(data, parameters);
 
   // ************************************
   // Test vertex insertion / removal ****
@@ -93,7 +98,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
 
   for (int steps = 1; steps <= 8; ++steps) {
     rng.setNewValues(setup_rngs);
-    SubmatrixWalker walker(parameters, rng);
+    SubmatrixWalker walker(parameters, rng, d_matrix_builder);
 
     MatrixPair old_M(walker.getM());
     rng.setNewValues(rng_vals);
@@ -116,7 +121,7 @@ TYPED_TEST(CtintWalkerSubmatrixTest, doSteps) {
 
     // Compare with non submatrix walker.
     rng.setNewValues(setup_rngs);
-    Walker walker_nosub(parameters, rng);
+    Walker walker_nosub(parameters, rng, d_matrix_builder);
 
     rng.setNewValues(rng_vals);
     for (int i = 0; i < steps; ++i)
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp
index e3a49c7ec..065f651bb 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/ct_int_walker_test.cpp
@@ -16,7 +16,7 @@
 #include "walker_wrapper.hpp"
 #include "dca/phys/dca_step/cluster_solver/ctint/details/solver_methods.hpp"
 #include "test/unit/phys/dca_step/cluster_solver/test_setup.hpp"
-
+#include "dca/phys/domains/cluster/cluster_domain_aliases.hpp"
 #include "test/mock_mcconfig.hpp"
 namespace dca {
 namespace config {
@@ -70,12 +70,16 @@ TYPED_TEST(CtintWalkerTest, InsertAndRemoveVertex) {
   typename CtintWalkerTest<Scalar>::LabelDomain label_dmn;
 
   using Parameters = typename CtintWalkerTest<Scalar>::Parameters;
+  using CDA = dca::phys::ClusterDomainAliases<Parameters::lattice_type::DIMENSION>;
+  using RDmn = typename CDA::RClusterDmn;
+
   using Walker = testing::phys::solver::ctint::WalkerWrapper<Scalar, Parameters>;
-  Walker::setDMatrixBuilder(g0);
-  Walker::setDMatrixAlpha(parameters.getAlphas(), 0);
+  dca::phys::solver::ctint::DMatrixBuilder<dca::linalg::CPU, Scalar> d_matrix_builder(g0, 1, RDmn());
+  d_matrix_builder.setAlphas(parameters.getAlphas(), parameters.adjustAlphaDd());
+
   Walker::setInteractionVertices(data, parameters);
 
-  Walker walker(parameters, rng);
+  Walker walker(parameters, rng, d_matrix_builder);
 
   // *******************************
   // Test vertex removal ***********
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp
index 6e0dba929..bdeb041ef 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper.hpp
@@ -33,8 +33,8 @@ class WalkerWrapper : public CtintWalker<dca::linalg::CPU, Parameters, DIST> {
   using Real = dca::util::RealAlias<Scalar>;
   using Rng = typename Base::Rng;
 
-  WalkerWrapper(Parameters& parameters_ref, Rng& rng_ref)
-      : Base(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref, 0) {
+    WalkerWrapper(Parameters& parameters_ref, Rng& rng_ref, DMatrixBuilder<dca::linalg::CPU, Scalar>& d_matrix_builder)
+      : Base(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref, d_matrix_builder, 0) {
     Base::initialize(0);
   }
 
diff --git a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
index da20194e3..2d5dc66a8 100644
--- a/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
+++ b/test/unit/phys/dca_step/cluster_solver/ctint/walker/walker_wrapper_submatrix.hpp
@@ -57,9 +57,10 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
   using Real = dca::util::RealAlias<Scalar>;
   using Rng = typename BaseClass::Rng;
   using Data = typename BaseClass::Data;
+  using BaseClass::setMFromConfig;
 
-  WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref, Rng& rng_ref)
-      : BaseClass(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref, 0),
+    WalkerWrapperSubmatrix(/*const*/ Parameters& parameters_ref, Rng& rng_ref, DMatrixBuilder<device_t, Scalar>& d_matrix_builder)
+      : BaseClass(parameters_ref, dca::phys::DcaData<Parameters>(parameters_ref), rng_ref, d_matrix_builder, 0),
         streams_(3) {
     BaseClass::initialize(0);
 
@@ -67,7 +68,7 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
 
   // This purposefully shadows
   void doStep(const int n_steps_to_delay) {
-    BaseClass::doStep(n_steps_to_delay);
+    BaseClass::SubmatrixBase::doStep(n_steps_to_delay);
   }
 
   using Matrix = dca::linalg::Matrix<Scalar, CPU>;
@@ -85,8 +86,6 @@ struct WalkerWrapperSubmatrix : public WalkerSelector<Parameters, device_t, DIST
     return M_copy;
   }
 
-  using BaseClass::setMFromConfig;
-
   const auto& getWalkerConfiguration() const {
     return BaseClass::configuration_;
   }