diff --git a/projects/CUDA/CMakeLists.txt b/projects/CUDA/CMakeLists.txt
index 805b88cfc2..fb97d98aac 100644
--- a/projects/CUDA/CMakeLists.txt
+++ b/projects/CUDA/CMakeLists.txt
@@ -48,6 +48,7 @@ set_target_properties(zeno
 # wrangler
 target_sources(zeno PRIVATE
   utils/IndexBuckets.cu
+  utils/Primitives.cu
   wrangle/PW.cu
   wrangle/P2W.cu
   wrangle/PNW.cu
diff --git a/projects/CUDA/utils/Primitives.cu b/projects/CUDA/utils/Primitives.cu
new file mode 100644
index 0000000000..27c28dd8cb
--- /dev/null
+++ b/projects/CUDA/utils/Primitives.cu
@@ -0,0 +1,130 @@
+#include "Structures.hpp"
+#include "zensim/cuda/execution/ExecutionPolicy.cuh"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <zeno/types/NumericObject.h>
+#include <zeno/types/PrimitiveObject.h>
+#include <zeno/utils/parallel_reduce.h>
+#include <zeno/utils/vec.h>
+#include <zeno/zeno.h>
+
+namespace zeno {
+
+/// utilities
+constexpr std::size_t count_warps(std::size_t n) noexcept {
+    return (n + 31) / 32;
+}
+constexpr int warp_index(int n) noexcept {
+    return n / 32;
+}
+constexpr auto warp_mask(int i, int n) noexcept {
+    int k = n % 32;
+    const int tail = n - k;
+    if (i < tail)
+        return zs::make_tuple(0xFFFFFFFFu, 32);
+    return zs::make_tuple(((unsigned)(1ull << k) - 1), k);
+}
+
+template <typename T, typename Op> __forceinline__ __device__ void reduce_to(int i, int n, T val, T &dst, Op op) {
+    auto [mask, numValid] = warp_mask(i, n);
+    __syncwarp(mask);
+    auto locid = threadIdx.x & 31;
+    for (int stride = 1; stride < 32; stride <<= 1) {
+        auto tmp = __shfl_down_sync(mask, val, stride);
+        if (locid + stride < numValid)
+            val = op(val, tmp);
+    }
+    if (locid == 0)
+        dst = val;
+}
+
+template <typename TransOp, typename ReduceOp>
+static float prim_reduce(typename ZenoParticles::particles_t &verts, float e, TransOp top, ReduceOp rop,
+                         std::string attrToReduce) {
+    using namespace zs;
+    constexpr auto space = execspace_e::cuda;
+    using T = typename ZenoParticles::particles_t::value_type;
+    auto nchn = verts.getPropertySize(attrToReduce);
+    auto offset = verts.getPropertyOffset(attrToReduce);
+    const auto nwarps = count_warps(verts.size());
+
+    auto cudaPol = cuda_exec().device(0);
+
+    Vector<float> res{verts.get_allocator(), nwarps};
+    // cudaPol(res, [e] ZS_LAMBDA(auto &v) { v = e; });
+    cudaPol(range(verts.size()), [res = proxy<space>(res), verts = proxy<space>({}, verts), offset, nwarps, nchn, top,
+                                  rop] ZS_LAMBDA(int i) mutable {
+        auto [mask, numValid] = warp_mask(i, nwarps);
+        auto locid = threadIdx.x & 31;
+        float v = top(verts(offset, i));
+        while (--nchn) {
+            v = rop(top(verts(offset++, i)), v);
+        }
+        reduce_to(i, nwarps, v, res[i / 32], rop);
+    });
+
+    Vector<float> ret{res.get_allocator(), 1};
+    zs::reduce(cudaPol, std::begin(res), std::end(res), std::begin(ret), e, rop);
+    return ret.getVal();
+}
+
+struct ZSPrimitiveReduction : zeno::INode {
+    struct pass_on {
+        constexpr auto operator()(auto v) const noexcept {
+            return v;
+        }
+    };
+    struct getabs {
+        constexpr auto operator()(auto v) const noexcept {
+            return zs::abs(v);
+        }
+    };
+    virtual void apply() override {
+        using namespace zs;
+        constexpr auto space = execspace_e::cuda;
+        auto prim = get_input<ZenoParticles>("ZSParticles");
+        auto &verts = prim->getParticles();
+        auto attrToReduce = get_input2<std::string>("attr");
+        if (attrToReduce == "pos")
+            attrToReduce = "x";
+        if (attrToReduce == "vel")
+            attrToReduce = "v";
+
+        if (!verts.hasProperty(attrToReduce))
+            throw std::runtime_error(fmt::format("verts do not have property [{}]\n", attrToReduce));
+
+        auto opStr = get_input2<std::string>("op");
+        zeno::NumericValue result;
+        if (opStr == "avg") {
+            result = prim_reduce(verts, 0, pass_on{}, std::plus<float>{}, attrToReduce) / verts.size();
+        } else if (opStr == "max") {
+            result = prim_reduce(verts, limits<float>::lowest(), pass_on{}, getmax<float>{}, attrToReduce);
+        } else if (opStr == "min") {
+            result = prim_reduce(verts, limits<float>::max(), pass_on{}, getmin<float>{}, attrToReduce);
+        } else if (opStr == "absmax") {
+            result = prim_reduce(verts, 0, getabs{}, getmax<float>{}, attrToReduce);
+        }
+
+        auto out = std::make_shared<zeno::NumericObject>();
+        out->set(result);
+        set_output("result", std::move(out));
+    }
+};
+ZENDEFNODE(ZSPrimitiveReduction, {/* inputs: */ {
+                                      "ZSParticles",
+                                      {"string", "attr", "pos"},
+                                      {"enum avg max min absmax", "op", "avg"},
+                                  },
+                                  /* outputs: */
+                                  {
+                                      "result",
+                                  },
+                                  /* params: */
+                                  {},
+                                  /* category: */
+                                  {
+                                      "primitive",
+                                  }});
+
+} // namespace zeno
\ No newline at end of file
diff --git a/projects/CUDA/wrangle/P2W.cu b/projects/CUDA/wrangle/P2W.cu
index 8e47c63a2b..1719363a45 100644
--- a/projects/CUDA/wrangle/P2W.cu
+++ b/projects/CUDA/wrangle/P2W.cu
@@ -153,15 +153,19 @@ struct ZSParticlesTwoWrangler : zeno::INode {
                 auto key = name.substr(1);
                 if (!checkDuplication(key))
                     newChns.push_back(PropertyTag{key, dim});
+                if (dim != 3 && dim != 1)
+                    err_printf("ERROR: bad attribute dimension for primitive: %d\n", dim);
             }
             if (newChns.size() > 0)
                 pars.append_channels(cudaPol, newChns);
-            props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
             if (_cuModule == nullptr) {
                 auto wrangleKernelPtxs = cudri::load_all_ptx_files_at();
                 void *state;
                 cuLinkCreate(0, nullptr, nullptr, (CUlinkState *)&state);
+                // fmt::print(">>>>>>>>>>>>>>>>>>>");
+                // fmt::print("{}\n", jitCode);
+                // fmt::print("<<<<<<<<<<<<<<<<<<<");
 
                 auto jitSrc = cudri::compile_cuda_source_to_ptx(jitCode);
                 cuLinkAddData((CUlinkState)state, CU_JIT_INPUT_PTX, (void *)jitSrc.data(), (size_t)jitSrc.size(),
@@ -188,27 +192,29 @@ struct ZSParticlesTwoWrangler : zeno::INode {
             /// symbols
             for (int i = 0; i < prog->symbols.size(); i++) {
                 auto [name, dimid] = prog->symbols[i];
-#if 0
-        fmt::print("channel {}: {}.{}. chn offset: {} (of {})\n", i,
-                   name.c_str(), dimid, pars.getChannelOffset(name.substr(1)),
-                   pars.numChannels());
-#endif
-                void *parsPtr = nullptr;
-                if (name[1] == '@') {
+
+                unsigned short aux = name[1] == '@' ? 1 : 0;
+                float *parsPtr = nullptr;
+                if (aux) {
+                    aux = 1;
                     parsPtr = pars2.data();
                     name = name.substr(2);
                 } else {
                     parsPtr = pars.data();
                     name = name.substr(1);
                 }
-                const auto &curPars = name[1] == '@' ? pars2 : pars;
+                const auto &curPars = aux ? pars2 : pars;
                 haccessors[i] = zs::AccessorAoSoA{zs::aosoa_c,
                                                   parsPtr,
                                                   (unsigned short)unitBytes,
                                                   (unsigned short)tileSize,
                                                   (unsigned short)curPars.numChannels(),
                                                   (unsigned short)(curPars.getChannelOffset(name) + dimid),
-                                                  (unsigned short)0};
+                                                  aux};
+#if 0
+                fmt::print("channel {}: {}.{}. chn offset: {} (of {})\n", i, name.c_str(), dimid,
+                           curPars.getChannelOffset(name), curPars.numChannels());
+#endif
             }
             auto daccessors = haccessors.clone({zs::memsrc_e::device, 0});
 
diff --git a/projects/CUDA/wrangle/PNBVHW.cu b/projects/CUDA/wrangle/PNBVHW.cu
index 8c63716fc8..1d58879e83 100644
--- a/projects/CUDA/wrangle/PNBVHW.cu
+++ b/projects/CUDA/wrangle/PNBVHW.cu
@@ -180,7 +180,6 @@ struct ZSParticleNeighborBvhWrangler : INode {
         }
         if (newChns.size() > 0)
             pars.append_channels(cudaPol, newChns);
-        props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
         if (_cuModule == nullptr) {
             /// execute on the current particle object
diff --git a/projects/CUDA/wrangle/PNW.cu b/projects/CUDA/wrangle/PNW.cu
index a1e64c8b71..26179a412c 100644
--- a/projects/CUDA/wrangle/PNW.cu
+++ b/projects/CUDA/wrangle/PNW.cu
@@ -184,7 +184,6 @@ struct ZSParticleNeighborWrangler : INode {
         }
         if (newChns.size() > 0)
             pars.append_channels(cudaPol, newChns);
-        props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
         if (_cuModule == nullptr) {
             /// execute on the current particle object
diff --git a/projects/CUDA/wrangle/PPW.cu b/projects/CUDA/wrangle/PPW.cu
index 9a27e4f9d3..416f2dc962 100644
--- a/projects/CUDA/wrangle/PPW.cu
+++ b/projects/CUDA/wrangle/PPW.cu
@@ -171,7 +171,6 @@ struct ZSParticleParticleWrangler : INode {
         }
         if (newChns.size() > 0)
             pars.append_channels(cudaPol, newChns);
-        props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
         if (_cuModule == nullptr) {
             /// execute on the current particle object
diff --git a/projects/CUDA/wrangle/PW.cu b/projects/CUDA/wrangle/PW.cu
index c502a6a1ed..8f4e19b1dd 100644
--- a/projects/CUDA/wrangle/PW.cu
+++ b/projects/CUDA/wrangle/PW.cu
@@ -144,7 +144,6 @@ struct ZSParticlesWrangler : zeno::INode {
             }
             if (newChns.size() > 0)
                 pars.append_channels(cudaPol, newChns);
-            props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
             if (_cuModule == nullptr) {
                 auto wrangleKernelPtxs = cudri::load_all_ptx_files_at();
diff --git a/projects/CUDA/zpc b/projects/CUDA/zpc
index c4092d787e..d260e81a9f 160000
--- a/projects/CUDA/zpc
+++ b/projects/CUDA/zpc
@@ -1 +1 @@
-Subproject commit c4092d787e119c2c7b0d4e2b9646e2d32bc074f2
+Subproject commit d260e81a9ffe21c3533d56cfe7f6e741f9d9df15
diff --git a/projects/CuLagrange/fem/Solver.cuh b/projects/CuLagrange/fem/Solver.cuh
index faa7aa5043..6bc79154fb 100644
--- a/projects/CuLagrange/fem/Solver.cuh
+++ b/projects/CuLagrange/fem/Solver.cuh
@@ -3,23 +3,68 @@
 #include "Structures.hpp"
 #include "zensim/container/Bvh.hpp"
 #include "zensim/container/Bvs.hpp"
+#include "zensim/container/Bvtt.hpp"
 #include "zensim/container/HashTable.hpp"
 #include "zensim/container/Vector.hpp"
 #include "zensim/cuda/Cuda.h"
 #include "zensim/cuda/execution/ExecutionPolicy.cuh"
 #include "zensim/math/Vec.h"
 #include <zeno/types/PrimitiveObject.h>
-#include <zeno/utils/logger.h>
+#include <zeno/utils/log.h>
 #include <zeno/zeno.h>
 
 namespace zeno {
 
+template <int n = 1> struct HessianPiece {
+    using HessT = zs::vec<float, n * 3, n * 3>;
+    using IndsT = zs::vec<int, n>;
+    zs::Vector<HessT> hess;
+    zs::Vector<IndsT> inds;
+    zs::Vector<int> cnt;
+    using allocator_t = typename zs::Vector<int>::allocator_type;
+    void init(const allocator_t &allocator, std::size_t size = 0) {
+        hess = zs::Vector<HessT>{allocator, size};
+        inds = zs::Vector<IndsT>{allocator, size};
+        cnt = zs::Vector<int>{allocator, 1};
+    }
+    int count() const {
+        return cnt.getVal();
+    }
+    int increaseCount(int inc) {
+        int v = cnt.getVal();
+        cnt.setVal(v + inc);
+        hess.resize((std::size_t)(v + inc));
+        inds.resize((std::size_t)(v + inc));
+        return v;
+    }
+    void reset(bool setZero = true, std::size_t count = 0) {
+        if (setZero)
+            hess.reset(0);
+        cnt.setVal(count);
+    }
+};
+template <typename HessianPieceT> struct HessianView {
+    static constexpr bool is_const_structure = std::is_const_v<HessianPieceT>;
+    using HT = typename HessianPieceT::HessT;
+    using IT = typename HessianPieceT::IndsT;
+    zs::conditional_t<is_const_structure, const HT *, HT *> hess;
+    zs::conditional_t<is_const_structure, const IT *, IT *> inds;
+    zs::conditional_t<is_const_structure, const int *, int *> cnt;
+};
+template <zs::execspace_e space, int n> HessianView<HessianPiece<n>> proxy(HessianPiece<n> &hp) {
+    return HessianView<HessianPiece<n>>{hp.hess.data(), hp.inds.data(), hp.cnt.data()};
+}
+template <zs::execspace_e space, int n> HessianView<const HessianPiece<n>> proxy(const HessianPiece<n> &hp) {
+    return HessianView<const HessianPiece<n>>{hp.hess.data(), hp.inds.data(), hp.cnt.data()};
+}
+
 struct IPCSystem : IObject {
     using T = double;
     using Ti = zs::conditional_t<zs::is_same_v<T, double>, zs::i64, zs::i32>;
     using dtiles_t = zs::TileVector<T, 32>;
     using tiles_t = typename ZenoParticles::particles_t;
     using vec3 = zs::vec<T, 3>;
+    using vec3f = zs::vec<float, 3>;
     using ivec3 = zs::vec<int, 3>;
     using ivec2 = zs::vec<int, 2>;
     using mat2 = zs::vec<T, 2, 2>;
@@ -32,6 +77,7 @@ struct IPCSystem : IObject {
     using dpair4_t = zs::vec<Ti, 4>;
     // using bvh_t = zeno::ZenoLBvh<3, 32, int, T>;
     using bvh_t = zs::LBvh<3, int, T>;
+    using bvfront_t = zs::BvttFront<int, int>;
     using bv_t = zs::AABBBox<3, T>;
 
     inline static const char s_meanMassTag[] = "MeanMass";
@@ -161,6 +207,9 @@ struct IPCSystem : IObject {
         return zs::make_tuple(nPP.getVal(), nPE.getVal(), nPT.getVal(), nEE.getVal(), nPPM.getVal(), nPEM.getVal(),
                               nEEM.getVal(), ncsPT.getVal(), ncsEE.getVal());
     }
+    auto getCollisionCnts() const {
+        return zs::make_tuple(ncsPT.getVal(), ncsEE.getVal());
+    }
     void findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T xi = 0);
     void findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHat, T xi, bool withBoundary = false);
     void precomputeFrictions(zs::CudaExecutionPolicy &pol, T dHat, T xi = 0);
@@ -178,8 +227,14 @@ struct IPCSystem : IObject {
     void computeFrictionBarrierGradientAndHessian(zs::CudaExecutionPolicy &pol, const zs::SmallString &gTag,
                                                   bool includeHessian = true);
     // krylov solver
+    void convertHessian(zs::CudaExecutionPolicy &pol);
     void project(zs::CudaExecutionPolicy &pol, const zs::SmallString tag);
+    void precondition(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString srcTag,
+                      const zs::SmallString dstTag);
     void precondition(zs::CudaExecutionPolicy &pol, const zs::SmallString srcTag, const zs::SmallString dstTag);
+
+    void multiply(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString dxTag,
+                  const zs::SmallString bTag);
     void multiply(zs::CudaExecutionPolicy &pol, const zs::SmallString dxTag, const zs::SmallString bTag);
     void cgsolve(zs::CudaExecutionPolicy &cudaPol);
     void groundIntersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T &stepSize);
@@ -225,7 +280,7 @@ struct IPCSystem : IObject {
     //
     std::vector<PrimitiveHandle> prims;
     std::vector<PrimitiveHandle> auxPrims;
-    std::size_t coOffset, numDofs;
+    std::size_t coOffset, numDofs, numBouDofs;
     std::size_t sfOffset, seOffset, svOffset;
 
     // (scripted) collision objects
@@ -277,6 +332,13 @@ struct IPCSystem : IObject {
     zs::Vector<pair4_t> csPT, csEE;
     zs::Vector<int> ncsPT, ncsEE;
 
+    // for faster linear system solve
+    HessianPiece<1> hess1;
+    HessianPiece<2> hess2;
+    HessianPiece<3> hess3;
+    HessianPiece<4> hess4;
+    tiles_t cgtemp;
+
     // boundary contacts
     // auxiliary data (spatial acceleration)
     tiles_t stInds, seInds, svInds;
@@ -285,6 +347,9 @@ struct IPCSystem : IObject {
     bvs_t stBvs, seBvs;       // STQ
     bvh_t bouStBvh, bouSeBvh; // for collision objects
     bvs_t bouStBvs, bouSeBvs; // STQ
+    bvfront_t selfStFront, boundaryStFront;
+    bvfront_t selfSeFront, boundarySeFront;
+    bool frontManageRequired;
     T dt, framedt, curRatio;
 };
 
diff --git a/projects/CuLagrange/fem/SolverInit.cu b/projects/CuLagrange/fem/SolverInit.cu
index 20056e1959..dd15b1f2d3 100644
--- a/projects/CuLagrange/fem/SolverInit.cu
+++ b/projects/CuLagrange/fem/SolverInit.cu
@@ -218,6 +218,20 @@ void IPCSystem::initialize(zs::CudaExecutionPolicy &pol) {
     seInds = tiles_t{vtemp.get_allocator(), {{"inds", 2}}, seOffset};
     svInds = tiles_t{vtemp.get_allocator(), {{"inds", 1}}, svOffset};
 
+    auto deduce_node_cnt = [](std::size_t numLeaves) {
+        if (numLeaves <= 2)
+            return numLeaves;
+        return numLeaves * 2 - 1;
+    };
+    selfStFront = bvfront_t{(int)deduce_node_cnt(stInds.size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()};
+    selfSeFront = bvfront_t{(int)deduce_node_cnt(seInds.size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()};
+    if (coVerts) {
+        boundaryStFront =
+            bvfront_t{(int)deduce_node_cnt(coEles->size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()};
+        boundarySeFront =
+            bvfront_t{(int)deduce_node_cnt(coEdges->size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()};
+    }
+
     meanEdgeLength = averageSurfEdgeLength(pol);
     meanSurfaceArea = averageSurfArea(pol);
     avgNodeMass = averageNodalMass(pol);
@@ -286,8 +300,9 @@ IPCSystem::IPCSystem(std::vector<ZenoParticles *> zsprims, const typename IPCSys
       //
       temp{estNumCps, zs::memsrc_e::um, zsprims[0]->getParticles<true>().devid()}, csPT{estNumCps, zs::memsrc_e::um, 0},
       csEE{estNumCps, zs::memsrc_e::um, 0}, ncsPT{zsprims[0]->getParticles<true>().get_allocator(), 1},
-      ncsEE{zsprims[0]->getParticles<true>().get_allocator(), 1}, dt{dt}, framedt{dt}, curRatio{0},
-      estNumCps{estNumCps}, enableGround{withGround}, enableContact{withContact},
+      ncsEE{zsprims[0]->getParticles<true>().get_allocator(), 1},
+      //
+      dt{dt}, framedt{dt}, curRatio{0}, estNumCps{estNumCps}, enableGround{withGround}, enableContact{withContact},
       enableMollification{withMollification}, s_groundNormal{gn[0], gn[1], gn[2]},
       augLagCoeff{augLagCoeff}, pnRel{pnRel}, cgRel{cgRel}, PNCap{PNCap}, CGCap{CGCap}, CCDCap{CCDCap}, kappa{kappa0},
       kappa0{kappa0}, kappaMin{0}, kappaMax{kappa0}, fricMu{fricMu}, dHat{dHat_}, epsv{epsv_}, extForce{0, gravity, 0} {
@@ -300,11 +315,14 @@ IPCSystem::IPCSystem(std::vector<ZenoParticles *> zsprims, const typename IPCSys
         else if (primPtr->category == ZenoParticles::category_e::tet)
             prims.emplace_back(*primPtr, coOffset, sfOffset, seOffset, svOffset, zs::wrapv<4>{});
     }
-    fmt::print("num total obj <verts, surfV, surfE, surfT>: {}, {}, {}, {}\n", coOffset, svOffset, seOffset, sfOffset);
-
     numDofs = coOffset;
     if (coVerts)
         numDofs += coVerts->size();
+    numBouDofs = numDofs - coOffset;
+
+    fmt::print("num total obj <verts, bouVerts, surfV, surfE, surfT>: {}, {}, {}, {}, {}\n", coOffset, numBouDofs,
+               svOffset, seOffset, sfOffset);
+
     vtemp = dtiles_t{zsprims[0]->getParticles<true>().get_allocator(),
                      {{"grad", 3},
                       {"P", 9},
@@ -334,6 +352,18 @@ IPCSystem::IPCSystem(std::vector<ZenoParticles *> zsprims, const typename IPCSys
     // inertial hessian
     tempI = dtiles_t{vtemp.get_allocator(), {{"Hi", 9}}, coOffset};
 
+    // connect vtemp with "dir", "grad"
+    cgtemp = tiles_t{vtemp.get_allocator(),
+                     {{"P", 9},
+
+                      {"dir", 3},
+
+                      {"temp", 3},
+                      {"r", 3},
+                      {"p", 3},
+                      {"q", 3}},
+                     numDofs};
+
     auto cudaPol = zs::cuda_exec();
     // average edge length (for CCD filtering)
     initialize(cudaPol); // update vtemp, bvh, boxsize, targetGRes
@@ -463,28 +493,57 @@ void IPCSystem::reinitialize(zs::CudaExecutionPolicy &pol, typename IPCSystem::T
         }
 
     // spatial accel structs
+    frontManageRequired = true;
+#define init_front(sInds, front)                                                                                 \
+    {                                                                                                            \
+        auto numNodes = front.numNodes();                                                                        \
+        if (numNodes <= 2) {                                                                                     \
+            front.reserve(sInds.size() * numNodes);                                                              \
+            front.setCounter(sInds.size() * numNodes);                                                           \
+            pol(Collapse{sInds.size()}, [front = proxy<space>(selfStFront), numNodes] ZS_LAMBDA(int i) mutable { \
+                for (int j = 0; j != numNodes; ++j)                                                              \
+                    front.assign(i *numNodes + j, i, j);                                                         \
+            });                                                                                                  \
+        } else {                                                                                                 \
+            front.reserve(sInds.size());                                                                         \
+            front.setCounter(sInds.size());                                                                      \
+            pol(Collapse{sInds.size()},                                                                          \
+                [front = proxy<space>(front)] ZS_LAMBDA(int i) mutable { front.assign(i, i, 0); });              \
+        }                                                                                                        \
+    }
     {
         auto triBvs = retrieve_bounding_volumes(pol, vtemp, "xn", stInds, zs::wrapv<3>{}, 0);
         stBvh.build(pol, triBvs);
+        init_front(svInds, selfStFront);
+
         auto edgeBvs = retrieve_bounding_volumes(pol, vtemp, "xn", seInds, zs::wrapv<2>{}, 0);
         seBvh.build(pol, edgeBvs);
+        init_front(seInds, selfSeFront);
     }
     if (coVerts)
         if (coVerts->size()) {
             auto triBvs = retrieve_bounding_volumes(pol, vtemp, "xn", *coEles, zs::wrapv<3>{}, coOffset);
             bouStBvh.build(pol, triBvs);
+            init_front(svInds, boundaryStFront);
+
             auto edgeBvs = retrieve_bounding_volumes(pol, vtemp, "xn", *coEdges, zs::wrapv<2>{}, coOffset);
             bouSeBvh.build(pol, edgeBvs);
+            init_front(seInds, boundarySeFront);
         }
 
     updateWholeBoundingBoxSize(pol);
     /// update grad pn residual tolerance
     targetGRes = pnRel * std::sqrt(boxDiagSize2);
     // zeno::log_info("box diag size: {}, targetGRes: {}\n", std::sqrt(boxDiagSize2), targetGRes);
+
+    /// for faster linear solve
+    hess1.init(vtemp.get_allocator(), numDofs);
+    hess2.init(PP.get_allocator(), estNumCps);
+    hess3.init(PP.get_allocator(), estNumCps);
+    hess4.init(PP.get_allocator(), estNumCps);
 }
 void IPCSystem::suggestKappa(zs::CudaExecutionPolicy &pol) {
     using namespace zs;
-    constexpr auto space = execspace_e::cuda;
     auto cudaPol = zs::cuda_exec();
     if (kappa0 == 0) {
         /// kappaMin
@@ -594,9 +653,9 @@ void IPCSystem::updateVelocities(zs::CudaExecutionPolicy &pol) {
         auto newX = vtemp.pack<3>("xn", vi);
         auto dv = (newX - vtemp.pack<3>("xtilde", vi)) / dt;
         auto vn = vtemp.pack<3>("vn", vi);
+        if (dv.length() > 4)
+            dv = dv.normalized() * 4;
         vn += dv;
-        // if (vn.length() > 2)
-        //     vn = vn.normalized() * 2;
         int BCorder = vtemp("BCorder", vi);
         auto BCbasis = vtemp.pack<3, 3>("BCbasis", vi);
         auto projVec = [&BCbasis, BCorder](auto &dx) {
diff --git a/projects/CuLagrange/fem/SolverLinearSystem.cu b/projects/CuLagrange/fem/SolverLinearSystem.cu
index 11f9b23e64..fa147a6752 100644
--- a/projects/CuLagrange/fem/SolverLinearSystem.cu
+++ b/projects/CuLagrange/fem/SolverLinearSystem.cu
@@ -385,7 +385,6 @@ void computeElasticGradientAndHessianImpl(zs::CudaExecutionPolicy &cudaPol, cons
 void IPCSystem::computeElasticGradientAndHessian(zs::CudaExecutionPolicy &cudaPol, const zs::SmallString &gTag,
                                                  bool includeHessian) {
     using namespace zs;
-    constexpr auto space = execspace_e::cuda;
     for (auto &primHandle : prims) {
         match([&](auto &elasticModel) {
             computeElasticGradientAndHessianImpl(cudaPol, gTag, vtemp, primHandle, elasticModel, dt, projectDBC,
@@ -506,4 +505,232 @@ void IPCSystem::computeBoundaryBarrierGradientAndHessian(zs::CudaExecutionPolicy
     return;
 }
 
+void IPCSystem::convertHessian(zs::CudaExecutionPolicy &pol) {
+    using namespace zs;
+    constexpr execspace_e space = execspace_e::cuda;
+    constexpr auto execTag = wrapv<space>{};
+
+    hess1.reset(true, numDofs); // additive style
+    hess2.reset(false, 0);      // overwrite style
+    hess3.reset(false, 0);
+    hess4.reset(false, 0);
+    // inertial
+    pol(zs::range(coOffset), [tempI = proxy<space>({}, tempI), hess1 = proxy<space>(hess1)] __device__(int i) mutable {
+        auto Hi = tempI.template pack<3, 3>("Hi", i);
+        hess1.hess[i] = Hi;
+        hess1.inds[i][0] = i;
+    });
+
+    // elasticity
+    for (auto &primHandle : prims) {
+        auto &eles = primHandle.getEles();
+        // elasticity
+        if (primHandle.category == ZenoParticles::curve) {
+            if (primHandle.isBoundary() && !primHandle.isAuxiliary())
+                continue;
+            auto offset = hess2.increaseCount(eles.size());
+            pol(zs::range(eles.size()),
+                [etemp = proxy<space>({}, primHandle.etemp), eles = proxy<space>({}, eles), hess2 = proxy<space>(hess2),
+                 vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable {
+                    auto He = etemp.template pack<6, 6>("He", ei);
+                    auto inds = eles.template pack<2>("inds", ei).template reinterpret_bits<int>() + vOffset;
+                    hess2.hess[offset + ei] = He;
+                    hess2.inds[offset + ei] = inds;
+                });
+        } else if (primHandle.category == ZenoParticles::surface) {
+            if (primHandle.isBoundary())
+                continue;
+            auto offset = hess3.increaseCount(eles.size());
+            pol(zs::range(eles.size()),
+                [etemp = proxy<space>({}, primHandle.etemp), eles = proxy<space>({}, eles), hess3 = proxy<space>(hess3),
+                 vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable {
+                    auto He = etemp.template pack<9, 9>("He", ei);
+                    auto inds = eles.template pack<3>("inds", ei).template reinterpret_bits<int>() + vOffset;
+                    hess3.hess[offset + ei] = He;
+                    hess3.inds[offset + ei] = inds;
+                });
+        } else if (primHandle.category == ZenoParticles::tet) {
+            auto offset = hess4.increaseCount(eles.size());
+            pol(zs::range(eles.size()),
+                [etemp = proxy<space>({}, primHandle.etemp), eles = proxy<space>({}, eles), hess4 = proxy<space>(hess4),
+                 vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable {
+                    auto He = etemp.template pack<12, 12>("He", ei);
+                    auto inds = eles.template pack<4>("inds", ei).template reinterpret_bits<int>() + vOffset;
+                    hess4.hess[offset + ei] = He;
+                    hess4.inds[offset + ei] = inds;
+                });
+        }
+        for (auto &primHandle : auxPrims) {
+            auto &eles = primHandle.getEles();
+            // soft bindings
+            if (primHandle.category == ZenoParticles::curve) {
+                auto offset = hess2.increaseCount(eles.size());
+                pol(zs::range(eles.size()),
+                    [etemp = proxy<space>({}, primHandle.etemp), eles = proxy<space>({}, eles),
+                     hess2 = proxy<space>(hess2), vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable {
+                        auto He = etemp.template pack<6, 6>("He", ei);
+                        auto inds = eles.template pack<2>("inds", ei).template reinterpret_bits<int>() + vOffset;
+                        hess2.hess[offset + ei] = He;
+                        hess2.inds[offset + ei] = inds;
+                    });
+            }
+        }
+
+        // contacts
+        if (enableContact) {
+            auto numPP = nPP.getVal();
+            auto offset = hess2.increaseCount(numPP);
+            pol(zs::range(numPP), [tempPP = proxy<space>({}, tempPP), PP = proxy<space>(PP),
+                                   hess2 = proxy<space>(hess2), offset] ZS_LAMBDA(int ppi) mutable {
+                auto H = tempPP.template pack<6, 6>("H", ppi);
+                auto inds = PP[ppi];
+                hess2.hess[offset + ppi] = H;
+                hess2.inds[offset + ppi] = inds;
+            });
+
+            auto numPE = nPE.getVal();
+            offset = hess3.increaseCount(numPE);
+            pol(zs::range(numPE), [tempPE = proxy<space>({}, tempPE), PE = proxy<space>(PE),
+                                   hess3 = proxy<space>(hess3), offset] ZS_LAMBDA(int pei) mutable {
+                auto H = tempPE.template pack<9, 9>("H", pei);
+                auto inds = PE[pei];
+                hess3.hess[offset + pei] = H;
+                hess3.inds[offset + pei] = inds;
+            });
+
+            auto numPT = nPT.getVal();
+            offset = hess4.increaseCount(numPT);
+            pol(zs::range(numPT), [tempPT = proxy<space>({}, tempPT), PT = proxy<space>(PT),
+                                   hess4 = proxy<space>(hess4), offset] ZS_LAMBDA(int pti) mutable {
+                auto H = tempPT.template pack<12, 12>("H", pti);
+                auto inds = PT[pti];
+                hess4.hess[offset + pti] = H;
+                hess4.inds[offset + pti] = inds;
+            });
+
+            auto numEE = nEE.getVal();
+            offset = hess4.increaseCount(numEE);
+            pol(zs::range(numEE), [tempEE = proxy<space>({}, tempEE), EE = proxy<space>(EE),
+                                   hess4 = proxy<space>(hess4), offset] ZS_LAMBDA(int eei) mutable {
+                auto H = tempEE.template pack<12, 12>("H", eei);
+                auto inds = EE[eei];
+                hess4.hess[offset + eei] = H;
+                hess4.inds[offset + eei] = inds;
+            });
+
+            if (enableMollification) {
+                auto numEEM = nEEM.getVal();
+                offset = hess4.increaseCount(numEEM);
+                pol(zs::range(numEEM), [tempEEM = proxy<space>({}, tempEEM), EEM = proxy<space>(EEM),
+                                        hess4 = proxy<space>(hess4), offset] ZS_LAMBDA(int eemi) mutable {
+                    auto H = tempEEM.template pack<12, 12>("H", eemi);
+                    auto inds = EEM[eemi];
+                    hess4.hess[offset + eemi] = H;
+                    hess4.inds[offset + eemi] = inds;
+                });
+
+                auto numPPM = nPPM.getVal();
+                offset = hess4.increaseCount(numPPM);
+                pol(zs::range(numPPM), [tempPPM = proxy<space>({}, tempPPM), PPM = proxy<space>(PPM),
+                                        hess4 = proxy<space>(hess4), offset] ZS_LAMBDA(int ppmi) mutable {
+                    auto H = tempPPM.template pack<12, 12>("H", ppmi);
+                    auto inds = PPM[ppmi];
+                    hess4.hess[offset + ppmi] = H;
+                    hess4.inds[offset + ppmi] = inds;
+                });
+
+                auto numPEM = nPEM.getVal();
+                offset = hess4.increaseCount(numPEM);
+                pol(zs::range(numPEM), [tempPEM = proxy<space>({}, tempPEM), PEM = proxy<space>(PEM),
+                                        hess4 = proxy<space>(hess4), offset] ZS_LAMBDA(int pemi) mutable {
+                    auto H = tempPEM.template pack<12, 12>("H", pemi);
+                    auto inds = PEM[pemi];
+                    hess4.hess[offset + pemi] = H;
+                    hess4.inds[offset + pemi] = inds;
+                });
+            } // end mollification
+
+            if (s_enableFriction) {
+                if (fricMu != 0) {
+                    if (s_enableSelfFriction) {
+                        auto numFPP = nFPP.getVal();
+                        offset = hess2.increaseCount(numFPP);
+                        pol(zs::range(numFPP), [fricPP = proxy<space>({}, fricPP), FPP = proxy<space>(FPP),
+                                                hess2 = proxy<space>(hess2), offset] ZS_LAMBDA(int fppi) mutable {
+                            auto H = fricPP.template pack<6, 6>("H", fppi);
+                            auto inds = FPP[fppi];
+                            hess2.hess[offset + fppi] = H;
+                            hess2.inds[offset + fppi] = inds;
+                        });
+
+                        auto numFPE = nFPE.getVal();
+                        offset = hess3.increaseCount(numFPE);
+                        pol(zs::range(numFPE), [fricPE = proxy<space>({}, fricPE), FPE = proxy<space>(FPE),
+                                                hess3 = proxy<space>(hess3), offset] ZS_LAMBDA(int fpei) mutable {
+                            auto H = fricPE.template pack<9, 9>("H", fpei);
+                            auto inds = FPE[fpei];
+                            hess3.hess[offset + fpei] = H;
+                            hess3.inds[offset + fpei] = inds;
+                        });
+
+                        auto numFPT = nFPT.getVal();
+                        offset = hess4.increaseCount(numFPT);
+                        pol(zs::range(numFPT), [fricPT = proxy<space>({}, fricPT), FPT = proxy<space>(FPT),
+                                                hess4 = proxy<space>(hess4), offset] ZS_LAMBDA(int fpti) mutable {
+                            auto H = fricPT.template pack<12, 12>("H", fpti);
+                            auto inds = FPT[fpti];
+                            hess4.hess[offset + fpti] = H;
+                            hess4.inds[offset + fpti] = inds;
+                        });
+
+                        auto numFEE = nFEE.getVal();
+                        offset = hess4.increaseCount(numFEE);
+                        pol(zs::range(numFEE), [fricEE = proxy<space>({}, fricEE), FEE = proxy<space>(FEE),
+                                                hess4 = proxy<space>(hess4), offset] ZS_LAMBDA(int feei) mutable {
+                            auto H = fricEE.template pack<12, 12>("H", feei);
+                            auto inds = FEE[feei];
+                            hess4.hess[offset + feei] = H;
+                            hess4.inds[offset + feei] = inds;
+                        });
+                    } // self friction
+                }     //fricmu
+            }         //enable friction
+        }             //enable contact
+
+        // ground contact
+        if (enableGround) {
+            for (auto &primHandle : prims) {
+                if (primHandle.isBoundary()) // skip soft boundary
+                    continue;
+                const auto &svs = primHandle.getSurfVerts();
+
+                pol(zs::range(svs.size()),
+                    [svtemp = proxy<space>({}, primHandle.svtemp), svs = proxy<space>({}, svs),
+                     svOffset = primHandle.svOffset, hess1 = proxy<space>(hess1), execTag] __device__(int svi) mutable {
+                        const auto vi = reinterpret_bits<int>(svs("inds", svi)) + svOffset;
+                        auto pbHess = svtemp.template pack<3, 3>("H", svi);
+                        for (int i = 0; i != 3; ++i)
+                            for (int j = 0; j != 3; ++j)
+                                atomic_add(execTag, &hess1.hess[vi](i, j), (float)pbHess(i, j));
+                        // hess1.hess[i] = Hi;
+                    });
+            }
+        }
+
+        // constraint hessian
+        if (!BCsatisfied) {
+            pol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp), hess1 = proxy<space>(hess1),
+                                     boundaryKappa = boundaryKappa, execTag] __device__(int vi) mutable {
+                auto w = vtemp("ws", vi);
+                int BCfixed = vtemp("BCfixed", vi);
+                if (!BCfixed) {
+                    int BCorder = vtemp("BCorder", vi);
+                    for (int d = 0; d != BCorder; ++d)
+                        atomic_add(execTag, &hess1.hess[vi](d, d), (float)(boundaryKappa * w));
+                }
+            });
+        }
+    }
+}
+
 } // namespace zeno
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/SolverPipeline.cu b/projects/CuLagrange/fem/SolverPipeline.cu
index 03636fa797..8064ef2406 100644
--- a/projects/CuLagrange/fem/SolverPipeline.cu
+++ b/projects/CuLagrange/fem/SolverPipeline.cu
@@ -1,10 +1,14 @@
-#include "Utils.hpp"
 #include "Ccds.hpp"
 #include "Solver.cuh"
+#include "Utils.hpp"
 #include "zensim/geometry/Distance.hpp"
 #include "zensim/geometry/Friction.hpp"
 #include "zensim/geometry/SpatialQuery.hpp"
+#include "zensim/profile/CppTimers.hpp"
 #include "zensim/types/SmallVector.hpp"
+#include <cooperative_groups.h>
+
+#define PROFILE_IPC 0
 
 namespace zeno {
 
@@ -70,6 +74,17 @@ typename IPCSystem::T IPCSystem::constraintResidual(zs::CudaExecutionPolicy &pol
     return ret < 1e-6 ? 0 : ret;
 }
 
+// https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
+namespace cg = cooperative_groups;
+__forceinline__ __device__ int atomicAggInc(int *ctr) noexcept {
+    auto g = cg::coalesced_threads();
+    int warp_res;
+    if (g.thread_rank() == 0)
+        warp_res = atomicAdd(ctr, g.size());
+    return g.shfl(warp_res, 0) + g.thread_rank();
+}
+#define USE_COALESCED 1
+
 void IPCSystem::findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T xi) {
     nPP.setVal(0);
     nPE.setVal(0);
@@ -81,6 +96,9 @@ void IPCSystem::findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T
 
     ncsPT.setVal(0);
     ncsEE.setVal(0);
+
+    zs::CppTimer timer;
+    timer.tick();
     {
         auto triBvs = retrieve_bounding_volumes(pol, vtemp, "xn", stInds, zs::wrapv<3>{}, 0);
         stBvh.refit(pol, triBvs);
@@ -97,24 +115,33 @@ void IPCSystem::findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T
             bouSeBvh.refit(pol, edgeBvs);
             findCollisionConstraintsImpl(pol, dHat, xi, true);
         }
+    auto [npt, nee] = getCollisionCnts();
+    timer.tock(fmt::format("dcd broad phase [pt, ee]({}, {})", npt, nee));
+
+    frontManageRequired = false;
 }
 void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHat, T xi, bool withBoundary) {
     using namespace zs;
     constexpr auto space = execspace_e::cuda;
 
+    pol.profile(PROFILE_IPC);
     /// pt
-    pol(Collapse{svInds.size()},
+    const auto &stbvh = withBoundary ? bouStBvh : stBvh;
+    auto &stfront = withBoundary ? boundaryStFront : selfStFront;
+    pol(Collapse{stfront.size()},
         [svInds = proxy<space>({}, svInds), eles = proxy<space>({}, withBoundary ? *coEles : stInds),
-         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(withBoundary ? bouStBvh : stBvh), PP = proxy<space>(PP),
-         nPP = proxy<space>(nPP), PE = proxy<space>(PE), nPE = proxy<space>(nPE), PT = proxy<space>(PT),
-         nPT = proxy<space>(nPT), csPT = proxy<space>(csPT), ncsPT = proxy<space>(ncsPT), dHat, xi,
-         thickness = xi + dHat, voffset = withBoundary ? coOffset : 0] __device__(int vi) mutable {
+         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(stbvh), front = proxy<space>(stfront),
+         PP = proxy<space>(PP), nPP = proxy<space>(nPP), PE = proxy<space>(PE), nPE = proxy<space>(nPE),
+         PT = proxy<space>(PT), nPT = proxy<space>(nPT), csPT = proxy<space>(csPT), ncsPT = proxy<space>(ncsPT), dHat,
+         xi, thickness = xi + dHat, voffset = withBoundary ? coOffset : 0,
+         frontManageRequired = frontManageRequired] __device__(int i) mutable {
+            auto vi = front.prim(i);
             vi = reinterpret_bits<int>(svInds("inds", vi));
             const auto dHat2 = zs::sqr(dHat + xi);
             int BCorder0 = vtemp("BCorder", vi);
             auto p = vtemp.template pack<3>("xn", vi);
             auto bv = bv_t{get_bounding_box(p - thickness, p + thickness)};
-            bvh.iter_neighbors(bv, [&](int stI) {
+            auto f = [&](int stI) {
                 auto tri = eles.template pack<3>("inds", stI).template reinterpret_bits<int>() + voffset;
                 if (vi == tri[0] || vi == tri[1] || vi == tri[2])
                     return;
@@ -186,21 +213,29 @@ void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHa
                 }
                 default: break;
                 }
-            });
+            };
+            if (frontManageRequired)
+                bvh.iter_neighbors(bv, i, front, f);
+            else
+                bvh.iter_neighbors(bv, front.node(i), f);
         });
+    if (frontManageRequired)
+        stfront.reorder(pol);
     /// ee
-    pol(Collapse{seInds.size()},
+    const auto &sebvh = withBoundary ? bouSeBvh : seBvh;
+    auto &sefront = withBoundary ? boundarySeFront : selfSeFront;
+    pol(Collapse{sefront.size()},
         [seInds = proxy<space>({}, seInds), sedges = proxy<space>({}, withBoundary ? *coEdges : seInds),
-         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(withBoundary ? bouSeBvh : seBvh), PP = proxy<space>(PP),
-         nPP = proxy<space>(nPP), PE = proxy<space>(PE), nPE = proxy<space>(nPE), EE = proxy<space>(EE),
-         nEE = proxy<space>(nEE),
+         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(sebvh), front = proxy<space>(sefront),
+         PP = proxy<space>(PP), nPP = proxy<space>(nPP), PE = proxy<space>(PE), nPE = proxy<space>(nPE),
+         EE = proxy<space>(EE), nEE = proxy<space>(nEE),
          // mollifier
          PPM = proxy<space>(PPM), nPPM = proxy<space>(nPPM), PEM = proxy<space>(PEM), nPEM = proxy<space>(nPEM),
          EEM = proxy<space>(EEM), nEEM = proxy<space>(nEEM), enableMollification = enableMollification,
          //
-         csEE = proxy<space>(csEE), ncsEE = proxy<space>(ncsEE), dHat, xi, thickness = xi + dHat,
-         voffset = withBoundary ? coOffset : 0] __device__(int sei) mutable {
-            const auto dHat2 = zs::sqr(dHat + xi);
+         csEE = proxy<space>(csEE), ncsEE = proxy<space>(ncsEE), dHat2 = zs::sqr(dHat + xi), xi, thickness = xi + dHat,
+         voffset = withBoundary ? coOffset : 0, frontManageRequired = frontManageRequired] __device__(int i) mutable {
+            auto sei = front.prim(i);
             auto eiInds = seInds.template pack<2>("inds", sei).template reinterpret_bits<int>();
             bool selfFixed = vtemp("BCorder", eiInds[0]) == 3 && vtemp("BCorder", eiInds[1]) == 3;
             auto v0 = vtemp.template pack<3>("xn", eiInds[0]);
@@ -209,7 +244,7 @@ void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHa
             auto rv1 = vtemp.template pack<3>("x0", eiInds[1]);
             auto [mi, ma] = get_bounding_box(v0, v1);
             auto bv = bv_t{mi - thickness, ma + thickness};
-            bvh.iter_neighbors(bv, [&](int sej) {
+            auto f = [&](int sej) {
                 if (voffset == 0 && sei < sej)
                     return;
                 auto ejInds = sedges.template pack<2>("inds", sej).template reinterpret_bits<int>() + voffset;
@@ -378,8 +413,15 @@ void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHa
                 }
                 default: break;
                 }
-            });
+            };
+            if (frontManageRequired)
+                bvh.iter_neighbors(bv, i, front, f);
+            else
+                bvh.iter_neighbors(bv, front.node(i), f);
         });
+    if (frontManageRequired)
+        sefront.reorder(pol);
+    pol.profile(false);
 }
 void IPCSystem::findCCDConstraints(zs::CudaExecutionPolicy &pol, T alpha, T xi) {
     ncsPT.setVal(0);
@@ -390,6 +432,9 @@ void IPCSystem::findCCDConstraints(zs::CudaExecutionPolicy &pol, T alpha, T xi)
         auto edgeBvs = retrieve_bounding_volumes(pol, vtemp, "xn", seInds, zs::wrapv<2>{}, vtemp, "dir", alpha, 0);
         seBvh.refit(pol, edgeBvs);
     }
+
+    zs::CppTimer timer;
+    timer.tick();
     findCCDConstraintsImpl(pol, alpha, xi, false);
 
     auto checkSize = [this](const auto &cnt, std::string_view msg) {
@@ -412,26 +457,31 @@ void IPCSystem::findCCDConstraints(zs::CudaExecutionPolicy &pol, T alpha, T xi)
             checkSize(ncsPT, "PT");
             checkSize(ncsEE, "EE");
         }
+    auto [npt, nee] = getCollisionCnts();
+    timer.tock(fmt::format("ccd broad phase [pt, ee]({}, {})", npt, nee));
 }
 void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T xi, bool withBoundary) {
     using namespace zs;
     constexpr auto space = execspace_e::cuda;
     const auto dHat2 = dHat * dHat;
 
+    pol.profile(PROFILE_IPC);
     /// pt
-    pol(Collapse{svInds.size()},
+    const auto &stbvh = withBoundary ? bouStBvh : stBvh;
+    auto &stfront = withBoundary ? boundaryStFront : selfStFront;
+    pol(Collapse{stfront.size()},
         [svInds = proxy<space>({}, svInds), eles = proxy<space>({}, withBoundary ? *coEles : stInds),
-         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(withBoundary ? bouStBvh : stBvh), PP = proxy<space>(PP),
-         nPP = proxy<space>(nPP), PE = proxy<space>(PE), nPE = proxy<space>(nPE), PT = proxy<space>(PT),
-         nPT = proxy<space>(nPT), csPT = proxy<space>(csPT), ncsPT = proxy<space>(ncsPT), xi, alpha,
-         voffset = withBoundary ? coOffset : 0] __device__(int vi) mutable {
+         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(stbvh), front = proxy<space>(stfront),
+         csPT = proxy<space>(csPT), ncsPT = proxy<space>(ncsPT), xi, alpha,
+         voffset = withBoundary ? coOffset : 0] __device__(int i) mutable {
+            auto vi = front.prim(i);
             vi = reinterpret_bits<int>(svInds("inds", vi));
             auto p = vtemp.template pack<3>("xn", vi);
             auto dir = vtemp.template pack<3>("dir", vi);
             auto bv = bv_t{get_bounding_box(p, p + alpha * dir)};
             bv._min -= xi;
             bv._max += xi;
-            bvh.iter_neighbors(bv, [&](int stI) {
+            bvh.iter_neighbors(bv, front.node(i), [&](int stI) {
                 auto tri = eles.template pack<3>("inds", stI).template reinterpret_bits<int>() + voffset;
                 if (vi == tri[0] || vi == tri[1] || vi == tri[2])
                     return;
@@ -443,12 +493,14 @@ void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T
             });
         });
     /// ee
-    pol(Collapse{seInds.size()},
+    const auto &sebvh = withBoundary ? bouSeBvh : seBvh;
+    auto &sefront = withBoundary ? boundarySeFront : selfSeFront;
+    pol(Collapse{sefront.size()},
         [seInds = proxy<space>({}, seInds), sedges = proxy<space>({}, withBoundary ? *coEdges : seInds),
-         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(withBoundary ? bouSeBvh : seBvh), PP = proxy<space>(PP),
-         nPP = proxy<space>(nPP), PE = proxy<space>(PE), nPE = proxy<space>(nPE), EE = proxy<space>(PT),
-         nEE = proxy<space>(nPT), csEE = proxy<space>(csEE), ncsEE = proxy<space>(ncsEE), xi, alpha,
-         voffset = withBoundary ? coOffset : 0] __device__(int sei) mutable {
+         vtemp = proxy<space>({}, vtemp), bvh = proxy<space>(sebvh), front = proxy<space>(sefront),
+         csEE = proxy<space>(csEE), ncsEE = proxy<space>(ncsEE), xi, alpha,
+         voffset = withBoundary ? coOffset : 0] __device__(int i) mutable {
+            auto sei = front.prim(i);
             auto eiInds = seInds.template pack<2>("inds", sei).template reinterpret_bits<int>();
             bool selfFixed = vtemp("BCorder", eiInds[0]) == 3 && vtemp("BCorder", eiInds[1]) == 3;
             auto v0 = vtemp.template pack<3>("xn", eiInds[0]);
@@ -460,7 +512,7 @@ void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T
             merge(bv, v1 + alpha * dir1);
             bv._min -= xi;
             bv._max += xi;
-            bvh.iter_neighbors(bv, [&](int sej) {
+            bvh.iter_neighbors(bv, front.node(i), [&](int sej) {
                 if (voffset == 0 && sei < sej)
                     return;
                 auto ejInds = sedges.template pack<2>("inds", sej).template reinterpret_bits<int>() + voffset;
@@ -473,6 +525,7 @@ void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T
                 csEE[atomic_add(exec_cuda, &ncsEE[0], 1)] = pair4_t{eiInds[0], eiInds[1], ejInds[0], ejInds[1]};
             });
         });
+    pol.profile(false);
 }
 void IPCSystem::precomputeFrictions(zs::CudaExecutionPolicy &pol, T dHat, T xi) {
     using namespace zs;
@@ -578,15 +631,38 @@ void IPCSystem::project(zs::CudaExecutionPolicy &pol, const zs::SmallString tag)
     using namespace zs;
     constexpr execspace_e space = execspace_e::cuda;
     // projection
-    pol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp), projectDBC = projectDBC, tag] ZS_LAMBDA(int vi) mutable {
-        int BCfixed = vtemp("BCfixed", vi);
+    pol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp), projectDBC = projectDBC,
+                             tagOffset = vtemp.getPropertyOffset(tag), fixedOffset = vtemp.getPropertyOffset("BCfixed"),
+                             orderOffset = vtemp.getPropertyOffset("BCorder")] ZS_LAMBDA(int vi) mutable {
+        int BCfixed = vtemp(fixedOffset, vi);
         if (projectDBC || (!projectDBC && BCfixed)) {
-            int BCorder = vtemp("BCorder", vi);
+            int BCorder = vtemp(orderOffset, vi);
             for (int d = 0; d != BCorder; ++d)
-                vtemp(tag, d, vi) = 0;
+                vtemp(tagOffset + d, vi) = 0;
         }
     });
 }
+void IPCSystem::precondition(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString srcTag,
+                             const zs::SmallString dstTag) {
+    using namespace zs;
+    constexpr execspace_e space = execspace_e::cuda;
+    // precondition
+    //pol(zs::range(numDofs), [cgtemp = proxy<space>({}, cgtemp), srcTag, dstTag] ZS_LAMBDA(int vi) mutable {
+    //    cgtemp.template tuple<3>(dstTag, vi) =
+    //        cgtemp.template pack<3, 3>("P", vi) * cgtemp.template pack<3>(srcTag, vi);
+    //});
+    pol(zs::range(numDofs * 3), [cgtemp = proxy<space>({}, cgtemp), srcOffset = cgtemp.getPropertyOffset(srcTag),
+                                 dstOffset = cgtemp.getPropertyOffset(dstTag),
+                                 POffset = cgtemp.getPropertyOffset("P")] ZS_LAMBDA(int vi) mutable {
+        int d = vi % 3;
+        vi /= 3;
+        float sum = 0;
+        POffset += d * 3;
+        for (int j = 0; j != 3; ++j)
+            sum += cgtemp(POffset + j, vi) * cgtemp(srcOffset + j, vi);
+        cgtemp(dstOffset + d, vi) = sum;
+    });
+}
 void IPCSystem::precondition(zs::CudaExecutionPolicy &pol, const zs::SmallString srcTag, const zs::SmallString dstTag) {
     using namespace zs;
     constexpr execspace_e space = execspace_e::cuda;
@@ -596,6 +672,116 @@ void IPCSystem::precondition(zs::CudaExecutionPolicy &pol, const zs::SmallString
     });
 }
 
+void IPCSystem::multiply(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString dxTag,
+                         const zs::SmallString bTag) {
+    using namespace zs;
+    constexpr execspace_e space = execspace_e::cuda;
+    constexpr auto execTag = wrapv<space>{};
+    using T = typename RM_CVREF_T(cgtemp)::value_type;
+    // dx -> b
+    pol(range(numDofs), [execTag, cgtemp = proxy<space>({}, cgtemp), bTag] ZS_LAMBDA(int vi) mutable {
+        cgtemp.template tuple<3>(bTag, vi) = vec3f::zeros();
+    });
+    // hess1
+    pol(zs::range(numDofs), [execTag, hess1 = proxy<space>(hess1), cgtemp = proxy<space>({}, cgtemp),
+                             dxOffset = cgtemp.getPropertyOffset(dxTag),
+                             bOffset = cgtemp.getPropertyOffset(bTag)] __device__(int i) mutable {
+        auto H = hess1.hess[i];
+        zs::vec<float, 3> dx{cgtemp(dxOffset, i), cgtemp(dxOffset + 1, i), cgtemp(dxOffset + 2, i)};
+        // auto dx = cgtemp.template pack<3>(dxTag, i);
+        dx = H * dx;
+        for (int d = 0; d != 3; ++d)
+            atomic_add(execTag, &cgtemp(bOffset + d, i), dx(d));
+    });
+    // hess2
+    pol(Collapse{hess2.count(), 32}, [execTag, hess2 = proxy<space>(hess2), cgtemp = proxy<space>({}, cgtemp),
+                                      dxOffset = cgtemp.getPropertyOffset(dxTag),
+                                      bOffset = cgtemp.getPropertyOffset(bTag)] ZS_LAMBDA(int ei, int tid) mutable {
+        int rowid = tid / 5;
+        int colid = tid % 5;
+        auto inds = hess2.inds[ei];
+        auto H = hess2.hess[ei];
+        T entryH = 0, entryDx = 0, entryG = 0;
+        if (tid < 30) {
+            entryH = H.val(rowid * 6 + colid);
+            entryDx = cgtemp(dxOffset + colid % 3, inds[colid / 3]);
+            entryG = entryH * entryDx;
+            if (colid == 0) {
+                entryG += H.val(rowid * 6 + 5) * cgtemp(dxOffset + 2, inds[1]);
+            }
+        }
+        for (int iter = 1; iter <= 4; iter <<= 1) {
+            T tmp = __shfl_down_sync(0xFFFFFFFF, entryG, iter);
+            if (colid + iter < 5 && tid < 30)
+                entryG += tmp;
+        }
+        if (colid == 0 && rowid < 6)
+            atomic_add(execTag, &cgtemp(bOffset + rowid % 3, inds[rowid / 3]), entryG);
+    });
+    // hess3
+    {
+        auto numRows = hess3.count() * 9;
+        auto numWarps = (numRows + 3) / 4; // 8 threads per row
+        pol(Collapse{numWarps * 32}, [execTag, hess3 = proxy<space>(hess3), cgtemp = proxy<space>({}, cgtemp),
+                                      dxOffset = cgtemp.getPropertyOffset(dxTag),
+                                      bOffset = cgtemp.getPropertyOffset(bTag), numRows] ZS_LAMBDA(int tid) mutable {
+            int growid = tid / 8;
+            int rowid = growid % 9;
+            int i = growid / 9;
+            int colid = tid % 8;
+
+            auto inds = hess3.inds[i];
+            auto H = hess3.hess[i];
+            T entryG = 0;
+            if (growid < numRows) {
+                entryG = H.val(rowid * 9 + colid) * cgtemp(dxOffset + colid % 3, inds[colid / 3]);
+                if (colid == 0) {
+                    auto cid = colid + 8;
+                    entryG += H.val(rowid * 9 + cid) * cgtemp(dxOffset + cid % 3, inds[cid / 3]);
+                }
+            }
+            for (int iter = 1; iter <= 4; iter <<= 1) {
+                T tmp = __shfl_down_sync(0xFFFFFFFF, entryG, iter);
+                if (colid + iter < 8 && growid < numRows)
+                    entryG += tmp;
+            }
+            if (colid == 0 && growid < numRows)
+                atomic_add(execTag, &cgtemp(bOffset + rowid % 3, inds[rowid / 3]), entryG);
+        });
+    }
+    // hess4
+    {
+        // 0, 1, ..., 7, 0, 1, 2, 3
+        pol(Collapse{hess4.count(), 32 * 3},
+            [execTag, hess4 = proxy<space>(hess4), cgtemp = proxy<space>({}, cgtemp),
+             dxOffset = cgtemp.getPropertyOffset(dxTag),
+             bOffset = cgtemp.getPropertyOffset(bTag)] ZS_LAMBDA(int i, int tid) mutable {
+                int rowid = tid / 8;
+                int colid = tid % 8;
+
+                auto inds = hess4.inds[i];
+                auto H = hess4.hess[i];
+                T entryH = 0, entryDx = 0, entryG = 0;
+                {
+                    entryH = H.val(rowid * 12 + colid);
+                    entryDx = cgtemp(dxOffset + colid % 3, inds[colid / 3]);
+                    entryG = entryH * entryDx;
+                    if (colid < 4) {
+                        auto cid = colid + 8;
+                        entryG += H.val(rowid * 12 + cid) * cgtemp(dxOffset + cid % 3, inds[cid / 3]);
+                    }
+                }
+                for (int iter = 1; iter <= 4; iter <<= 1) {
+                    T tmp = __shfl_down_sync(0xFFFFFFFF, entryG, iter);
+                    if (colid + iter < 8)
+                        entryG += tmp;
+                }
+                if (colid == 0)
+                    atomic_add(execTag, &cgtemp(bOffset + rowid % 3, inds[rowid / 3]), entryG);
+            });
+    }
+}
+
 void IPCSystem::multiply(zs::CudaExecutionPolicy &pol, const zs::SmallString dxTag, const zs::SmallString bTag) {
     using namespace zs;
     constexpr execspace_e space = execspace_e::cuda;
@@ -1428,7 +1614,6 @@ void IPCSystem::multiply(zs::CudaExecutionPolicy &pol, const zs::SmallString dxT
     if (!BCsatisfied) {
         pol(range(numDofs), [execTag, vtemp = proxy<space>({}, vtemp), dxTag, bTag,
                              boundaryKappa = boundaryKappa] ZS_LAMBDA(int vi) mutable {
-            auto cons = vtemp.template pack<3>("cons", vi);
             auto dx = vtemp.template pack<3>(dxTag, vi);
             auto w = vtemp("ws", vi);
             int BCfixed = vtemp("BCfixed", vi);
@@ -1989,65 +2174,81 @@ void IPCSystem::cgsolve(zs::CudaExecutionPolicy &cudaPol) {
     using namespace zs;
     constexpr auto space = execspace_e::cuda;
 
-    // solve for A dir = grad;
-    cudaPol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp)] ZS_LAMBDA(int i) mutable {
-        vtemp.tuple<3>("dir", i) = vec3::zeros();
-        vtemp.tuple<3>("temp", i) = vec3::zeros();
-    });
+    /// copy diagonal block preconditioners
+    cudaPol.sync(false);
+    cudaPol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp), cgtemp = proxy<space>({}, cgtemp)] ZS_LAMBDA(
+                                    int i) mutable { cgtemp.tuple<9>("P", i) = vtemp.pack<3, 3>("P", i); });
+    /// solve for A dir = grad;
     // initial guess for hard boundary constraints
-    if (coVerts)
-        cudaPol(zs::range(coVerts->size()),
-                [vtemp = proxy<space>({}, vtemp), coOffset = coOffset, dt = dt] ZS_LAMBDA(int i) mutable {
-                    i += coOffset;
-                    vtemp.tuple<3>("dir", i) = (vtemp.pack<3>("xtilde", i) - vtemp.pack<3>("xn", i)) * dt;
-                });
+    cudaPol(zs::range(numDofs),
+            [cgtemp = proxy<space>({}, cgtemp), vtemp = proxy<space>({}, vtemp), coOffset = coOffset, dt = dt,
+             dirOffset = cgtemp.getPropertyOffset("dir"), xtildeOffset = vtemp.getPropertyOffset("xtilde"),
+             xnOffset = vtemp.getPropertyOffset("xn")] ZS_LAMBDA(int i) mutable {
+                if (i < coOffset) {
+                    cgtemp.tuple<3>(dirOffset, i) = vec3::zeros();
+                } else {
+                    cgtemp.tuple<3>(dirOffset, i) = (vtemp.pack<3>(xtildeOffset, i) - vtemp.pack<3>(xnOffset, i)) * dt;
+                }
+            });
     // temp = A * dir
-    multiply(cudaPol, "dir", "temp");
+    multiply(cudaPol, true_c, "dir", "temp");
     // r = grad - temp
-    cudaPol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp)] ZS_LAMBDA(int i) mutable {
-        vtemp.tuple<3>("r", i) = vtemp.pack<3>("grad", i) - vtemp.pack<3>("temp", i);
+    cudaPol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp), cgtemp = proxy<space>({}, cgtemp),
+                                 rOffset = cgtemp.getPropertyOffset("r"), gradOffset = vtemp.getPropertyOffset("grad"),
+                                 tempOffset = cgtemp.getPropertyOffset("temp")] ZS_LAMBDA(int i) mutable {
+        cgtemp.tuple<3>(rOffset, i) = vtemp.pack<3>(gradOffset, i) - cgtemp.pack<3>(tempOffset, i);
     });
     // project(cudaPol, "r");
-    precondition(cudaPol, "r", "q");
-    cudaPol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp)] ZS_LAMBDA(int i) mutable {
-        vtemp.tuple<3>("p", i) = vtemp.pack<3>("q", i);
+    precondition(cudaPol, true_c, "r", "q");
+    cudaPol(zs::range(numDofs), [cgtemp = proxy<space>({}, cgtemp), pOffset = cgtemp.getPropertyOffset("p"),
+                                 qOffset = cgtemp.getPropertyOffset("q")] ZS_LAMBDA(int i) mutable {
+        cgtemp.tuple<3>(pOffset, i) = cgtemp.pack<3>(qOffset, i);
     });
-    T zTrk = dot(cudaPol, vtemp, "r", "q");
-    auto residualPreconditionedNorm2 = zTrk;
-    auto localTol2 = cgRel * cgRel * residualPreconditionedNorm2;
+    double zTrk = dot(cudaPol, wrapt<double>{}, cgtemp, "r", "q");
+    double residualPreconditionedNorm2 = zTrk;
+    double localTol2 = cgRel * cgRel * residualPreconditionedNorm2;
     int iter = 0;
 
     //
     auto [npp, npe, npt, nee, nppm, npem, neem, ncspt, ncsee] = getCnts();
+    fmt::print("npp: {}, npe: {}, npt: {}, nee: {}, nppm: {}, npem: {}, neem: {}, ncspt: {}, ncsee: {}\n", npp, npe,
+               npt, nee, nppm, npem, neem, ncspt, ncsee);
 
+    CppTimer timer;
+    timer.tick();
     for (; iter != CGCap; ++iter) {
         if (iter % 50 == 0)
-            fmt::print("cg iter: {}, norm2: {} (zTrk: {}) npp: {}, npe: {}, "
-                       "npt: {}, nee: {}, nppm: {}, npem: {}, neem: {}, ncspt: "
-                       "{}, ncsee: {}\n",
-                       iter, residualPreconditionedNorm2, zTrk, npp, npe, npt, nee, nppm, npem, neem, ncspt, ncsee);
+            fmt::print("cg iter: {}, norm2: {} (zTrk: {})\n", iter, residualPreconditionedNorm2, zTrk);
 
         if (residualPreconditionedNorm2 <= localTol2)
             break;
-        multiply(cudaPol, "p", "temp");
+        multiply(cudaPol, true_c, "p", "temp");
         // project(cudaPol, "temp"); // need further checking hessian!
 
-        T alpha = zTrk / dot(cudaPol, vtemp, "temp", "p");
-        cudaPol(range(numDofs), [vtemp = proxy<space>({}, vtemp), alpha] ZS_LAMBDA(int vi) mutable {
-            vtemp.tuple<3>("dir", vi) = vtemp.pack<3>("dir", vi) + alpha * vtemp.pack<3>("p", vi);
-            vtemp.tuple<3>("r", vi) = vtemp.pack<3>("r", vi) - alpha * vtemp.pack<3>("temp", vi);
+        double alpha = zTrk / dot(cudaPol, wrapt<double>{}, cgtemp, "temp", "p");
+        cudaPol(range(numDofs), [cgtemp = proxy<space>({}, cgtemp), dirOffset = cgtemp.getPropertyOffset("dir"),
+                                 pOffset = cgtemp.getPropertyOffset("p"), rOffset = cgtemp.getPropertyOffset("r"),
+                                 tempOffset = cgtemp.getPropertyOffset("temp"), alpha] ZS_LAMBDA(int vi) mutable {
+            cgtemp.tuple<3>(dirOffset, vi) = cgtemp.pack<3>(dirOffset, vi) + alpha * cgtemp.pack<3>(pOffset, vi);
+            cgtemp.tuple<3>(rOffset, vi) = cgtemp.pack<3>(rOffset, vi) - alpha * cgtemp.pack<3>(tempOffset, vi);
         });
 
-        precondition(cudaPol, "r", "q");
-        auto zTrkLast = zTrk;
-        zTrk = dot(cudaPol, vtemp, "q", "r");
-        auto beta = zTrk / zTrkLast;
-        cudaPol(range(numDofs), [vtemp = proxy<space>({}, vtemp), beta] ZS_LAMBDA(int vi) mutable {
-            vtemp.tuple<3>("p", vi) = vtemp.pack<3>("q", vi) + beta * vtemp.pack<3>("p", vi);
+        precondition(cudaPol, true_c, "r", "q");
+        double zTrkLast = zTrk;
+        zTrk = dot(cudaPol, wrapt<double>{}, cgtemp, "q", "r");
+        double beta = zTrk / zTrkLast;
+        cudaPol(range(numDofs), [cgtemp = proxy<space>({}, cgtemp), beta, pOffset = cgtemp.getPropertyOffset("p"),
+                                 qOffset = cgtemp.getPropertyOffset("q")] ZS_LAMBDA(int vi) mutable {
+            cgtemp.tuple<3>(pOffset, vi) = cgtemp.pack<3>(qOffset, vi) + beta * cgtemp.pack<3>(pOffset, vi);
         });
 
         residualPreconditionedNorm2 = zTrk;
     } // end cg step
+    /// copy back results
+    cudaPol(zs::range(numDofs), [vtemp = proxy<space>({}, vtemp), cgtemp = proxy<space>({}, cgtemp)] ZS_LAMBDA(
+                                    int i) mutable { vtemp.tuple<3>("dir", i) = cgtemp.pack<3>("dir", i); });
+    cudaPol.sync(true);
+    timer.tock(fmt::format("{} cgiters", iter));
 }
 void IPCSystem::groundIntersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T &stepSize) {
     using namespace zs;
@@ -2083,6 +2284,9 @@ void IPCSystem::intersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T xi, T &
     Vector<T> alpha{vtemp.get_allocator(), 1};
     alpha.setVal(stepSize);
     auto npt = ncsPT.getVal();
+    CppTimer timer;
+    timer.tick();
+    pol.profile(PROFILE_IPC);
     pol(range(npt), [csPT = proxy<space>(csPT), vtemp = proxy<space>({}, vtemp), alpha = proxy<space>(alpha), stepSize,
                      xi, coOffset = (int)coOffset] __device__(int pti) {
         auto ids = csPT[pti];
@@ -2126,6 +2330,8 @@ void IPCSystem::intersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T xi, T &
 #endif
             atomic_min(exec_cuda, &alpha[0], tmp);
     });
+    pol.profile(false);
+    timer.tock("ccd time");
     stepSize = alpha.getVal();
 }
 void IPCSystem::lineSearch(zs::CudaExecutionPolicy &cudaPol, T &alpha) {
@@ -2241,6 +2447,8 @@ void IPCSystem::newtonKrylov(zs::CudaExecutionPolicy &pol) {
             else
                 vtemp.tuple<9>("P", i) = mat3::identity();
         });
+        // prepare float edition
+        convertHessian(pol);
         // CG SOLVE
         cgsolve(pol);
         // ROTATE BACK
diff --git a/projects/CuLagrange/fem/SolverUtils.cuh b/projects/CuLagrange/fem/SolverUtils.cuh
index 5f1f72dc12..444798e361 100644
--- a/projects/CuLagrange/fem/SolverUtils.cuh
+++ b/projects/CuLagrange/fem/SolverUtils.cuh
@@ -88,31 +88,50 @@ retrieve_bounding_volumes(zs::CudaExecutionPolicy &pol, const TileVecT0 &verts,
     });
     return ret;
 }
-template <typename Op = std::plus<typename IPCSystem::T>>
-inline typename IPCSystem::T reduce(zs::CudaExecutionPolicy &cudaPol, const zs::Vector<typename IPCSystem::T> &res,
-                                    Op op = {}) {
+template <typename T, typename Op = std::plus<T>>
+inline T reduce(zs::CudaExecutionPolicy &cudaPol, const zs::Vector<T> &res, Op op = {}) {
     using namespace zs;
-    using T = typename IPCSystem::T;
     Vector<T> ret{res.get_allocator(), 1};
+    bool shouldSync = cudaPol.shouldSync();
+    cudaPol.sync(true);
     zs::reduce(cudaPol, std::begin(res), std::end(res), std::begin(ret), (T)0, op);
+    cudaPol.sync(shouldSync);
     return ret.getVal();
 }
-inline typename IPCSystem::T dot(zs::CudaExecutionPolicy &cudaPol, typename IPCSystem::dtiles_t &vertData,
-                                 const zs::SmallString tag0, const zs::SmallString tag1) {
+template <typename TT, typename T, typename AllocatorT>
+inline TT dot(zs::CudaExecutionPolicy &cudaPol, zs::wrapt<TT>, zs::TileVector<T, 32, AllocatorT> &vertData,
+              const zs::SmallString tag0, const zs::SmallString tag1) {
+    using namespace zs;
+    constexpr auto space = execspace_e::cuda;
+    Vector<TT> res{vertData.get_allocator(), count_warps(vertData.size())};
+    zs::memset(zs::mem_device, res.data(), 0, sizeof(TT) * count_warps(vertData.size()));
+    cudaPol(range(vertData.size()), [data = proxy<space>({}, vertData), res = proxy<space>(res), n = vertData.size(),
+                                     offset0 = vertData.getPropertyOffset(tag0),
+                                     offset1 = vertData.getPropertyOffset(tag1)] __device__(int pi) mutable {
+        auto v0 = data.template pack<3>(offset0, pi).template cast<TT>();
+        auto v1 = data.template pack<3>(offset1, pi).template cast<TT>();
+        auto v = v0.dot(v1);
+        reduce_to(pi, n, v, res[pi / 32]);
+    });
+    return reduce(cudaPol, res, thrust::plus<TT>{});
+}
+template <typename T, typename AllocatorT>
+inline T dot(zs::CudaExecutionPolicy &cudaPol, zs::TileVector<T, 32, AllocatorT> &vertData, const zs::SmallString tag0,
+             const zs::SmallString tag1) {
     using namespace zs;
     constexpr auto space = execspace_e::cuda;
     // Vector<double> res{vertData.get_allocator(), vertData.size()};
-    Vector<double> res{vertData.get_allocator(), count_warps(vertData.size())};
-    zs::memset(zs::mem_device, res.data(), 0, sizeof(double) * count_warps(vertData.size()));
+    Vector<T> res{vertData.get_allocator(), count_warps(vertData.size())};
+    zs::memset(zs::mem_device, res.data(), 0, sizeof(T) * count_warps(vertData.size()));
     cudaPol(range(vertData.size()), [data = proxy<space>({}, vertData), res = proxy<space>(res), tag0, tag1,
                                      n = vertData.size()] __device__(int pi) mutable {
-        auto v0 = data.pack<3>(tag0, pi);
-        auto v1 = data.pack<3>(tag1, pi);
+        auto v0 = data.template pack<3>(tag0, pi);
+        auto v1 = data.template pack<3>(tag1, pi);
         auto v = v0.dot(v1);
         // res[pi] = v;
         reduce_to(pi, n, v, res[pi / 32]);
     });
-    return reduce(cudaPol, res, thrust::plus<double>{});
+    return reduce(cudaPol, res, thrust::plus<T>{});
 }
 inline typename IPCSystem::T infNorm(zs::CudaExecutionPolicy &cudaPol, typename IPCSystem::dtiles_t &vertData,
                                      const zs::SmallString tag = "dir") {
@@ -121,9 +140,9 @@ inline typename IPCSystem::T infNorm(zs::CudaExecutionPolicy &cudaPol, typename
     constexpr auto space = execspace_e::cuda;
     Vector<T> res{vertData.get_allocator(), count_warps(vertData.size())};
     zs::memset(zs::mem_device, res.data(), 0, sizeof(T) * count_warps(vertData.size()));
-    cudaPol(range(vertData.size()), [data = proxy<space>({}, vertData), res = proxy<space>(res), tag,
-                                     n = vertData.size()] __device__(int pi) mutable {
-        auto v = data.pack<3>(tag, pi);
+    cudaPol(range(vertData.size()), [data = proxy<space>({}, vertData), res = proxy<space>(res), n = vertData.size(),
+                                     offset = vertData.getPropertyOffset(tag)] __device__(int pi) mutable {
+        auto v = data.pack<3>(offset, pi);
         auto val = v.abs().max();
 
         auto [mask, numValid] = warp_mask(pi, n);
diff --git a/projects/CuOcean/swe/FDGather.cu b/projects/CuOcean/swe/FDGather.cu
index da21c9846b..1fb12532eb 100644
--- a/projects/CuOcean/swe/FDGather.cu
+++ b/projects/CuOcean/swe/FDGather.cu
@@ -1,7 +1,9 @@
 #include "Structures.hpp"
 #include "zensim/Logger.hpp"
 #include "zensim/cuda/execution/ExecutionPolicy.cuh"
+#include "zensim/omp/execution/ExecutionPolicy.hpp"
 #include "zensim/types/Property.h"
+#include <zeno/types/ListObject.h>
 #include <zeno/types/NumericObject.h>
 #include <zeno/types/StringObject.h>
 
@@ -19,8 +21,8 @@ void edgeLoop(typename ZenoParticles::particles_t &prim, int nx, int ny, const s
     const SmallString tag = channel;
     prim.append_channels(
         pol, {{lTag.asString(), nchn}, {rTag.asString(), nchn}, {tTag.asString(), nchn}, {bTag.asString(), nchn}});
-    pol(Collapse{nx, ny},
-        [verts = proxy<space>({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int i, int j) mutable {
+    pol(Collapse{ny, nx},
+        [verts = proxy<space>({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int j, int i) mutable {
             size_t lidx = j * nx + math::max(i - 1, 0);
             size_t ridx = j * nx + math::min(i + 1, nx - 1);
             size_t tidx = math::min(j + 1, ny - 1) * nx + i;
@@ -34,6 +36,92 @@ void edgeLoop(typename ZenoParticles::particles_t &prim, int nx, int ny, const s
         });
 }
 template <int nchn>
+void checkEdgeLoop(PrimitiveObject *prim, typename ZenoParticles::particles_t &zsprim, int nx, int ny,
+                   const std::string &channel) {
+    using namespace zs;
+    constexpr auto space = execspace_e::host;
+    auto pol = omp_exec();
+    using T = conditional_t<nchn == 1, float, zeno::vec3f>;
+    auto &l = prim->add_attr<T>("l" + channel);
+    auto &r = prim->add_attr<T>("r" + channel);
+    auto &t = prim->add_attr<T>("t" + channel);
+    auto &b = prim->add_attr<T>("b" + channel);
+    auto &channels = prim->add_attr<T>(channel);
+
+    const SmallString lTag = std::string("l") + channel;
+    const SmallString rTag = std::string("r") + channel;
+    const SmallString tTag = std::string("t") + channel;
+    const SmallString bTag = std::string("b") + channel;
+    const SmallString tag = channel;
+    pol(Collapse{nx, ny}, [&, verts = proxy<space>({}, zsprim)](int i, int j) {
+        size_t lidx = j * nx + math::max(i - 1, 0);
+        size_t ridx = j * nx + math::min(i + 1, nx - 1);
+        size_t tidx = math::min(j + 1, ny - 1) * nx + i;
+        size_t bidx = math::max(j - 1, 0) * nx + i;
+        size_t idx = j * nx + i;
+
+        if constexpr (nchn == 1) {
+            auto dev_results = [&]() {
+                auto li = verts(lTag, idx);
+                auto ri = verts(rTag, idx);
+                auto ti = verts(tTag, idx);
+                auto bi = verts(bTag, idx);
+                auto inli = verts(tag, lidx);
+                auto inri = verts(tag, ridx);
+                auto inti = verts(tag, tidx);
+                auto inbi = verts(tag, bidx);
+                return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi);
+            };
+            auto host_results = [&]() {
+                auto li = l[idx];
+                auto ri = r[idx];
+                auto ti = t[idx];
+                auto bi = b[idx];
+                auto inli = channels[lidx];
+                auto inri = channels[ridx];
+                auto inti = channels[tidx];
+                auto inbi = channels[bidx];
+                return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi);
+            };
+            auto [da, db, dc, dd, dain, dbin, dcin, ddin] = dev_results();
+            auto [ha, hb, hc, hd, hain, hbin, hcin, hdin] = host_results();
+            if (da != ha || db != hb || dc != hc || dd != hd || dain != hain || dbin != hbin || dcin != hcin ||
+                ddin != hdin)
+                printf("damn wrong at <%d, %d>!\n\tdev: [%f(%f), %f(%f), %f(%f), %f(%f)]\n\thost_ref: [%f(%f), %f(%f), "
+                       "%f(%f), %f(%f)]\n",
+                       i, j, da, dain, db, dbin, dc, dcin, dd, ddin, ha, hain, hb, hbin, hc, hcin, hd, hdin);
+        } else if constexpr (nchn == 3) {
+            auto dev_results = [&]() {
+                auto li = verts.template pack<nchn>(lTag, idx).to_array();
+                auto ri = verts.template pack<nchn>(rTag, idx).to_array();
+                auto ti = verts.template pack<nchn>(tTag, idx).to_array();
+                auto bi = verts.template pack<nchn>(bTag, idx).to_array();
+                auto inli = verts.template pack<nchn>(tag, lidx).to_array();
+                auto inri = verts.template pack<nchn>(tag, ridx).to_array();
+                auto inti = verts.template pack<nchn>(tag, tidx).to_array();
+                auto inbi = verts.template pack<nchn>(tag, bidx).to_array();
+                return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi);
+            };
+            auto host_results = [&]() {
+                auto li = l[idx];
+                auto ri = r[idx];
+                auto ti = t[idx];
+                auto bi = b[idx];
+                auto inli = channels[lidx];
+                auto inri = channels[ridx];
+                auto inti = channels[tidx];
+                auto inbi = channels[bidx];
+                return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi);
+            };
+            auto [da, db, dc, dd, dain, dbin, dcin, ddin] = dev_results();
+            auto [ha, hb, hc, hd, hain, hbin, hcin, hdin] = host_results();
+            if (da != ha || db != hb || dc != hc || dd != hd || dain != hain || dbin != hbin || dcin != hcin ||
+                ddin != hdin)
+                printf("damn wrong vec3f! <%d, %d>\n", i, j);
+        }
+    });
+}
+template <int nchn>
 void edgeLoopSum(typename ZenoParticles::particles_t &prim, int nx, int ny, const std::string &channel,
                  const std::string &addChannel) {
     using namespace zs;
@@ -44,8 +132,8 @@ void edgeLoopSum(typename ZenoParticles::particles_t &prim, int nx, int ny, cons
     const SmallString tTag = std::string("t") + channel;
     const SmallString bTag = std::string("b") + channel;
     const SmallString tag = addChannel;
-    pol(Collapse{nx, ny},
-        [verts = proxy<space>({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int i, int j) mutable {
+    pol(Collapse{ny, nx},
+        [verts = proxy<space>({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int j, int i) mutable {
             size_t lidx = j * nx + math::max(i - 1, 0);
             size_t ridx = j * nx + math::min(i + 1, nx - 1);
             size_t tidx = math::min(j + 1, ny - 1) * nx + i;
@@ -70,8 +158,8 @@ void cornerLoop(typename ZenoParticles::particles_t &prim, int nx, int ny, const
     const SmallString tag = channel;
     prim.append_channels(
         pol, {{ltTag.asString(), nchn}, {rtTag.asString(), nchn}, {lbTag.asString(), nchn}, {rbTag.asString(), nchn}});
-    pol(Collapse{nx, ny},
-        [verts = proxy<space>({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int i, int j) mutable {
+    pol(Collapse{ny, nx},
+        [verts = proxy<space>({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int j, int i) mutable {
             size_t ltidx = math::min(j + 1, ny - 1) * nx + math::max(i - 1, 0);
             size_t rtidx = math::min(j + 1, ny - 1) * nx + math::min(i + 1, nx - 1);
             size_t lbidx = math::max(j - 1, 0) * nx + math::max(i - 1, 0);
@@ -95,8 +183,8 @@ void cornerLoopSum(typename ZenoParticles::particles_t &prim, int nx, int ny, co
     const SmallString lbTag = std::string("lb") + channel;
     const SmallString rbTag = std::string("rb") + channel;
     const SmallString tag = addChannel;
-    pol(Collapse{nx, ny},
-        [verts = proxy<space>({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int i, int j) mutable {
+    pol(Collapse{ny, nx},
+        [verts = proxy<space>({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int j, int i) mutable {
             size_t ltidx = math::min(j + 1, ny - 1) * nx + math::max(i - 1, 0);
             size_t rtidx = math::min(j + 1, ny - 1) * nx + math::min(i + 1, nx - 1);
             size_t lbidx = math::max(j - 1, 0) * nx + math::max(i - 1, 0);
@@ -152,6 +240,124 @@ ZENDEFNODE(ZSGather2DFiniteDifference, {
                                            {"zenofx"},
                                        });
 
+struct ZSCheckGather2DFiniteDifference : zeno::INode {
+    virtual void apply() override {
+        auto nx = get_input2<int>("nx");
+        auto ny = get_input2<int>("ny");
+        auto grid = get_input<ZenoParticles>("ZSParticles");
+        auto gridRef = get_input<PrimitiveObject>("grid");
+        auto attrT = get_input2<std::string>("attrT");
+        auto type = get_input2<std::string>("OpType");
+        auto channel = get_input2<std::string>("channel");
+
+        if (auto &verts_ = grid->getParticles(); verts_.hasProperty(channel)) {
+            auto verts = verts_.clone({zs::memsrc_e::host, -1});
+            if (type == "FIVE_STENCIL" || type == "NINE_STENCIL") {
+                puts("begin edge loop checking");
+                if (attrT == "float") {
+                    checkEdgeLoop<1>(gridRef.get(), verts, nx, ny, channel);
+                }
+                if (attrT == "vec3") {
+                    checkEdgeLoop<3>(gridRef.get(), verts, nx, ny, channel);
+                }
+                puts("done edge loop checking");
+            }
+#if 0
+            if (type == "NINE_STENCIL") {
+                if (attrT == "float") {
+                    checkCornerLoop<1>(gridRef.get(), verts, nx, ny, channel);
+                }
+                if (attrT == "vec3") {
+                    checkCornerLoop<3>(gridRef.get(), verts, nx, ny, channel);
+                }
+            }
+#endif
+        }
+
+        set_output("prim", std::move(grid));
+    }
+};
+
+ZENDEFNODE(ZSCheckGather2DFiniteDifference, {
+                                                {{"PrimitiveObject", "grid"},
+                                                 {"ZSParticles", "ZSParticles"},
+                                                 {"int", "nx", "1"},
+                                                 {"int", "ny", "1"},
+                                                 {"string", "channel", "pos"},
+                                                 {"enum vec3 float", "attrT", "float"},
+                                                 {"enum FIVE_STENCIL NINE_STENCIL", "OpType", "FIVE_STENCIL"}},
+                                                {{"ZSParticles", "prim"}},
+                                                {},
+                                                {"zenofx"},
+                                            });
+
+struct ZSCheckPrimAttribs : zeno::INode {
+    virtual void apply() override {
+        auto grid = get_input<ZenoParticles>("ZSParticles");
+        auto gridRef = get_input<PrimitiveObject>("grid");
+        auto attribs = get_input<zeno::ListObject>("attribs");
+
+        const auto &verts = grid->getParticles();
+        for (auto &attrib_ : attribs->get2<std::string>()) {
+            auto attrib = attrib_;
+            auto attribAct = attrib_;
+            if (attribAct == "pos")
+                attribAct = "x";
+            if (attribAct == "vel")
+                attribAct = "v";
+            if (!verts.hasProperty(attribAct) || !gridRef->has_attr(attrib))
+                throw std::runtime_error(fmt::format("about prop [{}], zspar [{}], ref [{}]\n", attrib,
+                                                     verts.hasProperty(attribAct), gridRef->has_attr(attrib)));
+            if (verts.size() != gridRef->size())
+                throw std::runtime_error("size mismatch!\n");
+            auto nchn = verts.getPropertySize(attribAct);
+            auto nchnRef = gridRef->attr_is<float>(attrib) ? 1 : 3;
+            if (nchn != nchnRef)
+                throw std::runtime_error("attrib dimension mismatch!\n");
+            auto compare = [&](auto dim_v) {
+                fmt::print("begin checking prim [{}] of size {}\n", attrib, nchn);
+                using namespace zs;
+                constexpr auto dim = RM_CVREF_T(dim_v)::value;
+                using T = conditional_t<dim == 1, float, zeno::vec3f>;
+                auto ompPol = omp_exec();
+                constexpr auto space = execspace_e::host;
+                ompPol(range(verts.size()),
+                       [&, verts = proxy<space>({}, verts), tag = SmallString{attribAct}, nchn](int vi) {
+                           float v, vref;
+                           for (int d = 0; d != nchn; ++d) {
+                               v = verts(tag, d, vi);
+                               if constexpr (dim == 1)
+                                   vref = gridRef->attr<T>(attrib)[vi];
+                               else
+                                   vref = gridRef->attr<T>(attrib)[vi][d];
+                               if (v != vref) {
+                                   fmt::print("damn this, {}-th vert prop [{}, {}] actual: {}, ref: {}\n", vi, attrib,
+                                              d, v, vref);
+                               }
+                           }
+                       });
+                fmt::print("done checking prim [{}]\n", attrib);
+            };
+            std::variant<zs::wrapv<1>, zs::wrapv<3>> tmp{};
+            if (nchn == 1)
+                tmp = zs::wrapv<1>{};
+            else
+                tmp = zs::wrapv<3>{};
+            zs::match(compare)(tmp);
+        }
+
+        set_output("prim", std::move(grid));
+    }
+};
+
+ZENDEFNODE(ZSCheckPrimAttribs,
+           {
+               {{"PrimitiveObject", "grid"}, {"ZSParticles", "ZSParticles"}, {"ListObject", "attribs"}},
+               {{"ZSParticles", "prim"}},
+               {},
+               {"zenofx"},
+           });
+
 struct ZSMomentumTransfer2DFiniteDifference : zeno::INode {
     void apply() override {
         auto nx = get_input2<int>("nx");
diff --git a/zeno/include/zeno/utils/vec.h b/zeno/include/zeno/utils/vec.h
index b6859f60a6..2e7c5d907c 100644
--- a/zeno/include/zeno/utils/vec.h
+++ b/zeno/include/zeno/utils/vec.h
@@ -33,6 +33,8 @@ struct vec : std::array<T, N> {
     }
   }
 
+/*
+  // will not be called for implicit or explicit conversion
   operator std::array<T, N>() const {
     std::array<T, N> res;
     for (size_t i = 0; i < N; i++) {
@@ -40,6 +42,7 @@ struct vec : std::array<T, N> {
     }
     return res;
   }
+*/
 
   template <class S>
   explicit vec(vec<N, S> const &x) {