[release] Merge branch 'master' of https://github.com/zenustech/zeno

zenustech · Sep 27, 2022 · c1e7afb · c1e7afb
2 parents 4ef9434 + 03a504c
commit c1e7afb
Show file tree

Hide file tree

Showing 15 changed files with 1,036 additions and 116 deletions.
diff --git a/projects/CUDA/CMakeLists.txt b/projects/CUDA/CMakeLists.txt
@@ -48,6 +48,7 @@ set_target_properties(zeno
 # wrangler
 target_sources(zeno PRIVATE
   utils/IndexBuckets.cu
+  utils/Primitives.cu
   wrangle/PW.cu
   wrangle/P2W.cu
   wrangle/PNW.cu

diff --git a/projects/CUDA/utils/Primitives.cu b/projects/CUDA/utils/Primitives.cu
@@ -0,0 +1,130 @@
+#include "Structures.hpp"
+#include "zensim/cuda/execution/ExecutionPolicy.cuh"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <zeno/types/NumericObject.h>
+#include <zeno/types/PrimitiveObject.h>
+#include <zeno/utils/parallel_reduce.h>
+#include <zeno/utils/vec.h>
+#include <zeno/zeno.h>
+
+namespace zeno {
+
+/// utilities
+constexpr std::size_t count_warps(std::size_t n) noexcept {
+    return (n + 31) / 32;
+}
+constexpr int warp_index(int n) noexcept {
+    return n / 32;
+}
+constexpr auto warp_mask(int i, int n) noexcept {
+    int k = n % 32;
+    const int tail = n - k;
+    if (i < tail)
+        return zs::make_tuple(0xFFFFFFFFu, 32);
+    return zs::make_tuple(((unsigned)(1ull << k) - 1), k);
+}
+
+template <typename T, typename Op> __forceinline__ __device__ void reduce_to(int i, int n, T val, T &dst, Op op) {
+    auto [mask, numValid] = warp_mask(i, n);
+    __syncwarp(mask);
+    auto locid = threadIdx.x & 31;
+    for (int stride = 1; stride < 32; stride <<= 1) {
+        auto tmp = __shfl_down_sync(mask, val, stride);
+        if (locid + stride < numValid)
+            val = op(val, tmp);
+    }
+    if (locid == 0)
+        dst = val;
+}
+
+template <typename TransOp, typename ReduceOp>
+static float prim_reduce(typename ZenoParticles::particles_t &verts, float e, TransOp top, ReduceOp rop,
+                         std::string attrToReduce) {
+    using namespace zs;
+    constexpr auto space = execspace_e::cuda;
+    using T = typename ZenoParticles::particles_t::value_type;
+    auto nchn = verts.getPropertySize(attrToReduce);
+    auto offset = verts.getPropertyOffset(attrToReduce);
+    const auto nwarps = count_warps(verts.size());
+
+    auto cudaPol = cuda_exec().device(0);
+
+    Vector<float> res{verts.get_allocator(), nwarps};
+    // cudaPol(res, [e] ZS_LAMBDA(auto &v) { v = e; });
+    cudaPol(range(verts.size()), [res = proxy<space>(res), verts = proxy<space>({}, verts), offset, nwarps, nchn, top,
+                                  rop] ZS_LAMBDA(int i) mutable {
+        auto [mask, numValid] = warp_mask(i, nwarps);
+        auto locid = threadIdx.x & 31;
+        float v = top(verts(offset, i));
+        while (--nchn) {
+            v = rop(top(verts(offset++, i)), v);
+        }
+        reduce_to(i, nwarps, v, res[i / 32], rop);
+    });
+
+    Vector<float> ret{res.get_allocator(), 1};
+    zs::reduce(cudaPol, std::begin(res), std::end(res), std::begin(ret), e, rop);
+    return ret.getVal();
+}
+
+struct ZSPrimitiveReduction : zeno::INode {
+    struct pass_on {
+        constexpr auto operator()(auto v) const noexcept {
+            return v;
+        }
+    };
+    struct getabs {
+        constexpr auto operator()(auto v) const noexcept {
+            return zs::abs(v);
+        }
+    };
+    virtual void apply() override {
+        using namespace zs;
+        constexpr auto space = execspace_e::cuda;
+        auto prim = get_input<ZenoParticles>("ZSParticles");
+        auto &verts = prim->getParticles();
+        auto attrToReduce = get_input2<std::string>("attr");
+        if (attrToReduce == "pos")
+            attrToReduce = "x";
+        if (attrToReduce == "vel")
+            attrToReduce = "v";
+
+        if (!verts.hasProperty(attrToReduce))
+            throw std::runtime_error(fmt::format("verts do not have property [{}]\n", attrToReduce));
+
+        auto opStr = get_input2<std::string>("op");
+        zeno::NumericValue result;
+        if (opStr == "avg") {
+            result = prim_reduce(verts, 0, pass_on{}, std::plus<float>{}, attrToReduce) / verts.size();
+        } else if (opStr == "max") {
+            result = prim_reduce(verts, limits<float>::lowest(), pass_on{}, getmax<float>{}, attrToReduce);
+        } else if (opStr == "min") {
+            result = prim_reduce(verts, limits<float>::max(), pass_on{}, getmin<float>{}, attrToReduce);
+        } else if (opStr == "absmax") {
+            result = prim_reduce(verts, 0, getabs{}, getmax<float>{}, attrToReduce);
+        }
+
+        auto out = std::make_shared<zeno::NumericObject>();
+        out->set(result);
+        set_output("result", std::move(out));
+    }
+};
+ZENDEFNODE(ZSPrimitiveReduction, {/* inputs: */ {
+                                      "ZSParticles",
+                                      {"string", "attr", "pos"},
+                                      {"enum avg max min absmax", "op", "avg"},
+                                  },
+                                  /* outputs: */
+                                  {
+                                      "result",
+                                  },
+                                  /* params: */
+                                  {},
+                                  /* category: */
+                                  {
+                                      "primitive",
+                                  }});
+
+} // namespace zeno
diff --git a/projects/CUDA/wrangle/P2W.cu b/projects/CUDA/wrangle/P2W.cu
@@ -153,15 +153,19 @@ struct ZSParticlesTwoWrangler : zeno::INode {
                 auto key = name.substr(1);
                 if (!checkDuplication(key))
                     newChns.push_back(PropertyTag{key, dim});
+                if (dim != 3 && dim != 1)
+                    err_printf("ERROR: bad attribute dimension for primitive: %d\n", dim);
             }
             if (newChns.size() > 0)
                 pars.append_channels(cudaPol, newChns);
-            props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
             if (_cuModule == nullptr) {
                 auto wrangleKernelPtxs = cudri::load_all_ptx_files_at();
                 void *state;
                 cuLinkCreate(0, nullptr, nullptr, (CUlinkState *)&state);
+                // fmt::print(">>>>>>>>>>>>>>>>>>>");
+                // fmt::print("{}\n", jitCode);
+                // fmt::print("<<<<<<<<<<<<<<<<<<<");
 
                 auto jitSrc = cudri::compile_cuda_source_to_ptx(jitCode);
                 cuLinkAddData((CUlinkState)state, CU_JIT_INPUT_PTX, (void *)jitSrc.data(), (size_t)jitSrc.size(),
@@ -188,27 +192,29 @@ struct ZSParticlesTwoWrangler : zeno::INode {
             /// symbols
             for (int i = 0; i < prog->symbols.size(); i++) {
                 auto [name, dimid] = prog->symbols[i];
-#if 0
-        fmt::print("channel {}: {}.{}. chn offset: {} (of {})\n", i,
-                   name.c_str(), dimid, pars.getChannelOffset(name.substr(1)),
-                   pars.numChannels());
-#endif
-                void *parsPtr = nullptr;
-                if (name[1] == '@') {
+
+                unsigned short aux = name[1] == '@' ? 1 : 0;
+                float *parsPtr = nullptr;
+                if (aux) {
+                    aux = 1;
                     parsPtr = pars2.data();
                     name = name.substr(2);
                 } else {
                     parsPtr = pars.data();
                     name = name.substr(1);
                 }
-                const auto &curPars = name[1] == '@' ? pars2 : pars;
+                const auto &curPars = aux ? pars2 : pars;
                 haccessors[i] = zs::AccessorAoSoA{zs::aosoa_c,
                                                   parsPtr,
                                                   (unsigned short)unitBytes,
                                                   (unsigned short)tileSize,
                                                   (unsigned short)curPars.numChannels(),
                                                   (unsigned short)(curPars.getChannelOffset(name) + dimid),
-                                                  (unsigned short)0};
+                                                  aux};
+#if 0
+                fmt::print("channel {}: {}.{}. chn offset: {} (of {})\n", i, name.c_str(), dimid,
+                           curPars.getChannelOffset(name), curPars.numChannels());
+#endif
             }
             auto daccessors = haccessors.clone({zs::memsrc_e::device, 0});
 

diff --git a/projects/CUDA/wrangle/PNBVHW.cu b/projects/CUDA/wrangle/PNBVHW.cu
@@ -180,7 +180,6 @@ struct ZSParticleNeighborBvhWrangler : INode {
         }
         if (newChns.size() > 0)
             pars.append_channels(cudaPol, newChns);
-        props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
         if (_cuModule == nullptr) {
             /// execute on the current particle object

diff --git a/projects/CUDA/wrangle/PNW.cu b/projects/CUDA/wrangle/PNW.cu
@@ -184,7 +184,6 @@ struct ZSParticleNeighborWrangler : INode {
         }
         if (newChns.size() > 0)
             pars.append_channels(cudaPol, newChns);
-        props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
         if (_cuModule == nullptr) {
             /// execute on the current particle object

diff --git a/projects/CUDA/wrangle/PPW.cu b/projects/CUDA/wrangle/PPW.cu
@@ -171,7 +171,6 @@ struct ZSParticleParticleWrangler : INode {
         }
         if (newChns.size() > 0)
             pars.append_channels(cudaPol, newChns);
-        props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
         if (_cuModule == nullptr) {
             /// execute on the current particle object

diff --git a/projects/CUDA/wrangle/PW.cu b/projects/CUDA/wrangle/PW.cu
@@ -144,7 +144,6 @@ struct ZSParticlesWrangler : zeno::INode {
             }
             if (newChns.size() > 0)
                 pars.append_channels(cudaPol, newChns);
-            props.insert(std::end(props), std::begin(newChns), std::end(newChns));
 
             if (_cuModule == nullptr) {
                 auto wrangleKernelPtxs = cudri::load_all_ptx_files_at();

diff --git a/projects/CUDA/zpc b/projects/CUDA/zpc
diff --git a/projects/CuLagrange/fem/Solver.cuh b/projects/CuLagrange/fem/Solver.cuh
@@ -3,23 +3,68 @@
 #include "Structures.hpp"
 #include "zensim/container/Bvh.hpp"
 #include "zensim/container/Bvs.hpp"
+#include "zensim/container/Bvtt.hpp"
 #include "zensim/container/HashTable.hpp"
 #include "zensim/container/Vector.hpp"
 #include "zensim/cuda/Cuda.h"
 #include "zensim/cuda/execution/ExecutionPolicy.cuh"
 #include "zensim/math/Vec.h"
 #include <zeno/types/PrimitiveObject.h>
-#include <zeno/utils/logger.h>
+#include <zeno/utils/log.h>
 #include <zeno/zeno.h>
 
 namespace zeno {
 
+template <int n = 1> struct HessianPiece {
+    using HessT = zs::vec<float, n * 3, n * 3>;
+    using IndsT = zs::vec<int, n>;
+    zs::Vector<HessT> hess;
+    zs::Vector<IndsT> inds;
+    zs::Vector<int> cnt;
+    using allocator_t = typename zs::Vector<int>::allocator_type;
+    void init(const allocator_t &allocator, std::size_t size = 0) {
+        hess = zs::Vector<HessT>{allocator, size};
+        inds = zs::Vector<IndsT>{allocator, size};
+        cnt = zs::Vector<int>{allocator, 1};
+    }
+    int count() const {
+        return cnt.getVal();
+    }
+    int increaseCount(int inc) {
+        int v = cnt.getVal();
+        cnt.setVal(v + inc);
+        hess.resize((std::size_t)(v + inc));
+        inds.resize((std::size_t)(v + inc));
+        return v;
+    }
+    void reset(bool setZero = true, std::size_t count = 0) {
+        if (setZero)
+            hess.reset(0);
+        cnt.setVal(count);
+    }
+};
+template <typename HessianPieceT> struct HessianView {
+    static constexpr bool is_const_structure = std::is_const_v<HessianPieceT>;
+    using HT = typename HessianPieceT::HessT;
+    using IT = typename HessianPieceT::IndsT;
+    zs::conditional_t<is_const_structure, const HT *, HT *> hess;
+    zs::conditional_t<is_const_structure, const IT *, IT *> inds;
+    zs::conditional_t<is_const_structure, const int *, int *> cnt;
+};
+template <zs::execspace_e space, int n> HessianView<HessianPiece<n>> proxy(HessianPiece<n> &hp) {
+    return HessianView<HessianPiece<n>>{hp.hess.data(), hp.inds.data(), hp.cnt.data()};
+}
+template <zs::execspace_e space, int n> HessianView<const HessianPiece<n>> proxy(const HessianPiece<n> &hp) {
+    return HessianView<const HessianPiece<n>>{hp.hess.data(), hp.inds.data(), hp.cnt.data()};
+}
+
 struct IPCSystem : IObject {
     using T = double;
     using Ti = zs::conditional_t<zs::is_same_v<T, double>, zs::i64, zs::i32>;
     using dtiles_t = zs::TileVector<T, 32>;
     using tiles_t = typename ZenoParticles::particles_t;
     using vec3 = zs::vec<T, 3>;
+    using vec3f = zs::vec<float, 3>;
     using ivec3 = zs::vec<int, 3>;
     using ivec2 = zs::vec<int, 2>;
     using mat2 = zs::vec<T, 2, 2>;
@@ -32,6 +77,7 @@ struct IPCSystem : IObject {
     using dpair4_t = zs::vec<Ti, 4>;
     // using bvh_t = zeno::ZenoLBvh<3, 32, int, T>;
     using bvh_t = zs::LBvh<3, int, T>;
+    using bvfront_t = zs::BvttFront<int, int>;
     using bv_t = zs::AABBBox<3, T>;
 
     inline static const char s_meanMassTag[] = "MeanMass";
@@ -161,6 +207,9 @@ struct IPCSystem : IObject {
         return zs::make_tuple(nPP.getVal(), nPE.getVal(), nPT.getVal(), nEE.getVal(), nPPM.getVal(), nPEM.getVal(),
                               nEEM.getVal(), ncsPT.getVal(), ncsEE.getVal());
     }
+    auto getCollisionCnts() const {
+        return zs::make_tuple(ncsPT.getVal(), ncsEE.getVal());
+    }
     void findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T xi = 0);
     void findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHat, T xi, bool withBoundary = false);
     void precomputeFrictions(zs::CudaExecutionPolicy &pol, T dHat, T xi = 0);
@@ -178,8 +227,14 @@ struct IPCSystem : IObject {
     void computeFrictionBarrierGradientAndHessian(zs::CudaExecutionPolicy &pol, const zs::SmallString &gTag,
                                                   bool includeHessian = true);
     // krylov solver
+    void convertHessian(zs::CudaExecutionPolicy &pol);
     void project(zs::CudaExecutionPolicy &pol, const zs::SmallString tag);
+    void precondition(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString srcTag,
+                      const zs::SmallString dstTag);
     void precondition(zs::CudaExecutionPolicy &pol, const zs::SmallString srcTag, const zs::SmallString dstTag);
+
+    void multiply(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString dxTag,
+                  const zs::SmallString bTag);
     void multiply(zs::CudaExecutionPolicy &pol, const zs::SmallString dxTag, const zs::SmallString bTag);
     void cgsolve(zs::CudaExecutionPolicy &cudaPol);
     void groundIntersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T &stepSize);
@@ -225,7 +280,7 @@ struct IPCSystem : IObject {
     //
     std::vector<PrimitiveHandle> prims;
     std::vector<PrimitiveHandle> auxPrims;
-    std::size_t coOffset, numDofs;
+    std::size_t coOffset, numDofs, numBouDofs;
     std::size_t sfOffset, seOffset, svOffset;
 
     // (scripted) collision objects
@@ -277,6 +332,13 @@ struct IPCSystem : IObject {
     zs::Vector<pair4_t> csPT, csEE;
     zs::Vector<int> ncsPT, ncsEE;
 
+    // for faster linear system solve
+    HessianPiece<1> hess1;
+    HessianPiece<2> hess2;
+    HessianPiece<3> hess3;
+    HessianPiece<4> hess4;
+    tiles_t cgtemp;
+
     // boundary contacts
     // auxiliary data (spatial acceleration)
     tiles_t stInds, seInds, svInds;
@@ -285,6 +347,9 @@ struct IPCSystem : IObject {
     bvs_t stBvs, seBvs;       // STQ
     bvh_t bouStBvh, bouSeBvh; // for collision objects
     bvs_t bouStBvs, bouSeBvs; // STQ
+    bvfront_t selfStFront, boundaryStFront;
+    bvfront_t selfSeFront, boundarySeFront;
+    bool frontManageRequired;
     T dt, framedt, curRatio;
 };
+1 −1		include/zensim/CMakeLists.txt
+1 −1		include/zensim/Reflection.h
+5 −4		include/zensim/container/Bvtt.hpp
+15 −2		include/zensim/container/TileVector.hpp
+5 −0		include/zensim/cuda/execution/ExecutionPolicy.cuh
+5 −3		include/zensim/geometry/Distance.hpp
+3 −4		include/zensim/math/matrix/Eigen.hpp
+1 −1		include/zensim/math/matrix/QRSVD.hpp
+22 −12		include/zensim/memory/Allocator.h
+7 −0		include/zensim/meta/Meta.h
+9 −8		include/zensim/physics/ConstitutiveModel.hpp
+1 −1		include/zensim/physics/constitutive_models/NeoHookean.hpp
+2 −1		include/zensim/types/Iterator.h
+15 −11		include/zensim/types/Tuple.h