diff --git a/projects/CUDA/CMakeLists.txt b/projects/CUDA/CMakeLists.txt index 805b88cfc2..fb97d98aac 100644 --- a/projects/CUDA/CMakeLists.txt +++ b/projects/CUDA/CMakeLists.txt @@ -48,6 +48,7 @@ set_target_properties(zeno # wrangler target_sources(zeno PRIVATE utils/IndexBuckets.cu + utils/Primitives.cu wrangle/PW.cu wrangle/P2W.cu wrangle/PNW.cu diff --git a/projects/CUDA/utils/Primitives.cu b/projects/CUDA/utils/Primitives.cu new file mode 100644 index 0000000000..27c28dd8cb --- /dev/null +++ b/projects/CUDA/utils/Primitives.cu @@ -0,0 +1,130 @@ +#include "Structures.hpp" +#include "zensim/cuda/execution/ExecutionPolicy.cuh" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace zeno { + +/// utilities +constexpr std::size_t count_warps(std::size_t n) noexcept { + return (n + 31) / 32; +} +constexpr int warp_index(int n) noexcept { + return n / 32; +} +constexpr auto warp_mask(int i, int n) noexcept { + int k = n % 32; + const int tail = n - k; + if (i < tail) + return zs::make_tuple(0xFFFFFFFFu, 32); + return zs::make_tuple(((unsigned)(1ull << k) - 1), k); +} + +template __forceinline__ __device__ void reduce_to(int i, int n, T val, T &dst, Op op) { + auto [mask, numValid] = warp_mask(i, n); + __syncwarp(mask); + auto locid = threadIdx.x & 31; + for (int stride = 1; stride < 32; stride <<= 1) { + auto tmp = __shfl_down_sync(mask, val, stride); + if (locid + stride < numValid) + val = op(val, tmp); + } + if (locid == 0) + dst = val; +} + +template +static float prim_reduce(typename ZenoParticles::particles_t &verts, float e, TransOp top, ReduceOp rop, + std::string attrToReduce) { + using namespace zs; + constexpr auto space = execspace_e::cuda; + using T = typename ZenoParticles::particles_t::value_type; + auto nchn = verts.getPropertySize(attrToReduce); + auto offset = verts.getPropertyOffset(attrToReduce); + const auto nwarps = count_warps(verts.size()); + + auto cudaPol = cuda_exec().device(0); + + Vector res{verts.get_allocator(), nwarps}; + // cudaPol(res, [e] ZS_LAMBDA(auto &v) { v = e; }); + cudaPol(range(verts.size()), [res = proxy(res), verts = proxy({}, verts), offset, nwarps, nchn, top, + rop] ZS_LAMBDA(int i) mutable { + auto [mask, numValid] = warp_mask(i, nwarps); + auto locid = threadIdx.x & 31; + float v = top(verts(offset, i)); + while (--nchn) { + v = rop(top(verts(offset++, i)), v); + } + reduce_to(i, nwarps, v, res[i / 32], rop); + }); + + Vector ret{res.get_allocator(), 1}; + zs::reduce(cudaPol, std::begin(res), std::end(res), std::begin(ret), e, rop); + return ret.getVal(); +} + +struct ZSPrimitiveReduction : zeno::INode { + struct pass_on { + constexpr auto operator()(auto v) const noexcept { + return v; + } + }; + struct getabs { + constexpr auto operator()(auto v) const noexcept { + return zs::abs(v); + } + }; + virtual void apply() override { + using namespace zs; + constexpr auto space = execspace_e::cuda; + auto prim = get_input("ZSParticles"); + auto &verts = prim->getParticles(); + auto attrToReduce = get_input2("attr"); + if (attrToReduce == "pos") + attrToReduce = "x"; + if (attrToReduce == "vel") + attrToReduce = "v"; + + if (!verts.hasProperty(attrToReduce)) + throw std::runtime_error(fmt::format("verts do not have property [{}]\n", attrToReduce)); + + auto opStr = get_input2("op"); + zeno::NumericValue result; + if (opStr == "avg") { + result = prim_reduce(verts, 0, pass_on{}, std::plus{}, attrToReduce) / verts.size(); + } else if (opStr == "max") { + result = prim_reduce(verts, limits::lowest(), pass_on{}, getmax{}, attrToReduce); + } else if (opStr == "min") { + result = prim_reduce(verts, limits::max(), pass_on{}, getmin{}, attrToReduce); + } else if (opStr == "absmax") { + result = prim_reduce(verts, 0, getabs{}, getmax{}, attrToReduce); + } + + auto out = std::make_shared(); + out->set(result); + set_output("result", std::move(out)); + } +}; +ZENDEFNODE(ZSPrimitiveReduction, {/* inputs: */ { + "ZSParticles", + {"string", "attr", "pos"}, + {"enum avg max min absmax", "op", "avg"}, + }, + /* outputs: */ + { + "result", + }, + /* params: */ + {}, + /* category: */ + { + "primitive", + }}); + +} // namespace zeno \ No newline at end of file diff --git a/projects/CUDA/wrangle/P2W.cu b/projects/CUDA/wrangle/P2W.cu index 8e47c63a2b..1719363a45 100644 --- a/projects/CUDA/wrangle/P2W.cu +++ b/projects/CUDA/wrangle/P2W.cu @@ -153,15 +153,19 @@ struct ZSParticlesTwoWrangler : zeno::INode { auto key = name.substr(1); if (!checkDuplication(key)) newChns.push_back(PropertyTag{key, dim}); + if (dim != 3 && dim != 1) + err_printf("ERROR: bad attribute dimension for primitive: %d\n", dim); } if (newChns.size() > 0) pars.append_channels(cudaPol, newChns); - props.insert(std::end(props), std::begin(newChns), std::end(newChns)); if (_cuModule == nullptr) { auto wrangleKernelPtxs = cudri::load_all_ptx_files_at(); void *state; cuLinkCreate(0, nullptr, nullptr, (CUlinkState *)&state); + // fmt::print(">>>>>>>>>>>>>>>>>>>"); + // fmt::print("{}\n", jitCode); + // fmt::print("<<<<<<<<<<<<<<<<<<<"); auto jitSrc = cudri::compile_cuda_source_to_ptx(jitCode); cuLinkAddData((CUlinkState)state, CU_JIT_INPUT_PTX, (void *)jitSrc.data(), (size_t)jitSrc.size(), @@ -188,27 +192,29 @@ struct ZSParticlesTwoWrangler : zeno::INode { /// symbols for (int i = 0; i < prog->symbols.size(); i++) { auto [name, dimid] = prog->symbols[i]; -#if 0 - fmt::print("channel {}: {}.{}. chn offset: {} (of {})\n", i, - name.c_str(), dimid, pars.getChannelOffset(name.substr(1)), - pars.numChannels()); -#endif - void *parsPtr = nullptr; - if (name[1] == '@') { + + unsigned short aux = name[1] == '@' ? 1 : 0; + float *parsPtr = nullptr; + if (aux) { + aux = 1; parsPtr = pars2.data(); name = name.substr(2); } else { parsPtr = pars.data(); name = name.substr(1); } - const auto &curPars = name[1] == '@' ? pars2 : pars; + const auto &curPars = aux ? pars2 : pars; haccessors[i] = zs::AccessorAoSoA{zs::aosoa_c, parsPtr, (unsigned short)unitBytes, (unsigned short)tileSize, (unsigned short)curPars.numChannels(), (unsigned short)(curPars.getChannelOffset(name) + dimid), - (unsigned short)0}; + aux}; +#if 0 + fmt::print("channel {}: {}.{}. chn offset: {} (of {})\n", i, name.c_str(), dimid, + curPars.getChannelOffset(name), curPars.numChannels()); +#endif } auto daccessors = haccessors.clone({zs::memsrc_e::device, 0}); diff --git a/projects/CUDA/wrangle/PNBVHW.cu b/projects/CUDA/wrangle/PNBVHW.cu index 8c63716fc8..1d58879e83 100644 --- a/projects/CUDA/wrangle/PNBVHW.cu +++ b/projects/CUDA/wrangle/PNBVHW.cu @@ -180,7 +180,6 @@ struct ZSParticleNeighborBvhWrangler : INode { } if (newChns.size() > 0) pars.append_channels(cudaPol, newChns); - props.insert(std::end(props), std::begin(newChns), std::end(newChns)); if (_cuModule == nullptr) { /// execute on the current particle object diff --git a/projects/CUDA/wrangle/PNW.cu b/projects/CUDA/wrangle/PNW.cu index a1e64c8b71..26179a412c 100644 --- a/projects/CUDA/wrangle/PNW.cu +++ b/projects/CUDA/wrangle/PNW.cu @@ -184,7 +184,6 @@ struct ZSParticleNeighborWrangler : INode { } if (newChns.size() > 0) pars.append_channels(cudaPol, newChns); - props.insert(std::end(props), std::begin(newChns), std::end(newChns)); if (_cuModule == nullptr) { /// execute on the current particle object diff --git a/projects/CUDA/wrangle/PPW.cu b/projects/CUDA/wrangle/PPW.cu index 9a27e4f9d3..416f2dc962 100644 --- a/projects/CUDA/wrangle/PPW.cu +++ b/projects/CUDA/wrangle/PPW.cu @@ -171,7 +171,6 @@ struct ZSParticleParticleWrangler : INode { } if (newChns.size() > 0) pars.append_channels(cudaPol, newChns); - props.insert(std::end(props), std::begin(newChns), std::end(newChns)); if (_cuModule == nullptr) { /// execute on the current particle object diff --git a/projects/CUDA/wrangle/PW.cu b/projects/CUDA/wrangle/PW.cu index c502a6a1ed..8f4e19b1dd 100644 --- a/projects/CUDA/wrangle/PW.cu +++ b/projects/CUDA/wrangle/PW.cu @@ -144,7 +144,6 @@ struct ZSParticlesWrangler : zeno::INode { } if (newChns.size() > 0) pars.append_channels(cudaPol, newChns); - props.insert(std::end(props), std::begin(newChns), std::end(newChns)); if (_cuModule == nullptr) { auto wrangleKernelPtxs = cudri::load_all_ptx_files_at(); diff --git a/projects/CUDA/zpc b/projects/CUDA/zpc index c4092d787e..d260e81a9f 160000 --- a/projects/CUDA/zpc +++ b/projects/CUDA/zpc @@ -1 +1 @@ -Subproject commit c4092d787e119c2c7b0d4e2b9646e2d32bc074f2 +Subproject commit d260e81a9ffe21c3533d56cfe7f6e741f9d9df15 diff --git a/projects/CuLagrange/fem/Solver.cuh b/projects/CuLagrange/fem/Solver.cuh index faa7aa5043..6bc79154fb 100644 --- a/projects/CuLagrange/fem/Solver.cuh +++ b/projects/CuLagrange/fem/Solver.cuh @@ -3,23 +3,68 @@ #include "Structures.hpp" #include "zensim/container/Bvh.hpp" #include "zensim/container/Bvs.hpp" +#include "zensim/container/Bvtt.hpp" #include "zensim/container/HashTable.hpp" #include "zensim/container/Vector.hpp" #include "zensim/cuda/Cuda.h" #include "zensim/cuda/execution/ExecutionPolicy.cuh" #include "zensim/math/Vec.h" #include -#include +#include #include namespace zeno { +template struct HessianPiece { + using HessT = zs::vec; + using IndsT = zs::vec; + zs::Vector hess; + zs::Vector inds; + zs::Vector cnt; + using allocator_t = typename zs::Vector::allocator_type; + void init(const allocator_t &allocator, std::size_t size = 0) { + hess = zs::Vector{allocator, size}; + inds = zs::Vector{allocator, size}; + cnt = zs::Vector{allocator, 1}; + } + int count() const { + return cnt.getVal(); + } + int increaseCount(int inc) { + int v = cnt.getVal(); + cnt.setVal(v + inc); + hess.resize((std::size_t)(v + inc)); + inds.resize((std::size_t)(v + inc)); + return v; + } + void reset(bool setZero = true, std::size_t count = 0) { + if (setZero) + hess.reset(0); + cnt.setVal(count); + } +}; +template struct HessianView { + static constexpr bool is_const_structure = std::is_const_v; + using HT = typename HessianPieceT::HessT; + using IT = typename HessianPieceT::IndsT; + zs::conditional_t hess; + zs::conditional_t inds; + zs::conditional_t cnt; +}; +template HessianView> proxy(HessianPiece &hp) { + return HessianView>{hp.hess.data(), hp.inds.data(), hp.cnt.data()}; +} +template HessianView> proxy(const HessianPiece &hp) { + return HessianView>{hp.hess.data(), hp.inds.data(), hp.cnt.data()}; +} + struct IPCSystem : IObject { using T = double; using Ti = zs::conditional_t, zs::i64, zs::i32>; using dtiles_t = zs::TileVector; using tiles_t = typename ZenoParticles::particles_t; using vec3 = zs::vec; + using vec3f = zs::vec; using ivec3 = zs::vec; using ivec2 = zs::vec; using mat2 = zs::vec; @@ -32,6 +77,7 @@ struct IPCSystem : IObject { using dpair4_t = zs::vec; // using bvh_t = zeno::ZenoLBvh<3, 32, int, T>; using bvh_t = zs::LBvh<3, int, T>; + using bvfront_t = zs::BvttFront; using bv_t = zs::AABBBox<3, T>; inline static const char s_meanMassTag[] = "MeanMass"; @@ -161,6 +207,9 @@ struct IPCSystem : IObject { return zs::make_tuple(nPP.getVal(), nPE.getVal(), nPT.getVal(), nEE.getVal(), nPPM.getVal(), nPEM.getVal(), nEEM.getVal(), ncsPT.getVal(), ncsEE.getVal()); } + auto getCollisionCnts() const { + return zs::make_tuple(ncsPT.getVal(), ncsEE.getVal()); + } void findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T xi = 0); void findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHat, T xi, bool withBoundary = false); void precomputeFrictions(zs::CudaExecutionPolicy &pol, T dHat, T xi = 0); @@ -178,8 +227,14 @@ struct IPCSystem : IObject { void computeFrictionBarrierGradientAndHessian(zs::CudaExecutionPolicy &pol, const zs::SmallString &gTag, bool includeHessian = true); // krylov solver + void convertHessian(zs::CudaExecutionPolicy &pol); void project(zs::CudaExecutionPolicy &pol, const zs::SmallString tag); + void precondition(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString srcTag, + const zs::SmallString dstTag); void precondition(zs::CudaExecutionPolicy &pol, const zs::SmallString srcTag, const zs::SmallString dstTag); + + void multiply(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString dxTag, + const zs::SmallString bTag); void multiply(zs::CudaExecutionPolicy &pol, const zs::SmallString dxTag, const zs::SmallString bTag); void cgsolve(zs::CudaExecutionPolicy &cudaPol); void groundIntersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T &stepSize); @@ -225,7 +280,7 @@ struct IPCSystem : IObject { // std::vector prims; std::vector auxPrims; - std::size_t coOffset, numDofs; + std::size_t coOffset, numDofs, numBouDofs; std::size_t sfOffset, seOffset, svOffset; // (scripted) collision objects @@ -277,6 +332,13 @@ struct IPCSystem : IObject { zs::Vector csPT, csEE; zs::Vector ncsPT, ncsEE; + // for faster linear system solve + HessianPiece<1> hess1; + HessianPiece<2> hess2; + HessianPiece<3> hess3; + HessianPiece<4> hess4; + tiles_t cgtemp; + // boundary contacts // auxiliary data (spatial acceleration) tiles_t stInds, seInds, svInds; @@ -285,6 +347,9 @@ struct IPCSystem : IObject { bvs_t stBvs, seBvs; // STQ bvh_t bouStBvh, bouSeBvh; // for collision objects bvs_t bouStBvs, bouSeBvs; // STQ + bvfront_t selfStFront, boundaryStFront; + bvfront_t selfSeFront, boundarySeFront; + bool frontManageRequired; T dt, framedt, curRatio; }; diff --git a/projects/CuLagrange/fem/SolverInit.cu b/projects/CuLagrange/fem/SolverInit.cu index 20056e1959..dd15b1f2d3 100644 --- a/projects/CuLagrange/fem/SolverInit.cu +++ b/projects/CuLagrange/fem/SolverInit.cu @@ -218,6 +218,20 @@ void IPCSystem::initialize(zs::CudaExecutionPolicy &pol) { seInds = tiles_t{vtemp.get_allocator(), {{"inds", 2}}, seOffset}; svInds = tiles_t{vtemp.get_allocator(), {{"inds", 1}}, svOffset}; + auto deduce_node_cnt = [](std::size_t numLeaves) { + if (numLeaves <= 2) + return numLeaves; + return numLeaves * 2 - 1; + }; + selfStFront = bvfront_t{(int)deduce_node_cnt(stInds.size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()}; + selfSeFront = bvfront_t{(int)deduce_node_cnt(seInds.size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()}; + if (coVerts) { + boundaryStFront = + bvfront_t{(int)deduce_node_cnt(coEles->size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()}; + boundarySeFront = + bvfront_t{(int)deduce_node_cnt(coEdges->size()), (int)estNumCps, zs::memsrc_e::um, vtemp.devid()}; + } + meanEdgeLength = averageSurfEdgeLength(pol); meanSurfaceArea = averageSurfArea(pol); avgNodeMass = averageNodalMass(pol); @@ -286,8 +300,9 @@ IPCSystem::IPCSystem(std::vector zsprims, const typename IPCSys // temp{estNumCps, zs::memsrc_e::um, zsprims[0]->getParticles().devid()}, csPT{estNumCps, zs::memsrc_e::um, 0}, csEE{estNumCps, zs::memsrc_e::um, 0}, ncsPT{zsprims[0]->getParticles().get_allocator(), 1}, - ncsEE{zsprims[0]->getParticles().get_allocator(), 1}, dt{dt}, framedt{dt}, curRatio{0}, - estNumCps{estNumCps}, enableGround{withGround}, enableContact{withContact}, + ncsEE{zsprims[0]->getParticles().get_allocator(), 1}, + // + dt{dt}, framedt{dt}, curRatio{0}, estNumCps{estNumCps}, enableGround{withGround}, enableContact{withContact}, enableMollification{withMollification}, s_groundNormal{gn[0], gn[1], gn[2]}, augLagCoeff{augLagCoeff}, pnRel{pnRel}, cgRel{cgRel}, PNCap{PNCap}, CGCap{CGCap}, CCDCap{CCDCap}, kappa{kappa0}, kappa0{kappa0}, kappaMin{0}, kappaMax{kappa0}, fricMu{fricMu}, dHat{dHat_}, epsv{epsv_}, extForce{0, gravity, 0} { @@ -300,11 +315,14 @@ IPCSystem::IPCSystem(std::vector zsprims, const typename IPCSys else if (primPtr->category == ZenoParticles::category_e::tet) prims.emplace_back(*primPtr, coOffset, sfOffset, seOffset, svOffset, zs::wrapv<4>{}); } - fmt::print("num total obj : {}, {}, {}, {}\n", coOffset, svOffset, seOffset, sfOffset); - numDofs = coOffset; if (coVerts) numDofs += coVerts->size(); + numBouDofs = numDofs - coOffset; + + fmt::print("num total obj : {}, {}, {}, {}, {}\n", coOffset, numBouDofs, + svOffset, seOffset, sfOffset); + vtemp = dtiles_t{zsprims[0]->getParticles().get_allocator(), {{"grad", 3}, {"P", 9}, @@ -334,6 +352,18 @@ IPCSystem::IPCSystem(std::vector zsprims, const typename IPCSys // inertial hessian tempI = dtiles_t{vtemp.get_allocator(), {{"Hi", 9}}, coOffset}; + // connect vtemp with "dir", "grad" + cgtemp = tiles_t{vtemp.get_allocator(), + {{"P", 9}, + + {"dir", 3}, + + {"temp", 3}, + {"r", 3}, + {"p", 3}, + {"q", 3}}, + numDofs}; + auto cudaPol = zs::cuda_exec(); // average edge length (for CCD filtering) initialize(cudaPol); // update vtemp, bvh, boxsize, targetGRes @@ -463,28 +493,57 @@ void IPCSystem::reinitialize(zs::CudaExecutionPolicy &pol, typename IPCSystem::T } // spatial accel structs + frontManageRequired = true; +#define init_front(sInds, front) \ + { \ + auto numNodes = front.numNodes(); \ + if (numNodes <= 2) { \ + front.reserve(sInds.size() * numNodes); \ + front.setCounter(sInds.size() * numNodes); \ + pol(Collapse{sInds.size()}, [front = proxy(selfStFront), numNodes] ZS_LAMBDA(int i) mutable { \ + for (int j = 0; j != numNodes; ++j) \ + front.assign(i *numNodes + j, i, j); \ + }); \ + } else { \ + front.reserve(sInds.size()); \ + front.setCounter(sInds.size()); \ + pol(Collapse{sInds.size()}, \ + [front = proxy(front)] ZS_LAMBDA(int i) mutable { front.assign(i, i, 0); }); \ + } \ + } { auto triBvs = retrieve_bounding_volumes(pol, vtemp, "xn", stInds, zs::wrapv<3>{}, 0); stBvh.build(pol, triBvs); + init_front(svInds, selfStFront); + auto edgeBvs = retrieve_bounding_volumes(pol, vtemp, "xn", seInds, zs::wrapv<2>{}, 0); seBvh.build(pol, edgeBvs); + init_front(seInds, selfSeFront); } if (coVerts) if (coVerts->size()) { auto triBvs = retrieve_bounding_volumes(pol, vtemp, "xn", *coEles, zs::wrapv<3>{}, coOffset); bouStBvh.build(pol, triBvs); + init_front(svInds, boundaryStFront); + auto edgeBvs = retrieve_bounding_volumes(pol, vtemp, "xn", *coEdges, zs::wrapv<2>{}, coOffset); bouSeBvh.build(pol, edgeBvs); + init_front(seInds, boundarySeFront); } updateWholeBoundingBoxSize(pol); /// update grad pn residual tolerance targetGRes = pnRel * std::sqrt(boxDiagSize2); // zeno::log_info("box diag size: {}, targetGRes: {}\n", std::sqrt(boxDiagSize2), targetGRes); + + /// for faster linear solve + hess1.init(vtemp.get_allocator(), numDofs); + hess2.init(PP.get_allocator(), estNumCps); + hess3.init(PP.get_allocator(), estNumCps); + hess4.init(PP.get_allocator(), estNumCps); } void IPCSystem::suggestKappa(zs::CudaExecutionPolicy &pol) { using namespace zs; - constexpr auto space = execspace_e::cuda; auto cudaPol = zs::cuda_exec(); if (kappa0 == 0) { /// kappaMin @@ -594,9 +653,9 @@ void IPCSystem::updateVelocities(zs::CudaExecutionPolicy &pol) { auto newX = vtemp.pack<3>("xn", vi); auto dv = (newX - vtemp.pack<3>("xtilde", vi)) / dt; auto vn = vtemp.pack<3>("vn", vi); + if (dv.length() > 4) + dv = dv.normalized() * 4; vn += dv; - // if (vn.length() > 2) - // vn = vn.normalized() * 2; int BCorder = vtemp("BCorder", vi); auto BCbasis = vtemp.pack<3, 3>("BCbasis", vi); auto projVec = [&BCbasis, BCorder](auto &dx) { diff --git a/projects/CuLagrange/fem/SolverLinearSystem.cu b/projects/CuLagrange/fem/SolverLinearSystem.cu index 11f9b23e64..fa147a6752 100644 --- a/projects/CuLagrange/fem/SolverLinearSystem.cu +++ b/projects/CuLagrange/fem/SolverLinearSystem.cu @@ -385,7 +385,6 @@ void computeElasticGradientAndHessianImpl(zs::CudaExecutionPolicy &cudaPol, cons void IPCSystem::computeElasticGradientAndHessian(zs::CudaExecutionPolicy &cudaPol, const zs::SmallString &gTag, bool includeHessian) { using namespace zs; - constexpr auto space = execspace_e::cuda; for (auto &primHandle : prims) { match([&](auto &elasticModel) { computeElasticGradientAndHessianImpl(cudaPol, gTag, vtemp, primHandle, elasticModel, dt, projectDBC, @@ -506,4 +505,232 @@ void IPCSystem::computeBoundaryBarrierGradientAndHessian(zs::CudaExecutionPolicy return; } +void IPCSystem::convertHessian(zs::CudaExecutionPolicy &pol) { + using namespace zs; + constexpr execspace_e space = execspace_e::cuda; + constexpr auto execTag = wrapv{}; + + hess1.reset(true, numDofs); // additive style + hess2.reset(false, 0); // overwrite style + hess3.reset(false, 0); + hess4.reset(false, 0); + // inertial + pol(zs::range(coOffset), [tempI = proxy({}, tempI), hess1 = proxy(hess1)] __device__(int i) mutable { + auto Hi = tempI.template pack<3, 3>("Hi", i); + hess1.hess[i] = Hi; + hess1.inds[i][0] = i; + }); + + // elasticity + for (auto &primHandle : prims) { + auto &eles = primHandle.getEles(); + // elasticity + if (primHandle.category == ZenoParticles::curve) { + if (primHandle.isBoundary() && !primHandle.isAuxiliary()) + continue; + auto offset = hess2.increaseCount(eles.size()); + pol(zs::range(eles.size()), + [etemp = proxy({}, primHandle.etemp), eles = proxy({}, eles), hess2 = proxy(hess2), + vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable { + auto He = etemp.template pack<6, 6>("He", ei); + auto inds = eles.template pack<2>("inds", ei).template reinterpret_bits() + vOffset; + hess2.hess[offset + ei] = He; + hess2.inds[offset + ei] = inds; + }); + } else if (primHandle.category == ZenoParticles::surface) { + if (primHandle.isBoundary()) + continue; + auto offset = hess3.increaseCount(eles.size()); + pol(zs::range(eles.size()), + [etemp = proxy({}, primHandle.etemp), eles = proxy({}, eles), hess3 = proxy(hess3), + vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable { + auto He = etemp.template pack<9, 9>("He", ei); + auto inds = eles.template pack<3>("inds", ei).template reinterpret_bits() + vOffset; + hess3.hess[offset + ei] = He; + hess3.inds[offset + ei] = inds; + }); + } else if (primHandle.category == ZenoParticles::tet) { + auto offset = hess4.increaseCount(eles.size()); + pol(zs::range(eles.size()), + [etemp = proxy({}, primHandle.etemp), eles = proxy({}, eles), hess4 = proxy(hess4), + vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable { + auto He = etemp.template pack<12, 12>("He", ei); + auto inds = eles.template pack<4>("inds", ei).template reinterpret_bits() + vOffset; + hess4.hess[offset + ei] = He; + hess4.inds[offset + ei] = inds; + }); + } + for (auto &primHandle : auxPrims) { + auto &eles = primHandle.getEles(); + // soft bindings + if (primHandle.category == ZenoParticles::curve) { + auto offset = hess2.increaseCount(eles.size()); + pol(zs::range(eles.size()), + [etemp = proxy({}, primHandle.etemp), eles = proxy({}, eles), + hess2 = proxy(hess2), vOffset = primHandle.vOffset, offset] ZS_LAMBDA(int ei) mutable { + auto He = etemp.template pack<6, 6>("He", ei); + auto inds = eles.template pack<2>("inds", ei).template reinterpret_bits() + vOffset; + hess2.hess[offset + ei] = He; + hess2.inds[offset + ei] = inds; + }); + } + } + + // contacts + if (enableContact) { + auto numPP = nPP.getVal(); + auto offset = hess2.increaseCount(numPP); + pol(zs::range(numPP), [tempPP = proxy({}, tempPP), PP = proxy(PP), + hess2 = proxy(hess2), offset] ZS_LAMBDA(int ppi) mutable { + auto H = tempPP.template pack<6, 6>("H", ppi); + auto inds = PP[ppi]; + hess2.hess[offset + ppi] = H; + hess2.inds[offset + ppi] = inds; + }); + + auto numPE = nPE.getVal(); + offset = hess3.increaseCount(numPE); + pol(zs::range(numPE), [tempPE = proxy({}, tempPE), PE = proxy(PE), + hess3 = proxy(hess3), offset] ZS_LAMBDA(int pei) mutable { + auto H = tempPE.template pack<9, 9>("H", pei); + auto inds = PE[pei]; + hess3.hess[offset + pei] = H; + hess3.inds[offset + pei] = inds; + }); + + auto numPT = nPT.getVal(); + offset = hess4.increaseCount(numPT); + pol(zs::range(numPT), [tempPT = proxy({}, tempPT), PT = proxy(PT), + hess4 = proxy(hess4), offset] ZS_LAMBDA(int pti) mutable { + auto H = tempPT.template pack<12, 12>("H", pti); + auto inds = PT[pti]; + hess4.hess[offset + pti] = H; + hess4.inds[offset + pti] = inds; + }); + + auto numEE = nEE.getVal(); + offset = hess4.increaseCount(numEE); + pol(zs::range(numEE), [tempEE = proxy({}, tempEE), EE = proxy(EE), + hess4 = proxy(hess4), offset] ZS_LAMBDA(int eei) mutable { + auto H = tempEE.template pack<12, 12>("H", eei); + auto inds = EE[eei]; + hess4.hess[offset + eei] = H; + hess4.inds[offset + eei] = inds; + }); + + if (enableMollification) { + auto numEEM = nEEM.getVal(); + offset = hess4.increaseCount(numEEM); + pol(zs::range(numEEM), [tempEEM = proxy({}, tempEEM), EEM = proxy(EEM), + hess4 = proxy(hess4), offset] ZS_LAMBDA(int eemi) mutable { + auto H = tempEEM.template pack<12, 12>("H", eemi); + auto inds = EEM[eemi]; + hess4.hess[offset + eemi] = H; + hess4.inds[offset + eemi] = inds; + }); + + auto numPPM = nPPM.getVal(); + offset = hess4.increaseCount(numPPM); + pol(zs::range(numPPM), [tempPPM = proxy({}, tempPPM), PPM = proxy(PPM), + hess4 = proxy(hess4), offset] ZS_LAMBDA(int ppmi) mutable { + auto H = tempPPM.template pack<12, 12>("H", ppmi); + auto inds = PPM[ppmi]; + hess4.hess[offset + ppmi] = H; + hess4.inds[offset + ppmi] = inds; + }); + + auto numPEM = nPEM.getVal(); + offset = hess4.increaseCount(numPEM); + pol(zs::range(numPEM), [tempPEM = proxy({}, tempPEM), PEM = proxy(PEM), + hess4 = proxy(hess4), offset] ZS_LAMBDA(int pemi) mutable { + auto H = tempPEM.template pack<12, 12>("H", pemi); + auto inds = PEM[pemi]; + hess4.hess[offset + pemi] = H; + hess4.inds[offset + pemi] = inds; + }); + } // end mollification + + if (s_enableFriction) { + if (fricMu != 0) { + if (s_enableSelfFriction) { + auto numFPP = nFPP.getVal(); + offset = hess2.increaseCount(numFPP); + pol(zs::range(numFPP), [fricPP = proxy({}, fricPP), FPP = proxy(FPP), + hess2 = proxy(hess2), offset] ZS_LAMBDA(int fppi) mutable { + auto H = fricPP.template pack<6, 6>("H", fppi); + auto inds = FPP[fppi]; + hess2.hess[offset + fppi] = H; + hess2.inds[offset + fppi] = inds; + }); + + auto numFPE = nFPE.getVal(); + offset = hess3.increaseCount(numFPE); + pol(zs::range(numFPE), [fricPE = proxy({}, fricPE), FPE = proxy(FPE), + hess3 = proxy(hess3), offset] ZS_LAMBDA(int fpei) mutable { + auto H = fricPE.template pack<9, 9>("H", fpei); + auto inds = FPE[fpei]; + hess3.hess[offset + fpei] = H; + hess3.inds[offset + fpei] = inds; + }); + + auto numFPT = nFPT.getVal(); + offset = hess4.increaseCount(numFPT); + pol(zs::range(numFPT), [fricPT = proxy({}, fricPT), FPT = proxy(FPT), + hess4 = proxy(hess4), offset] ZS_LAMBDA(int fpti) mutable { + auto H = fricPT.template pack<12, 12>("H", fpti); + auto inds = FPT[fpti]; + hess4.hess[offset + fpti] = H; + hess4.inds[offset + fpti] = inds; + }); + + auto numFEE = nFEE.getVal(); + offset = hess4.increaseCount(numFEE); + pol(zs::range(numFEE), [fricEE = proxy({}, fricEE), FEE = proxy(FEE), + hess4 = proxy(hess4), offset] ZS_LAMBDA(int feei) mutable { + auto H = fricEE.template pack<12, 12>("H", feei); + auto inds = FEE[feei]; + hess4.hess[offset + feei] = H; + hess4.inds[offset + feei] = inds; + }); + } // self friction + } //fricmu + } //enable friction + } //enable contact + + // ground contact + if (enableGround) { + for (auto &primHandle : prims) { + if (primHandle.isBoundary()) // skip soft boundary + continue; + const auto &svs = primHandle.getSurfVerts(); + + pol(zs::range(svs.size()), + [svtemp = proxy({}, primHandle.svtemp), svs = proxy({}, svs), + svOffset = primHandle.svOffset, hess1 = proxy(hess1), execTag] __device__(int svi) mutable { + const auto vi = reinterpret_bits(svs("inds", svi)) + svOffset; + auto pbHess = svtemp.template pack<3, 3>("H", svi); + for (int i = 0; i != 3; ++i) + for (int j = 0; j != 3; ++j) + atomic_add(execTag, &hess1.hess[vi](i, j), (float)pbHess(i, j)); + // hess1.hess[i] = Hi; + }); + } + } + + // constraint hessian + if (!BCsatisfied) { + pol(zs::range(numDofs), [vtemp = proxy({}, vtemp), hess1 = proxy(hess1), + boundaryKappa = boundaryKappa, execTag] __device__(int vi) mutable { + auto w = vtemp("ws", vi); + int BCfixed = vtemp("BCfixed", vi); + if (!BCfixed) { + int BCorder = vtemp("BCorder", vi); + for (int d = 0; d != BCorder; ++d) + atomic_add(execTag, &hess1.hess[vi](d, d), (float)(boundaryKappa * w)); + } + }); + } + } +} + } // namespace zeno \ No newline at end of file diff --git a/projects/CuLagrange/fem/SolverPipeline.cu b/projects/CuLagrange/fem/SolverPipeline.cu index 03636fa797..8064ef2406 100644 --- a/projects/CuLagrange/fem/SolverPipeline.cu +++ b/projects/CuLagrange/fem/SolverPipeline.cu @@ -1,10 +1,14 @@ -#include "Utils.hpp" #include "Ccds.hpp" #include "Solver.cuh" +#include "Utils.hpp" #include "zensim/geometry/Distance.hpp" #include "zensim/geometry/Friction.hpp" #include "zensim/geometry/SpatialQuery.hpp" +#include "zensim/profile/CppTimers.hpp" #include "zensim/types/SmallVector.hpp" +#include + +#define PROFILE_IPC 0 namespace zeno { @@ -70,6 +74,17 @@ typename IPCSystem::T IPCSystem::constraintResidual(zs::CudaExecutionPolicy &pol return ret < 1e-6 ? 0 : ret; } +// https://developer.nvidia.com/blog/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ +namespace cg = cooperative_groups; +__forceinline__ __device__ int atomicAggInc(int *ctr) noexcept { + auto g = cg::coalesced_threads(); + int warp_res; + if (g.thread_rank() == 0) + warp_res = atomicAdd(ctr, g.size()); + return g.shfl(warp_res, 0) + g.thread_rank(); +} +#define USE_COALESCED 1 + void IPCSystem::findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T xi) { nPP.setVal(0); nPE.setVal(0); @@ -81,6 +96,9 @@ void IPCSystem::findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T ncsPT.setVal(0); ncsEE.setVal(0); + + zs::CppTimer timer; + timer.tick(); { auto triBvs = retrieve_bounding_volumes(pol, vtemp, "xn", stInds, zs::wrapv<3>{}, 0); stBvh.refit(pol, triBvs); @@ -97,24 +115,33 @@ void IPCSystem::findCollisionConstraints(zs::CudaExecutionPolicy &pol, T dHat, T bouSeBvh.refit(pol, edgeBvs); findCollisionConstraintsImpl(pol, dHat, xi, true); } + auto [npt, nee] = getCollisionCnts(); + timer.tock(fmt::format("dcd broad phase [pt, ee]({}, {})", npt, nee)); + + frontManageRequired = false; } void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHat, T xi, bool withBoundary) { using namespace zs; constexpr auto space = execspace_e::cuda; + pol.profile(PROFILE_IPC); /// pt - pol(Collapse{svInds.size()}, + const auto &stbvh = withBoundary ? bouStBvh : stBvh; + auto &stfront = withBoundary ? boundaryStFront : selfStFront; + pol(Collapse{stfront.size()}, [svInds = proxy({}, svInds), eles = proxy({}, withBoundary ? *coEles : stInds), - vtemp = proxy({}, vtemp), bvh = proxy(withBoundary ? bouStBvh : stBvh), PP = proxy(PP), - nPP = proxy(nPP), PE = proxy(PE), nPE = proxy(nPE), PT = proxy(PT), - nPT = proxy(nPT), csPT = proxy(csPT), ncsPT = proxy(ncsPT), dHat, xi, - thickness = xi + dHat, voffset = withBoundary ? coOffset : 0] __device__(int vi) mutable { + vtemp = proxy({}, vtemp), bvh = proxy(stbvh), front = proxy(stfront), + PP = proxy(PP), nPP = proxy(nPP), PE = proxy(PE), nPE = proxy(nPE), + PT = proxy(PT), nPT = proxy(nPT), csPT = proxy(csPT), ncsPT = proxy(ncsPT), dHat, + xi, thickness = xi + dHat, voffset = withBoundary ? coOffset : 0, + frontManageRequired = frontManageRequired] __device__(int i) mutable { + auto vi = front.prim(i); vi = reinterpret_bits(svInds("inds", vi)); const auto dHat2 = zs::sqr(dHat + xi); int BCorder0 = vtemp("BCorder", vi); auto p = vtemp.template pack<3>("xn", vi); auto bv = bv_t{get_bounding_box(p - thickness, p + thickness)}; - bvh.iter_neighbors(bv, [&](int stI) { + auto f = [&](int stI) { auto tri = eles.template pack<3>("inds", stI).template reinterpret_bits() + voffset; if (vi == tri[0] || vi == tri[1] || vi == tri[2]) return; @@ -186,21 +213,29 @@ void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHa } default: break; } - }); + }; + if (frontManageRequired) + bvh.iter_neighbors(bv, i, front, f); + else + bvh.iter_neighbors(bv, front.node(i), f); }); + if (frontManageRequired) + stfront.reorder(pol); /// ee - pol(Collapse{seInds.size()}, + const auto &sebvh = withBoundary ? bouSeBvh : seBvh; + auto &sefront = withBoundary ? boundarySeFront : selfSeFront; + pol(Collapse{sefront.size()}, [seInds = proxy({}, seInds), sedges = proxy({}, withBoundary ? *coEdges : seInds), - vtemp = proxy({}, vtemp), bvh = proxy(withBoundary ? bouSeBvh : seBvh), PP = proxy(PP), - nPP = proxy(nPP), PE = proxy(PE), nPE = proxy(nPE), EE = proxy(EE), - nEE = proxy(nEE), + vtemp = proxy({}, vtemp), bvh = proxy(sebvh), front = proxy(sefront), + PP = proxy(PP), nPP = proxy(nPP), PE = proxy(PE), nPE = proxy(nPE), + EE = proxy(EE), nEE = proxy(nEE), // mollifier PPM = proxy(PPM), nPPM = proxy(nPPM), PEM = proxy(PEM), nPEM = proxy(nPEM), EEM = proxy(EEM), nEEM = proxy(nEEM), enableMollification = enableMollification, // - csEE = proxy(csEE), ncsEE = proxy(ncsEE), dHat, xi, thickness = xi + dHat, - voffset = withBoundary ? coOffset : 0] __device__(int sei) mutable { - const auto dHat2 = zs::sqr(dHat + xi); + csEE = proxy(csEE), ncsEE = proxy(ncsEE), dHat2 = zs::sqr(dHat + xi), xi, thickness = xi + dHat, + voffset = withBoundary ? coOffset : 0, frontManageRequired = frontManageRequired] __device__(int i) mutable { + auto sei = front.prim(i); auto eiInds = seInds.template pack<2>("inds", sei).template reinterpret_bits(); bool selfFixed = vtemp("BCorder", eiInds[0]) == 3 && vtemp("BCorder", eiInds[1]) == 3; auto v0 = vtemp.template pack<3>("xn", eiInds[0]); @@ -209,7 +244,7 @@ void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHa auto rv1 = vtemp.template pack<3>("x0", eiInds[1]); auto [mi, ma] = get_bounding_box(v0, v1); auto bv = bv_t{mi - thickness, ma + thickness}; - bvh.iter_neighbors(bv, [&](int sej) { + auto f = [&](int sej) { if (voffset == 0 && sei < sej) return; auto ejInds = sedges.template pack<2>("inds", sej).template reinterpret_bits() + voffset; @@ -378,8 +413,15 @@ void IPCSystem::findCollisionConstraintsImpl(zs::CudaExecutionPolicy &pol, T dHa } default: break; } - }); + }; + if (frontManageRequired) + bvh.iter_neighbors(bv, i, front, f); + else + bvh.iter_neighbors(bv, front.node(i), f); }); + if (frontManageRequired) + sefront.reorder(pol); + pol.profile(false); } void IPCSystem::findCCDConstraints(zs::CudaExecutionPolicy &pol, T alpha, T xi) { ncsPT.setVal(0); @@ -390,6 +432,9 @@ void IPCSystem::findCCDConstraints(zs::CudaExecutionPolicy &pol, T alpha, T xi) auto edgeBvs = retrieve_bounding_volumes(pol, vtemp, "xn", seInds, zs::wrapv<2>{}, vtemp, "dir", alpha, 0); seBvh.refit(pol, edgeBvs); } + + zs::CppTimer timer; + timer.tick(); findCCDConstraintsImpl(pol, alpha, xi, false); auto checkSize = [this](const auto &cnt, std::string_view msg) { @@ -412,26 +457,31 @@ void IPCSystem::findCCDConstraints(zs::CudaExecutionPolicy &pol, T alpha, T xi) checkSize(ncsPT, "PT"); checkSize(ncsEE, "EE"); } + auto [npt, nee] = getCollisionCnts(); + timer.tock(fmt::format("ccd broad phase [pt, ee]({}, {})", npt, nee)); } void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T xi, bool withBoundary) { using namespace zs; constexpr auto space = execspace_e::cuda; const auto dHat2 = dHat * dHat; + pol.profile(PROFILE_IPC); /// pt - pol(Collapse{svInds.size()}, + const auto &stbvh = withBoundary ? bouStBvh : stBvh; + auto &stfront = withBoundary ? boundaryStFront : selfStFront; + pol(Collapse{stfront.size()}, [svInds = proxy({}, svInds), eles = proxy({}, withBoundary ? *coEles : stInds), - vtemp = proxy({}, vtemp), bvh = proxy(withBoundary ? bouStBvh : stBvh), PP = proxy(PP), - nPP = proxy(nPP), PE = proxy(PE), nPE = proxy(nPE), PT = proxy(PT), - nPT = proxy(nPT), csPT = proxy(csPT), ncsPT = proxy(ncsPT), xi, alpha, - voffset = withBoundary ? coOffset : 0] __device__(int vi) mutable { + vtemp = proxy({}, vtemp), bvh = proxy(stbvh), front = proxy(stfront), + csPT = proxy(csPT), ncsPT = proxy(ncsPT), xi, alpha, + voffset = withBoundary ? coOffset : 0] __device__(int i) mutable { + auto vi = front.prim(i); vi = reinterpret_bits(svInds("inds", vi)); auto p = vtemp.template pack<3>("xn", vi); auto dir = vtemp.template pack<3>("dir", vi); auto bv = bv_t{get_bounding_box(p, p + alpha * dir)}; bv._min -= xi; bv._max += xi; - bvh.iter_neighbors(bv, [&](int stI) { + bvh.iter_neighbors(bv, front.node(i), [&](int stI) { auto tri = eles.template pack<3>("inds", stI).template reinterpret_bits() + voffset; if (vi == tri[0] || vi == tri[1] || vi == tri[2]) return; @@ -443,12 +493,14 @@ void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T }); }); /// ee - pol(Collapse{seInds.size()}, + const auto &sebvh = withBoundary ? bouSeBvh : seBvh; + auto &sefront = withBoundary ? boundarySeFront : selfSeFront; + pol(Collapse{sefront.size()}, [seInds = proxy({}, seInds), sedges = proxy({}, withBoundary ? *coEdges : seInds), - vtemp = proxy({}, vtemp), bvh = proxy(withBoundary ? bouSeBvh : seBvh), PP = proxy(PP), - nPP = proxy(nPP), PE = proxy(PE), nPE = proxy(nPE), EE = proxy(PT), - nEE = proxy(nPT), csEE = proxy(csEE), ncsEE = proxy(ncsEE), xi, alpha, - voffset = withBoundary ? coOffset : 0] __device__(int sei) mutable { + vtemp = proxy({}, vtemp), bvh = proxy(sebvh), front = proxy(sefront), + csEE = proxy(csEE), ncsEE = proxy(ncsEE), xi, alpha, + voffset = withBoundary ? coOffset : 0] __device__(int i) mutable { + auto sei = front.prim(i); auto eiInds = seInds.template pack<2>("inds", sei).template reinterpret_bits(); bool selfFixed = vtemp("BCorder", eiInds[0]) == 3 && vtemp("BCorder", eiInds[1]) == 3; auto v0 = vtemp.template pack<3>("xn", eiInds[0]); @@ -460,7 +512,7 @@ void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T merge(bv, v1 + alpha * dir1); bv._min -= xi; bv._max += xi; - bvh.iter_neighbors(bv, [&](int sej) { + bvh.iter_neighbors(bv, front.node(i), [&](int sej) { if (voffset == 0 && sei < sej) return; auto ejInds = sedges.template pack<2>("inds", sej).template reinterpret_bits() + voffset; @@ -473,6 +525,7 @@ void IPCSystem::findCCDConstraintsImpl(zs::CudaExecutionPolicy &pol, T alpha, T csEE[atomic_add(exec_cuda, &ncsEE[0], 1)] = pair4_t{eiInds[0], eiInds[1], ejInds[0], ejInds[1]}; }); }); + pol.profile(false); } void IPCSystem::precomputeFrictions(zs::CudaExecutionPolicy &pol, T dHat, T xi) { using namespace zs; @@ -578,15 +631,38 @@ void IPCSystem::project(zs::CudaExecutionPolicy &pol, const zs::SmallString tag) using namespace zs; constexpr execspace_e space = execspace_e::cuda; // projection - pol(zs::range(numDofs), [vtemp = proxy({}, vtemp), projectDBC = projectDBC, tag] ZS_LAMBDA(int vi) mutable { - int BCfixed = vtemp("BCfixed", vi); + pol(zs::range(numDofs), [vtemp = proxy({}, vtemp), projectDBC = projectDBC, + tagOffset = vtemp.getPropertyOffset(tag), fixedOffset = vtemp.getPropertyOffset("BCfixed"), + orderOffset = vtemp.getPropertyOffset("BCorder")] ZS_LAMBDA(int vi) mutable { + int BCfixed = vtemp(fixedOffset, vi); if (projectDBC || (!projectDBC && BCfixed)) { - int BCorder = vtemp("BCorder", vi); + int BCorder = vtemp(orderOffset, vi); for (int d = 0; d != BCorder; ++d) - vtemp(tag, d, vi) = 0; + vtemp(tagOffset + d, vi) = 0; } }); } +void IPCSystem::precondition(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString srcTag, + const zs::SmallString dstTag) { + using namespace zs; + constexpr execspace_e space = execspace_e::cuda; + // precondition + //pol(zs::range(numDofs), [cgtemp = proxy({}, cgtemp), srcTag, dstTag] ZS_LAMBDA(int vi) mutable { + // cgtemp.template tuple<3>(dstTag, vi) = + // cgtemp.template pack<3, 3>("P", vi) * cgtemp.template pack<3>(srcTag, vi); + //}); + pol(zs::range(numDofs * 3), [cgtemp = proxy({}, cgtemp), srcOffset = cgtemp.getPropertyOffset(srcTag), + dstOffset = cgtemp.getPropertyOffset(dstTag), + POffset = cgtemp.getPropertyOffset("P")] ZS_LAMBDA(int vi) mutable { + int d = vi % 3; + vi /= 3; + float sum = 0; + POffset += d * 3; + for (int j = 0; j != 3; ++j) + sum += cgtemp(POffset + j, vi) * cgtemp(srcOffset + j, vi); + cgtemp(dstOffset + d, vi) = sum; + }); +} void IPCSystem::precondition(zs::CudaExecutionPolicy &pol, const zs::SmallString srcTag, const zs::SmallString dstTag) { using namespace zs; constexpr execspace_e space = execspace_e::cuda; @@ -596,6 +672,116 @@ void IPCSystem::precondition(zs::CudaExecutionPolicy &pol, const zs::SmallString }); } +void IPCSystem::multiply(zs::CudaExecutionPolicy &pol, std::true_type, const zs::SmallString dxTag, + const zs::SmallString bTag) { + using namespace zs; + constexpr execspace_e space = execspace_e::cuda; + constexpr auto execTag = wrapv{}; + using T = typename RM_CVREF_T(cgtemp)::value_type; + // dx -> b + pol(range(numDofs), [execTag, cgtemp = proxy({}, cgtemp), bTag] ZS_LAMBDA(int vi) mutable { + cgtemp.template tuple<3>(bTag, vi) = vec3f::zeros(); + }); + // hess1 + pol(zs::range(numDofs), [execTag, hess1 = proxy(hess1), cgtemp = proxy({}, cgtemp), + dxOffset = cgtemp.getPropertyOffset(dxTag), + bOffset = cgtemp.getPropertyOffset(bTag)] __device__(int i) mutable { + auto H = hess1.hess[i]; + zs::vec dx{cgtemp(dxOffset, i), cgtemp(dxOffset + 1, i), cgtemp(dxOffset + 2, i)}; + // auto dx = cgtemp.template pack<3>(dxTag, i); + dx = H * dx; + for (int d = 0; d != 3; ++d) + atomic_add(execTag, &cgtemp(bOffset + d, i), dx(d)); + }); + // hess2 + pol(Collapse{hess2.count(), 32}, [execTag, hess2 = proxy(hess2), cgtemp = proxy({}, cgtemp), + dxOffset = cgtemp.getPropertyOffset(dxTag), + bOffset = cgtemp.getPropertyOffset(bTag)] ZS_LAMBDA(int ei, int tid) mutable { + int rowid = tid / 5; + int colid = tid % 5; + auto inds = hess2.inds[ei]; + auto H = hess2.hess[ei]; + T entryH = 0, entryDx = 0, entryG = 0; + if (tid < 30) { + entryH = H.val(rowid * 6 + colid); + entryDx = cgtemp(dxOffset + colid % 3, inds[colid / 3]); + entryG = entryH * entryDx; + if (colid == 0) { + entryG += H.val(rowid * 6 + 5) * cgtemp(dxOffset + 2, inds[1]); + } + } + for (int iter = 1; iter <= 4; iter <<= 1) { + T tmp = __shfl_down_sync(0xFFFFFFFF, entryG, iter); + if (colid + iter < 5 && tid < 30) + entryG += tmp; + } + if (colid == 0 && rowid < 6) + atomic_add(execTag, &cgtemp(bOffset + rowid % 3, inds[rowid / 3]), entryG); + }); + // hess3 + { + auto numRows = hess3.count() * 9; + auto numWarps = (numRows + 3) / 4; // 8 threads per row + pol(Collapse{numWarps * 32}, [execTag, hess3 = proxy(hess3), cgtemp = proxy({}, cgtemp), + dxOffset = cgtemp.getPropertyOffset(dxTag), + bOffset = cgtemp.getPropertyOffset(bTag), numRows] ZS_LAMBDA(int tid) mutable { + int growid = tid / 8; + int rowid = growid % 9; + int i = growid / 9; + int colid = tid % 8; + + auto inds = hess3.inds[i]; + auto H = hess3.hess[i]; + T entryG = 0; + if (growid < numRows) { + entryG = H.val(rowid * 9 + colid) * cgtemp(dxOffset + colid % 3, inds[colid / 3]); + if (colid == 0) { + auto cid = colid + 8; + entryG += H.val(rowid * 9 + cid) * cgtemp(dxOffset + cid % 3, inds[cid / 3]); + } + } + for (int iter = 1; iter <= 4; iter <<= 1) { + T tmp = __shfl_down_sync(0xFFFFFFFF, entryG, iter); + if (colid + iter < 8 && growid < numRows) + entryG += tmp; + } + if (colid == 0 && growid < numRows) + atomic_add(execTag, &cgtemp(bOffset + rowid % 3, inds[rowid / 3]), entryG); + }); + } + // hess4 + { + // 0, 1, ..., 7, 0, 1, 2, 3 + pol(Collapse{hess4.count(), 32 * 3}, + [execTag, hess4 = proxy(hess4), cgtemp = proxy({}, cgtemp), + dxOffset = cgtemp.getPropertyOffset(dxTag), + bOffset = cgtemp.getPropertyOffset(bTag)] ZS_LAMBDA(int i, int tid) mutable { + int rowid = tid / 8; + int colid = tid % 8; + + auto inds = hess4.inds[i]; + auto H = hess4.hess[i]; + T entryH = 0, entryDx = 0, entryG = 0; + { + entryH = H.val(rowid * 12 + colid); + entryDx = cgtemp(dxOffset + colid % 3, inds[colid / 3]); + entryG = entryH * entryDx; + if (colid < 4) { + auto cid = colid + 8; + entryG += H.val(rowid * 12 + cid) * cgtemp(dxOffset + cid % 3, inds[cid / 3]); + } + } + for (int iter = 1; iter <= 4; iter <<= 1) { + T tmp = __shfl_down_sync(0xFFFFFFFF, entryG, iter); + if (colid + iter < 8) + entryG += tmp; + } + if (colid == 0) + atomic_add(execTag, &cgtemp(bOffset + rowid % 3, inds[rowid / 3]), entryG); + }); + } +} + void IPCSystem::multiply(zs::CudaExecutionPolicy &pol, const zs::SmallString dxTag, const zs::SmallString bTag) { using namespace zs; constexpr execspace_e space = execspace_e::cuda; @@ -1428,7 +1614,6 @@ void IPCSystem::multiply(zs::CudaExecutionPolicy &pol, const zs::SmallString dxT if (!BCsatisfied) { pol(range(numDofs), [execTag, vtemp = proxy({}, vtemp), dxTag, bTag, boundaryKappa = boundaryKappa] ZS_LAMBDA(int vi) mutable { - auto cons = vtemp.template pack<3>("cons", vi); auto dx = vtemp.template pack<3>(dxTag, vi); auto w = vtemp("ws", vi); int BCfixed = vtemp("BCfixed", vi); @@ -1989,65 +2174,81 @@ void IPCSystem::cgsolve(zs::CudaExecutionPolicy &cudaPol) { using namespace zs; constexpr auto space = execspace_e::cuda; - // solve for A dir = grad; - cudaPol(zs::range(numDofs), [vtemp = proxy({}, vtemp)] ZS_LAMBDA(int i) mutable { - vtemp.tuple<3>("dir", i) = vec3::zeros(); - vtemp.tuple<3>("temp", i) = vec3::zeros(); - }); + /// copy diagonal block preconditioners + cudaPol.sync(false); + cudaPol(zs::range(numDofs), [vtemp = proxy({}, vtemp), cgtemp = proxy({}, cgtemp)] ZS_LAMBDA( + int i) mutable { cgtemp.tuple<9>("P", i) = vtemp.pack<3, 3>("P", i); }); + /// solve for A dir = grad; // initial guess for hard boundary constraints - if (coVerts) - cudaPol(zs::range(coVerts->size()), - [vtemp = proxy({}, vtemp), coOffset = coOffset, dt = dt] ZS_LAMBDA(int i) mutable { - i += coOffset; - vtemp.tuple<3>("dir", i) = (vtemp.pack<3>("xtilde", i) - vtemp.pack<3>("xn", i)) * dt; - }); + cudaPol(zs::range(numDofs), + [cgtemp = proxy({}, cgtemp), vtemp = proxy({}, vtemp), coOffset = coOffset, dt = dt, + dirOffset = cgtemp.getPropertyOffset("dir"), xtildeOffset = vtemp.getPropertyOffset("xtilde"), + xnOffset = vtemp.getPropertyOffset("xn")] ZS_LAMBDA(int i) mutable { + if (i < coOffset) { + cgtemp.tuple<3>(dirOffset, i) = vec3::zeros(); + } else { + cgtemp.tuple<3>(dirOffset, i) = (vtemp.pack<3>(xtildeOffset, i) - vtemp.pack<3>(xnOffset, i)) * dt; + } + }); // temp = A * dir - multiply(cudaPol, "dir", "temp"); + multiply(cudaPol, true_c, "dir", "temp"); // r = grad - temp - cudaPol(zs::range(numDofs), [vtemp = proxy({}, vtemp)] ZS_LAMBDA(int i) mutable { - vtemp.tuple<3>("r", i) = vtemp.pack<3>("grad", i) - vtemp.pack<3>("temp", i); + cudaPol(zs::range(numDofs), [vtemp = proxy({}, vtemp), cgtemp = proxy({}, cgtemp), + rOffset = cgtemp.getPropertyOffset("r"), gradOffset = vtemp.getPropertyOffset("grad"), + tempOffset = cgtemp.getPropertyOffset("temp")] ZS_LAMBDA(int i) mutable { + cgtemp.tuple<3>(rOffset, i) = vtemp.pack<3>(gradOffset, i) - cgtemp.pack<3>(tempOffset, i); }); // project(cudaPol, "r"); - precondition(cudaPol, "r", "q"); - cudaPol(zs::range(numDofs), [vtemp = proxy({}, vtemp)] ZS_LAMBDA(int i) mutable { - vtemp.tuple<3>("p", i) = vtemp.pack<3>("q", i); + precondition(cudaPol, true_c, "r", "q"); + cudaPol(zs::range(numDofs), [cgtemp = proxy({}, cgtemp), pOffset = cgtemp.getPropertyOffset("p"), + qOffset = cgtemp.getPropertyOffset("q")] ZS_LAMBDA(int i) mutable { + cgtemp.tuple<3>(pOffset, i) = cgtemp.pack<3>(qOffset, i); }); - T zTrk = dot(cudaPol, vtemp, "r", "q"); - auto residualPreconditionedNorm2 = zTrk; - auto localTol2 = cgRel * cgRel * residualPreconditionedNorm2; + double zTrk = dot(cudaPol, wrapt{}, cgtemp, "r", "q"); + double residualPreconditionedNorm2 = zTrk; + double localTol2 = cgRel * cgRel * residualPreconditionedNorm2; int iter = 0; // auto [npp, npe, npt, nee, nppm, npem, neem, ncspt, ncsee] = getCnts(); + fmt::print("npp: {}, npe: {}, npt: {}, nee: {}, nppm: {}, npem: {}, neem: {}, ncspt: {}, ncsee: {}\n", npp, npe, + npt, nee, nppm, npem, neem, ncspt, ncsee); + CppTimer timer; + timer.tick(); for (; iter != CGCap; ++iter) { if (iter % 50 == 0) - fmt::print("cg iter: {}, norm2: {} (zTrk: {}) npp: {}, npe: {}, " - "npt: {}, nee: {}, nppm: {}, npem: {}, neem: {}, ncspt: " - "{}, ncsee: {}\n", - iter, residualPreconditionedNorm2, zTrk, npp, npe, npt, nee, nppm, npem, neem, ncspt, ncsee); + fmt::print("cg iter: {}, norm2: {} (zTrk: {})\n", iter, residualPreconditionedNorm2, zTrk); if (residualPreconditionedNorm2 <= localTol2) break; - multiply(cudaPol, "p", "temp"); + multiply(cudaPol, true_c, "p", "temp"); // project(cudaPol, "temp"); // need further checking hessian! - T alpha = zTrk / dot(cudaPol, vtemp, "temp", "p"); - cudaPol(range(numDofs), [vtemp = proxy({}, vtemp), alpha] ZS_LAMBDA(int vi) mutable { - vtemp.tuple<3>("dir", vi) = vtemp.pack<3>("dir", vi) + alpha * vtemp.pack<3>("p", vi); - vtemp.tuple<3>("r", vi) = vtemp.pack<3>("r", vi) - alpha * vtemp.pack<3>("temp", vi); + double alpha = zTrk / dot(cudaPol, wrapt{}, cgtemp, "temp", "p"); + cudaPol(range(numDofs), [cgtemp = proxy({}, cgtemp), dirOffset = cgtemp.getPropertyOffset("dir"), + pOffset = cgtemp.getPropertyOffset("p"), rOffset = cgtemp.getPropertyOffset("r"), + tempOffset = cgtemp.getPropertyOffset("temp"), alpha] ZS_LAMBDA(int vi) mutable { + cgtemp.tuple<3>(dirOffset, vi) = cgtemp.pack<3>(dirOffset, vi) + alpha * cgtemp.pack<3>(pOffset, vi); + cgtemp.tuple<3>(rOffset, vi) = cgtemp.pack<3>(rOffset, vi) - alpha * cgtemp.pack<3>(tempOffset, vi); }); - precondition(cudaPol, "r", "q"); - auto zTrkLast = zTrk; - zTrk = dot(cudaPol, vtemp, "q", "r"); - auto beta = zTrk / zTrkLast; - cudaPol(range(numDofs), [vtemp = proxy({}, vtemp), beta] ZS_LAMBDA(int vi) mutable { - vtemp.tuple<3>("p", vi) = vtemp.pack<3>("q", vi) + beta * vtemp.pack<3>("p", vi); + precondition(cudaPol, true_c, "r", "q"); + double zTrkLast = zTrk; + zTrk = dot(cudaPol, wrapt{}, cgtemp, "q", "r"); + double beta = zTrk / zTrkLast; + cudaPol(range(numDofs), [cgtemp = proxy({}, cgtemp), beta, pOffset = cgtemp.getPropertyOffset("p"), + qOffset = cgtemp.getPropertyOffset("q")] ZS_LAMBDA(int vi) mutable { + cgtemp.tuple<3>(pOffset, vi) = cgtemp.pack<3>(qOffset, vi) + beta * cgtemp.pack<3>(pOffset, vi); }); residualPreconditionedNorm2 = zTrk; } // end cg step + /// copy back results + cudaPol(zs::range(numDofs), [vtemp = proxy({}, vtemp), cgtemp = proxy({}, cgtemp)] ZS_LAMBDA( + int i) mutable { vtemp.tuple<3>("dir", i) = cgtemp.pack<3>("dir", i); }); + cudaPol.sync(true); + timer.tock(fmt::format("{} cgiters", iter)); } void IPCSystem::groundIntersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T &stepSize) { using namespace zs; @@ -2083,6 +2284,9 @@ void IPCSystem::intersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T xi, T & Vector alpha{vtemp.get_allocator(), 1}; alpha.setVal(stepSize); auto npt = ncsPT.getVal(); + CppTimer timer; + timer.tick(); + pol.profile(PROFILE_IPC); pol(range(npt), [csPT = proxy(csPT), vtemp = proxy({}, vtemp), alpha = proxy(alpha), stepSize, xi, coOffset = (int)coOffset] __device__(int pti) { auto ids = csPT[pti]; @@ -2126,6 +2330,8 @@ void IPCSystem::intersectionFreeStepsize(zs::CudaExecutionPolicy &pol, T xi, T & #endif atomic_min(exec_cuda, &alpha[0], tmp); }); + pol.profile(false); + timer.tock("ccd time"); stepSize = alpha.getVal(); } void IPCSystem::lineSearch(zs::CudaExecutionPolicy &cudaPol, T &alpha) { @@ -2241,6 +2447,8 @@ void IPCSystem::newtonKrylov(zs::CudaExecutionPolicy &pol) { else vtemp.tuple<9>("P", i) = mat3::identity(); }); + // prepare float edition + convertHessian(pol); // CG SOLVE cgsolve(pol); // ROTATE BACK diff --git a/projects/CuLagrange/fem/SolverUtils.cuh b/projects/CuLagrange/fem/SolverUtils.cuh index 5f1f72dc12..444798e361 100644 --- a/projects/CuLagrange/fem/SolverUtils.cuh +++ b/projects/CuLagrange/fem/SolverUtils.cuh @@ -88,31 +88,50 @@ retrieve_bounding_volumes(zs::CudaExecutionPolicy &pol, const TileVecT0 &verts, }); return ret; } -template > -inline typename IPCSystem::T reduce(zs::CudaExecutionPolicy &cudaPol, const zs::Vector &res, - Op op = {}) { +template > +inline T reduce(zs::CudaExecutionPolicy &cudaPol, const zs::Vector &res, Op op = {}) { using namespace zs; - using T = typename IPCSystem::T; Vector ret{res.get_allocator(), 1}; + bool shouldSync = cudaPol.shouldSync(); + cudaPol.sync(true); zs::reduce(cudaPol, std::begin(res), std::end(res), std::begin(ret), (T)0, op); + cudaPol.sync(shouldSync); return ret.getVal(); } -inline typename IPCSystem::T dot(zs::CudaExecutionPolicy &cudaPol, typename IPCSystem::dtiles_t &vertData, - const zs::SmallString tag0, const zs::SmallString tag1) { +template +inline TT dot(zs::CudaExecutionPolicy &cudaPol, zs::wrapt, zs::TileVector &vertData, + const zs::SmallString tag0, const zs::SmallString tag1) { + using namespace zs; + constexpr auto space = execspace_e::cuda; + Vector res{vertData.get_allocator(), count_warps(vertData.size())}; + zs::memset(zs::mem_device, res.data(), 0, sizeof(TT) * count_warps(vertData.size())); + cudaPol(range(vertData.size()), [data = proxy({}, vertData), res = proxy(res), n = vertData.size(), + offset0 = vertData.getPropertyOffset(tag0), + offset1 = vertData.getPropertyOffset(tag1)] __device__(int pi) mutable { + auto v0 = data.template pack<3>(offset0, pi).template cast(); + auto v1 = data.template pack<3>(offset1, pi).template cast(); + auto v = v0.dot(v1); + reduce_to(pi, n, v, res[pi / 32]); + }); + return reduce(cudaPol, res, thrust::plus{}); +} +template +inline T dot(zs::CudaExecutionPolicy &cudaPol, zs::TileVector &vertData, const zs::SmallString tag0, + const zs::SmallString tag1) { using namespace zs; constexpr auto space = execspace_e::cuda; // Vector res{vertData.get_allocator(), vertData.size()}; - Vector res{vertData.get_allocator(), count_warps(vertData.size())}; - zs::memset(zs::mem_device, res.data(), 0, sizeof(double) * count_warps(vertData.size())); + Vector res{vertData.get_allocator(), count_warps(vertData.size())}; + zs::memset(zs::mem_device, res.data(), 0, sizeof(T) * count_warps(vertData.size())); cudaPol(range(vertData.size()), [data = proxy({}, vertData), res = proxy(res), tag0, tag1, n = vertData.size()] __device__(int pi) mutable { - auto v0 = data.pack<3>(tag0, pi); - auto v1 = data.pack<3>(tag1, pi); + auto v0 = data.template pack<3>(tag0, pi); + auto v1 = data.template pack<3>(tag1, pi); auto v = v0.dot(v1); // res[pi] = v; reduce_to(pi, n, v, res[pi / 32]); }); - return reduce(cudaPol, res, thrust::plus{}); + return reduce(cudaPol, res, thrust::plus{}); } inline typename IPCSystem::T infNorm(zs::CudaExecutionPolicy &cudaPol, typename IPCSystem::dtiles_t &vertData, const zs::SmallString tag = "dir") { @@ -121,9 +140,9 @@ inline typename IPCSystem::T infNorm(zs::CudaExecutionPolicy &cudaPol, typename constexpr auto space = execspace_e::cuda; Vector res{vertData.get_allocator(), count_warps(vertData.size())}; zs::memset(zs::mem_device, res.data(), 0, sizeof(T) * count_warps(vertData.size())); - cudaPol(range(vertData.size()), [data = proxy({}, vertData), res = proxy(res), tag, - n = vertData.size()] __device__(int pi) mutable { - auto v = data.pack<3>(tag, pi); + cudaPol(range(vertData.size()), [data = proxy({}, vertData), res = proxy(res), n = vertData.size(), + offset = vertData.getPropertyOffset(tag)] __device__(int pi) mutable { + auto v = data.pack<3>(offset, pi); auto val = v.abs().max(); auto [mask, numValid] = warp_mask(pi, n); diff --git a/projects/CuOcean/swe/FDGather.cu b/projects/CuOcean/swe/FDGather.cu index da21c9846b..1fb12532eb 100644 --- a/projects/CuOcean/swe/FDGather.cu +++ b/projects/CuOcean/swe/FDGather.cu @@ -1,7 +1,9 @@ #include "Structures.hpp" #include "zensim/Logger.hpp" #include "zensim/cuda/execution/ExecutionPolicy.cuh" +#include "zensim/omp/execution/ExecutionPolicy.hpp" #include "zensim/types/Property.h" +#include #include #include @@ -19,8 +21,8 @@ void edgeLoop(typename ZenoParticles::particles_t &prim, int nx, int ny, const s const SmallString tag = channel; prim.append_channels( pol, {{lTag.asString(), nchn}, {rTag.asString(), nchn}, {tTag.asString(), nchn}, {bTag.asString(), nchn}}); - pol(Collapse{nx, ny}, - [verts = proxy({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int i, int j) mutable { + pol(Collapse{ny, nx}, + [verts = proxy({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int j, int i) mutable { size_t lidx = j * nx + math::max(i - 1, 0); size_t ridx = j * nx + math::min(i + 1, nx - 1); size_t tidx = math::min(j + 1, ny - 1) * nx + i; @@ -34,6 +36,92 @@ void edgeLoop(typename ZenoParticles::particles_t &prim, int nx, int ny, const s }); } template +void checkEdgeLoop(PrimitiveObject *prim, typename ZenoParticles::particles_t &zsprim, int nx, int ny, + const std::string &channel) { + using namespace zs; + constexpr auto space = execspace_e::host; + auto pol = omp_exec(); + using T = conditional_t; + auto &l = prim->add_attr("l" + channel); + auto &r = prim->add_attr("r" + channel); + auto &t = prim->add_attr("t" + channel); + auto &b = prim->add_attr("b" + channel); + auto &channels = prim->add_attr(channel); + + const SmallString lTag = std::string("l") + channel; + const SmallString rTag = std::string("r") + channel; + const SmallString tTag = std::string("t") + channel; + const SmallString bTag = std::string("b") + channel; + const SmallString tag = channel; + pol(Collapse{nx, ny}, [&, verts = proxy({}, zsprim)](int i, int j) { + size_t lidx = j * nx + math::max(i - 1, 0); + size_t ridx = j * nx + math::min(i + 1, nx - 1); + size_t tidx = math::min(j + 1, ny - 1) * nx + i; + size_t bidx = math::max(j - 1, 0) * nx + i; + size_t idx = j * nx + i; + + if constexpr (nchn == 1) { + auto dev_results = [&]() { + auto li = verts(lTag, idx); + auto ri = verts(rTag, idx); + auto ti = verts(tTag, idx); + auto bi = verts(bTag, idx); + auto inli = verts(tag, lidx); + auto inri = verts(tag, ridx); + auto inti = verts(tag, tidx); + auto inbi = verts(tag, bidx); + return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi); + }; + auto host_results = [&]() { + auto li = l[idx]; + auto ri = r[idx]; + auto ti = t[idx]; + auto bi = b[idx]; + auto inli = channels[lidx]; + auto inri = channels[ridx]; + auto inti = channels[tidx]; + auto inbi = channels[bidx]; + return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi); + }; + auto [da, db, dc, dd, dain, dbin, dcin, ddin] = dev_results(); + auto [ha, hb, hc, hd, hain, hbin, hcin, hdin] = host_results(); + if (da != ha || db != hb || dc != hc || dd != hd || dain != hain || dbin != hbin || dcin != hcin || + ddin != hdin) + printf("damn wrong at <%d, %d>!\n\tdev: [%f(%f), %f(%f), %f(%f), %f(%f)]\n\thost_ref: [%f(%f), %f(%f), " + "%f(%f), %f(%f)]\n", + i, j, da, dain, db, dbin, dc, dcin, dd, ddin, ha, hain, hb, hbin, hc, hcin, hd, hdin); + } else if constexpr (nchn == 3) { + auto dev_results = [&]() { + auto li = verts.template pack(lTag, idx).to_array(); + auto ri = verts.template pack(rTag, idx).to_array(); + auto ti = verts.template pack(tTag, idx).to_array(); + auto bi = verts.template pack(bTag, idx).to_array(); + auto inli = verts.template pack(tag, lidx).to_array(); + auto inri = verts.template pack(tag, ridx).to_array(); + auto inti = verts.template pack(tag, tidx).to_array(); + auto inbi = verts.template pack(tag, bidx).to_array(); + return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi); + }; + auto host_results = [&]() { + auto li = l[idx]; + auto ri = r[idx]; + auto ti = t[idx]; + auto bi = b[idx]; + auto inli = channels[lidx]; + auto inri = channels[ridx]; + auto inti = channels[tidx]; + auto inbi = channels[bidx]; + return zs::make_tuple(li, ri, ti, bi, inli, inri, inti, inbi); + }; + auto [da, db, dc, dd, dain, dbin, dcin, ddin] = dev_results(); + auto [ha, hb, hc, hd, hain, hbin, hcin, hdin] = host_results(); + if (da != ha || db != hb || dc != hc || dd != hd || dain != hain || dbin != hbin || dcin != hcin || + ddin != hdin) + printf("damn wrong vec3f! <%d, %d>\n", i, j); + } + }); +} +template void edgeLoopSum(typename ZenoParticles::particles_t &prim, int nx, int ny, const std::string &channel, const std::string &addChannel) { using namespace zs; @@ -44,8 +132,8 @@ void edgeLoopSum(typename ZenoParticles::particles_t &prim, int nx, int ny, cons const SmallString tTag = std::string("t") + channel; const SmallString bTag = std::string("b") + channel; const SmallString tag = addChannel; - pol(Collapse{nx, ny}, - [verts = proxy({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int i, int j) mutable { + pol(Collapse{ny, nx}, + [verts = proxy({}, prim), nx, ny, lTag, rTag, tTag, bTag, tag] ZS_LAMBDA(int j, int i) mutable { size_t lidx = j * nx + math::max(i - 1, 0); size_t ridx = j * nx + math::min(i + 1, nx - 1); size_t tidx = math::min(j + 1, ny - 1) * nx + i; @@ -70,8 +158,8 @@ void cornerLoop(typename ZenoParticles::particles_t &prim, int nx, int ny, const const SmallString tag = channel; prim.append_channels( pol, {{ltTag.asString(), nchn}, {rtTag.asString(), nchn}, {lbTag.asString(), nchn}, {rbTag.asString(), nchn}}); - pol(Collapse{nx, ny}, - [verts = proxy({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int i, int j) mutable { + pol(Collapse{ny, nx}, + [verts = proxy({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int j, int i) mutable { size_t ltidx = math::min(j + 1, ny - 1) * nx + math::max(i - 1, 0); size_t rtidx = math::min(j + 1, ny - 1) * nx + math::min(i + 1, nx - 1); size_t lbidx = math::max(j - 1, 0) * nx + math::max(i - 1, 0); @@ -95,8 +183,8 @@ void cornerLoopSum(typename ZenoParticles::particles_t &prim, int nx, int ny, co const SmallString lbTag = std::string("lb") + channel; const SmallString rbTag = std::string("rb") + channel; const SmallString tag = addChannel; - pol(Collapse{nx, ny}, - [verts = proxy({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int i, int j) mutable { + pol(Collapse{ny, nx}, + [verts = proxy({}, prim), nx, ny, ltTag, rtTag, lbTag, rbTag, tag] ZS_LAMBDA(int j, int i) mutable { size_t ltidx = math::min(j + 1, ny - 1) * nx + math::max(i - 1, 0); size_t rtidx = math::min(j + 1, ny - 1) * nx + math::min(i + 1, nx - 1); size_t lbidx = math::max(j - 1, 0) * nx + math::max(i - 1, 0); @@ -152,6 +240,124 @@ ZENDEFNODE(ZSGather2DFiniteDifference, { {"zenofx"}, }); +struct ZSCheckGather2DFiniteDifference : zeno::INode { + virtual void apply() override { + auto nx = get_input2("nx"); + auto ny = get_input2("ny"); + auto grid = get_input("ZSParticles"); + auto gridRef = get_input("grid"); + auto attrT = get_input2("attrT"); + auto type = get_input2("OpType"); + auto channel = get_input2("channel"); + + if (auto &verts_ = grid->getParticles(); verts_.hasProperty(channel)) { + auto verts = verts_.clone({zs::memsrc_e::host, -1}); + if (type == "FIVE_STENCIL" || type == "NINE_STENCIL") { + puts("begin edge loop checking"); + if (attrT == "float") { + checkEdgeLoop<1>(gridRef.get(), verts, nx, ny, channel); + } + if (attrT == "vec3") { + checkEdgeLoop<3>(gridRef.get(), verts, nx, ny, channel); + } + puts("done edge loop checking"); + } +#if 0 + if (type == "NINE_STENCIL") { + if (attrT == "float") { + checkCornerLoop<1>(gridRef.get(), verts, nx, ny, channel); + } + if (attrT == "vec3") { + checkCornerLoop<3>(gridRef.get(), verts, nx, ny, channel); + } + } +#endif + } + + set_output("prim", std::move(grid)); + } +}; + +ZENDEFNODE(ZSCheckGather2DFiniteDifference, { + {{"PrimitiveObject", "grid"}, + {"ZSParticles", "ZSParticles"}, + {"int", "nx", "1"}, + {"int", "ny", "1"}, + {"string", "channel", "pos"}, + {"enum vec3 float", "attrT", "float"}, + {"enum FIVE_STENCIL NINE_STENCIL", "OpType", "FIVE_STENCIL"}}, + {{"ZSParticles", "prim"}}, + {}, + {"zenofx"}, + }); + +struct ZSCheckPrimAttribs : zeno::INode { + virtual void apply() override { + auto grid = get_input("ZSParticles"); + auto gridRef = get_input("grid"); + auto attribs = get_input("attribs"); + + const auto &verts = grid->getParticles(); + for (auto &attrib_ : attribs->get2()) { + auto attrib = attrib_; + auto attribAct = attrib_; + if (attribAct == "pos") + attribAct = "x"; + if (attribAct == "vel") + attribAct = "v"; + if (!verts.hasProperty(attribAct) || !gridRef->has_attr(attrib)) + throw std::runtime_error(fmt::format("about prop [{}], zspar [{}], ref [{}]\n", attrib, + verts.hasProperty(attribAct), gridRef->has_attr(attrib))); + if (verts.size() != gridRef->size()) + throw std::runtime_error("size mismatch!\n"); + auto nchn = verts.getPropertySize(attribAct); + auto nchnRef = gridRef->attr_is(attrib) ? 1 : 3; + if (nchn != nchnRef) + throw std::runtime_error("attrib dimension mismatch!\n"); + auto compare = [&](auto dim_v) { + fmt::print("begin checking prim [{}] of size {}\n", attrib, nchn); + using namespace zs; + constexpr auto dim = RM_CVREF_T(dim_v)::value; + using T = conditional_t; + auto ompPol = omp_exec(); + constexpr auto space = execspace_e::host; + ompPol(range(verts.size()), + [&, verts = proxy({}, verts), tag = SmallString{attribAct}, nchn](int vi) { + float v, vref; + for (int d = 0; d != nchn; ++d) { + v = verts(tag, d, vi); + if constexpr (dim == 1) + vref = gridRef->attr(attrib)[vi]; + else + vref = gridRef->attr(attrib)[vi][d]; + if (v != vref) { + fmt::print("damn this, {}-th vert prop [{}, {}] actual: {}, ref: {}\n", vi, attrib, + d, v, vref); + } + } + }); + fmt::print("done checking prim [{}]\n", attrib); + }; + std::variant, zs::wrapv<3>> tmp{}; + if (nchn == 1) + tmp = zs::wrapv<1>{}; + else + tmp = zs::wrapv<3>{}; + zs::match(compare)(tmp); + } + + set_output("prim", std::move(grid)); + } +}; + +ZENDEFNODE(ZSCheckPrimAttribs, + { + {{"PrimitiveObject", "grid"}, {"ZSParticles", "ZSParticles"}, {"ListObject", "attribs"}}, + {{"ZSParticles", "prim"}}, + {}, + {"zenofx"}, + }); + struct ZSMomentumTransfer2DFiniteDifference : zeno::INode { void apply() override { auto nx = get_input2("nx"); diff --git a/zeno/include/zeno/utils/vec.h b/zeno/include/zeno/utils/vec.h index b6859f60a6..2e7c5d907c 100644 --- a/zeno/include/zeno/utils/vec.h +++ b/zeno/include/zeno/utils/vec.h @@ -33,6 +33,8 @@ struct vec : std::array { } } +/* + // will not be called for implicit or explicit conversion operator std::array() const { std::array res; for (size_t i = 0; i < N; i++) { @@ -40,6 +42,7 @@ struct vec : std::array { } return res; } +*/ template explicit vec(vec const &x) {