From c8864c906b14b0875e755b7ba1bba5c8b51fea50 Mon Sep 17 00:00:00 2001
From: Samuel Ayala <samayala22@gmail.com>
Date: Tue, 7 May 2024 18:07:47 -0400
Subject: [PATCH] parallel wake rollup & fix taskflow

---
 packages/taskflow.lua                         |  2 -
 tests/test_uvlm_theodorsen.cpp                |  6 +--
 vlm/backends/cpu/include/vlm_backend_cpu.hpp  |  1 +
 vlm/backends/cpu/src/vlm_backend_cpu.cpp      | 54 +++++++++++++------
 .../cpu/src/vlm_backend_cpu_kernels.ispc      | 11 ++--
 vlm/include/vlm_backend.hpp                   |  1 +
 vlm/include/vlm_types.hpp                     |  6 +--
 7 files changed, 51 insertions(+), 30 deletions(-)
diff --git a/packages/taskflow.lua b/packages/taskflow.lua
index 17f043c..fc70e46 100644
--- a/packages/taskflow.lua
+++ b/packages/taskflow.lua
@@ -1,7 +1,5 @@
 package("taskflow_custom")
     set_urls("https://github.com/samayala22/taskflow.git")
-    add_urls("https://github.com/samayala22/taskflow/archive/$(version).tar.gz")
-    add_versions("v3.6.2", "0a1d306f90e8e17cb98b95eaae9e8b9455beeeca0f0a72afee7719b27706c68c")
 
     if is_plat("linux") then
         add_syslinks("pthread")
diff --git a/tests/test_uvlm_theodorsen.cpp b/tests/test_uvlm_theodorsen.cpp
index 021b917..c70dd0c 100644
--- a/tests/test_uvlm_theodorsen.cpp
+++ b/tests/test_uvlm_theodorsen.cpp
@@ -182,7 +182,7 @@ int main() {
 
         mesh->resize_wake(vec_t.size()-1); // +1 for the initial pose
         const std::unique_ptr<Backend> backend = create_backend(backend_name, *mesh); // create after mesh has been resized
-
+        
         // Precompute the LHS since wing geometry is constant
         backend->compute_lhs();
         backend->lu_factor();
@@ -219,8 +219,8 @@ int main() {
                 velocities.z[idx] = local_velocity.z;
 
                 if (idx == 0) {
-                    f32 analytical_vel = - amplitude * omega * std::cos(omega * t);
-                    f32 rel_error = 100.0f * std::abs((analytical_vel - local_velocity.z) / analytical_vel);
+                    const f32 analytical_vel = - amplitude * omega * std::cos(omega * t);
+                    const f32 rel_error = 100.0f * std::abs((analytical_vel - local_velocity.z) / analytical_vel);
                     std::cout << "vel error:" << rel_error << "%\n";
                     avg_vel_error += rel_error;
                 }
diff --git a/vlm/backends/cpu/include/vlm_backend_cpu.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
index 057d1f4..7bcf6e1 100644
--- a/vlm/backends/cpu/include/vlm_backend_cpu.hpp
+++ b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
@@ -14,6 +14,7 @@ class BackendCPU : public Backend {
         std::vector<f32> panel_uw;
         std::vector<f32> panel_vw;
         std::vector<f32> panel_ww;
+        SoA_3D_t<f32> rollup_vertices;
         std::vector<f32> wake_buffer; // (ns*nc) * ns
         std::vector<f32> rhs;
         std::vector<i32> ipiv;
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
index fcffaa4..50d2a15 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu.cpp
+++ b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
@@ -39,6 +39,7 @@ BackendCPU::BackendCPU(Mesh& mesh) : Backend(mesh) {
     lhs.resize(mesh.nb_panels_wing() * mesh.nb_panels_wing());
     wake_buffer.resize(mesh.nb_panels_wing() * mesh.ns);
 
+    rollup_vertices.resize(mesh.nb_vertices_total());
     uw.resize(mesh.nb_panels_wing() * mesh.ns);
     vw.resize(mesh.nb_panels_wing() * mesh.ns);
     ww.resize(mesh.nb_panels_wing() * mesh.ns);
@@ -131,7 +132,6 @@ void BackendCPU::compute_rhs(const FlowData& flow) {
     kernel_cpu_rhs(m.nb_panels_wing(), m.normal.x.data(), m.normal.y.data(), m.normal.z.data(), flow.freestream.x, flow.freestream.y, flow.freestream.z, rhs.data());
 }
 
-// TODO: consider changing FlowData to SolverData
 void BackendCPU::add_wake_influence() {
     // const tiny::ScopedTimer timer("Wake Influence");
 
@@ -214,10 +214,30 @@ void BackendCPU::wake_rollup(float dt) {
     const u64 wake_vertices_begin = (mesh.nc + mesh.nw - mesh.current_nw + 1) * (mesh.ns+1);
     const u64 wake_vertices_end = (mesh.nc + mesh.nw + 1) * (mesh.ns + 1);
 
-    // parallel for
-    for (u64 vidx = wake_vertices_begin; vidx < wake_vertices_end; vidx++) {
-        ispc::kernel_induced_vel(mesh_view, vidx, dt, gamma.data(), sigma_vatistas);
-    }
+    tf::Taskflow taskflow;
+
+    auto init = taskflow.placeholder();
+    auto rollup = taskflow.for_each_index(wake_vertices_begin, wake_vertices_end, [&] (u64 vidx) {
+        ispc::kernel_induced_vel(mesh_view, dt, rollup_vertices.x.data(), rollup_vertices.y.data(), rollup_vertices.z.data(), vidx, gamma.data(), sigma_vatistas);
+    });
+    auto copy = taskflow.emplace([&]{
+        std::copy(rollup_vertices.x.data() + wake_vertices_begin, rollup_vertices.x.data() + wake_vertices_end, mesh.v.x.data() + wake_vertices_begin);
+        std::copy(rollup_vertices.y.data() + wake_vertices_begin, rollup_vertices.y.data() + wake_vertices_end, mesh.v.y.data() + wake_vertices_begin);
+        std::copy(rollup_vertices.z.data() + wake_vertices_begin, rollup_vertices.z.data() + wake_vertices_end, mesh.v.z.data() + wake_vertices_begin);
+    });
+    auto sync = taskflow.placeholder();
+    init.precede(rollup);
+    rollup.precede(copy);
+    copy.precede(sync);
+
+    Executor::get().run(taskflow).wait();
+
+    // for (u64 vidx = wake_vertices_begin; vidx < wake_vertices_end; vidx++) {
+    //     ispc::kernel_induced_vel(mesh_view, dt, rollup_vertices.x.data(), rollup_vertices.y.data(), rollup_vertices.z.data(), vidx, gamma.data(), sigma_vatistas);
+    // }
+    // std::copy(rollup_vertices.x.data() + wake_vertices_begin, rollup_vertices.x.data() + rollup_vertices.size, mesh.v.x.data() + wake_vertices_begin);
+    // std::copy(rollup_vertices.y.data() + wake_vertices_begin, rollup_vertices.y.data() + rollup_vertices.size, mesh.v.y.data() + wake_vertices_begin);
+    // std::copy(rollup_vertices.z.data() + wake_vertices_begin, rollup_vertices.z.data() + rollup_vertices.size, mesh.v.z.data() + wake_vertices_begin);
 }
 
 void BackendCPU::shed_gamma() {
@@ -298,9 +318,9 @@ f32 BackendCPU::compute_coefficient_unsteady_cl(const SoA_3D_t<f32>& vel, f32 dt
 
             linalg::alias::float3 freestream{vel.x[li], vel.y[li], vel.z[li]};
             //std::cout << "Without: " << freestream << "\n";
-            freestream.x += panel_uw[li];
-            freestream.y += panel_vw[li];
-            freestream.z += panel_ww[li];
+            // freestream.x += panel_uw[li];
+            // freestream.y += panel_vw[li];
+            // freestream.z += panel_ww[li];
             // std::cout << "With: " << freestream << "\n";
             
             const linalg::alias::float3 lift_axis = compute_lift_axis(freestream);
@@ -315,17 +335,17 @@ f32 BackendCPU::compute_coefficient_unsteady_cl(const SoA_3D_t<f32>& vel, f32 dt
             const f32 gamma_dt = (gamma[li] - gamma_prev[li]) / dt; // backward difference
 
             // Joukowski method
-            // force += rho * delta_gamma[li] * linalg::cross(freestream, v1 - v0);
-            // force += rho * gamma_dt * mesh.area[li] * normal;
+            force += rho * delta_gamma[li] * linalg::cross(freestream, v1 - v0);
+            force += rho * gamma_dt * mesh.area[li] * normal;
 
             // Katz Plotkin method
-            linalg::alias::float3 delta_p = {0.0f, 0.0f, 0.0f};
-            const f32 delta_gamma_i = (u == 0) ? gamma[li] : gamma[li] - gamma[(u-1) * mesh.ns + v];
-            const f32 delta_gamma_j = (v == 0) ? gamma[li] : gamma[li] - gamma[u * mesh.ns + v - 1];
-            delta_p += rho * linalg::dot(freestream, linalg::normalize(v1 - v0)) * delta_gamma_j / mesh.panel_width_y(u, v);
-            delta_p += rho * linalg::dot(freestream, linalg::normalize(v3 - v0)) * delta_gamma_i / mesh.panel_length(u, v);
-            delta_p += gamma_dt;
-            force = (delta_p * mesh.area[li]) * normal;
+            // linalg::alias::float3 delta_p = {0.0f, 0.0f, 0.0f};
+            // const f32 delta_gamma_i = (u == 0) ? gamma[li] : gamma[li] - gamma[(u-1) * mesh.ns + v];
+            // const f32 delta_gamma_j = (v == 0) ? gamma[li] : gamma[li] - gamma[u * mesh.ns + v - 1];
+            // delta_p += rho * linalg::dot(freestream, linalg::normalize(v1 - v0)) * delta_gamma_j / mesh.panel_width_y(u, v);
+            // delta_p += rho * linalg::dot(freestream, linalg::normalize(v3 - v0)) * delta_gamma_i / mesh.panel_length(u, v);
+            // delta_p += gamma_dt;
+            // force = (delta_p * mesh.area[li]) * normal;
 
             // force /= linalg::length2(freestream);
             
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
index 87744c0..c0e41ad 100644
--- a/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
+++ b/vlm/backends/cpu/src/vlm_backend_cpu_kernels.ispc
@@ -181,8 +181,9 @@ export void kernel_influence2(
 }
 
 export void kernel_induced_vel(
-    uniform const MeshView& m, uniform uint64 vidx, uniform float dt,
-    uniform float* uniform gamma, uniform float sigma
+    uniform MeshView& m, uniform float dt, 
+    uniform float* uniform rollup_vx, uniform float* uniform rollup_vy, uniform float* uniform rollup_vz,
+    uniform uint64 vidx, uniform float* uniform gamma, uniform float sigma
     ) {
 
     const uniform float3 vertex_influenced = {m.v.x[vidx], m.v.y[vidx], m.v.z[vidx]};
@@ -237,9 +238,9 @@ export void kernel_induced_vel(
         }
     }
 
-    m.v.x[vidx] += reduce_add(induced_vel.x) * dt;
-    m.v.y[vidx] += reduce_add(induced_vel.y) * dt;
-    m.v.z[vidx] += reduce_add(induced_vel.z) * dt;
+    rollup_vx[vidx] = m.v.x[vidx] + reduce_add(induced_vel.x) * dt;
+    rollup_vy[vidx] = m.v.y[vidx] + reduce_add(induced_vel.y) * dt;
+    rollup_vz[vidx] = m.v.z[vidx] + reduce_add(induced_vel.z) * dt;
 }
 
 export uniform float kernel_trefftz_cd(
diff --git a/vlm/include/vlm_backend.hpp b/vlm/include/vlm_backend.hpp
index 602e181..8b03cc7 100644
--- a/vlm/include/vlm_backend.hpp
+++ b/vlm/include/vlm_backend.hpp
@@ -2,6 +2,7 @@
 
 #include "vlm_fwd.hpp"
 #include "vlm_types.hpp"
+#include "vlm_mesh.hpp"
 
 namespace vlm {
 
diff --git a/vlm/include/vlm_types.hpp b/vlm/include/vlm_types.hpp
index e90698e..4cbfda0 100644
--- a/vlm/include/vlm_types.hpp
+++ b/vlm/include/vlm_types.hpp
@@ -54,9 +54,9 @@ struct SoA_3D_t {
     
     void resize(u64 size_) {
         size = size_;
-        x.resize(size);
-        y.resize(size);
-        z.resize(size);
+        x.resize(size, 0.f);
+        y.resize(size, 0.f);
+        z.resize(size, 0.f);
     }
 
     void reserve(u64 size_) {