From cc5bbb505df07eaebef4b1fc7aa8f38ef12528fa Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Mon, 17 Jun 2024 02:12:01 -0700 Subject: [PATCH 1/4] Simplify and Cleanup AtomicQuadTree --- src/atomic.h | 2 + src/atomic_quad_tree.h | 132 +++++++++++++---------------------- src/barnes_hut.h | 7 +- src/kernels.h | 154 +++++++++++++++++------------------------ src/vec.h | 23 +++++- 5 files changed, 137 insertions(+), 181 deletions(-) diff --git a/src/atomic.h b/src/atomic.h index 1982df0..4c66029 100644 --- a/src/atomic.h +++ b/src/atomic.h @@ -8,6 +8,7 @@ template using atomic_ref = cuda::atomic_ref template using atomic = std::atomic; @@ -15,6 +16,7 @@ template using atomic_ref = std::atomic_ref; inline constexpr auto memory_order_relaxed = std::memory_order_relaxed; inline constexpr auto memory_order_acquire = std::memory_order_acquire; inline constexpr auto memory_order_release = std::memory_order_release; +inline constexpr auto memory_order_acq_rel = std::memory_order_acq_rel; #endif // _NVHPC_STDPAR_GPU #endif // ATOMIC_H diff --git a/src/atomic_quad_tree.h b/src/atomic_quad_tree.h index b1f6f7f..5ee56b2 100644 --- a/src/atomic_quad_tree.h +++ b/src/atomic_quad_tree.h @@ -20,98 +20,66 @@ enum class NodeStatus : uint32_t { }; static_assert(atomic::is_always_lock_free); -// container class for lambdas -template -class ConstAtomicQuadTreeContainer { -public: - T const root_side_length; - // T const root_x; - // T const root_y; - T const * const total_masses; - T const * const centre_masses_x; - T const * const centre_masses_y; - Index_t const * const first_child; - Index_t const * const next_nodes; - // atomic const * const node_status; - - // atomic const * const leaf_count; - // atomic const * const child_mass_complete; - Index_t const * const parent; - - // only pointer field not a vector - // atomic const * const bump_allocator; -}; - -// container class for lambdas -template -class AtomicQuadTreeContainer { -public: - T root_side_length; - T root_x; - T root_y; - T * const total_masses; - T * const centre_masses_x; - T * const centre_masses_y; - Index_t * const first_child; - Index_t * const next_nodes; - atomic * const node_status; - - atomic * const leaf_count; - atomic * const child_mass_complete; - Index_t * const parent; - - // only a pointer field not a vector - atomic * const bump_allocator; - - auto to_const() { - return ConstAtomicQuadTreeContainer{ - root_side_length, // root_x, root_y, - total_masses, centre_masses_x, centre_masses_y, - first_child, next_nodes, // node_status, - //leaf_count, child_mass_complete, - parent, - //bump_allocator - }; - } -}; - // Quad tree using Class of Vectors (CoV) (i.e. Structure of Arrays) template class AtomicQuadTree { public: + Index_t capacity; T root_side_length; - T root_x; - T root_y; - std::vector total_masses; - std::vector centre_masses_x; - std::vector centre_masses_y; - std::vector first_child; - std::vector next_nodes; - std::vector> node_status; - - // used for mass calc - std::vector> leaf_count; // stores total number of sub leaves a node has - std::vector> child_mass_complete; // stores number of children that have correct mass - std::vector parent; - + vec root_x; + Index_t* first_child; + Index_t* next_nodes; + Index_t* parent; + atomic* node_status; // bump ptr used to keep track of allocated nodes - std::unique_ptr> bump_allocator = std::make_unique>(1); + atomic* bump_allocator; + + T* total_masses; + vec* centre_masses; - explicit AtomicQuadTree(size_t size) - : total_masses(size, 0), centre_masses_x(size, 0), centre_masses_y(size, 0), - first_child(size, is_leaf), next_nodes(size, is_leaf), node_status(size), - leaf_count(size), child_mass_complete(size), parent(size, is_leaf){ + // used for mass calc + atomic* leaf_count; // stores total number of sub leaves a node has + atomic* child_mass_complete; // stores number of children that have correct mass + + void clear(Index_t i) { + if (i == 0) bump_allocator->store(1, memory_order_relaxed); + first_child[i] = is_leaf; + next_nodes[i] = is_leaf; + parent[i] = is_leaf; + node_status[i].store(NodeStatus::EmptyLeaf, memory_order_relaxed); + total_masses[i] = T(0); + centre_masses[i] = vec::splat(0); + leaf_count[i].store(0, memory_order_relaxed); + child_mass_complete[i].store(0, memory_order_relaxed); + } + static AtomicQuadTree alloc(size_t size) { + AtomicQuadTree qt; + qt.capacity = size; + qt.first_child = new Index_t[size]; + qt.next_nodes = new Index_t[size]; + qt.parent = new Index_t[size]; + qt.node_status = new atomic[size]; + qt.bump_allocator = new atomic(1); + + qt.total_masses = new T[size]; + qt.centre_masses = new vec[size]; + + qt.leaf_count = new atomic[size]; + qt.child_mass_complete = new atomic[size]; + return qt; } - auto get_container() { - return AtomicQuadTreeContainer{ - root_side_length, root_x, root_y, - total_masses.data(), centre_masses_x.data(), centre_masses_y.data(), - first_child.data(), next_nodes.data(), node_status.data(), - leaf_count.data(), child_mass_complete.data(), parent.data(), - bump_allocator.get() - }; + static void dealloc(AtomicQuadTree* qt) { + delete qt->first_child[]; + delete qt->next_nodes[]; + delete qt->parent[]; + delete qt->node_status[]; + delete qt->total_masses[]; + delete qt->centre_masses[]; + delete qt->leaf_count[]; + delete qt->child_mass_complete[]; + delete qt->bump_allocator; } }; diff --git a/src/barnes_hut.h b/src/barnes_hut.h index 32f2cef..6a3bfa8 100644 --- a/src/barnes_hut.h +++ b/src/barnes_hut.h @@ -14,7 +14,7 @@ using clock_timer = std::chrono::steady_clock; template -void barnes_hut_step(System& system, Arguments arguments, AtomicQuadTreeContainer tree) { +void barnes_hut_step(System& system, Arguments arguments, AtomicQuadTree tree) { auto start_timer = clock_timer::now(); clear_tree(system, tree); @@ -25,7 +25,7 @@ void barnes_hut_step(System& system, Arguments arguments, AtomicQuadTreeConta calc_mass_atomic_tree(system, tree); auto calc_mass_timer = clock_timer::now(); - calc_force_atomic_tree(system, tree.to_const(), static_cast(arguments.theta)); + calc_force_atomic_tree(system, tree, static_cast(arguments.theta)); auto force_timer = clock_timer::now(); system.accelerate_step(); @@ -52,8 +52,7 @@ void run_barnes_hut(System& system, Arguments arguments) { saver.save_points(system); // init tree structure - auto vector_tree = AtomicQuadTree(system.max_tree_node_size); - auto tree = vector_tree.get_container(); + auto tree = AtomicQuadTree::alloc(system.max_tree_node_size); if (arguments.print_info) { std::cout << "Tree init complete\n"; } diff --git a/src/kernels.h b/src/kernels.h index b54957f..c5ea742 100644 --- a/src/kernels.h +++ b/src/kernels.h @@ -4,10 +4,9 @@ #include "atomic_quad_tree.h" // raw kernels - template -void atomic_calc_mass(AtomicQuadTreeContainer tree) { - Index_t tree_index = 0; +void atomic_calc_mass(AtomicQuadTree tree, Index_t tree_index) { + tree_index = 0; // navigate to leaf while (tree.node_status[tree_index].load(memory_order_acquire) != NodeStatus::FullLeaf) { @@ -19,48 +18,44 @@ void atomic_calc_mass(AtomicQuadTreeContainer tree) { else if (tree.leaf_count[child_index + 3].fetch_sub(1, memory_order_relaxed) > 0) tree_index = child_index + 3; } - // sum child masses + // Accumulate masses up to the root do { // move up to parent tree_index = tree.parent[tree_index]; if (tree_index == is_leaf) break; // if reached root - // count how many leafs this node has - auto leaf_child_index = tree.first_child[tree_index]; - uint32_t local_leaf_count = static_cast(tree.node_status[leaf_child_index + 0].load(memory_order_acquire) == NodeStatus::EmptyLeaf) - + static_cast(tree.node_status[leaf_child_index + 1].load(memory_order_acquire) == NodeStatus::EmptyLeaf) - + static_cast(tree.node_status[leaf_child_index + 2].load(memory_order_acquire) == NodeStatus::EmptyLeaf) - + static_cast(tree.node_status[leaf_child_index + 3].load(memory_order_acquire) == NodeStatus::EmptyLeaf); - if (tree.child_mass_complete[tree_index].fetch_add(1, memory_order_relaxed) != 4 - 1 - local_leaf_count) break; // not this threads job + // No thread will be arriving from siblings that are empty leaves, + // so count those: + uint32_t local_leaf_count = 0; + auto leaf_child_index = tree.first_child[tree_index]; + for (int i = 0; i < 4; ++i) + local_leaf_count += static_cast(tree.node_status[leaf_child_index + i].load(memory_order_relaxed) == NodeStatus::EmptyLeaf); + uint32_t expected_count = 4 - 1 - local_leaf_count; - // reset masses as node contains old information - tree.total_masses[tree_index] = 0; - tree.centre_masses_x[tree_index] = 0; - tree.centre_masses_y[tree_index] = 0; + // Arrive at parent releasing previous masses accumulated, and acquiring masses accumulated by other threads: + if (tree.child_mass_complete[tree_index].fetch_add(1, memory_order_acq_rel) != expected_count) + break; // pick up all child masses + T mass = 0; + auto centre_masses = vec::splat(0); for (auto i = 0; i < 4; i++) { auto child_index = tree.first_child[tree_index] + i; auto child_total_mass = tree.total_masses[child_index]; - tree.total_masses[tree_index] += child_total_mass; - tree.centre_masses_x[tree_index] += child_total_mass * tree.centre_masses_x[child_index]; - tree.centre_masses_y[tree_index] += child_total_mass * tree.centre_masses_y[child_index]; + mass += child_total_mass; + centre_masses += child_total_mass * tree.centre_masses[child_index]; } - tree.centre_masses_x[tree_index] /= tree.total_masses[tree_index]; - tree.centre_masses_y[tree_index] /= tree.total_masses[tree_index]; + centre_masses /= mass; + + tree.total_masses[tree_index] = mass; + tree.centre_masses[tree_index] = centre_masses; } while(true); } template -void atomic_insert( - // particle data - T mass, T pos_x, T pos_y, - // tree data - AtomicQuadTreeContainer tree -) { +void atomic_insert(T mass, vec pos, AtomicQuadTree tree) { Index_t tree_index = 0; // insert into root - T divide_x = tree.root_x; - T divide_y = tree.root_y; + vec divide = tree.root_x; T side_length = tree.root_side_length; while (true) { @@ -69,35 +64,33 @@ void atomic_insert( if (local_node_status == NodeStatus::NotLeaf) { // if the node has children T half_length = side_length / static_cast(4); // / 2 is for new quad length, then / 2 is for half length - if (pos_x < divide_x) { // left - divide_x -= half_length; - if (pos_y > divide_y) { // top left - divide_y += half_length; - tree_index = tree.first_child[tree_index] + 0; + Index_t child_pos; + if (pos[0] < divide[0]) { // left + divide[0] -= half_length; + if (pos[1] > divide[1]) { // top left + divide[1] += half_length; + child_pos = 0; + } else { // bottom left + divide[1] -= half_length; + child_pos = 1; } - else { // bottom left - divide_y -= half_length; - tree_index = tree.first_child[tree_index] + 1; - } - } - else { // right - divide_x += half_length; - if (pos_y > divide_y) { // top right - divide_y += half_length; - tree_index = tree.first_child[tree_index] + 2; - } - else { // bottom right - divide_y -= half_length; - tree_index = tree.first_child[tree_index] + 3; + } else { // right + divide[0] += half_length; + if (pos[1] > divide[1]) { // top right + divide[1] += half_length; + child_pos = 2; + } else { // bottom right + divide[1] -= half_length; + child_pos = 3; } } + tree_index = tree.first_child[tree_index] + child_pos; tree.leaf_count[tree_index].fetch_add(1, memory_order_relaxed); // count needed for mass traversal side_length /= static_cast(2); } else if (local_node_status == NodeStatus::EmptyLeaf && tree.node_status[tree_index].compare_exchange_weak(local_node_status, NodeStatus::Locked, memory_order_acquire, memory_order_relaxed)) { tree.total_masses[tree_index] = mass; - tree.centre_masses_x[tree_index] = pos_x; - tree.centre_masses_y[tree_index] = pos_y; + tree.centre_masses[tree_index] = pos; tree.node_status[tree_index].store(NodeStatus::FullLeaf, memory_order_release); break; } else if (local_node_status == NodeStatus::FullLeaf && tree.node_status[tree_index].compare_exchange_weak(local_node_status, NodeStatus::Locked, memory_order_acquire, memory_order_relaxed)) { @@ -110,36 +103,29 @@ void atomic_insert( tree.next_nodes[first_child_index + 2] = first_child_index + 3; tree.next_nodes[first_child_index + 3] = tree_index; // link back to parent - tree.parent[first_child_index + 0] = tree_index; - tree.parent[first_child_index + 1] = tree_index; - tree.parent[first_child_index + 2] = tree_index; - tree.parent[first_child_index + 3] = tree_index; + for (int i = 0; i < 4; ++i) { + tree.parent[first_child_index + i] = tree_index; + tree.node_status[first_child_index + i].store(NodeStatus::EmptyLeaf, memory_order_relaxed); + } - // relaxed as values released with a later atomic - tree.node_status[first_child_index + 0].store(NodeStatus::EmptyLeaf, memory_order_relaxed); - tree.node_status[first_child_index + 1].store(NodeStatus::EmptyLeaf, memory_order_relaxed); - tree.node_status[first_child_index + 2].store(NodeStatus::EmptyLeaf, memory_order_relaxed); - tree.node_status[first_child_index + 3].store(NodeStatus::EmptyLeaf, memory_order_relaxed); // end of children creation // evict body at current index and insert into children keeping node locked - T p_x = tree.centre_masses_x[tree_index]; - T p_y = tree.centre_masses_y[tree_index]; - Index_t evicted_index = first_child_index + 2 * static_cast(p_x >= divide_x) + 1 * static_cast(p_y <= divide_y); - tree.centre_masses_x[evicted_index] = p_x; - tree.centre_masses_y[evicted_index] = p_y; + auto p_x = tree.centre_masses[tree_index]; + Index_t evicted_index = first_child_index + 2 * static_cast(p_x[0] >= divide[0]) + 1 * static_cast(p_x[1] <= divide[1]); + tree.centre_masses[evicted_index] = p_x; tree.total_masses[evicted_index] = tree.total_masses[tree_index]; tree.node_status[evicted_index].store(NodeStatus::FullLeaf, memory_order_relaxed); - tree.leaf_count[evicted_index].fetch_add(1, memory_order_relaxed); // release node and continue to try to insert body tree.node_status[tree_index].store(NodeStatus::NotLeaf, memory_order_release); + tree.leaf_count[evicted_index].fetch_add(1, memory_order_relaxed); } } } template -vec bh_calc_force(vec x, T const theta, ConstAtomicQuadTreeContainer const tree) { +vec bh_calc_force(vec x, T const theta, AtomicQuadTree const tree) { Index_t tree_index = 0; T side_length = tree.root_side_length; auto a = vec::splat(0); @@ -149,7 +135,7 @@ vec bh_calc_force(vec x, T const theta, ConstAtomicQuadTreeContainer while (tree_index != is_leaf) { Index_t next_node_index = tree.next_nodes[tree_index]; if (came_forwards) { // child or sibling node - vec xj{{tree.centre_masses_x[tree_index], tree.centre_masses_y[tree_index]}}; + vec xj = tree.centre_masses[tree_index]; // check if below threshold if (tree.first_child[tree_index] == is_leaf || side_length / dist(x, xj) < theta) { T mj = tree.total_masses[tree_index]; @@ -171,32 +157,32 @@ vec bh_calc_force(vec x, T const theta, ConstAtomicQuadTreeContainer // launch kernels template -auto build_atomic_tree(System& system, AtomicQuadTreeContainer tree) { +auto build_atomic_tree(System& system, AtomicQuadTree tree) { auto r = system.body_indices(); std::for_each( std::execution::par, r.begin(), r.end(), [s=system.state(),tree] (Index_t i) { - atomic_insert(s.m[i], s.x[i][0], s.x[i][1], tree); + atomic_insert(s.m[i], s.x[i], tree); } ); } template -auto calc_mass_atomic_tree(System& system, AtomicQuadTreeContainer tree) { +auto calc_mass_atomic_tree(System& system, AtomicQuadTree tree) { auto r = system.body_indices(); std::for_each( std::execution::par, r.begin(), r.end(), - [tree] (auto) { - atomic_calc_mass(tree); + [tree] (auto i) { + atomic_calc_mass(tree, i); } ); } template -auto calc_force_atomic_tree(System& system, ConstAtomicQuadTreeContainer const tree, T const theta) { +auto calc_force_atomic_tree(System& system, AtomicQuadTree const tree, T const theta) { auto r = system.body_indices(); std::for_each( std::execution::par_unseq, @@ -208,7 +194,7 @@ auto calc_force_atomic_tree(System& system, ConstAtomicQuadTreeContainer -auto compute_bounded_atomic_quad_tree(System& system, AtomicQuadTreeContainer& tree){ +auto compute_bounded_atomic_quad_tree(System& system, AtomicQuadTree& tree){ // find the minimum and maximum xy co-ordinate auto r = system.body_indices(); auto [min_size, max_size] = std::transform_reduce( @@ -231,35 +217,19 @@ auto compute_bounded_atomic_quad_tree(System& system, AtomicQuadTreeContainer // add root node to tree tree.root_side_length = max_size - min_size; - tree.root_x = divide; - tree.root_y = divide; + tree.root_x = vec::splat(divide); tree.next_nodes[0] = is_leaf; tree.node_status[0].store(NodeStatus::EmptyLeaf, memory_order_relaxed); } template -auto clear_tree(System& system, AtomicQuadTreeContainer tree) { +auto clear_tree(System& system, AtomicQuadTree tree) { // clear the tree, ready for next iteration auto r = system.body_indices(); std::for_each_n( std::execution::par_unseq, r.begin(), tree.bump_allocator->load(memory_order_acquire), - [tree] (auto tree_index) { - if (tree_index == 0) { - tree.bump_allocator->store(1, memory_order_relaxed); - } - - tree.total_masses[tree_index] = 0; - tree.centre_masses_x[tree_index] = 0; - tree.centre_masses_y[tree_index] = 0; - tree.first_child[tree_index] = is_leaf; - tree.next_nodes[tree_index] = is_leaf; - - tree.leaf_count[tree_index].store(0, memory_order_relaxed); - tree.child_mass_complete[tree_index].store(0, memory_order_relaxed); - tree.parent[tree_index] = is_leaf; - tree.node_status[tree_index].store(NodeStatus::EmptyLeaf, memory_order_release); - }); + [tree] (auto tree_index) mutable { tree.clear(tree_index); }); } #endif //HPC_SENDERS_KERNELS_H diff --git a/src/vec.h b/src/vec.h index f751195..7495321 100644 --- a/src/vec.h +++ b/src/vec.h @@ -58,23 +58,40 @@ constexpr vec operator-(vec a, vec b) { return a; } +template +constexpr vec& operator*=(vec& a, T s) { + for (int i = 0; i < N; ++i) a[i] *= s; + return a; +} + template constexpr vec operator*(T s, vec b) { - for (int i = 0; i < N; ++i) b[i] = s * b[i]; + b *= s; return b; } template constexpr vec operator*(vec a, T s) { - for (int i = 0; i < N; ++i) a[i] = a[i] * s; + a *= s; return a; } template -constexpr vec operator/(vec a, T s) { +constexpr vec& operator/=(vec& a, T s) { for (int i = 0; i < N; ++i) a[i] /= s; return a; } +template +constexpr vec operator/(T s, vec b) { + b /= s; + return b; +} +template +constexpr vec operator/(vec a, T s) { + a /= s; + return a; +} + //////////////////////////////////////////////////////////////////////////////// // Horizontal reductions From 99e6a716cf4706ee660fa144d3a31bff5d2b844f Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Mon, 17 Jun 2024 02:49:49 -0700 Subject: [PATCH 2/4] Remove leaf_count Start one thread per tree node, and filter out threads that do not start at a leaf node, instead of preparing and updating a leaf node count. --- src/atomic_quad_tree.h | 4 ---- src/kernels.h | 20 +++++--------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/src/atomic_quad_tree.h b/src/atomic_quad_tree.h index 5ee56b2..cb5b84c 100644 --- a/src/atomic_quad_tree.h +++ b/src/atomic_quad_tree.h @@ -38,7 +38,6 @@ class AtomicQuadTree { vec* centre_masses; // used for mass calc - atomic* leaf_count; // stores total number of sub leaves a node has atomic* child_mass_complete; // stores number of children that have correct mass void clear(Index_t i) { @@ -49,7 +48,6 @@ class AtomicQuadTree { node_status[i].store(NodeStatus::EmptyLeaf, memory_order_relaxed); total_masses[i] = T(0); centre_masses[i] = vec::splat(0); - leaf_count[i].store(0, memory_order_relaxed); child_mass_complete[i].store(0, memory_order_relaxed); } @@ -65,7 +63,6 @@ class AtomicQuadTree { qt.total_masses = new T[size]; qt.centre_masses = new vec[size]; - qt.leaf_count = new atomic[size]; qt.child_mass_complete = new atomic[size]; return qt; } @@ -77,7 +74,6 @@ class AtomicQuadTree { delete qt->node_status[]; delete qt->total_masses[]; delete qt->centre_masses[]; - delete qt->leaf_count[]; delete qt->child_mass_complete[]; delete qt->bump_allocator; } diff --git a/src/kernels.h b/src/kernels.h index c5ea742..d56f655 100644 --- a/src/kernels.h +++ b/src/kernels.h @@ -6,17 +6,9 @@ // raw kernels template void atomic_calc_mass(AtomicQuadTree tree, Index_t tree_index) { - tree_index = 0; - - // navigate to leaf - while (tree.node_status[tree_index].load(memory_order_acquire) != NodeStatus::FullLeaf) { - // work out which child to go to ... - auto child_index = tree.first_child[tree_index]; - if (tree.leaf_count[child_index + 0].fetch_sub(1, memory_order_relaxed) > 0) tree_index = child_index + 0; - else if (tree.leaf_count[child_index + 1].fetch_sub(1, memory_order_relaxed) > 0) tree_index = child_index + 1; - else if (tree.leaf_count[child_index + 2].fetch_sub(1, memory_order_relaxed) > 0) tree_index = child_index + 2; - else if (tree.leaf_count[child_index + 3].fetch_sub(1, memory_order_relaxed) > 0) tree_index = child_index + 3; - } + // If this node is not a leaf node with a body, we are done: + if (tree.node_status[tree_index].load(memory_order_relaxed) != NodeStatus::FullLeaf) + return; // Accumulate masses up to the root do { @@ -86,7 +78,6 @@ void atomic_insert(T mass, vec pos, AtomicQuadTree tree) { } tree_index = tree.first_child[tree_index] + child_pos; - tree.leaf_count[tree_index].fetch_add(1, memory_order_relaxed); // count needed for mass traversal side_length /= static_cast(2); } else if (local_node_status == NodeStatus::EmptyLeaf && tree.node_status[tree_index].compare_exchange_weak(local_node_status, NodeStatus::Locked, memory_order_acquire, memory_order_relaxed)) { tree.total_masses[tree_index] = mass; @@ -119,7 +110,6 @@ void atomic_insert(T mass, vec pos, AtomicQuadTree tree) { // release node and continue to try to insert body tree.node_status[tree_index].store(NodeStatus::NotLeaf, memory_order_release); - tree.leaf_count[evicted_index].fetch_add(1, memory_order_relaxed); } } } @@ -172,9 +162,9 @@ auto build_atomic_tree(System& system, AtomicQuadTree tree) { template auto calc_mass_atomic_tree(System& system, AtomicQuadTree tree) { auto r = system.body_indices(); - std::for_each( + std::for_each_n( std::execution::par, - r.begin(), r.end(), + r.begin(), tree.capacity, [tree] (auto i) { atomic_calc_mass(tree, i); } From 08a97f8f27c0565441486a6e52b24f192b21053d Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Sun, 23 Jun 2024 11:40:19 -0700 Subject: [PATCH 3/4] Fix memory reclamation --- src/atomic_quad_tree.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/atomic_quad_tree.h b/src/atomic_quad_tree.h index cb5b84c..4ccd20a 100644 --- a/src/atomic_quad_tree.h +++ b/src/atomic_quad_tree.h @@ -68,13 +68,13 @@ class AtomicQuadTree { } static void dealloc(AtomicQuadTree* qt) { - delete qt->first_child[]; - delete qt->next_nodes[]; - delete qt->parent[]; - delete qt->node_status[]; - delete qt->total_masses[]; - delete qt->centre_masses[]; - delete qt->child_mass_complete[]; + delete[] qt->first_child; + delete[] qt->next_nodes; + delete[] qt->parent; + delete[] qt->node_status; + delete[] qt->total_masses; + delete[] qt->centre_masses; + delete[] qt->child_mass_complete; delete qt->bump_allocator; } }; From 8970037c08cb0cb119124f5f031029c7494991e3 Mon Sep 17 00:00:00 2001 From: Gonzalo Brito Gadeschi Date: Sun, 23 Jun 2024 11:53:07 -0700 Subject: [PATCH 4/4] Clean all tree nodes in first iteration --- src/barnes_hut.h | 8 ++++---- src/kernels.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/barnes_hut.h b/src/barnes_hut.h index 6a3bfa8..fed852b 100644 --- a/src/barnes_hut.h +++ b/src/barnes_hut.h @@ -14,10 +14,10 @@ using clock_timer = std::chrono::steady_clock; template -void barnes_hut_step(System& system, Arguments arguments, AtomicQuadTree tree) { +void barnes_hut_step(System& system, Arguments arguments, AtomicQuadTree tree, bool first) { auto start_timer = clock_timer::now(); - clear_tree(system, tree); + clear_tree(system, tree, first? tree.capacity : tree.bump_allocator->load(memory_order_relaxed)); compute_bounded_atomic_quad_tree(system, tree); build_atomic_tree(system, tree); auto built_tree_timer = clock_timer::now(); @@ -57,8 +57,8 @@ void run_barnes_hut(System& system, Arguments arguments) { std::cout << "Tree init complete\n"; } for (size_t step = 0; step < arguments.steps; step++) { - barnes_hut_step(system, arguments, tree); - saver.save_points(system); + barnes_hut_step(system, arguments, tree, step == 0); + saver.save_points(system); } } diff --git a/src/kernels.h b/src/kernels.h index d56f655..ffe744a 100644 --- a/src/kernels.h +++ b/src/kernels.h @@ -213,12 +213,12 @@ auto compute_bounded_atomic_quad_tree(System& system, AtomicQuadTree -auto clear_tree(System& system, AtomicQuadTree tree) { +auto clear_tree(System& system, AtomicQuadTree tree, Index_t last_node) { // clear the tree, ready for next iteration auto r = system.body_indices(); std::for_each_n( std::execution::par_unseq, - r.begin(), tree.bump_allocator->load(memory_order_acquire), + r.begin(), last_node, [tree] (auto tree_index) mutable { tree.clear(tree_index); }); }