From 10a37f2359f17d08f6ecb5abbb66e265ff3818ec Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Wed, 22 Jan 2025 19:58:22 +0100
Subject: [PATCH] Add semi-fast second and slower third approach

---
 ext/PointNeighborsCUDAExt.jl | 290 +++++++++++++++++++++++++++++------
 1 file changed, 244 insertions(+), 46 deletions(-)

diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
index b9d8bc5..fc93d1b 100644
--- a/ext/PointNeighborsCUDAExt.jl
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -8,6 +8,7 @@ using UnsafeAtomics: UnsafeAtomics
 using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
 using PointNeighbors.Adapt: Adapt
 using Base: @propagate_inbounds
+using PointNeighbors.BenchmarkTools
 
 const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
 
@@ -67,61 +68,72 @@ end
     CUDA.device!(0)
 end
 
-function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
-    CUDA.LLVM.Interop.@asmcall(
-        "atom.sys.global.add.u32 \$0, [\$1], \$2;",
-        "=r,l,r,~{memory}",
-        true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
-        ptr, val
-    )
-end
+############################################################################################
+# First approach: Use actual system atomics on unified memory
+############################################################################################
+# function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
+#     CUDA.LLVM.Interop.@asmcall(
+#         "atom.sys.global.add.u32 \$0, [\$1], \$2;",
+#         "=r,l,r,~{memory}",
+#         true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
+#         ptr, val
+#     )
+# end
 
-@inline function pushat_atomic_system!(vov, i, value)
-    (; backend, lengths) = vov
+# @inline function pushat_atomic_system!(vov, i, value)
+#     (; backend, lengths) = vov
 
-    @boundscheck checkbounds(vov, i)
+#     @boundscheck checkbounds(vov, i)
 
-    # Increment the column length with an atomic add to avoid race conditions.
-    # Store the new value since it might be changed immediately afterwards by another
-    # thread.
-    # new_length = Atomix.@atomic lengths[i] += 1
-    new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
+#     # Increment the column length with an atomic add to avoid race conditions.
+#     # Store the new value since it might be changed immediately afterwards by another
+#     # thread.
+#     # new_length = Atomix.@atomic lengths[i] += 1
+#     new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
 
-    # We can write here without race conditions, since the atomic add guarantees
-    # that `new_length` is different for each thread.
-    backend[new_length, i] = value
+#     # We can write here without race conditions, since the atomic add guarantees
+#     # that `new_length` is different for each thread.
+#     backend[new_length, i] = value
 
-    return vov
-end
+#     return vov
+# end
 
-const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
+# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
 
-function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
-    (; cell_list) = neighborhood_search
+# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+#     (; cell_list) = neighborhood_search
 
-    cell_list.cells.lengths .= 0
+#     cell_list.cells.lengths .= 0
 
-    if neighborhood_search.search_radius < eps()
-        # Cannot initialize with zero search radius.
-        # This is used in TrixiParticles when a neighborhood search is not used.
-        return neighborhood_search
-    end
+#     if neighborhood_search.search_radius < eps()
+#         # Cannot initialize with zero search radius.
+#         # This is used in TrixiParticles when a neighborhood search is not used.
+#         return neighborhood_search
+#     end
 
-    # Faster on a single GPU
-    @threaded CUDABackend() for point in axes(y, 2)
-        # Get cell index of the point's cell
-        point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
-        cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
+#     # Faster on a single GPU
+#     # CUDA.NVTX.@range "threaded loop" begin
+#     @threaded y for point in axes(y, 2)
+#         # Get cell index of the point's cell
+#         point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
+#         cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
 
-        # Add point to corresponding cell
-        pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
-    end
+#         # Add point to corresponding cell
+#         pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
+#     end
+# # end
 
-    return neighborhood_search
-end
+#     return neighborhood_search
+# end
 
-# This might be slightly faster, but causes problems with synchronization in the interaction
-# kernel because we are carrying around device memory.
+# function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
+#                       y::AbstractMatrix; parallelization_backend = y)
+#     PointNeighbors.initialize_grid!(neighborhood_search, y)
+# end
+
+############################################################################################
+# Second approach: Use a device array to update and copy to unified memory
+############################################################################################
 # struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
 #     vov_unified :: VU
 #     vov_device  :: VD
@@ -133,9 +145,8 @@ end
 
 # # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
 
-# function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
-#     @info "" to
-#     return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
+# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
+#     return Adapt.adapt(to, vov.vov_unified)
 # end
 
 # @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
@@ -182,7 +193,7 @@ end
 
 #         # The atomic version of `pushat!` uses atomics to avoid race conditions when
 #         # `pushat!` is used in a parallel loop.
-#         @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
+#         @inbounds PointNeighbors.pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
 #     end
 
 #     # Copy vector of vectors to unified memory
@@ -203,4 +214,191 @@ end
 #     PointNeighbors.initialize_grid!(neighborhood_search, y)
 # end
 
+############################################################################################
+# Third approach: Like second approach, but a device array for each GPU.
+
+# This should have the advantage that each GPU updates the part of the grid that should
+# already be in its memory and we can avoid moving everything to the first GPU.
+############################################################################################
+KernelAbstractions.@kernel function generic_kernel2(f, gpu)
+    i = KernelAbstractions.@index(Global)
+    @inline f(i, gpu)
+end
+
+@inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T}
+    # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
+    # and index with `iterator[eachindex(iterator)[i]]`.
+    # Note that this only works with vector-like iterators that support arbitrary indexing.
+    indices = eachindex(iterator)
+
+    # Skip empty loops
+    length(indices) == 0 && return
+
+    # Partition `ndrange` to the GPUs
+    n_gpus = length(CUDA.devices())
+    indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
+    @assert length(indices_split) <= n_gpus
+
+    backend = CUDABackend()
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Launch kernel on each device
+    for (i, indices_) in enumerate(indices_split)
+        # Select the correct device for this partition
+        CUDA.device!(i - 1)
+
+        # Call the generic kernel, which only calls a function with the global GPU index
+        generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu
+            @inbounds @inline f(iterator[indices_[j]], vov_per_gpu)
+        end
+    end
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Select first device again
+    CUDA.device!(0)
+end
+
+struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
+    vov_unified::VOV
+    vov_per_gpu::V
+end
+
+function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu)
+    MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu)
+end
+
+# Adapt.@adapt_structure(MultiGPUVectorOfVectors)
+
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
+    return Adapt.adapt(to, vov.vov_unified)
+end
+
+function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
+    max_inner_length, max_outer_length = size(vov.backend)
+
+    n_gpus = length(CUDA.devices())
+    vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
+
+    vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
+    vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
+    for vov__ in vov_per_gpu_
+        vov__.length_[] = vov.length_[]
+    end
+
+    return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_)
+end
+
+const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
+
+KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast}
+    cell = KernelAbstractions.@index(Global)
+
+    length = @inbounds vov.lengths[cell]
+    if length > 0 || islast
+        offset = 0
+        for length_ in previous_lengths
+            offset += @inbounds length_[cell]
+        end
+
+        for i in 1:length
+            @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell]
+        end
+
+        if islast
+            @inbounds vov_unified.lengths[cell] = offset + length
+        end
+    end
+end
+
+function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+    (; cell_list) = neighborhood_search
+    (; cells) = cell_list
+    (; vov_unified, vov_per_gpu) = cells
+
+    for vov_ in vov_per_gpu
+        vov_.lengths .= 0
+    end
+
+    if neighborhood_search.search_radius < eps()
+        # Cannot initialize with zero search radius.
+        # This is used in TrixiParticles when a neighborhood search is not used.
+        return neighborhood_search
+    end
+
+    # Fill cell lists per GPU
+    parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov
+        # Get cell index of the point's cell
+        point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+        cell = cell_coords(point_coords, neighborhood_search)
+
+        # Add point to corresponding cell
+        @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
+
+        # The atomic version of `pushat!` uses atomics to avoid race conditions when
+        # `pushat!` is used in a parallel loop.
+        @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point)
+    end
+
+    n_gpus = length(CUDA.devices())
+    backend = CUDABackend()
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Launch kernel on each device
+    for gpu in 1:n_gpus
+        # Select the correct device
+        CUDA.device!(gpu - 1)
+
+        previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1)
+
+        # Call the generic kernel, which only calls a function with the global GPU index
+        kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu]))
+    end
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Select first device again
+    CUDA.device!(0)
+
+    # # Merge cell lists
+    # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
+    #     offset = offsets[gpu][cell]
+
+    #     points = vov_per_gpu[gpu][cell]
+    #     for i in eachindex(points)
+    #         vov_unified.backend[offset + i, cell] = points[i]
+    #     end
+    # end
+
+    return neighborhood_search
+end
+
+function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
+                                x::AbstractMatrix, y::AbstractMatrix;
+                                points_moving = (true, true), parallelization_backend = x)
+    # The coordinates of the first set of points are irrelevant for this NHS.
+    # Only update when the second set is moving.
+    points_moving[2] || return neighborhood_search
+
+    PointNeighbors.initialize_grid!(neighborhood_search, y)
+end
+
 end # module