From 54f6a077ca9b5270a36ecbbfb8feceecf0661769 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 10 Jan 2025 23:53:18 +0100
Subject: [PATCH 1/7] Add backend to use unified memory and multiple GPUs

---
 Project.toml                 |  7 +++++
 ext/PointNeighborsCUDAExt.jl | 59 ++++++++++++++++++++++++++++++++++++
 src/gpu.jl                   |  3 --
 src/util.jl                  | 13 ++++++--
 4 files changed, 77 insertions(+), 5 deletions(-)
 create mode 100644 ext/PointNeighborsCUDAExt.jl

diff --git a/Project.toml b/Project.toml
index feab2f25..913f2c87 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,9 +13,16 @@ Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+PointNeighborsCUDAExt = "CUDA"
+
 [compat]
 Adapt = "3, 4"
 Atomix = "0.1, 1"
+CUDA = "5.4.3"
 GPUArraysCore = "0.1, 0.2"
 KernelAbstractions = "0.9"
 LinearAlgebra = "1"
diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
new file mode 100644
index 00000000..0719305e
--- /dev/null
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -0,0 +1,59 @@
+module PointNeighborsCUDAExt
+
+using PointNeighbors: PointNeighbors, generic_kernel, CUDAMultiGPUBackend, KernelAbstractions
+using CUDA: CUDA, CuArray, CUDABackend
+
+const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
+
+# This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between
+# `nothing` (Polyester.jl) and `KernelAbstractions.CPU`.
+PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend()
+
+# Convert input array to `CuArray` with unified memory
+function PointNeighbors.Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array)
+    return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array)
+end
+
+@inline function PointNeighbors.parallel_foreach(f, iterator, x::UnifiedCuArray)
+    PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend())
+end
+
+# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
+@inline function PointNeighbors.parallel_foreach(f, iterator, x::CUDAMultiGPUBackend)
+    # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
+    # and index with `iterator[eachindex(iterator)[i]]`.
+    # Note that this only works with vector-like iterators that support arbitrary indexing.
+    indices = eachindex(iterator)
+
+    # Skip empty loops
+    length(indices) == 0 && return
+
+    # Partition `ndrange` to the GPUs
+    n_gpus = length(CUDA.devices())
+    indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
+    @assert length(indices_split) <= n_gpus
+
+    backend = CUDABackend()
+
+    # Spawn kernel on each device
+    for (i, indices_) in enumerate(indices_split)
+        # Select the correct device for this partition
+        CUDA.device!(i - 1)
+
+        # Call the generic kernel, which only calls a function with the global GPU index
+        generic_kernel(backend)(ndrange = length(indices_)) do j
+            @inbounds @inline f(iterator[indices_[j]])
+        end
+    end
+
+    # Synchronize each device
+    for i in 1:length(indices_split)
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Select first device again
+    CUDA.device!(0)
+end
+
+end # module
diff --git a/src/gpu.jl b/src/gpu.jl
index 9293e19f..cd418c2e 100644
--- a/src/gpu.jl
+++ b/src/gpu.jl
@@ -31,6 +31,3 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch)
     return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells,
                                   cell_size, update_buffer, nhs.update_strategy)
 end
-
-# This is useful to pass the backend directly to `@threaded`
-KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend
diff --git a/src/util.jl b/src/util.jl
index 7cfbfedc..0fdc1176 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -60,7 +60,9 @@ with `Threads.@threads :static`.
 """
 struct ThreadsStaticBackend <: AbstractThreadingBackend end
 
-const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend}
+struct CUDAMultiGPUBackend end
+
+const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend, CUDAMultiGPUBackend}
 
 """
     @threaded x for ... end
@@ -140,7 +142,9 @@ end
     # Skip empty loops
     ndrange == 0 && return
 
-    backend = KernelAbstractions.get_backend(x)
+    # Use our `get_backend`, which usually forwards to `KernelAbstractions.get_backend`,
+    # but also works when `x` is already a backend.
+    backend = get_backend(x)
 
     # Call the generic kernel that is defined below, which only calls a function with
     # the global GPU index.
@@ -155,3 +159,8 @@ end
     i = @index(Global)
     @inline f(i)
 end
+
+get_backend(x) = KernelAbstractions.get_backend(x)
+
+# This is useful to pass the backend directly to `@threaded`
+get_backend(backend::KernelAbstractions.Backend) = backend

From fc7180f6c725872ae00c69ed24c6178e331443ef Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 17 Jan 2025 18:01:06 +0100
Subject: [PATCH 2/7] Update on each GPU in device memory and then copy to
 unified memory

---
 ext/PointNeighborsCUDAExt.jl | 130 ++++++++++++++++++++++++++++++++---
 1 file changed, 121 insertions(+), 9 deletions(-)

diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
index 0719305e..446b2713 100644
--- a/ext/PointNeighborsCUDAExt.jl
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -1,7 +1,12 @@
 module PointNeighborsCUDAExt
 
-using PointNeighbors: PointNeighbors, generic_kernel, CUDAMultiGPUBackend, KernelAbstractions
-using CUDA: CUDA, CuArray, CUDABackend
+using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors,
+                      GridNeighborhoodSearch, FullGridCellList, extract_svector,
+                      cell_coords, cell_index, check_cell_bounds, pushat_atomic!
+using CUDA: CUDA, CuArray, CUDABackend, cu
+using UnsafeAtomics: UnsafeAtomics
+using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
+using PointNeighbors.Adapt: Adapt
 
 const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
 
@@ -10,7 +15,7 @@ const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
 PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend()
 
 # Convert input array to `CuArray` with unified memory
-function PointNeighbors.Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array)
+function Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array)
     return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array)
 end
 
@@ -18,8 +23,20 @@ end
     PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend())
 end
 
-# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
-@inline function PointNeighbors.parallel_foreach(f, iterator, x::CUDAMultiGPUBackend)
+@inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T}
+    parallel_foreach2((i, gpu) -> @inline f(i), iterator, x)
+end
+
+@inline function parallel_foreach2(f, iterator, x::UnifiedCuArray)
+    parallel_foreach2(f, iterator, CUDAMultiGPUBackend())
+end
+
+KernelAbstractions.@kernel function generic_kernel2(f, gpu)
+    i = KernelAbstractions.@index(Global)
+    @inline f(i, gpu)
+end
+
+@inline function parallel_foreach2(f, iterator, x::CUDAMultiGPUBackend)
     # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
     # and index with `iterator[eachindex(iterator)[i]]`.
     # Note that this only works with vector-like iterators that support arbitrary indexing.
@@ -35,19 +52,25 @@ end
 
     backend = CUDABackend()
 
-    # Spawn kernel on each device
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Launch kernel on each device
     for (i, indices_) in enumerate(indices_split)
         # Select the correct device for this partition
         CUDA.device!(i - 1)
 
         # Call the generic kernel, which only calls a function with the global GPU index
-        generic_kernel(backend)(ndrange = length(indices_)) do j
-            @inbounds @inline f(iterator[indices_[j]])
+        generic_kernel2(backend)(i, ndrange = length(indices_)) do j, gpu
+            @inbounds @inline f(iterator[indices_[j]], gpu)
         end
     end
 
     # Synchronize each device
-    for i in 1:length(indices_split)
+    for i in 1:n_gpus
         CUDA.device!(i - 1)
         KernelAbstractions.synchronize(backend)
     end
@@ -56,4 +79,93 @@ end
     CUDA.device!(0)
 end
 
+struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
+    vov::VOV
+    vov_per_gpu::V
+end
+
+function MultiGPUVectorOfVectors(vov, vov_per_gpu)
+    MultiGPUVectorOfVectors{eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)}(vov, vov_per_gpu)
+end
+
+Adapt.@adapt_structure(MultiGPUVectorOfVectors)
+
+function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
+    max_inner_length, max_outer_length = size(vov.backend)
+
+    n_gpus = length(CUDA.devices())
+    vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
+
+    vov_ = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
+    vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
+    for vov__ in vov_per_gpu_
+        vov__.length_[] = vov.length_[]
+    end
+
+    return MultiGPUVectorOfVectors{T, typeof(vov_), typeof(vov_per_gpu_)}(vov_, vov_per_gpu_)
+end
+
+const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
+
+function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+    (; cell_list) = neighborhood_search
+    (; cells) = cell_list
+    (; vov, vov_per_gpu) = cells
+
+    for vov_ in vov_per_gpu
+        vov_.lengths .= 0
+    end
+
+    if neighborhood_search.search_radius < eps()
+        # Cannot initialize with zero search radius.
+        # This is used in TrixiParticles when a neighborhood search is not used.
+        return neighborhood_search
+    end
+
+    # Fill cell lists per GPU
+    # TODO split range into chunks for each GPU
+    parallel_foreach2(axes(y, 2), y) do point, gpu
+        # Get cell index of the point's cell
+        point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+        cell = cell_coords(point_coords, neighborhood_search)
+
+        # Add point to corresponding cell
+        @boundscheck check_cell_bounds(cell_list, cell)
+
+        # The atomic version of `pushat!` uses atomics to avoid race conditions when
+        # `pushat!` is used in a parallel loop.
+        @inbounds pushat_atomic!(vov_per_gpu[gpu], cell_index(cell_list, cell), point)
+    end
+
+    lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, length(vov_per_gpu))
+    CUDA.synchronize()
+    offsets_ = cu(cumsum(lengths), unified = true)
+    CUDA.synchronize()
+    vov.lengths .= offsets_[end]
+    CUDA.synchronize()
+    offsets = offsets_ .- lengths
+
+    # Merge cell lists
+    parallel_foreach2(axes(vov, 2), y) do cell, gpu
+        offset = offsets[gpu][cell]
+
+        points = vov_per_gpu[gpu][cell]
+        for i in eachindex(points)
+            vov.backend[offset + i, cell] = points[i]
+        end
+    end
+
+    return neighborhood_search
+end
+
+function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
+                                x::AbstractMatrix, y::AbstractMatrix;
+                                points_moving = (true, true), parallelization_backend = x)
+    # The coordinates of the first set of points are irrelevant for this NHS.
+    # Only update when the second set is moving.
+    points_moving[2] || return neighborhood_search
+
+    PointNeighbors.initialize_grid!(neighborhood_search, y)
+end
+
 end # module

From 234dbb573e12f620437eb03cf470202e88269a3d Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 20 Jan 2025 14:37:21 +0100
Subject: [PATCH 3/7] Add workaround for system atomics

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
---
 ext/PointNeighborsCUDAExt.jl | 187 +++++++++++++++++++++--------------
 1 file changed, 111 insertions(+), 76 deletions(-)

diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
index 446b2713..b9d8bc57 100644
--- a/ext/PointNeighborsCUDAExt.jl
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -2,11 +2,12 @@ module PointNeighborsCUDAExt
 
 using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors,
                       GridNeighborhoodSearch, FullGridCellList, extract_svector,
-                      cell_coords, cell_index, check_cell_bounds, pushat_atomic!
+                      cell_coords, cell_index, @threaded, generic_kernel
 using CUDA: CUDA, CuArray, CUDABackend, cu
 using UnsafeAtomics: UnsafeAtomics
 using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
 using PointNeighbors.Adapt: Adapt
+using Base: @propagate_inbounds
 
 const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
 
@@ -24,19 +25,6 @@ end
 end
 
 @inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T}
-    parallel_foreach2((i, gpu) -> @inline f(i), iterator, x)
-end
-
-@inline function parallel_foreach2(f, iterator, x::UnifiedCuArray)
-    parallel_foreach2(f, iterator, CUDAMultiGPUBackend())
-end
-
-KernelAbstractions.@kernel function generic_kernel2(f, gpu)
-    i = KernelAbstractions.@index(Global)
-    @inline f(i, gpu)
-end
-
-@inline function parallel_foreach2(f, iterator, x::CUDAMultiGPUBackend)
     # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
     # and index with `iterator[eachindex(iterator)[i]]`.
     # Note that this only works with vector-like iterators that support arbitrary indexing.
@@ -64,8 +52,8 @@ end
         CUDA.device!(i - 1)
 
         # Call the generic kernel, which only calls a function with the global GPU index
-        generic_kernel2(backend)(i, ndrange = length(indices_)) do j, gpu
-            @inbounds @inline f(iterator[indices_[j]], gpu)
+        generic_kernel(backend)(ndrange = length(indices_)) do j
+            @inbounds @inline f(iterator[indices_[j]])
         end
     end
 
@@ -79,42 +67,39 @@ end
     CUDA.device!(0)
 end
 
-struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
-    vov::VOV
-    vov_per_gpu::V
-end
-
-function MultiGPUVectorOfVectors(vov, vov_per_gpu)
-    MultiGPUVectorOfVectors{eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)}(vov, vov_per_gpu)
+function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
+    CUDA.LLVM.Interop.@asmcall(
+        "atom.sys.global.add.u32 \$0, [\$1], \$2;",
+        "=r,l,r,~{memory}",
+        true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
+        ptr, val
+    )
 end
 
-Adapt.@adapt_structure(MultiGPUVectorOfVectors)
+@inline function pushat_atomic_system!(vov, i, value)
+    (; backend, lengths) = vov
 
-function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
-    max_inner_length, max_outer_length = size(vov.backend)
+    @boundscheck checkbounds(vov, i)
 
-    n_gpus = length(CUDA.devices())
-    vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
+    # Increment the column length with an atomic add to avoid race conditions.
+    # Store the new value since it might be changed immediately afterwards by another
+    # thread.
+    # new_length = Atomix.@atomic lengths[i] += 1
+    new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
 
-    vov_ = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
-    vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
-    for vov__ in vov_per_gpu_
-        vov__.length_[] = vov.length_[]
-    end
+    # We can write here without race conditions, since the atomic add guarantees
+    # that `new_length` is different for each thread.
+    backend[new_length, i] = value
 
-    return MultiGPUVectorOfVectors{T, typeof(vov_), typeof(vov_per_gpu_)}(vov_, vov_per_gpu_)
+    return vov
 end
 
-const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
+const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
 
 function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
     (; cell_list) = neighborhood_search
-    (; cells) = cell_list
-    (; vov, vov_per_gpu) = cells
 
-    for vov_ in vov_per_gpu
-        vov_.lengths .= 0
-    end
+    cell_list.cells.lengths .= 0
 
     if neighborhood_search.search_radius < eps()
         # Cannot initialize with zero search radius.
@@ -122,50 +107,100 @@ function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborho
         return neighborhood_search
     end
 
-    # Fill cell lists per GPU
-    # TODO split range into chunks for each GPU
-    parallel_foreach2(axes(y, 2), y) do point, gpu
+    # Faster on a single GPU
+    @threaded CUDABackend() for point in axes(y, 2)
         # Get cell index of the point's cell
-        point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
-        cell = cell_coords(point_coords, neighborhood_search)
+        point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
+        cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
 
         # Add point to corresponding cell
-        @boundscheck check_cell_bounds(cell_list, cell)
-
-        # The atomic version of `pushat!` uses atomics to avoid race conditions when
-        # `pushat!` is used in a parallel loop.
-        @inbounds pushat_atomic!(vov_per_gpu[gpu], cell_index(cell_list, cell), point)
-    end
-
-    lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, length(vov_per_gpu))
-    CUDA.synchronize()
-    offsets_ = cu(cumsum(lengths), unified = true)
-    CUDA.synchronize()
-    vov.lengths .= offsets_[end]
-    CUDA.synchronize()
-    offsets = offsets_ .- lengths
-
-    # Merge cell lists
-    parallel_foreach2(axes(vov, 2), y) do cell, gpu
-        offset = offsets[gpu][cell]
-
-        points = vov_per_gpu[gpu][cell]
-        for i in eachindex(points)
-            vov.backend[offset + i, cell] = points[i]
-        end
+        pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
     end
 
     return neighborhood_search
 end
 
-function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
-                                x::AbstractMatrix, y::AbstractMatrix;
-                                points_moving = (true, true), parallelization_backend = x)
-    # The coordinates of the first set of points are irrelevant for this NHS.
-    # Only update when the second set is moving.
-    points_moving[2] || return neighborhood_search
+# This might be slightly faster, but causes problems with synchronization in the interaction
+# kernel because we are carrying around device memory.
+# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
+#     vov_unified :: VU
+#     vov_device  :: VD
+# end
 
-    PointNeighbors.initialize_grid!(neighborhood_search, y)
-end
+# function MultiGPUVectorOfVectors(vov_unified, vov_device)
+#     MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_device)}(vov_unified, vov_device)
+# end
+
+# # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
+
+# function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
+#     @info "" to
+#     return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
+# end
+
+# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
+#     return getindex(vov.vov_unified, i)
+# end
+
+# function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
+#     max_inner_length, max_outer_length = size(vov.backend)
+
+#     # Make sure the vector of vectors in device memory lives on the first GPU
+#     CUDA.device!(0)
+
+#     vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
+#     vov_device = Adapt.adapt(CuArray, vov)
+
+#     return MultiGPUVectorOfVectors(vov_unified, vov_device)
+# end
+
+# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
+
+# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+#     (; cell_list) = neighborhood_search
+#     (; cells) = cell_list
+#     (; vov_unified, vov_device) = cells
+
+#     if neighborhood_search.search_radius < eps()
+#         # Cannot initialize with zero search radius.
+#         # This is used in TrixiParticles when a neighborhood search is not used.
+#         return neighborhood_search
+#     end
+
+#     vov_device.lengths .= 0
+
+#     CUDA.device!(0)
+
+#     # Fill vector of vectors in device memory (on a single GPU)
+#     @threaded CUDABackend() for point in axes(y, 2)
+#         # Get cell index of the point's cell
+#         point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+#         cell = cell_coords(point_coords, neighborhood_search)
+
+#         # Add point to corresponding cell
+#         @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
+
+#         # The atomic version of `pushat!` uses atomics to avoid race conditions when
+#         # `pushat!` is used in a parallel loop.
+#         @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
+#     end
+
+#     # Copy vector of vectors to unified memory
+#     vov_unified.backend .= vov_device.backend
+#     vov_unified.lengths .= vov_device.lengths
+#     vov_unified.length_[] = vov_device.length_[]
+
+#     return neighborhood_search
+# end
+
+# function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
+#                                 x::AbstractMatrix, y::AbstractMatrix;
+#                                 points_moving = (true, true), parallelization_backend = x)
+#     # The coordinates of the first set of points are irrelevant for this NHS.
+#     # Only update when the second set is moving.
+#     points_moving[2] || return neighborhood_search
+
+#     PointNeighbors.initialize_grid!(neighborhood_search, y)
+# end
 
 end # module

From 10a37f2359f17d08f6ecb5abbb66e265ff3818ec Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Wed, 22 Jan 2025 19:58:22 +0100
Subject: [PATCH 4/7] Add semi-fast second and slower third approach

---
 ext/PointNeighborsCUDAExt.jl | 290 +++++++++++++++++++++++++++++------
 1 file changed, 244 insertions(+), 46 deletions(-)

diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
index b9d8bc57..fc93d1b2 100644
--- a/ext/PointNeighborsCUDAExt.jl
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -8,6 +8,7 @@ using UnsafeAtomics: UnsafeAtomics
 using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
 using PointNeighbors.Adapt: Adapt
 using Base: @propagate_inbounds
+using PointNeighbors.BenchmarkTools
 
 const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
 
@@ -67,61 +68,72 @@ end
     CUDA.device!(0)
 end
 
-function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
-    CUDA.LLVM.Interop.@asmcall(
-        "atom.sys.global.add.u32 \$0, [\$1], \$2;",
-        "=r,l,r,~{memory}",
-        true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
-        ptr, val
-    )
-end
+############################################################################################
+# First approach: Use actual system atomics on unified memory
+############################################################################################
+# function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
+#     CUDA.LLVM.Interop.@asmcall(
+#         "atom.sys.global.add.u32 \$0, [\$1], \$2;",
+#         "=r,l,r,~{memory}",
+#         true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
+#         ptr, val
+#     )
+# end
 
-@inline function pushat_atomic_system!(vov, i, value)
-    (; backend, lengths) = vov
+# @inline function pushat_atomic_system!(vov, i, value)
+#     (; backend, lengths) = vov
 
-    @boundscheck checkbounds(vov, i)
+#     @boundscheck checkbounds(vov, i)
 
-    # Increment the column length with an atomic add to avoid race conditions.
-    # Store the new value since it might be changed immediately afterwards by another
-    # thread.
-    # new_length = Atomix.@atomic lengths[i] += 1
-    new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
+#     # Increment the column length with an atomic add to avoid race conditions.
+#     # Store the new value since it might be changed immediately afterwards by another
+#     # thread.
+#     # new_length = Atomix.@atomic lengths[i] += 1
+#     new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
 
-    # We can write here without race conditions, since the atomic add guarantees
-    # that `new_length` is different for each thread.
-    backend[new_length, i] = value
+#     # We can write here without race conditions, since the atomic add guarantees
+#     # that `new_length` is different for each thread.
+#     backend[new_length, i] = value
 
-    return vov
-end
+#     return vov
+# end
 
-const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
+# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
 
-function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
-    (; cell_list) = neighborhood_search
+# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+#     (; cell_list) = neighborhood_search
 
-    cell_list.cells.lengths .= 0
+#     cell_list.cells.lengths .= 0
 
-    if neighborhood_search.search_radius < eps()
-        # Cannot initialize with zero search radius.
-        # This is used in TrixiParticles when a neighborhood search is not used.
-        return neighborhood_search
-    end
+#     if neighborhood_search.search_radius < eps()
+#         # Cannot initialize with zero search radius.
+#         # This is used in TrixiParticles when a neighborhood search is not used.
+#         return neighborhood_search
+#     end
 
-    # Faster on a single GPU
-    @threaded CUDABackend() for point in axes(y, 2)
-        # Get cell index of the point's cell
-        point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
-        cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
+#     # Faster on a single GPU
+#     # CUDA.NVTX.@range "threaded loop" begin
+#     @threaded y for point in axes(y, 2)
+#         # Get cell index of the point's cell
+#         point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
+#         cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
 
-        # Add point to corresponding cell
-        pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
-    end
+#         # Add point to corresponding cell
+#         pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
+#     end
+# # end
 
-    return neighborhood_search
-end
+#     return neighborhood_search
+# end
 
-# This might be slightly faster, but causes problems with synchronization in the interaction
-# kernel because we are carrying around device memory.
+# function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
+#                       y::AbstractMatrix; parallelization_backend = y)
+#     PointNeighbors.initialize_grid!(neighborhood_search, y)
+# end
+
+############################################################################################
+# Second approach: Use a device array to update and copy to unified memory
+############################################################################################
 # struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
 #     vov_unified :: VU
 #     vov_device  :: VD
@@ -133,9 +145,8 @@ end
 
 # # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
 
-# function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
-#     @info "" to
-#     return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
+# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
+#     return Adapt.adapt(to, vov.vov_unified)
 # end
 
 # @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
@@ -182,7 +193,7 @@ end
 
 #         # The atomic version of `pushat!` uses atomics to avoid race conditions when
 #         # `pushat!` is used in a parallel loop.
-#         @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
+#         @inbounds PointNeighbors.pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
 #     end
 
 #     # Copy vector of vectors to unified memory
@@ -203,4 +214,191 @@ end
 #     PointNeighbors.initialize_grid!(neighborhood_search, y)
 # end
 
+############################################################################################
+# Third approach: Like second approach, but a device array for each GPU.
+
+# This should have the advantage that each GPU updates the part of the grid that should
+# already be in its memory and we can avoid moving everything to the first GPU.
+############################################################################################
+KernelAbstractions.@kernel function generic_kernel2(f, gpu)
+    i = KernelAbstractions.@index(Global)
+    @inline f(i, gpu)
+end
+
+@inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T}
+    # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
+    # and index with `iterator[eachindex(iterator)[i]]`.
+    # Note that this only works with vector-like iterators that support arbitrary indexing.
+    indices = eachindex(iterator)
+
+    # Skip empty loops
+    length(indices) == 0 && return
+
+    # Partition `ndrange` to the GPUs
+    n_gpus = length(CUDA.devices())
+    indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
+    @assert length(indices_split) <= n_gpus
+
+    backend = CUDABackend()
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Launch kernel on each device
+    for (i, indices_) in enumerate(indices_split)
+        # Select the correct device for this partition
+        CUDA.device!(i - 1)
+
+        # Call the generic kernel, which only calls a function with the global GPU index
+        generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu
+            @inbounds @inline f(iterator[indices_[j]], vov_per_gpu)
+        end
+    end
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Select first device again
+    CUDA.device!(0)
+end
+
+struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
+    vov_unified::VOV
+    vov_per_gpu::V
+end
+
+function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu)
+    MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu)
+end
+
+# Adapt.@adapt_structure(MultiGPUVectorOfVectors)
+
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
+    return Adapt.adapt(to, vov.vov_unified)
+end
+
+function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
+    max_inner_length, max_outer_length = size(vov.backend)
+
+    n_gpus = length(CUDA.devices())
+    vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
+
+    vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
+    vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
+    for vov__ in vov_per_gpu_
+        vov__.length_[] = vov.length_[]
+    end
+
+    return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_)
+end
+
+const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
+
+KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast}
+    cell = KernelAbstractions.@index(Global)
+
+    length = @inbounds vov.lengths[cell]
+    if length > 0 || islast
+        offset = 0
+        for length_ in previous_lengths
+            offset += @inbounds length_[cell]
+        end
+
+        for i in 1:length
+            @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell]
+        end
+
+        if islast
+            @inbounds vov_unified.lengths[cell] = offset + length
+        end
+    end
+end
+
+function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+    (; cell_list) = neighborhood_search
+    (; cells) = cell_list
+    (; vov_unified, vov_per_gpu) = cells
+
+    for vov_ in vov_per_gpu
+        vov_.lengths .= 0
+    end
+
+    if neighborhood_search.search_radius < eps()
+        # Cannot initialize with zero search radius.
+        # This is used in TrixiParticles when a neighborhood search is not used.
+        return neighborhood_search
+    end
+
+    # Fill cell lists per GPU
+    parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov
+        # Get cell index of the point's cell
+        point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+        cell = cell_coords(point_coords, neighborhood_search)
+
+        # Add point to corresponding cell
+        @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
+
+        # The atomic version of `pushat!` uses atomics to avoid race conditions when
+        # `pushat!` is used in a parallel loop.
+        @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point)
+    end
+
+    n_gpus = length(CUDA.devices())
+    backend = CUDABackend()
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Launch kernel on each device
+    for gpu in 1:n_gpus
+        # Select the correct device
+        CUDA.device!(gpu - 1)
+
+        previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1)
+
+        # Call the generic kernel, which only calls a function with the global GPU index
+        kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu]))
+    end
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    # Select first device again
+    CUDA.device!(0)
+
+    # # Merge cell lists
+    # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
+    #     offset = offsets[gpu][cell]
+
+    #     points = vov_per_gpu[gpu][cell]
+    #     for i in eachindex(points)
+    #         vov_unified.backend[offset + i, cell] = points[i]
+    #     end
+    # end
+
+    return neighborhood_search
+end
+
+function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
+                                x::AbstractMatrix, y::AbstractMatrix;
+                                points_moving = (true, true), parallelization_backend = x)
+    # The coordinates of the first set of points are irrelevant for this NHS.
+    # Only update when the second set is moving.
+    points_moving[2] || return neighborhood_search
+
+    PointNeighbors.initialize_grid!(neighborhood_search, y)
+end
+
 end # module

From 86b465e636f5b35fc1fd741b7457a5529c840be9 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 24 Jan 2025 17:36:47 +0100
Subject: [PATCH 5/7] Define custom type to make broadcasting fast

---
 ext/PointNeighborsCUDAExt.jl | 125 +++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
index fc93d1b2..8772b1be 100644
--- a/ext/PointNeighborsCUDAExt.jl
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -10,6 +10,53 @@ using PointNeighbors.Adapt: Adapt
 using Base: @propagate_inbounds
 using PointNeighbors.BenchmarkTools
 
+# Define custom type where broadcasting is split over multiple GPUs.
+# So far only works with a single system.
+struct MultiSystemMatrix{T, A, S, R} <: AbstractVector{T}
+    array   :: A
+    sizes   :: S
+    ranges  :: R
+
+    function MultiSystemMatrix(array, sizes, ranges)
+        new{eltype(array), typeof(array), typeof(sizes), typeof(ranges)}(array, sizes, ranges)
+    end
+end
+
+function Base.similar(m::MultiSystemMatrix, ::Type{T}) where {T}
+    array = similar(m.array, T)
+    return MultiSystemMatrix(array, m.sizes, m.ranges)
+end
+
+Base.parent(m::MultiSystemMatrix) = m.array
+Base.size(m::MultiSystemMatrix) = size(m.array)
+Base.getindex(m::MultiSystemMatrix, i) = getindex(m.array, i)
+Base.setindex!(m::MultiSystemMatrix, i, x) = setindex!(m.array, i, x)
+
+# function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix)
+#     copyto!(dest.array, src.array)
+#     return dest
+# end
+function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix)
+    # TODO check bounds
+    @threaded dest.array for i in eachindex(dest.array)
+        @inbounds dest.array[i] = src.array[i]
+    end
+
+    return dest
+end
+function Base.copyto!(dest::MultiSystemMatrix, src::AbstractVector)
+    copyto!(dest.array, src)
+    return dest
+end
+function Base.copyto!(dest::AbstractVector, src::MultiSystemMatrix)
+    copyto!(dest, src.array)
+    return dest
+end
+
+function Base.Broadcast.BroadcastStyle(::Type{MultiSystemMatrix{T, A, S, R}}) where {T, A, S, R}
+    Broadcast.BroadcastStyle(A)
+end
+
 const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
 
 # This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between
@@ -68,6 +115,84 @@ end
     CUDA.device!(0)
 end
 
+Base.any(f::Function, A::PointNeighbors.MultiSystemMatrix) = mapreduce(f, |, A)
+
+function Base.mapreduce(f, op, A::PointNeighbors.MultiSystemMatrix; dims=:, init=nothing)
+    n_gpus = length(CUDA.devices())
+    indices = eachindex(A.array)
+    indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
+    @assert length(indices_split) <= n_gpus
+
+    backend = CUDABackend()
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    init2 = init === nothing ? Base._InitialValue() : init
+    result = mapreduce(op, enumerate(indices_split), init=init2) do (i, indices_)
+        # Select the correct device for this partition
+        CUDA.device!(i - 1)
+
+        # return mapreduce(@inline(i -> @inbounds f(A.array[i])), op, indices_)
+        return mapreduce(f, op, view(A.array, indices_); dims=dims, init=init)
+    end
+
+    # Synchronize each device
+    for i in 1:n_gpus
+        CUDA.device!(i - 1)
+        KernelAbstractions.synchronize(backend)
+    end
+
+    return result
+end
+
+@inline function CUDA.GPUArrays._copyto!(dest::PointNeighbors.MultiSystemMatrix, bc::Broadcast.Broadcasted)
+    axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
+    isempty(dest) && return dest
+    bc = Broadcast.preprocess(dest, bc)
+
+    @threaded dest.array for i in eachindex(dest.array)
+        @inbounds dest.array[i] = bc[i]
+    end
+
+    # @kernel function broadcast_kernel_linear(dest, bc)
+    #     I = @index(Global, Linear)
+    #     @inbounds dest[I] = bc[I]
+    # end
+
+    # @kernel function broadcast_kernel_cartesian(dest, bc)
+    #     I = @index(Global, Cartesian)
+    #     @inbounds dest[I] = bc[I]
+    # end
+
+    # broadcast_kernel = if ndims(dest) == 1 ||
+    #                       (isa(IndexStyle(dest), IndexLinear) &&
+    #                        isa(IndexStyle(bc), IndexLinear))
+    #     broadcast_kernel_linear(get_backend(dest))
+    # else
+    #     broadcast_kernel_cartesian(get_backend(dest))
+    # end
+
+    # # ndims check for 0D support
+    # broadcast_kernel(dest, bc; ndrange = ndims(dest) > 0 ? size(dest) : (1,))
+    # if eltype(dest) <: BrokenBroadcast
+    #     throw(ArgumentError("Broadcast operation resulting in $(eltype(eltype(dest))) is not GPU compatible"))
+    # end
+
+    return dest
+end
+
+@inline function PointNeighbors.parallel_foreach(f, iterator, ::PointNeighbors.MultiSystemMatrix)
+    PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend())
+end
+
+PointNeighbors.get_backend(x::PointNeighbors.MultiSystemMatrix) = CUDAMultiGPUBackend()
+
+Adapt.@adapt_structure PointNeighbors.MultiSystemMatrix
+
 ############################################################################################
 # First approach: Use actual system atomics on unified memory
 ############################################################################################

From cffa33f3fbb234f3d71308ac60a879a8c7cb8ef6 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Thu, 30 Jan 2025 14:52:42 +0100
Subject: [PATCH 6/7] Implement `MultiGPUArray`

---
 ext/PointNeighborsCUDAExt.jl | 99 ++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 44 deletions(-)

diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
index 8772b1be..b5b40780 100644
--- a/ext/PointNeighborsCUDAExt.jl
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -4,75 +4,86 @@ using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVector
                       GridNeighborhoodSearch, FullGridCellList, extract_svector,
                       cell_coords, cell_index, @threaded, generic_kernel
 using CUDA: CUDA, CuArray, CUDABackend, cu
-using UnsafeAtomics: UnsafeAtomics
 using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
 using PointNeighbors.Adapt: Adapt
 using Base: @propagate_inbounds
-using PointNeighbors.BenchmarkTools
 
-# Define custom type where broadcasting is split over multiple GPUs.
-# So far only works with a single system.
-struct MultiSystemMatrix{T, A, S, R} <: AbstractVector{T}
-    array   :: A
-    sizes   :: S
-    ranges  :: R
+# Define custom type wrapping a `CuArray` with unified memory to dispatch on it
+struct MultiGPUArray{T, N, A} <: AbstractArray{T, N}
+    array::A
 
-    function MultiSystemMatrix(array, sizes, ranges)
-        new{eltype(array), typeof(array), typeof(sizes), typeof(ranges)}(array, sizes, ranges)
+    function MultiGPUArray(array::AbstractArray{T, N}) where {T, N}
+        new{T, N, typeof(array)}(array)
     end
 end
 
-function Base.similar(m::MultiSystemMatrix, ::Type{T}) where {T}
+Adapt.@adapt_structure MultiGPUArray
+
+function Base.similar(m::MultiGPUArray, ::Type{T}) where {T}
     array = similar(m.array, T)
-    return MultiSystemMatrix(array, m.sizes, m.ranges)
+    return MultiGPUArray(array)
 end
 
-Base.parent(m::MultiSystemMatrix) = m.array
-Base.size(m::MultiSystemMatrix) = size(m.array)
-Base.getindex(m::MultiSystemMatrix, i) = getindex(m.array, i)
-Base.setindex!(m::MultiSystemMatrix, i, x) = setindex!(m.array, i, x)
+Base.parent(m::MultiGPUArray) = m.array
+Base.size(m::MultiGPUArray) = size(m.array)
+Base.getindex(m::MultiGPUArray, i...) = getindex(m.array, i...)
+Base.setindex!(m::MultiGPUArray, x...) = (setindex!(m.array, x...); m)
 
-# function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix)
-#     copyto!(dest.array, src.array)
-#     return dest
-# end
-function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix)
+@inline function Base.fill!(m::MultiGPUArray, x)
     # TODO check bounds
-    @threaded dest.array for i in eachindex(dest.array)
+    # TODO convert x
+    @threaded m for i in eachindex(m.array)
+        @inbounds m.array[i] = x
+    end
+
+    return m
+end
+
+function Base.copyto!(dest::MultiGPUArray, src::MultiGPUArray)
+    # TODO check bounds
+    @threaded dest for i in eachindex(dest.array)
         @inbounds dest.array[i] = src.array[i]
     end
 
     return dest
 end
-function Base.copyto!(dest::MultiSystemMatrix, src::AbstractVector)
+
+function Base.copyto!(dest::MultiGPUArray, src::AbstractArray)
     copyto!(dest.array, src)
     return dest
 end
-function Base.copyto!(dest::AbstractVector, src::MultiSystemMatrix)
+
+function Base.copyto!(dest::MultiGPUArray, indices1::CartesianIndices, src::AbstractArray, indices2::CartesianIndices)
+    copyto!(dest.array, indices1, src, indices2)
+    return dest
+end
+
+function Base.copyto!(dest::AbstractArray, src::MultiGPUArray)
     copyto!(dest, src.array)
     return dest
 end
 
-function Base.Broadcast.BroadcastStyle(::Type{MultiSystemMatrix{T, A, S, R}}) where {T, A, S, R}
+function Base.Broadcast.BroadcastStyle(::Type{MultiGPUArray{T, N, A}}) where {T, N, A}
     Broadcast.BroadcastStyle(A)
 end
 
-const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
+Base.view(m::MultiGPUArray, i) = MultiGPUArray(view(m.array, i))
+Base.reshape(m::MultiGPUArray, dims::Base.Dims) = MultiGPUArray(reshape(m.array, dims))
 
 # This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between
 # `nothing` (Polyester.jl) and `KernelAbstractions.CPU`.
-PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend()
+PointNeighbors.get_backend(::MultiGPUArray) = CUDAMultiGPUBackend()
 
-# Convert input array to `CuArray` with unified memory
-function Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array)
-    return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array)
+# Convert input array to `MultiGPUArray` wrapping a `CuArray` with unified memory
+function Adapt.adapt_structure(::CUDAMultiGPUBackend, array::Array)
+    return MultiGPUArray(CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array))
 end
 
-@inline function PointNeighbors.parallel_foreach(f, iterator, x::UnifiedCuArray)
+@inline function PointNeighbors.parallel_foreach(f, iterator, ::MultiGPUArray)
     PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend())
 end
 
-@inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T}
+@inline function PointNeighbors.parallel_foreach(f::T, iterator, ::CUDAMultiGPUBackend) where {T}
     # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
     # and index with `iterator[eachindex(iterator)[i]]`.
     # Note that this only works with vector-like iterators that support arbitrary indexing.
@@ -115,11 +126,18 @@ end
     CUDA.device!(0)
 end
 
-Base.any(f::Function, A::PointNeighbors.MultiSystemMatrix) = mapreduce(f, |, A)
+Base.any(f::Function, A::MultiGPUArray) = mapreduce(f, |, A, init = false)
 
-function Base.mapreduce(f, op, A::PointNeighbors.MultiSystemMatrix; dims=:, init=nothing)
+function Base.mapreduce(f, op, A::MultiGPUArray; dims=:, init=nothing)
     n_gpus = length(CUDA.devices())
     indices = eachindex(A.array)
+
+    # Skip empty loops
+    if length(indices) == 0
+        init !== nothing && return init
+        throw(ArgumentError("reducing over an empty collection is not allowed; consider supplying `init` to the reducer"))
+    end
+
     indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
     @assert length(indices_split) <= n_gpus
 
@@ -149,12 +167,13 @@ function Base.mapreduce(f, op, A::PointNeighbors.MultiSystemMatrix; dims=:, init
     return result
 end
 
-@inline function CUDA.GPUArrays._copyto!(dest::PointNeighbors.MultiSystemMatrix, bc::Broadcast.Broadcasted)
+# `@..` by FastBroadcast.jl forwards to this function
+@inline function CUDA.GPUArrays._copyto!(dest::MultiGPUArray, bc::Broadcast.Broadcasted)
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     isempty(dest) && return dest
     bc = Broadcast.preprocess(dest, bc)
 
-    @threaded dest.array for i in eachindex(dest.array)
+    @threaded dest for i in eachindex(dest.array)
         @inbounds dest.array[i] = bc[i]
     end
 
@@ -185,14 +204,6 @@ end
     return dest
 end
 
-@inline function PointNeighbors.parallel_foreach(f, iterator, ::PointNeighbors.MultiSystemMatrix)
-    PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend())
-end
-
-PointNeighbors.get_backend(x::PointNeighbors.MultiSystemMatrix) = CUDAMultiGPUBackend()
-
-Adapt.@adapt_structure PointNeighbors.MultiSystemMatrix
-
 ############################################################################################
 # First approach: Use actual system atomics on unified memory
 ############################################################################################

From 558b8a2986262fcbc931c74b1a576ced0ee38628 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 31 Jan 2025 10:43:59 +0100
Subject: [PATCH 7/7] Make all approaches work

---
 .gitignore                   |   1 +
 ext/PointNeighborsCUDAExt.jl | 373 ++++++++++++++++++-----------------
 2 files changed, 188 insertions(+), 186 deletions(-)

diff --git a/.gitignore b/.gitignore
index 47327812..b24718ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ coverage_report/
 *.jl.*.cov
 .vscode/
 run
+out/
 
 .DS_Store
 
diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl
index b5b40780..f826da1e 100644
--- a/ext/PointNeighborsCUDAExt.jl
+++ b/ext/PointNeighborsCUDAExt.jl
@@ -207,65 +207,65 @@ end
 ############################################################################################
 # First approach: Use actual system atomics on unified memory
 ############################################################################################
-# function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
-#     CUDA.LLVM.Interop.@asmcall(
-#         "atom.sys.global.add.u32 \$0, [\$1], \$2;",
-#         "=r,l,r,~{memory}",
-#         true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
-#         ptr, val
-#     )
-# end
+function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
+    CUDA.LLVM.Interop.@asmcall(
+        "atom.sys.global.add.u32 \$0, [\$1], \$2;",
+        "=r,l,r,~{memory}",
+        true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
+        ptr, val
+    )
+end
 
-# @inline function pushat_atomic_system!(vov, i, value)
-#     (; backend, lengths) = vov
+@inline function pushat_atomic_system!(vov, i, value)
+    (; backend, lengths) = vov
 
-#     @boundscheck checkbounds(vov, i)
+    @boundscheck checkbounds(vov, i)
 
-#     # Increment the column length with an atomic add to avoid race conditions.
-#     # Store the new value since it might be changed immediately afterwards by another
-#     # thread.
-#     # new_length = Atomix.@atomic lengths[i] += 1
-#     new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
+    # Increment the column length with an atomic add to avoid race conditions.
+    # Store the new value since it might be changed immediately afterwards by another
+    # thread.
+    # new_length = Atomix.@atomic lengths[i] += 1
+    new_length = atomic_system_add(pointer(lengths.array, i), Int32(1)) + 1
 
-#     # We can write here without race conditions, since the atomic add guarantees
-#     # that `new_length` is different for each thread.
-#     backend[new_length, i] = value
+    # We can write here without race conditions, since the atomic add guarantees
+    # that `new_length` is different for each thread.
+    backend[new_length, i] = value
 
-#     return vov
-# end
+    return vov
+end
 
-# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
+const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:MultiGPUArray}}}
 
-# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
-#     (; cell_list) = neighborhood_search
+function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+    (; cell_list) = neighborhood_search
 
-#     cell_list.cells.lengths .= 0
+    cell_list.cells.lengths .= 0
 
-#     if neighborhood_search.search_radius < eps()
-#         # Cannot initialize with zero search radius.
-#         # This is used in TrixiParticles when a neighborhood search is not used.
-#         return neighborhood_search
-#     end
-
-#     # Faster on a single GPU
-#     # CUDA.NVTX.@range "threaded loop" begin
-#     @threaded y for point in axes(y, 2)
-#         # Get cell index of the point's cell
-#         point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
-#         cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
+    if neighborhood_search.search_radius < eps()
+        # Cannot initialize with zero search radius.
+        # This is used in TrixiParticles when a neighborhood search is not used.
+        return neighborhood_search
+    end
 
-#         # Add point to corresponding cell
-#         pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
-#     end
-# # end
+    # Faster on a single GPU
+    # CUDA.NVTX.@range "threaded loop" begin
+    @threaded y for point in axes(y, 2)
+        # Get cell index of the point's cell
+        point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
+        cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
 
-#     return neighborhood_search
+        # Add point to corresponding cell
+        pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
+    end
 # end
 
-# function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
-#                       y::AbstractMatrix; parallelization_backend = y)
-#     PointNeighbors.initialize_grid!(neighborhood_search, y)
-# end
+    return neighborhood_search
+end
+
+function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
+                      y::AbstractMatrix; parallelization_backend = y)
+    PointNeighbors.initialize_grid!(neighborhood_search, y)
+end
 
 ############################################################################################
 # Second approach: Use a device array to update and copy to unified memory
@@ -356,185 +356,186 @@ end
 # This should have the advantage that each GPU updates the part of the grid that should
 # already be in its memory and we can avoid moving everything to the first GPU.
 ############################################################################################
-KernelAbstractions.@kernel function generic_kernel2(f, gpu)
-    i = KernelAbstractions.@index(Global)
-    @inline f(i, gpu)
-end
+# KernelAbstractions.@kernel function generic_kernel2(f, gpu)
+#     i = KernelAbstractions.@index(Global)
+#     @inline f(i, gpu)
+# end
 
-@inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T}
-    # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
-    # and index with `iterator[eachindex(iterator)[i]]`.
-    # Note that this only works with vector-like iterators that support arbitrary indexing.
-    indices = eachindex(iterator)
+# @inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T}
+#     # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
+#     # and index with `iterator[eachindex(iterator)[i]]`.
+#     # Note that this only works with vector-like iterators that support arbitrary indexing.
+#     indices = eachindex(iterator)
 
-    # Skip empty loops
-    length(indices) == 0 && return
+#     # Skip empty loops
+#     length(indices) == 0 && return
 
-    # Partition `ndrange` to the GPUs
-    n_gpus = length(CUDA.devices())
-    indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
-    @assert length(indices_split) <= n_gpus
+#     # Partition `ndrange` to the GPUs
+#     n_gpus = length(CUDA.devices())
+#     indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
+#     @assert length(indices_split) <= n_gpus
 
-    backend = CUDABackend()
+#     backend = CUDABackend()
 
-    # Synchronize each device
-    for i in 1:n_gpus
-        CUDA.device!(i - 1)
-        KernelAbstractions.synchronize(backend)
-    end
+#     # Synchronize each device
+#     for i in 1:n_gpus
+#         CUDA.device!(i - 1)
+#         KernelAbstractions.synchronize(backend)
+#     end
 
-    # Launch kernel on each device
-    for (i, indices_) in enumerate(indices_split)
-        # Select the correct device for this partition
-        CUDA.device!(i - 1)
+#     # Launch kernel on each device
+#     for (i, indices_) in enumerate(indices_split)
+#         # Select the correct device for this partition
+#         CUDA.device!(i - 1)
 
-        # Call the generic kernel, which only calls a function with the global GPU index
-        generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu
-            @inbounds @inline f(iterator[indices_[j]], vov_per_gpu)
-        end
-    end
+#         # Call the generic kernel, which only calls a function with the global GPU index
+#         generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu
+#             @inbounds @inline f(iterator[indices_[j]], vov_per_gpu)
+#         end
+#     end
 
-    # Synchronize each device
-    for i in 1:n_gpus
-        CUDA.device!(i - 1)
-        KernelAbstractions.synchronize(backend)
-    end
+#     # Synchronize each device
+#     for i in 1:n_gpus
+#         CUDA.device!(i - 1)
+#         KernelAbstractions.synchronize(backend)
+#     end
 
-    # Select first device again
-    CUDA.device!(0)
-end
+#     # Select first device again
+#     CUDA.device!(0)
+# end
 
-struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
-    vov_unified::VOV
-    vov_per_gpu::V
-end
+# struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
+#     vov_unified::VOV
+#     vov_per_gpu::V
+# end
 
-function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu)
-    MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu)
-end
+# function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu)
+#     MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu)
+# end
 
-# Adapt.@adapt_structure(MultiGPUVectorOfVectors)
+# # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
 
-function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
-    return Adapt.adapt(to, vov.vov_unified)
-end
+# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
+#     return Adapt.adapt(to, vov.vov_unified)
+# end
 
-function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
-    max_inner_length, max_outer_length = size(vov.backend)
+# function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
+#     max_inner_length, max_outer_length = size(vov.backend)
 
-    n_gpus = length(CUDA.devices())
-    vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
+#     n_gpus = length(CUDA.devices())
+#     vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
 
-    vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
-    vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
-    for vov__ in vov_per_gpu_
-        vov__.length_[] = vov.length_[]
-    end
+#     vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
+#     vov_per_gpu_ = ntuple(i -> (CUDA.device!(i - 1); Adapt.adapt(CuArray, vov_per_gpu[i])), n_gpus)
+#     for vov__ in vov_per_gpu_
+#         vov__.length_[] = vov.length_[]
+#     end
+#     CUDA.device!(0)
 
-    return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_)
-end
+#     return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_)
+# end
 
-const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
+# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
 
-KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast}
-    cell = KernelAbstractions.@index(Global)
+# KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast}
+#     cell = KernelAbstractions.@index(Global)
 
-    length = @inbounds vov.lengths[cell]
-    if length > 0 || islast
-        offset = 0
-        for length_ in previous_lengths
-            offset += @inbounds length_[cell]
-        end
+#     length = @inbounds vov.lengths[cell]
+#     if length > 0 || islast
+#         offset = 0
+#         for length_ in previous_lengths
+#             offset += @inbounds length_[cell]
+#         end
 
-        for i in 1:length
-            @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell]
-        end
+#         for i in 1:length
+#             @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell]
+#         end
 
-        if islast
-            @inbounds vov_unified.lengths[cell] = offset + length
-        end
-    end
-end
+#         if islast
+#             @inbounds vov_unified.lengths[cell] = offset + length
+#         end
+#     end
+# end
 
-function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
-    (; cell_list) = neighborhood_search
-    (; cells) = cell_list
-    (; vov_unified, vov_per_gpu) = cells
+# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
+#     (; cell_list) = neighborhood_search
+#     (; cells) = cell_list
+#     (; vov_unified, vov_per_gpu) = cells
 
-    for vov_ in vov_per_gpu
-        vov_.lengths .= 0
-    end
+#     for vov_ in vov_per_gpu
+#         vov_.lengths .= 0
+#     end
 
-    if neighborhood_search.search_radius < eps()
-        # Cannot initialize with zero search radius.
-        # This is used in TrixiParticles when a neighborhood search is not used.
-        return neighborhood_search
-    end
+#     if neighborhood_search.search_radius < eps()
+#         # Cannot initialize with zero search radius.
+#         # This is used in TrixiParticles when a neighborhood search is not used.
+#         return neighborhood_search
+#     end
 
-    # Fill cell lists per GPU
-    parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov
-        # Get cell index of the point's cell
-        point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
-        cell = cell_coords(point_coords, neighborhood_search)
+#     # Fill cell lists per GPU
+#     parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov
+#         # Get cell index of the point's cell
+#         point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+#         cell = cell_coords(point_coords, neighborhood_search)
 
-        # Add point to corresponding cell
-        @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
+#         # Add point to corresponding cell
+#         @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
 
-        # The atomic version of `pushat!` uses atomics to avoid race conditions when
-        # `pushat!` is used in a parallel loop.
-        @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point)
-    end
+#         # The atomic version of `pushat!` uses atomics to avoid race conditions when
+#         # `pushat!` is used in a parallel loop.
+#         @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point)
+#     end
 
-    n_gpus = length(CUDA.devices())
-    backend = CUDABackend()
+#     n_gpus = length(CUDA.devices())
+#     backend = CUDABackend()
 
-    # Synchronize each device
-    for i in 1:n_gpus
-        CUDA.device!(i - 1)
-        KernelAbstractions.synchronize(backend)
-    end
+#     # Synchronize each device
+#     for i in 1:n_gpus
+#         CUDA.device!(i - 1)
+#         KernelAbstractions.synchronize(backend)
+#     end
 
-    # Launch kernel on each device
-    for gpu in 1:n_gpus
-        # Select the correct device
-        CUDA.device!(gpu - 1)
+#     # Launch kernel on each device
+#     for gpu in 1:n_gpus
+#         # Select the correct device
+#         CUDA.device!(gpu - 1)
 
-        previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1)
+#         previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1)
 
-        # Call the generic kernel, which only calls a function with the global GPU index
-        kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu]))
-    end
+#         # Call the generic kernel, which only calls a function with the global GPU index
+#         kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu]))
+#     end
 
-    # Synchronize each device
-    for i in 1:n_gpus
-        CUDA.device!(i - 1)
-        KernelAbstractions.synchronize(backend)
-    end
+#     # Synchronize each device
+#     for i in 1:n_gpus
+#         CUDA.device!(i - 1)
+#         KernelAbstractions.synchronize(backend)
+#     end
 
-    # Select first device again
-    CUDA.device!(0)
+#     # Select first device again
+#     CUDA.device!(0)
 
-    # # Merge cell lists
-    # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
-    #     offset = offsets[gpu][cell]
+#     # # Merge cell lists
+#     # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
+#     #     offset = offsets[gpu][cell]
 
-    #     points = vov_per_gpu[gpu][cell]
-    #     for i in eachindex(points)
-    #         vov_unified.backend[offset + i, cell] = points[i]
-    #     end
-    # end
+#     #     points = vov_per_gpu[gpu][cell]
+#     #     for i in eachindex(points)
+#     #         vov_unified.backend[offset + i, cell] = points[i]
+#     #     end
+#     # end
 
-    return neighborhood_search
-end
+#     return neighborhood_search
+# end
 
-function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
-                                x::AbstractMatrix, y::AbstractMatrix;
-                                points_moving = (true, true), parallelization_backend = x)
-    # The coordinates of the first set of points are irrelevant for this NHS.
-    # Only update when the second set is moving.
-    points_moving[2] || return neighborhood_search
+# function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
+#                                 x::AbstractMatrix, y::AbstractMatrix;
+#                                 points_moving = (true, true), parallelization_backend = x)
+#     # The coordinates of the first set of points are irrelevant for this NHS.
+#     # Only update when the second set is moving.
+#     points_moving[2] || return neighborhood_search
 
-    PointNeighbors.initialize_grid!(neighborhood_search, y)
-end
+#     PointNeighbors.initialize_grid!(neighborhood_search, y)
+# end
 
 end # module