From 54f6a077ca9b5270a36ecbbfb8feceecf0661769 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 10 Jan 2025 23:53:18 +0100 Subject: [PATCH 1/7] Add backend to use unified memory and multiple GPUs --- Project.toml | 7 +++++ ext/PointNeighborsCUDAExt.jl | 59 ++++++++++++++++++++++++++++++++++++ src/gpu.jl | 3 -- src/util.jl | 13 ++++++-- 4 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 ext/PointNeighborsCUDAExt.jl diff --git a/Project.toml b/Project.toml index feab2f25..913f2c87 100644 --- a/Project.toml +++ b/Project.toml @@ -13,9 +13,16 @@ Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +PointNeighborsCUDAExt = "CUDA" + [compat] Adapt = "3, 4" Atomix = "0.1, 1" +CUDA = "5.4.3" GPUArraysCore = "0.1, 0.2" KernelAbstractions = "0.9" LinearAlgebra = "1" diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl new file mode 100644 index 00000000..0719305e --- /dev/null +++ b/ext/PointNeighborsCUDAExt.jl @@ -0,0 +1,59 @@ +module PointNeighborsCUDAExt + +using PointNeighbors: PointNeighbors, generic_kernel, CUDAMultiGPUBackend, KernelAbstractions +using CUDA: CUDA, CuArray, CUDABackend + +const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} + +# This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between +# `nothing` (Polyester.jl) and `KernelAbstractions.CPU`. +PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend() + +# Convert input array to `CuArray` with unified memory +function PointNeighbors.Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array) + return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array) +end + +@inline function PointNeighbors.parallel_foreach(f, iterator, x::UnifiedCuArray) + PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend()) +end + +# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl +@inline function PointNeighbors.parallel_foreach(f, iterator, x::CUDAMultiGPUBackend) + # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` + # and index with `iterator[eachindex(iterator)[i]]`. + # Note that this only works with vector-like iterators that support arbitrary indexing. + indices = eachindex(iterator) + + # Skip empty loops + length(indices) == 0 && return + + # Partition `ndrange` to the GPUs + n_gpus = length(CUDA.devices()) + indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) + @assert length(indices_split) <= n_gpus + + backend = CUDABackend() + + # Spawn kernel on each device + for (i, indices_) in enumerate(indices_split) + # Select the correct device for this partition + CUDA.device!(i - 1) + + # Call the generic kernel, which only calls a function with the global GPU index + generic_kernel(backend)(ndrange = length(indices_)) do j + @inbounds @inline f(iterator[indices_[j]]) + end + end + + # Synchronize each device + for i in 1:length(indices_split) + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Select first device again + CUDA.device!(0) +end + +end # module diff --git a/src/gpu.jl b/src/gpu.jl index 9293e19f..cd418c2e 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -31,6 +31,3 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch) return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells, cell_size, update_buffer, nhs.update_strategy) end - -# This is useful to pass the backend directly to `@threaded` -KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend diff --git a/src/util.jl b/src/util.jl index 7cfbfedc..0fdc1176 100644 --- a/src/util.jl +++ b/src/util.jl @@ -60,7 +60,9 @@ with `Threads.@threads :static`. """ struct ThreadsStaticBackend <: AbstractThreadingBackend end -const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend} +struct CUDAMultiGPUBackend end + +const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend, CUDAMultiGPUBackend} """ @threaded x for ... end @@ -140,7 +142,9 @@ end # Skip empty loops ndrange == 0 && return - backend = KernelAbstractions.get_backend(x) + # Use our `get_backend`, which usually forwards to `KernelAbstractions.get_backend`, + # but also works when `x` is already a backend. + backend = get_backend(x) # Call the generic kernel that is defined below, which only calls a function with # the global GPU index. @@ -155,3 +159,8 @@ end i = @index(Global) @inline f(i) end + +get_backend(x) = KernelAbstractions.get_backend(x) + +# This is useful to pass the backend directly to `@threaded` +get_backend(backend::KernelAbstractions.Backend) = backend From fc7180f6c725872ae00c69ed24c6178e331443ef Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 17 Jan 2025 18:01:06 +0100 Subject: [PATCH 2/7] Update on each GPU in device memory and then copy to unified memory --- ext/PointNeighborsCUDAExt.jl | 130 ++++++++++++++++++++++++++++++++--- 1 file changed, 121 insertions(+), 9 deletions(-) diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl index 0719305e..446b2713 100644 --- a/ext/PointNeighborsCUDAExt.jl +++ b/ext/PointNeighborsCUDAExt.jl @@ -1,7 +1,12 @@ module PointNeighborsCUDAExt -using PointNeighbors: PointNeighbors, generic_kernel, CUDAMultiGPUBackend, KernelAbstractions -using CUDA: CUDA, CuArray, CUDABackend +using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors, + GridNeighborhoodSearch, FullGridCellList, extract_svector, + cell_coords, cell_index, check_cell_bounds, pushat_atomic! +using CUDA: CUDA, CuArray, CUDABackend, cu +using UnsafeAtomics: UnsafeAtomics +using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index +using PointNeighbors.Adapt: Adapt const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} @@ -10,7 +15,7 @@ const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend() # Convert input array to `CuArray` with unified memory -function PointNeighbors.Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array) +function Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array) return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array) end @@ -18,8 +23,20 @@ end PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend()) end -# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl -@inline function PointNeighbors.parallel_foreach(f, iterator, x::CUDAMultiGPUBackend) +@inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T} + parallel_foreach2((i, gpu) -> @inline f(i), iterator, x) +end + +@inline function parallel_foreach2(f, iterator, x::UnifiedCuArray) + parallel_foreach2(f, iterator, CUDAMultiGPUBackend()) +end + +KernelAbstractions.@kernel function generic_kernel2(f, gpu) + i = KernelAbstractions.@index(Global) + @inline f(i, gpu) +end + +@inline function parallel_foreach2(f, iterator, x::CUDAMultiGPUBackend) # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` # and index with `iterator[eachindex(iterator)[i]]`. # Note that this only works with vector-like iterators that support arbitrary indexing. @@ -35,19 +52,25 @@ end backend = CUDABackend() - # Spawn kernel on each device + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Launch kernel on each device for (i, indices_) in enumerate(indices_split) # Select the correct device for this partition CUDA.device!(i - 1) # Call the generic kernel, which only calls a function with the global GPU index - generic_kernel(backend)(ndrange = length(indices_)) do j - @inbounds @inline f(iterator[indices_[j]]) + generic_kernel2(backend)(i, ndrange = length(indices_)) do j, gpu + @inbounds @inline f(iterator[indices_[j]], gpu) end end # Synchronize each device - for i in 1:length(indices_split) + for i in 1:n_gpus CUDA.device!(i - 1) KernelAbstractions.synchronize(backend) end @@ -56,4 +79,93 @@ end CUDA.device!(0) end +struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}} + vov::VOV + vov_per_gpu::V +end + +function MultiGPUVectorOfVectors(vov, vov_per_gpu) + MultiGPUVectorOfVectors{eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)}(vov, vov_per_gpu) +end + +Adapt.@adapt_structure(MultiGPUVectorOfVectors) + +function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} + max_inner_length, max_outer_length = size(vov.backend) + + n_gpus = length(CUDA.devices()) + vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus] + + vov_ = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) + vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus) + for vov__ in vov_per_gpu_ + vov__.length_[] = vov.length_[] + end + + return MultiGPUVectorOfVectors{T, typeof(vov_), typeof(vov_per_gpu_)}(vov_, vov_per_gpu_) +end + +const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} + +function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) + (; cell_list) = neighborhood_search + (; cells) = cell_list + (; vov, vov_per_gpu) = cells + + for vov_ in vov_per_gpu + vov_.lengths .= 0 + end + + if neighborhood_search.search_radius < eps() + # Cannot initialize with zero search radius. + # This is used in TrixiParticles when a neighborhood search is not used. + return neighborhood_search + end + + # Fill cell lists per GPU + # TODO split range into chunks for each GPU + parallel_foreach2(axes(y, 2), y) do point, gpu + # Get cell index of the point's cell + point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) + + # Add point to corresponding cell + @boundscheck check_cell_bounds(cell_list, cell) + + # The atomic version of `pushat!` uses atomics to avoid race conditions when + # `pushat!` is used in a parallel loop. + @inbounds pushat_atomic!(vov_per_gpu[gpu], cell_index(cell_list, cell), point) + end + + lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, length(vov_per_gpu)) + CUDA.synchronize() + offsets_ = cu(cumsum(lengths), unified = true) + CUDA.synchronize() + vov.lengths .= offsets_[end] + CUDA.synchronize() + offsets = offsets_ .- lengths + + # Merge cell lists + parallel_foreach2(axes(vov, 2), y) do cell, gpu + offset = offsets[gpu][cell] + + points = vov_per_gpu[gpu][cell] + for i in eachindex(points) + vov.backend[offset + i, cell] = points[i] + end + end + + return neighborhood_search +end + +function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, + x::AbstractMatrix, y::AbstractMatrix; + points_moving = (true, true), parallelization_backend = x) + # The coordinates of the first set of points are irrelevant for this NHS. + # Only update when the second set is moving. + points_moving[2] || return neighborhood_search + + PointNeighbors.initialize_grid!(neighborhood_search, y) +end + end # module From 234dbb573e12f620437eb03cf470202e88269a3d Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 20 Jan 2025 14:37:21 +0100 Subject: [PATCH 3/7] Add workaround for system atomics Co-authored-by: Valentin Churavy --- ext/PointNeighborsCUDAExt.jl | 187 +++++++++++++++++++++-------------- 1 file changed, 111 insertions(+), 76 deletions(-) diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl index 446b2713..b9d8bc57 100644 --- a/ext/PointNeighborsCUDAExt.jl +++ b/ext/PointNeighborsCUDAExt.jl @@ -2,11 +2,12 @@ module PointNeighborsCUDAExt using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors, GridNeighborhoodSearch, FullGridCellList, extract_svector, - cell_coords, cell_index, check_cell_bounds, pushat_atomic! + cell_coords, cell_index, @threaded, generic_kernel using CUDA: CUDA, CuArray, CUDABackend, cu using UnsafeAtomics: UnsafeAtomics using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index using PointNeighbors.Adapt: Adapt +using Base: @propagate_inbounds const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} @@ -24,19 +25,6 @@ end end @inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T} - parallel_foreach2((i, gpu) -> @inline f(i), iterator, x) -end - -@inline function parallel_foreach2(f, iterator, x::UnifiedCuArray) - parallel_foreach2(f, iterator, CUDAMultiGPUBackend()) -end - -KernelAbstractions.@kernel function generic_kernel2(f, gpu) - i = KernelAbstractions.@index(Global) - @inline f(i, gpu) -end - -@inline function parallel_foreach2(f, iterator, x::CUDAMultiGPUBackend) # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` # and index with `iterator[eachindex(iterator)[i]]`. # Note that this only works with vector-like iterators that support arbitrary indexing. @@ -64,8 +52,8 @@ end CUDA.device!(i - 1) # Call the generic kernel, which only calls a function with the global GPU index - generic_kernel2(backend)(i, ndrange = length(indices_)) do j, gpu - @inbounds @inline f(iterator[indices_[j]], gpu) + generic_kernel(backend)(ndrange = length(indices_)) do j + @inbounds @inline f(iterator[indices_[j]]) end end @@ -79,42 +67,39 @@ end CUDA.device!(0) end -struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}} - vov::VOV - vov_per_gpu::V -end - -function MultiGPUVectorOfVectors(vov, vov_per_gpu) - MultiGPUVectorOfVectors{eltype(vov.backend), typeof(vov), typeof(vov_per_gpu)}(vov, vov_per_gpu) +function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32) + CUDA.LLVM.Interop.@asmcall( + "atom.sys.global.add.u32 \$0, [\$1], \$2;", + "=r,l,r,~{memory}", + true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32}, + ptr, val + ) end -Adapt.@adapt_structure(MultiGPUVectorOfVectors) +@inline function pushat_atomic_system!(vov, i, value) + (; backend, lengths) = vov -function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} - max_inner_length, max_outer_length = size(vov.backend) + @boundscheck checkbounds(vov, i) - n_gpus = length(CUDA.devices()) - vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus] + # Increment the column length with an atomic add to avoid race conditions. + # Store the new value since it might be changed immediately afterwards by another + # thread. + # new_length = Atomix.@atomic lengths[i] += 1 + new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1 - vov_ = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) - vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus) - for vov__ in vov_per_gpu_ - vov__.length_[] = vov.length_[] - end + # We can write here without race conditions, since the atomic add guarantees + # that `new_length` is different for each thread. + backend[new_length, i] = value - return MultiGPUVectorOfVectors{T, typeof(vov_), typeof(vov_per_gpu_)}(vov_, vov_per_gpu_) + return vov end -const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} +const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}} function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) (; cell_list) = neighborhood_search - (; cells) = cell_list - (; vov, vov_per_gpu) = cells - for vov_ in vov_per_gpu - vov_.lengths .= 0 - end + cell_list.cells.lengths .= 0 if neighborhood_search.search_radius < eps() # Cannot initialize with zero search radius. @@ -122,50 +107,100 @@ function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborho return neighborhood_search end - # Fill cell lists per GPU - # TODO split range into chunks for each GPU - parallel_foreach2(axes(y, 2), y) do point, gpu + # Faster on a single GPU + @threaded CUDABackend() for point in axes(y, 2) # Get cell index of the point's cell - point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) - cell = cell_coords(point_coords, neighborhood_search) + point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point) + cell = PointNeighbors.cell_coords(point_coords, neighborhood_search) # Add point to corresponding cell - @boundscheck check_cell_bounds(cell_list, cell) - - # The atomic version of `pushat!` uses atomics to avoid race conditions when - # `pushat!` is used in a parallel loop. - @inbounds pushat_atomic!(vov_per_gpu[gpu], cell_index(cell_list, cell), point) - end - - lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, length(vov_per_gpu)) - CUDA.synchronize() - offsets_ = cu(cumsum(lengths), unified = true) - CUDA.synchronize() - vov.lengths .= offsets_[end] - CUDA.synchronize() - offsets = offsets_ .- lengths - - # Merge cell lists - parallel_foreach2(axes(vov, 2), y) do cell, gpu - offset = offsets[gpu][cell] - - points = vov_per_gpu[gpu][cell] - for i in eachindex(points) - vov.backend[offset + i, cell] = points[i] - end + pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point) end return neighborhood_search end -function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, - x::AbstractMatrix, y::AbstractMatrix; - points_moving = (true, true), parallelization_backend = x) - # The coordinates of the first set of points are irrelevant for this NHS. - # Only update when the second set is moving. - points_moving[2] || return neighborhood_search +# This might be slightly faster, but causes problems with synchronization in the interaction +# kernel because we are carrying around device memory. +# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}} +# vov_unified :: VU +# vov_device :: VD +# end - PointNeighbors.initialize_grid!(neighborhood_search, y) -end +# function MultiGPUVectorOfVectors(vov_unified, vov_device) +# MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_device)}(vov_unified, vov_device) +# end + +# # Adapt.@adapt_structure(MultiGPUVectorOfVectors) + +# function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors) +# @info "" to +# return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device)) +# end + +# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i) +# return getindex(vov.vov_unified, i) +# end + +# function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} +# max_inner_length, max_outer_length = size(vov.backend) + +# # Make sure the vector of vectors in device memory lives on the first GPU +# CUDA.device!(0) + +# vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) +# vov_device = Adapt.adapt(CuArray, vov) + +# return MultiGPUVectorOfVectors(vov_unified, vov_device) +# end + +# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} + +# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) +# (; cell_list) = neighborhood_search +# (; cells) = cell_list +# (; vov_unified, vov_device) = cells + +# if neighborhood_search.search_radius < eps() +# # Cannot initialize with zero search radius. +# # This is used in TrixiParticles when a neighborhood search is not used. +# return neighborhood_search +# end + +# vov_device.lengths .= 0 + +# CUDA.device!(0) + +# # Fill vector of vectors in device memory (on a single GPU) +# @threaded CUDABackend() for point in axes(y, 2) +# # Get cell index of the point's cell +# point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) +# cell = cell_coords(point_coords, neighborhood_search) + +# # Add point to corresponding cell +# @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell) + +# # The atomic version of `pushat!` uses atomics to avoid race conditions when +# # `pushat!` is used in a parallel loop. +# @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point) +# end + +# # Copy vector of vectors to unified memory +# vov_unified.backend .= vov_device.backend +# vov_unified.lengths .= vov_device.lengths +# vov_unified.length_[] = vov_device.length_[] + +# return neighborhood_search +# end + +# function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, +# x::AbstractMatrix, y::AbstractMatrix; +# points_moving = (true, true), parallelization_backend = x) +# # The coordinates of the first set of points are irrelevant for this NHS. +# # Only update when the second set is moving. +# points_moving[2] || return neighborhood_search + +# PointNeighbors.initialize_grid!(neighborhood_search, y) +# end end # module From 10a37f2359f17d08f6ecb5abbb66e265ff3818ec Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 22 Jan 2025 19:58:22 +0100 Subject: [PATCH 4/7] Add semi-fast second and slower third approach --- ext/PointNeighborsCUDAExt.jl | 290 +++++++++++++++++++++++++++++------ 1 file changed, 244 insertions(+), 46 deletions(-) diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl index b9d8bc57..fc93d1b2 100644 --- a/ext/PointNeighborsCUDAExt.jl +++ b/ext/PointNeighborsCUDAExt.jl @@ -8,6 +8,7 @@ using UnsafeAtomics: UnsafeAtomics using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index using PointNeighbors.Adapt: Adapt using Base: @propagate_inbounds +using PointNeighbors.BenchmarkTools const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} @@ -67,61 +68,72 @@ end CUDA.device!(0) end -function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32) - CUDA.LLVM.Interop.@asmcall( - "atom.sys.global.add.u32 \$0, [\$1], \$2;", - "=r,l,r,~{memory}", - true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32}, - ptr, val - ) -end +############################################################################################ +# First approach: Use actual system atomics on unified memory +############################################################################################ +# function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32) +# CUDA.LLVM.Interop.@asmcall( +# "atom.sys.global.add.u32 \$0, [\$1], \$2;", +# "=r,l,r,~{memory}", +# true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32}, +# ptr, val +# ) +# end -@inline function pushat_atomic_system!(vov, i, value) - (; backend, lengths) = vov +# @inline function pushat_atomic_system!(vov, i, value) +# (; backend, lengths) = vov - @boundscheck checkbounds(vov, i) +# @boundscheck checkbounds(vov, i) - # Increment the column length with an atomic add to avoid race conditions. - # Store the new value since it might be changed immediately afterwards by another - # thread. - # new_length = Atomix.@atomic lengths[i] += 1 - new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1 +# # Increment the column length with an atomic add to avoid race conditions. +# # Store the new value since it might be changed immediately afterwards by another +# # thread. +# # new_length = Atomix.@atomic lengths[i] += 1 +# new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1 - # We can write here without race conditions, since the atomic add guarantees - # that `new_length` is different for each thread. - backend[new_length, i] = value +# # We can write here without race conditions, since the atomic add guarantees +# # that `new_length` is different for each thread. +# backend[new_length, i] = value - return vov -end +# return vov +# end -const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}} +# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}} -function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) - (; cell_list) = neighborhood_search +# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) +# (; cell_list) = neighborhood_search - cell_list.cells.lengths .= 0 +# cell_list.cells.lengths .= 0 - if neighborhood_search.search_radius < eps() - # Cannot initialize with zero search radius. - # This is used in TrixiParticles when a neighborhood search is not used. - return neighborhood_search - end +# if neighborhood_search.search_radius < eps() +# # Cannot initialize with zero search radius. +# # This is used in TrixiParticles when a neighborhood search is not used. +# return neighborhood_search +# end - # Faster on a single GPU - @threaded CUDABackend() for point in axes(y, 2) - # Get cell index of the point's cell - point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point) - cell = PointNeighbors.cell_coords(point_coords, neighborhood_search) +# # Faster on a single GPU +# # CUDA.NVTX.@range "threaded loop" begin +# @threaded y for point in axes(y, 2) +# # Get cell index of the point's cell +# point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point) +# cell = PointNeighbors.cell_coords(point_coords, neighborhood_search) - # Add point to corresponding cell - pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point) - end +# # Add point to corresponding cell +# pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point) +# end +# # end - return neighborhood_search -end +# return neighborhood_search +# end -# This might be slightly faster, but causes problems with synchronization in the interaction -# kernel because we are carrying around device memory. +# function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, +# y::AbstractMatrix; parallelization_backend = y) +# PointNeighbors.initialize_grid!(neighborhood_search, y) +# end + +############################################################################################ +# Second approach: Use a device array to update and copy to unified memory +############################################################################################ # struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}} # vov_unified :: VU # vov_device :: VD @@ -133,9 +145,8 @@ end # # Adapt.@adapt_structure(MultiGPUVectorOfVectors) -# function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors) -# @info "" to -# return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device)) +# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors) +# return Adapt.adapt(to, vov.vov_unified) # end # @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i) @@ -182,7 +193,7 @@ end # # The atomic version of `pushat!` uses atomics to avoid race conditions when # # `pushat!` is used in a parallel loop. -# @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point) +# @inbounds PointNeighbors.pushat_atomic!(vov_device, cell_index(cell_list, cell), point) # end # # Copy vector of vectors to unified memory @@ -203,4 +214,191 @@ end # PointNeighbors.initialize_grid!(neighborhood_search, y) # end +############################################################################################ +# Third approach: Like second approach, but a device array for each GPU. + +# This should have the advantage that each GPU updates the part of the grid that should +# already be in its memory and we can avoid moving everything to the first GPU. +############################################################################################ +KernelAbstractions.@kernel function generic_kernel2(f, gpu) + i = KernelAbstractions.@index(Global) + @inline f(i, gpu) +end + +@inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T} + # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` + # and index with `iterator[eachindex(iterator)[i]]`. + # Note that this only works with vector-like iterators that support arbitrary indexing. + indices = eachindex(iterator) + + # Skip empty loops + length(indices) == 0 && return + + # Partition `ndrange` to the GPUs + n_gpus = length(CUDA.devices()) + indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) + @assert length(indices_split) <= n_gpus + + backend = CUDABackend() + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Launch kernel on each device + for (i, indices_) in enumerate(indices_split) + # Select the correct device for this partition + CUDA.device!(i - 1) + + # Call the generic kernel, which only calls a function with the global GPU index + generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu + @inbounds @inline f(iterator[indices_[j]], vov_per_gpu) + end + end + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Select first device again + CUDA.device!(0) +end + +struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}} + vov_unified::VOV + vov_per_gpu::V +end + +function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu) + MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu) +end + +# Adapt.@adapt_structure(MultiGPUVectorOfVectors) + +function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors) + return Adapt.adapt(to, vov.vov_unified) +end + +function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} + max_inner_length, max_outer_length = size(vov.backend) + + n_gpus = length(CUDA.devices()) + vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus] + + vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) + vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus) + for vov__ in vov_per_gpu_ + vov__.length_[] = vov.length_[] + end + + return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_) +end + +const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} + +KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast} + cell = KernelAbstractions.@index(Global) + + length = @inbounds vov.lengths[cell] + if length > 0 || islast + offset = 0 + for length_ in previous_lengths + offset += @inbounds length_[cell] + end + + for i in 1:length + @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell] + end + + if islast + @inbounds vov_unified.lengths[cell] = offset + length + end + end +end + +function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) + (; cell_list) = neighborhood_search + (; cells) = cell_list + (; vov_unified, vov_per_gpu) = cells + + for vov_ in vov_per_gpu + vov_.lengths .= 0 + end + + if neighborhood_search.search_radius < eps() + # Cannot initialize with zero search radius. + # This is used in TrixiParticles when a neighborhood search is not used. + return neighborhood_search + end + + # Fill cell lists per GPU + parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov + # Get cell index of the point's cell + point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) + + # Add point to corresponding cell + @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell) + + # The atomic version of `pushat!` uses atomics to avoid race conditions when + # `pushat!` is used in a parallel loop. + @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point) + end + + n_gpus = length(CUDA.devices()) + backend = CUDABackend() + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Launch kernel on each device + for gpu in 1:n_gpus + # Select the correct device + CUDA.device!(gpu - 1) + + previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1) + + # Call the generic kernel, which only calls a function with the global GPU index + kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu])) + end + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Select first device again + CUDA.device!(0) + + # # Merge cell lists + # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu + # offset = offsets[gpu][cell] + + # points = vov_per_gpu[gpu][cell] + # for i in eachindex(points) + # vov_unified.backend[offset + i, cell] = points[i] + # end + # end + + return neighborhood_search +end + +function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, + x::AbstractMatrix, y::AbstractMatrix; + points_moving = (true, true), parallelization_backend = x) + # The coordinates of the first set of points are irrelevant for this NHS. + # Only update when the second set is moving. + points_moving[2] || return neighborhood_search + + PointNeighbors.initialize_grid!(neighborhood_search, y) +end + end # module From 86b465e636f5b35fc1fd741b7457a5529c840be9 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 24 Jan 2025 17:36:47 +0100 Subject: [PATCH 5/7] Define custom type to make broadcasting fast --- ext/PointNeighborsCUDAExt.jl | 125 +++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl index fc93d1b2..8772b1be 100644 --- a/ext/PointNeighborsCUDAExt.jl +++ b/ext/PointNeighborsCUDAExt.jl @@ -10,6 +10,53 @@ using PointNeighbors.Adapt: Adapt using Base: @propagate_inbounds using PointNeighbors.BenchmarkTools +# Define custom type where broadcasting is split over multiple GPUs. +# So far only works with a single system. +struct MultiSystemMatrix{T, A, S, R} <: AbstractVector{T} + array :: A + sizes :: S + ranges :: R + + function MultiSystemMatrix(array, sizes, ranges) + new{eltype(array), typeof(array), typeof(sizes), typeof(ranges)}(array, sizes, ranges) + end +end + +function Base.similar(m::MultiSystemMatrix, ::Type{T}) where {T} + array = similar(m.array, T) + return MultiSystemMatrix(array, m.sizes, m.ranges) +end + +Base.parent(m::MultiSystemMatrix) = m.array +Base.size(m::MultiSystemMatrix) = size(m.array) +Base.getindex(m::MultiSystemMatrix, i) = getindex(m.array, i) +Base.setindex!(m::MultiSystemMatrix, i, x) = setindex!(m.array, i, x) + +# function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix) +# copyto!(dest.array, src.array) +# return dest +# end +function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix) + # TODO check bounds + @threaded dest.array for i in eachindex(dest.array) + @inbounds dest.array[i] = src.array[i] + end + + return dest +end +function Base.copyto!(dest::MultiSystemMatrix, src::AbstractVector) + copyto!(dest.array, src) + return dest +end +function Base.copyto!(dest::AbstractVector, src::MultiSystemMatrix) + copyto!(dest, src.array) + return dest +end + +function Base.Broadcast.BroadcastStyle(::Type{MultiSystemMatrix{T, A, S, R}}) where {T, A, S, R} + Broadcast.BroadcastStyle(A) +end + const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} # This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between @@ -68,6 +115,84 @@ end CUDA.device!(0) end +Base.any(f::Function, A::PointNeighbors.MultiSystemMatrix) = mapreduce(f, |, A) + +function Base.mapreduce(f, op, A::PointNeighbors.MultiSystemMatrix; dims=:, init=nothing) + n_gpus = length(CUDA.devices()) + indices = eachindex(A.array) + indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) + @assert length(indices_split) <= n_gpus + + backend = CUDABackend() + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + init2 = init === nothing ? Base._InitialValue() : init + result = mapreduce(op, enumerate(indices_split), init=init2) do (i, indices_) + # Select the correct device for this partition + CUDA.device!(i - 1) + + # return mapreduce(@inline(i -> @inbounds f(A.array[i])), op, indices_) + return mapreduce(f, op, view(A.array, indices_); dims=dims, init=init) + end + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + return result +end + +@inline function CUDA.GPUArrays._copyto!(dest::PointNeighbors.MultiSystemMatrix, bc::Broadcast.Broadcasted) + axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) + isempty(dest) && return dest + bc = Broadcast.preprocess(dest, bc) + + @threaded dest.array for i in eachindex(dest.array) + @inbounds dest.array[i] = bc[i] + end + + # @kernel function broadcast_kernel_linear(dest, bc) + # I = @index(Global, Linear) + # @inbounds dest[I] = bc[I] + # end + + # @kernel function broadcast_kernel_cartesian(dest, bc) + # I = @index(Global, Cartesian) + # @inbounds dest[I] = bc[I] + # end + + # broadcast_kernel = if ndims(dest) == 1 || + # (isa(IndexStyle(dest), IndexLinear) && + # isa(IndexStyle(bc), IndexLinear)) + # broadcast_kernel_linear(get_backend(dest)) + # else + # broadcast_kernel_cartesian(get_backend(dest)) + # end + + # # ndims check for 0D support + # broadcast_kernel(dest, bc; ndrange = ndims(dest) > 0 ? size(dest) : (1,)) + # if eltype(dest) <: BrokenBroadcast + # throw(ArgumentError("Broadcast operation resulting in $(eltype(eltype(dest))) is not GPU compatible")) + # end + + return dest +end + +@inline function PointNeighbors.parallel_foreach(f, iterator, ::PointNeighbors.MultiSystemMatrix) + PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend()) +end + +PointNeighbors.get_backend(x::PointNeighbors.MultiSystemMatrix) = CUDAMultiGPUBackend() + +Adapt.@adapt_structure PointNeighbors.MultiSystemMatrix + ############################################################################################ # First approach: Use actual system atomics on unified memory ############################################################################################ From cffa33f3fbb234f3d71308ac60a879a8c7cb8ef6 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:52:42 +0100 Subject: [PATCH 6/7] Implement `MultiGPUArray` --- ext/PointNeighborsCUDAExt.jl | 99 ++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl index 8772b1be..b5b40780 100644 --- a/ext/PointNeighborsCUDAExt.jl +++ b/ext/PointNeighborsCUDAExt.jl @@ -4,75 +4,86 @@ using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVector GridNeighborhoodSearch, FullGridCellList, extract_svector, cell_coords, cell_index, @threaded, generic_kernel using CUDA: CUDA, CuArray, CUDABackend, cu -using UnsafeAtomics: UnsafeAtomics using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index using PointNeighbors.Adapt: Adapt using Base: @propagate_inbounds -using PointNeighbors.BenchmarkTools -# Define custom type where broadcasting is split over multiple GPUs. -# So far only works with a single system. -struct MultiSystemMatrix{T, A, S, R} <: AbstractVector{T} - array :: A - sizes :: S - ranges :: R +# Define custom type wrapping a `CuArray` with unified memory to dispatch on it +struct MultiGPUArray{T, N, A} <: AbstractArray{T, N} + array::A - function MultiSystemMatrix(array, sizes, ranges) - new{eltype(array), typeof(array), typeof(sizes), typeof(ranges)}(array, sizes, ranges) + function MultiGPUArray(array::AbstractArray{T, N}) where {T, N} + new{T, N, typeof(array)}(array) end end -function Base.similar(m::MultiSystemMatrix, ::Type{T}) where {T} +Adapt.@adapt_structure MultiGPUArray + +function Base.similar(m::MultiGPUArray, ::Type{T}) where {T} array = similar(m.array, T) - return MultiSystemMatrix(array, m.sizes, m.ranges) + return MultiGPUArray(array) end -Base.parent(m::MultiSystemMatrix) = m.array -Base.size(m::MultiSystemMatrix) = size(m.array) -Base.getindex(m::MultiSystemMatrix, i) = getindex(m.array, i) -Base.setindex!(m::MultiSystemMatrix, i, x) = setindex!(m.array, i, x) +Base.parent(m::MultiGPUArray) = m.array +Base.size(m::MultiGPUArray) = size(m.array) +Base.getindex(m::MultiGPUArray, i...) = getindex(m.array, i...) +Base.setindex!(m::MultiGPUArray, x...) = (setindex!(m.array, x...); m) -# function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix) -# copyto!(dest.array, src.array) -# return dest -# end -function Base.copyto!(dest::MultiSystemMatrix, src::MultiSystemMatrix) +@inline function Base.fill!(m::MultiGPUArray, x) # TODO check bounds - @threaded dest.array for i in eachindex(dest.array) + # TODO convert x + @threaded m for i in eachindex(m.array) + @inbounds m.array[i] = x + end + + return m +end + +function Base.copyto!(dest::MultiGPUArray, src::MultiGPUArray) + # TODO check bounds + @threaded dest for i in eachindex(dest.array) @inbounds dest.array[i] = src.array[i] end return dest end -function Base.copyto!(dest::MultiSystemMatrix, src::AbstractVector) + +function Base.copyto!(dest::MultiGPUArray, src::AbstractArray) copyto!(dest.array, src) return dest end -function Base.copyto!(dest::AbstractVector, src::MultiSystemMatrix) + +function Base.copyto!(dest::MultiGPUArray, indices1::CartesianIndices, src::AbstractArray, indices2::CartesianIndices) + copyto!(dest.array, indices1, src, indices2) + return dest +end + +function Base.copyto!(dest::AbstractArray, src::MultiGPUArray) copyto!(dest, src.array) return dest end -function Base.Broadcast.BroadcastStyle(::Type{MultiSystemMatrix{T, A, S, R}}) where {T, A, S, R} +function Base.Broadcast.BroadcastStyle(::Type{MultiGPUArray{T, N, A}}) where {T, N, A} Broadcast.BroadcastStyle(A) end -const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} +Base.view(m::MultiGPUArray, i) = MultiGPUArray(view(m.array, i)) +Base.reshape(m::MultiGPUArray, dims::Base.Dims) = MultiGPUArray(reshape(m.array, dims)) # This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between # `nothing` (Polyester.jl) and `KernelAbstractions.CPU`. -PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend() +PointNeighbors.get_backend(::MultiGPUArray) = CUDAMultiGPUBackend() -# Convert input array to `CuArray` with unified memory -function Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array) - return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array) +# Convert input array to `MultiGPUArray` wrapping a `CuArray` with unified memory +function Adapt.adapt_structure(::CUDAMultiGPUBackend, array::Array) + return MultiGPUArray(CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array)) end -@inline function PointNeighbors.parallel_foreach(f, iterator, x::UnifiedCuArray) +@inline function PointNeighbors.parallel_foreach(f, iterator, ::MultiGPUArray) PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend()) end -@inline function PointNeighbors.parallel_foreach(f::T, iterator, x::CUDAMultiGPUBackend) where {T} +@inline function PointNeighbors.parallel_foreach(f::T, iterator, ::CUDAMultiGPUBackend) where {T} # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` # and index with `iterator[eachindex(iterator)[i]]`. # Note that this only works with vector-like iterators that support arbitrary indexing. @@ -115,11 +126,18 @@ end CUDA.device!(0) end -Base.any(f::Function, A::PointNeighbors.MultiSystemMatrix) = mapreduce(f, |, A) +Base.any(f::Function, A::MultiGPUArray) = mapreduce(f, |, A, init = false) -function Base.mapreduce(f, op, A::PointNeighbors.MultiSystemMatrix; dims=:, init=nothing) +function Base.mapreduce(f, op, A::MultiGPUArray; dims=:, init=nothing) n_gpus = length(CUDA.devices()) indices = eachindex(A.array) + + # Skip empty loops + if length(indices) == 0 + init !== nothing && return init + throw(ArgumentError("reducing over an empty collection is not allowed; consider supplying `init` to the reducer")) + end + indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) @assert length(indices_split) <= n_gpus @@ -149,12 +167,13 @@ function Base.mapreduce(f, op, A::PointNeighbors.MultiSystemMatrix; dims=:, init return result end -@inline function CUDA.GPUArrays._copyto!(dest::PointNeighbors.MultiSystemMatrix, bc::Broadcast.Broadcasted) +# `@..` by FastBroadcast.jl forwards to this function +@inline function CUDA.GPUArrays._copyto!(dest::MultiGPUArray, bc::Broadcast.Broadcasted) axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) isempty(dest) && return dest bc = Broadcast.preprocess(dest, bc) - @threaded dest.array for i in eachindex(dest.array) + @threaded dest for i in eachindex(dest.array) @inbounds dest.array[i] = bc[i] end @@ -185,14 +204,6 @@ end return dest end -@inline function PointNeighbors.parallel_foreach(f, iterator, ::PointNeighbors.MultiSystemMatrix) - PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend()) -end - -PointNeighbors.get_backend(x::PointNeighbors.MultiSystemMatrix) = CUDAMultiGPUBackend() - -Adapt.@adapt_structure PointNeighbors.MultiSystemMatrix - ############################################################################################ # First approach: Use actual system atomics on unified memory ############################################################################################ From 558b8a2986262fcbc931c74b1a576ced0ee38628 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 31 Jan 2025 10:43:59 +0100 Subject: [PATCH 7/7] Make all approaches work --- .gitignore | 1 + ext/PointNeighborsCUDAExt.jl | 373 ++++++++++++++++++----------------- 2 files changed, 188 insertions(+), 186 deletions(-) diff --git a/.gitignore b/.gitignore index 47327812..b24718ac 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ coverage_report/ *.jl.*.cov .vscode/ run +out/ .DS_Store diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl index b5b40780..f826da1e 100644 --- a/ext/PointNeighborsCUDAExt.jl +++ b/ext/PointNeighborsCUDAExt.jl @@ -207,65 +207,65 @@ end ############################################################################################ # First approach: Use actual system atomics on unified memory ############################################################################################ -# function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32) -# CUDA.LLVM.Interop.@asmcall( -# "atom.sys.global.add.u32 \$0, [\$1], \$2;", -# "=r,l,r,~{memory}", -# true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32}, -# ptr, val -# ) -# end +function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32) + CUDA.LLVM.Interop.@asmcall( + "atom.sys.global.add.u32 \$0, [\$1], \$2;", + "=r,l,r,~{memory}", + true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32}, + ptr, val + ) +end -# @inline function pushat_atomic_system!(vov, i, value) -# (; backend, lengths) = vov +@inline function pushat_atomic_system!(vov, i, value) + (; backend, lengths) = vov -# @boundscheck checkbounds(vov, i) + @boundscheck checkbounds(vov, i) -# # Increment the column length with an atomic add to avoid race conditions. -# # Store the new value since it might be changed immediately afterwards by another -# # thread. -# # new_length = Atomix.@atomic lengths[i] += 1 -# new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1 + # Increment the column length with an atomic add to avoid race conditions. + # Store the new value since it might be changed immediately afterwards by another + # thread. + # new_length = Atomix.@atomic lengths[i] += 1 + new_length = atomic_system_add(pointer(lengths.array, i), Int32(1)) + 1 -# # We can write here without race conditions, since the atomic add guarantees -# # that `new_length` is different for each thread. -# backend[new_length, i] = value + # We can write here without race conditions, since the atomic add guarantees + # that `new_length` is different for each thread. + backend[new_length, i] = value -# return vov -# end + return vov +end -# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}} +const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:MultiGPUArray}}} -# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) -# (; cell_list) = neighborhood_search +function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) + (; cell_list) = neighborhood_search -# cell_list.cells.lengths .= 0 + cell_list.cells.lengths .= 0 -# if neighborhood_search.search_radius < eps() -# # Cannot initialize with zero search radius. -# # This is used in TrixiParticles when a neighborhood search is not used. -# return neighborhood_search -# end - -# # Faster on a single GPU -# # CUDA.NVTX.@range "threaded loop" begin -# @threaded y for point in axes(y, 2) -# # Get cell index of the point's cell -# point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point) -# cell = PointNeighbors.cell_coords(point_coords, neighborhood_search) + if neighborhood_search.search_radius < eps() + # Cannot initialize with zero search radius. + # This is used in TrixiParticles when a neighborhood search is not used. + return neighborhood_search + end -# # Add point to corresponding cell -# pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point) -# end -# # end + # Faster on a single GPU + # CUDA.NVTX.@range "threaded loop" begin + @threaded y for point in axes(y, 2) + # Get cell index of the point's cell + point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point) + cell = PointNeighbors.cell_coords(point_coords, neighborhood_search) -# return neighborhood_search + # Add point to corresponding cell + pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point) + end # end -# function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, -# y::AbstractMatrix; parallelization_backend = y) -# PointNeighbors.initialize_grid!(neighborhood_search, y) -# end + return neighborhood_search +end + +function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, + y::AbstractMatrix; parallelization_backend = y) + PointNeighbors.initialize_grid!(neighborhood_search, y) +end ############################################################################################ # Second approach: Use a device array to update and copy to unified memory @@ -356,185 +356,186 @@ end # This should have the advantage that each GPU updates the part of the grid that should # already be in its memory and we can avoid moving everything to the first GPU. ############################################################################################ -KernelAbstractions.@kernel function generic_kernel2(f, gpu) - i = KernelAbstractions.@index(Global) - @inline f(i, gpu) -end +# KernelAbstractions.@kernel function generic_kernel2(f, gpu) +# i = KernelAbstractions.@index(Global) +# @inline f(i, gpu) +# end -@inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T} - # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` - # and index with `iterator[eachindex(iterator)[i]]`. - # Note that this only works with vector-like iterators that support arbitrary indexing. - indices = eachindex(iterator) +# @inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T} +# # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` +# # and index with `iterator[eachindex(iterator)[i]]`. +# # Note that this only works with vector-like iterators that support arbitrary indexing. +# indices = eachindex(iterator) - # Skip empty loops - length(indices) == 0 && return +# # Skip empty loops +# length(indices) == 0 && return - # Partition `ndrange` to the GPUs - n_gpus = length(CUDA.devices()) - indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) - @assert length(indices_split) <= n_gpus +# # Partition `ndrange` to the GPUs +# n_gpus = length(CUDA.devices()) +# indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) +# @assert length(indices_split) <= n_gpus - backend = CUDABackend() +# backend = CUDABackend() - # Synchronize each device - for i in 1:n_gpus - CUDA.device!(i - 1) - KernelAbstractions.synchronize(backend) - end +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end - # Launch kernel on each device - for (i, indices_) in enumerate(indices_split) - # Select the correct device for this partition - CUDA.device!(i - 1) +# # Launch kernel on each device +# for (i, indices_) in enumerate(indices_split) +# # Select the correct device for this partition +# CUDA.device!(i - 1) - # Call the generic kernel, which only calls a function with the global GPU index - generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu - @inbounds @inline f(iterator[indices_[j]], vov_per_gpu) - end - end +# # Call the generic kernel, which only calls a function with the global GPU index +# generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu +# @inbounds @inline f(iterator[indices_[j]], vov_per_gpu) +# end +# end - # Synchronize each device - for i in 1:n_gpus - CUDA.device!(i - 1) - KernelAbstractions.synchronize(backend) - end +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end - # Select first device again - CUDA.device!(0) -end +# # Select first device again +# CUDA.device!(0) +# end -struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}} - vov_unified::VOV - vov_per_gpu::V -end +# struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}} +# vov_unified::VOV +# vov_per_gpu::V +# end -function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu) - MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu) -end +# function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu) +# MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu) +# end -# Adapt.@adapt_structure(MultiGPUVectorOfVectors) +# # Adapt.@adapt_structure(MultiGPUVectorOfVectors) -function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors) - return Adapt.adapt(to, vov.vov_unified) -end +# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors) +# return Adapt.adapt(to, vov.vov_unified) +# end -function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} - max_inner_length, max_outer_length = size(vov.backend) +# function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} +# max_inner_length, max_outer_length = size(vov.backend) - n_gpus = length(CUDA.devices()) - vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus] +# n_gpus = length(CUDA.devices()) +# vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus] - vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) - vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus) - for vov__ in vov_per_gpu_ - vov__.length_[] = vov.length_[] - end +# vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) +# vov_per_gpu_ = ntuple(i -> (CUDA.device!(i - 1); Adapt.adapt(CuArray, vov_per_gpu[i])), n_gpus) +# for vov__ in vov_per_gpu_ +# vov__.length_[] = vov.length_[] +# end +# CUDA.device!(0) - return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_) -end +# return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_) +# end -const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} +# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} -KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast} - cell = KernelAbstractions.@index(Global) +# KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast} +# cell = KernelAbstractions.@index(Global) - length = @inbounds vov.lengths[cell] - if length > 0 || islast - offset = 0 - for length_ in previous_lengths - offset += @inbounds length_[cell] - end +# length = @inbounds vov.lengths[cell] +# if length > 0 || islast +# offset = 0 +# for length_ in previous_lengths +# offset += @inbounds length_[cell] +# end - for i in 1:length - @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell] - end +# for i in 1:length +# @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell] +# end - if islast - @inbounds vov_unified.lengths[cell] = offset + length - end - end -end +# if islast +# @inbounds vov_unified.lengths[cell] = offset + length +# end +# end +# end -function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) - (; cell_list) = neighborhood_search - (; cells) = cell_list - (; vov_unified, vov_per_gpu) = cells +# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) +# (; cell_list) = neighborhood_search +# (; cells) = cell_list +# (; vov_unified, vov_per_gpu) = cells - for vov_ in vov_per_gpu - vov_.lengths .= 0 - end +# for vov_ in vov_per_gpu +# vov_.lengths .= 0 +# end - if neighborhood_search.search_radius < eps() - # Cannot initialize with zero search radius. - # This is used in TrixiParticles when a neighborhood search is not used. - return neighborhood_search - end +# if neighborhood_search.search_radius < eps() +# # Cannot initialize with zero search radius. +# # This is used in TrixiParticles when a neighborhood search is not used. +# return neighborhood_search +# end - # Fill cell lists per GPU - parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov - # Get cell index of the point's cell - point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) - cell = cell_coords(point_coords, neighborhood_search) +# # Fill cell lists per GPU +# parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov +# # Get cell index of the point's cell +# point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) +# cell = cell_coords(point_coords, neighborhood_search) - # Add point to corresponding cell - @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell) +# # Add point to corresponding cell +# @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell) - # The atomic version of `pushat!` uses atomics to avoid race conditions when - # `pushat!` is used in a parallel loop. - @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point) - end +# # The atomic version of `pushat!` uses atomics to avoid race conditions when +# # `pushat!` is used in a parallel loop. +# @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point) +# end - n_gpus = length(CUDA.devices()) - backend = CUDABackend() +# n_gpus = length(CUDA.devices()) +# backend = CUDABackend() - # Synchronize each device - for i in 1:n_gpus - CUDA.device!(i - 1) - KernelAbstractions.synchronize(backend) - end +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end - # Launch kernel on each device - for gpu in 1:n_gpus - # Select the correct device - CUDA.device!(gpu - 1) +# # Launch kernel on each device +# for gpu in 1:n_gpus +# # Select the correct device +# CUDA.device!(gpu - 1) - previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1) +# previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1) - # Call the generic kernel, which only calls a function with the global GPU index - kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu])) - end +# # Call the generic kernel, which only calls a function with the global GPU index +# kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu])) +# end - # Synchronize each device - for i in 1:n_gpus - CUDA.device!(i - 1) - KernelAbstractions.synchronize(backend) - end +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end - # Select first device again - CUDA.device!(0) +# # Select first device again +# CUDA.device!(0) - # # Merge cell lists - # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu - # offset = offsets[gpu][cell] +# # # Merge cell lists +# # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu +# # offset = offsets[gpu][cell] - # points = vov_per_gpu[gpu][cell] - # for i in eachindex(points) - # vov_unified.backend[offset + i, cell] = points[i] - # end - # end +# # points = vov_per_gpu[gpu][cell] +# # for i in eachindex(points) +# # vov_unified.backend[offset + i, cell] = points[i] +# # end +# # end - return neighborhood_search -end +# return neighborhood_search +# end -function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, - x::AbstractMatrix, y::AbstractMatrix; - points_moving = (true, true), parallelization_backend = x) - # The coordinates of the first set of points are irrelevant for this NHS. - # Only update when the second set is moving. - points_moving[2] || return neighborhood_search +# function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, +# x::AbstractMatrix, y::AbstractMatrix; +# points_moving = (true, true), parallelization_backend = x) +# # The coordinates of the first set of points are irrelevant for this NHS. +# # Only update when the second set is moving. +# points_moving[2] || return neighborhood_search - PointNeighbors.initialize_grid!(neighborhood_search, y) -end +# PointNeighbors.initialize_grid!(neighborhood_search, y) +# end end # module