diff --git a/.gitignore b/.gitignore index 4732781..b24718a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ coverage_report/ *.jl.*.cov .vscode/ run +out/ .DS_Store diff --git a/Project.toml b/Project.toml index 12542ae..120137d 100644 --- a/Project.toml +++ b/Project.toml @@ -13,9 +13,16 @@ Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +PointNeighborsCUDAExt = "CUDA" + [compat] Adapt = "4" Atomix = "1" +CUDA = "5.4.3" GPUArraysCore = "0.2" KernelAbstractions = "0.9" LinearAlgebra = "1" diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl new file mode 100644 index 0000000..f826da1 --- /dev/null +++ b/ext/PointNeighborsCUDAExt.jl @@ -0,0 +1,541 @@ +module PointNeighborsCUDAExt + +using PointNeighbors: PointNeighbors, CUDAMultiGPUBackend, DynamicVectorOfVectors, + GridNeighborhoodSearch, FullGridCellList, extract_svector, + cell_coords, cell_index, @threaded, generic_kernel +using CUDA: CUDA, CuArray, CUDABackend, cu +using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index +using PointNeighbors.Adapt: Adapt +using Base: @propagate_inbounds + +# Define custom type wrapping a `CuArray` with unified memory to dispatch on it +struct MultiGPUArray{T, N, A} <: AbstractArray{T, N} + array::A + + function MultiGPUArray(array::AbstractArray{T, N}) where {T, N} + new{T, N, typeof(array)}(array) + end +end + +Adapt.@adapt_structure MultiGPUArray + +function Base.similar(m::MultiGPUArray, ::Type{T}) where {T} + array = similar(m.array, T) + return MultiGPUArray(array) +end + +Base.parent(m::MultiGPUArray) = m.array +Base.size(m::MultiGPUArray) = size(m.array) +Base.getindex(m::MultiGPUArray, i...) = getindex(m.array, i...) +Base.setindex!(m::MultiGPUArray, x...) = (setindex!(m.array, x...); m) + +@inline function Base.fill!(m::MultiGPUArray, x) + # TODO check bounds + # TODO convert x + @threaded m for i in eachindex(m.array) + @inbounds m.array[i] = x + end + + return m +end + +function Base.copyto!(dest::MultiGPUArray, src::MultiGPUArray) + # TODO check bounds + @threaded dest for i in eachindex(dest.array) + @inbounds dest.array[i] = src.array[i] + end + + return dest +end + +function Base.copyto!(dest::MultiGPUArray, src::AbstractArray) + copyto!(dest.array, src) + return dest +end + +function Base.copyto!(dest::MultiGPUArray, indices1::CartesianIndices, src::AbstractArray, indices2::CartesianIndices) + copyto!(dest.array, indices1, src, indices2) + return dest +end + +function Base.copyto!(dest::AbstractArray, src::MultiGPUArray) + copyto!(dest, src.array) + return dest +end + +function Base.Broadcast.BroadcastStyle(::Type{MultiGPUArray{T, N, A}}) where {T, N, A} + Broadcast.BroadcastStyle(A) +end + +Base.view(m::MultiGPUArray, i) = MultiGPUArray(view(m.array, i)) +Base.reshape(m::MultiGPUArray, dims::Base.Dims) = MultiGPUArray(reshape(m.array, dims)) + +# This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between +# `nothing` (Polyester.jl) and `KernelAbstractions.CPU`. +PointNeighbors.get_backend(::MultiGPUArray) = CUDAMultiGPUBackend() + +# Convert input array to `MultiGPUArray` wrapping a `CuArray` with unified memory +function Adapt.adapt_structure(::CUDAMultiGPUBackend, array::Array) + return MultiGPUArray(CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array)) +end + +@inline function PointNeighbors.parallel_foreach(f, iterator, ::MultiGPUArray) + PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend()) +end + +@inline function PointNeighbors.parallel_foreach(f::T, iterator, ::CUDAMultiGPUBackend) where {T} + # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` + # and index with `iterator[eachindex(iterator)[i]]`. + # Note that this only works with vector-like iterators that support arbitrary indexing. + indices = eachindex(iterator) + + # Skip empty loops + length(indices) == 0 && return + + # Partition `ndrange` to the GPUs + n_gpus = length(CUDA.devices()) + indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) + @assert length(indices_split) <= n_gpus + + backend = CUDABackend() + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Launch kernel on each device + for (i, indices_) in enumerate(indices_split) + # Select the correct device for this partition + CUDA.device!(i - 1) + + # Call the generic kernel, which only calls a function with the global GPU index + generic_kernel(backend)(ndrange = length(indices_)) do j + @inbounds @inline f(iterator[indices_[j]]) + end + end + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Select first device again + CUDA.device!(0) +end + +Base.any(f::Function, A::MultiGPUArray) = mapreduce(f, |, A, init = false) + +function Base.mapreduce(f, op, A::MultiGPUArray; dims=:, init=nothing) + n_gpus = length(CUDA.devices()) + indices = eachindex(A.array) + + # Skip empty loops + if length(indices) == 0 + init !== nothing && return init + throw(ArgumentError("reducing over an empty collection is not allowed; consider supplying `init` to the reducer")) + end + + indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) + @assert length(indices_split) <= n_gpus + + backend = CUDABackend() + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + init2 = init === nothing ? Base._InitialValue() : init + result = mapreduce(op, enumerate(indices_split), init=init2) do (i, indices_) + # Select the correct device for this partition + CUDA.device!(i - 1) + + # return mapreduce(@inline(i -> @inbounds f(A.array[i])), op, indices_) + return mapreduce(f, op, view(A.array, indices_); dims=dims, init=init) + end + + # Synchronize each device + for i in 1:n_gpus + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + return result +end + +# `@..` by FastBroadcast.jl forwards to this function +@inline function CUDA.GPUArrays._copyto!(dest::MultiGPUArray, bc::Broadcast.Broadcasted) + axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) + isempty(dest) && return dest + bc = Broadcast.preprocess(dest, bc) + + @threaded dest for i in eachindex(dest.array) + @inbounds dest.array[i] = bc[i] + end + + # @kernel function broadcast_kernel_linear(dest, bc) + # I = @index(Global, Linear) + # @inbounds dest[I] = bc[I] + # end + + # @kernel function broadcast_kernel_cartesian(dest, bc) + # I = @index(Global, Cartesian) + # @inbounds dest[I] = bc[I] + # end + + # broadcast_kernel = if ndims(dest) == 1 || + # (isa(IndexStyle(dest), IndexLinear) && + # isa(IndexStyle(bc), IndexLinear)) + # broadcast_kernel_linear(get_backend(dest)) + # else + # broadcast_kernel_cartesian(get_backend(dest)) + # end + + # # ndims check for 0D support + # broadcast_kernel(dest, bc; ndrange = ndims(dest) > 0 ? size(dest) : (1,)) + # if eltype(dest) <: BrokenBroadcast + # throw(ArgumentError("Broadcast operation resulting in $(eltype(eltype(dest))) is not GPU compatible")) + # end + + return dest +end + +############################################################################################ +# First approach: Use actual system atomics on unified memory +############################################################################################ +function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32) + CUDA.LLVM.Interop.@asmcall( + "atom.sys.global.add.u32 \$0, [\$1], \$2;", + "=r,l,r,~{memory}", + true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32}, + ptr, val + ) +end + +@inline function pushat_atomic_system!(vov, i, value) + (; backend, lengths) = vov + + @boundscheck checkbounds(vov, i) + + # Increment the column length with an atomic add to avoid race conditions. + # Store the new value since it might be changed immediately afterwards by another + # thread. + # new_length = Atomix.@atomic lengths[i] += 1 + new_length = atomic_system_add(pointer(lengths.array, i), Int32(1)) + 1 + + # We can write here without race conditions, since the atomic add guarantees + # that `new_length` is different for each thread. + backend[new_length, i] = value + + return vov +end + +const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:MultiGPUArray}}} + +function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) + (; cell_list) = neighborhood_search + + cell_list.cells.lengths .= 0 + + if neighborhood_search.search_radius < eps() + # Cannot initialize with zero search radius. + # This is used in TrixiParticles when a neighborhood search is not used. + return neighborhood_search + end + + # Faster on a single GPU + # CUDA.NVTX.@range "threaded loop" begin + @threaded y for point in axes(y, 2) + # Get cell index of the point's cell + point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point) + cell = PointNeighbors.cell_coords(point_coords, neighborhood_search) + + # Add point to corresponding cell + pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point) + end +# end + + return neighborhood_search +end + +function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, + y::AbstractMatrix; parallelization_backend = y) + PointNeighbors.initialize_grid!(neighborhood_search, y) +end + +############################################################################################ +# Second approach: Use a device array to update and copy to unified memory +############################################################################################ +# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}} +# vov_unified :: VU +# vov_device :: VD +# end + +# function MultiGPUVectorOfVectors(vov_unified, vov_device) +# MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_device)}(vov_unified, vov_device) +# end + +# # Adapt.@adapt_structure(MultiGPUVectorOfVectors) + +# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors) +# return Adapt.adapt(to, vov.vov_unified) +# end + +# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i) +# return getindex(vov.vov_unified, i) +# end + +# function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} +# max_inner_length, max_outer_length = size(vov.backend) + +# # Make sure the vector of vectors in device memory lives on the first GPU +# CUDA.device!(0) + +# vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) +# vov_device = Adapt.adapt(CuArray, vov) + +# return MultiGPUVectorOfVectors(vov_unified, vov_device) +# end + +# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} + +# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) +# (; cell_list) = neighborhood_search +# (; cells) = cell_list +# (; vov_unified, vov_device) = cells + +# if neighborhood_search.search_radius < eps() +# # Cannot initialize with zero search radius. +# # This is used in TrixiParticles when a neighborhood search is not used. +# return neighborhood_search +# end + +# vov_device.lengths .= 0 + +# CUDA.device!(0) + +# # Fill vector of vectors in device memory (on a single GPU) +# @threaded CUDABackend() for point in axes(y, 2) +# # Get cell index of the point's cell +# point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) +# cell = cell_coords(point_coords, neighborhood_search) + +# # Add point to corresponding cell +# @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell) + +# # The atomic version of `pushat!` uses atomics to avoid race conditions when +# # `pushat!` is used in a parallel loop. +# @inbounds PointNeighbors.pushat_atomic!(vov_device, cell_index(cell_list, cell), point) +# end + +# # Copy vector of vectors to unified memory +# vov_unified.backend .= vov_device.backend +# vov_unified.lengths .= vov_device.lengths +# vov_unified.length_[] = vov_device.length_[] + +# return neighborhood_search +# end + +# function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, +# x::AbstractMatrix, y::AbstractMatrix; +# points_moving = (true, true), parallelization_backend = x) +# # The coordinates of the first set of points are irrelevant for this NHS. +# # Only update when the second set is moving. +# points_moving[2] || return neighborhood_search + +# PointNeighbors.initialize_grid!(neighborhood_search, y) +# end + +############################################################################################ +# Third approach: Like second approach, but a device array for each GPU. + +# This should have the advantage that each GPU updates the part of the grid that should +# already be in its memory and we can avoid moving everything to the first GPU. +############################################################################################ +# KernelAbstractions.@kernel function generic_kernel2(f, gpu) +# i = KernelAbstractions.@index(Global) +# @inline f(i, gpu) +# end + +# @inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T} +# # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` +# # and index with `iterator[eachindex(iterator)[i]]`. +# # Note that this only works with vector-like iterators that support arbitrary indexing. +# indices = eachindex(iterator) + +# # Skip empty loops +# length(indices) == 0 && return + +# # Partition `ndrange` to the GPUs +# n_gpus = length(CUDA.devices()) +# indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) +# @assert length(indices_split) <= n_gpus + +# backend = CUDABackend() + +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end + +# # Launch kernel on each device +# for (i, indices_) in enumerate(indices_split) +# # Select the correct device for this partition +# CUDA.device!(i - 1) + +# # Call the generic kernel, which only calls a function with the global GPU index +# generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu +# @inbounds @inline f(iterator[indices_[j]], vov_per_gpu) +# end +# end + +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end + +# # Select first device again +# CUDA.device!(0) +# end + +# struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}} +# vov_unified::VOV +# vov_per_gpu::V +# end + +# function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu) +# MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu) +# end + +# # Adapt.@adapt_structure(MultiGPUVectorOfVectors) + +# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors) +# return Adapt.adapt(to, vov.vov_unified) +# end + +# function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T} +# max_inner_length, max_outer_length = size(vov.backend) + +# n_gpus = length(CUDA.devices()) +# vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus] + +# vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths)) +# vov_per_gpu_ = ntuple(i -> (CUDA.device!(i - 1); Adapt.adapt(CuArray, vov_per_gpu[i])), n_gpus) +# for vov__ in vov_per_gpu_ +# vov__.length_[] = vov.length_[] +# end +# CUDA.device!(0) + +# return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_) +# end + +# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}} + +# KernelAbstractions.@kernel function kernel3(vov, vov_unified, previous_lengths, ::Val{gpu}, ::Val{islast}) where {gpu, islast} +# cell = KernelAbstractions.@index(Global) + +# length = @inbounds vov.lengths[cell] +# if length > 0 || islast +# offset = 0 +# for length_ in previous_lengths +# offset += @inbounds length_[cell] +# end + +# for i in 1:length +# @inbounds vov_unified.backend[offset + i, cell] = vov.backend[i, cell] +# end + +# if islast +# @inbounds vov_unified.lengths[cell] = offset + length +# end +# end +# end + +# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix) +# (; cell_list) = neighborhood_search +# (; cells) = cell_list +# (; vov_unified, vov_per_gpu) = cells + +# for vov_ in vov_per_gpu +# vov_.lengths .= 0 +# end + +# if neighborhood_search.search_radius < eps() +# # Cannot initialize with zero search radius. +# # This is used in TrixiParticles when a neighborhood search is not used. +# return neighborhood_search +# end + +# # Fill cell lists per GPU +# parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov +# # Get cell index of the point's cell +# point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) +# cell = cell_coords(point_coords, neighborhood_search) + +# # Add point to corresponding cell +# @boundscheck PointNeighbors.check_cell_bounds(cell_list, cell) + +# # The atomic version of `pushat!` uses atomics to avoid race conditions when +# # `pushat!` is used in a parallel loop. +# @inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point) +# end + +# n_gpus = length(CUDA.devices()) +# backend = CUDABackend() + +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end + +# # Launch kernel on each device +# for gpu in 1:n_gpus +# # Select the correct device +# CUDA.device!(gpu - 1) + +# previous_lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, gpu - 1) + +# # Call the generic kernel, which only calls a function with the global GPU index +# kernel3(backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val(gpu), Val(gpu == n_gpus), ndrange = length(vov_per_gpu[gpu])) +# end + +# # Synchronize each device +# for i in 1:n_gpus +# CUDA.device!(i - 1) +# KernelAbstractions.synchronize(backend) +# end + +# # Select first device again +# CUDA.device!(0) + +# # # Merge cell lists +# # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu +# # offset = offsets[gpu][cell] + +# # points = vov_per_gpu[gpu][cell] +# # for i in eachindex(points) +# # vov_unified.backend[offset + i, cell] = points[i] +# # end +# # end + +# return neighborhood_search +# end + +# function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch, +# x::AbstractMatrix, y::AbstractMatrix; +# points_moving = (true, true), parallelization_backend = x) +# # The coordinates of the first set of points are irrelevant for this NHS. +# # Only update when the second set is moving. +# points_moving[2] || return neighborhood_search + +# PointNeighbors.initialize_grid!(neighborhood_search, y) +# end + +end # module diff --git a/src/gpu.jl b/src/gpu.jl index 9293e19..cd418c2 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -31,6 +31,3 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch) return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells, cell_size, update_buffer, nhs.update_strategy) end - -# This is useful to pass the backend directly to `@threaded` -KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend diff --git a/src/util.jl b/src/util.jl index 7cfbfed..0fdc117 100644 --- a/src/util.jl +++ b/src/util.jl @@ -60,7 +60,9 @@ with `Threads.@threads :static`. """ struct ThreadsStaticBackend <: AbstractThreadingBackend end -const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend} +struct CUDAMultiGPUBackend end + +const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend, CUDAMultiGPUBackend} """ @threaded x for ... end @@ -140,7 +142,9 @@ end # Skip empty loops ndrange == 0 && return - backend = KernelAbstractions.get_backend(x) + # Use our `get_backend`, which usually forwards to `KernelAbstractions.get_backend`, + # but also works when `x` is already a backend. + backend = get_backend(x) # Call the generic kernel that is defined below, which only calls a function with # the global GPU index. @@ -155,3 +159,8 @@ end i = @index(Global) @inline f(i) end + +get_backend(x) = KernelAbstractions.get_backend(x) + +# This is useful to pass the backend directly to `@threaded` +get_backend(backend::KernelAbstractions.Backend) = backend