From 12cad6293b30c6a4a54de0c1ec9c3593302ddbd5 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sat, 14 Dec 2024 18:18:58 +0200 Subject: [PATCH 01/28] Implement allocation cache --- src/GPUArrays.jl | 1 + src/host/allocations_cache.jl | 132 ++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 src/host/allocations_cache.jl diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 418b87b5..948ff068 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -34,6 +34,7 @@ include("host/random.jl") include("host/quirks.jl") include("host/uniformscaling.jl") include("host/statistics.jl") +include("host/allocations_cache.jl") end # module diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl new file mode 100644 index 00000000..3f45b052 --- /dev/null +++ b/src/host/allocations_cache.jl @@ -0,0 +1,132 @@ +using Base.ScopedValues + +struct CacheAllocator{T <: AbstractGPUArray} + lock::ReentrantLock + busy::Dict{UInt64, Vector{T}} # hash((T, dims)) => GPUArray[] + free::Dict{UInt64, Vector{T}} +end + +CacheAllocator(::Type{T}) where T = CacheAllocator( + ReentrantLock(), + Dict{UInt64, Vector{T}}(), + Dict{UInt64, Vector{T}}(), +) + +function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T + pool = getproperty(cache, pool) + uid_pool = get(pool, uid, nothing) + if uid_pool ≡ nothing + uid_pool = Base.@lock cache.lock pool[uid] = T[] + end + return uid_pool +end + +function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}) where {T, N} + uid = hash((T, dims)) + free_pool = get_pool!(cache, :free, uid) + busy_pool = get_pool!(cache, :busy, uid) + + x = nothing + + # No array available in `free` - call `alloc_f`. + isempty(free_pool) && (x = alloc_f()) + + # Otherwise, try fetching from `free`. + while !isempty(free_pool) && x ≡ nothing + tmp = pop!(free_pool) + # Array was manually freed via `unsafe_free!`. + tmp.buf.freed && continue + x = tmp + end + + # No array in cache - call `alloc_f`. + x ≡ nothing && (x = alloc_f()) + push!(busy_pool, x) + return x +end + +function free_busy!(cache::CacheAllocator) + for uid in cache.busy.keys + busy_pool = get_pool!(cache, :busy, uid) + isempty(busy_pool) && continue + + free_pool = get_pool!(cache, :free, uid) + Base.@lock cache.lock begin + append!(free_pool, busy_pool) + empty!(busy_pool) + end + end +end + +struct PerDeviceCacheAllocator{T <: AbstractGPUArray} + lock::ReentrantLock + caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}} +end + +PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray = + PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}()) + +function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T + h = hash(device) + dev_cache = get(pdcache.caches, h, nothing) + if dev_cache ≡ nothing + Base.@lock pdcache.lock begin + named_cache = CacheAllocator(T) + pdcache.caches[h] = Dict{Symbol, CacheAllocator{T}}(name => named_cache) + return named_cache + end + end + + named_cache = get(dev_cache, name, nothing) + if named_cache ≡ nothing + named_cache = CacheAllocator(T) + Base.@lock dev_cache.lock dev_cache[name] = named_cache + end + return named_cache +end + +function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) + h = hash(device) + dev_cache = get(pdcache.caches, h, nothing) + dev_cache ≡ nothing && return + + named_cache = get(dev_cache, name, nothing) + named_cache ≡ nothing && return + + Base.@lock named_cache.lock begin + for (_, pool) in named_cache.free + map(unsafe_free!, pool) + end + # TODO error when trying to invalidate busy cache? + for (_, pool) in named_cache.busy + map(unsafe_free!, pool) + end + empty!(named_cache.busy) + empty!(named_cache.free) + end + return +end + +macro cache_scope(backend, name, expr) + quote + scope = cache_alloc_scope($(esc(backend))) + res = @with scope => $(esc(name)) $(esc(expr)) + free_busy_cache_alloc!(cache_allocator($(esc(backend))), $(esc(name))) + res + end +end + +macro no_cache_scope(backend, expr) + quote + scope = cache_alloc_scope($(esc(backend))) + @with scope => :none $(esc(expr)) + end +end + +# Interface API. + +cache_alloc_scope(::Backend) = error("Not implemented.") + +cache_allocator(::Backend) = error("Not implemented.") + +free_busy_cache_alloc!(pdcache, name::Symbol) = error("Not implemented.") From c6f128f38c689024462b20b96050425d4642b153 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sun, 15 Dec 2024 00:15:44 +0200 Subject: [PATCH 02/28] Correctly fetch underlying storage --- src/host/allocations_cache.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 3f45b052..44a0aac4 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -35,7 +35,7 @@ function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}) where while !isempty(free_pool) && x ≡ nothing tmp = pop!(free_pool) # Array was manually freed via `unsafe_free!`. - tmp.buf.freed && continue + storage(tmp).freed && continue x = tmp end From c2f32e13becfdf3f1733b1358da9b6b484015eae Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sun, 15 Dec 2024 13:53:26 +0200 Subject: [PATCH 03/28] Add cache sizeof --- src/host/allocations_cache.jl | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 44a0aac4..14fd7764 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -85,6 +85,27 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam return named_cache end +function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) + sz = UInt64(0) + h = hash(device) + + dev_cache = get(pdcache.caches, h, nothing) + dev_cache ≡ nothing && return sz + + named_cache = get(dev_cache, name, nothing) + named_cache ≡ nothing && return sz + + Base.@lock named_cache.lock begin + for (_, pool) in named_cache.free + sz += sum(sizeof, pool; init=UInt64(0)) + end + for (_, pool) in named_cache.busy + sz += sum(sizeof, pool; init=UInt64(0)) + end + end + return sz +end + function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) h = hash(device) dev_cache = get(pdcache.caches, h, nothing) @@ -130,3 +151,5 @@ cache_alloc_scope(::Backend) = error("Not implemented.") cache_allocator(::Backend) = error("Not implemented.") free_busy_cache_alloc!(pdcache, name::Symbol) = error("Not implemented.") + +invalidate_cache_allocator!(pdcache, name::Symbol) = error("Not implemented.") From 44e8990cd713cf806253a9503947ef5bbf788f0f Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 17 Dec 2024 12:49:53 +0200 Subject: [PATCH 04/28] Allow bulk-freeing arrays instead of caching them --- src/host/allocations_cache.jl | 73 +++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 14fd7764..770485c6 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -1,5 +1,7 @@ using Base.ScopedValues +const CacheAllocatorName = ScopedValue(:none) + struct CacheAllocator{T <: AbstractGPUArray} lock::ReentrantLock busy::Dict{UInt64, Vector{T}} # hash((T, dims)) => GPUArray[] @@ -21,38 +23,42 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T return uid_pool end -function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}) where {T, N} +function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N} + x = nothing uid = hash((T, dims)) - free_pool = get_pool!(cache, :free, uid) busy_pool = get_pool!(cache, :busy, uid) - x = nothing - - # No array available in `free` - call `alloc_f`. - isempty(free_pool) && (x = alloc_f()) + if skip_free + x = alloc_f() + else + free_pool = get_pool!(cache, :free, uid) + isempty(free_pool) && (x = alloc_f()) - # Otherwise, try fetching from `free`. - while !isempty(free_pool) && x ≡ nothing - tmp = pop!(free_pool) - # Array was manually freed via `unsafe_free!`. - storage(tmp).freed && continue - x = tmp + while !isempty(free_pool) && x ≡ nothing + tmp = Base.@lock cache.lock pop!(free_pool) + # Array was manually freed via `unsafe_free!`. + storage(tmp).freed && continue + x = tmp + end end - # No array in cache - call `alloc_f`. x ≡ nothing && (x = alloc_f()) - push!(busy_pool, x) + Base.@lock cache.lock push!(busy_pool, x) return x end -function free_busy!(cache::CacheAllocator) +function free_busy!(cache::CacheAllocator; free_immediately::Bool) for uid in cache.busy.keys busy_pool = get_pool!(cache, :busy, uid) isempty(busy_pool) && continue free_pool = get_pool!(cache, :free, uid) Base.@lock cache.lock begin - append!(free_pool, busy_pool) + if free_immediately + for p in busy_pool unsafe_free!(p) end + else + append!(free_pool, busy_pool) + end empty!(busy_pool) end end @@ -61,10 +67,11 @@ end struct PerDeviceCacheAllocator{T <: AbstractGPUArray} lock::ReentrantLock caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}} + free_immediately::Bool end -PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray = - PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}()) +PerDeviceCacheAllocator(::Type{T}; free_immediately::Bool) where T <: AbstractGPUArray = + PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}(), free_immediately) function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T h = hash(device) @@ -85,6 +92,12 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam return named_cache end +function alloc!(alloc_f, kab::Backend, name::Symbol, ::Type{T}, dims::Dims{N}) where {T, N} + pdcache = cache_allocator(kab) + cache = named_cache_allocator!(pdcache, device(kab), name) + alloc!(alloc_f, cache, T, dims; skip_free=pdcache.free_immediately) +end + function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) sz = UInt64(0) h = hash(device) @@ -106,6 +119,9 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) return sz end +invalidate_cache_allocator!(kab::Backend, name::Symbol) = + invalidate_cache_allocator!(cache_allocator(kab), device(kab), name) + function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) h = hash(device) dev_cache = get(pdcache.caches, h, nothing) @@ -128,28 +144,27 @@ function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, n return end +function free_busy!(kab::Backend, name::Symbol) + pdcache = cache_allocator(kab) + free_busy!(named_cache_allocator!(pdcache, device(kab), name); pdcache.free_immediately) +end + macro cache_scope(backend, name, expr) quote - scope = cache_alloc_scope($(esc(backend))) - res = @with scope => $(esc(name)) $(esc(expr)) - free_busy_cache_alloc!(cache_allocator($(esc(backend))), $(esc(name))) + res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr)) + free_busy!($(esc(backend)), $(esc(name))) res end end -macro no_cache_scope(backend, expr) +macro no_cache_scope(expr) quote - scope = cache_alloc_scope($(esc(backend))) - @with scope => :none $(esc(expr)) + @with $(esc(CacheAllocatorName)) => :none $(esc(expr)) end end # Interface API. -cache_alloc_scope(::Backend) = error("Not implemented.") - cache_allocator(::Backend) = error("Not implemented.") -free_busy_cache_alloc!(pdcache, name::Symbol) = error("Not implemented.") - -invalidate_cache_allocator!(pdcache, name::Symbol) = error("Not implemented.") +device(::Backend) = error("Not implemented.") From 5ce044d76188874bfe4cc5e89b0945505c94c35e Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 18 Dec 2024 00:19:39 +0200 Subject: [PATCH 05/28] Add docs --- Project.toml | 2 + src/host/allocations_cache.jl | 85 ++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 551c9edc..1582449f 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -23,6 +24,7 @@ LinearAlgebra = "1" Printf = "1" Random = "1" Reexport = "1" +ScopedValues = "1" Serialization = "1" Statistics = "1" julia = "1.10" diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 770485c6..aea6a918 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -1,4 +1,8 @@ -using Base.ScopedValues +@static if VERSION < v"1.11" + using ScopedValues +else + using Base.ScopedValues +end const CacheAllocatorName = ScopedValue(:none) @@ -23,6 +27,19 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T return uid_pool end +""" + alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N} + +Attempt to retrieve cached allocation from `cache` using eltype `T` and `dims` +as keys for searching. +If no such allocation is found, execute `alloc_f` that does actual allocation, +store it in cache for future use and return it. + +`skip_free::Bool` is used together with `PerDeviceCacheAllocator.free_immediately`. +When `true` arrays are bulk-freed instead of stored in cache. +In this case `alloc!` will avoid looking into "free" part of `cache` +and execute `alloc_f` immediately, storing allocation for future bulk-freeing. +""" function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N} x = nothing uid = hash((T, dims)) @@ -55,7 +72,7 @@ function free_busy!(cache::CacheAllocator; free_immediately::Bool) free_pool = get_pool!(cache, :free, uid) Base.@lock cache.lock begin if free_immediately - for p in busy_pool unsafe_free!(p) end + map(unsafe_free!, busy_pool) else append!(free_pool, busy_pool) end @@ -119,6 +136,11 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) return sz end +""" + invalidate_cache_allocator!(kab::Backend, name::Symbol) + +Free all memory held by `name`d cached allocator given KernelAbstractions `backend`. +""" invalidate_cache_allocator!(kab::Backend, name::Symbol) = invalidate_cache_allocator!(cache_allocator(kab), device(kab), name) @@ -149,6 +171,47 @@ function free_busy!(kab::Backend, name::Symbol) free_busy!(named_cache_allocator!(pdcache, device(kab), name); pdcache.free_immediately) end +""" + @cache_scope backend name expr + +Evaluate expression `expr` using `name`d caching allocator +for the given KernelAbstractions `backend`. + +When during execution of `expr` gpu allocation is requested, +allocator will try to find such allocation in "free" parts of cache, +marking them as "busy" and returning allocation to the user. +If no allocation is found in "free" part, an actual allocation is performed, +marking it as "busy" and returned to the user. + +**After** the execution of `expr` all "busy" allocations are marked as "free" +thus they can be re-used next time the program enters this scope. + +This is useful to apply in a repeating block of code to avoid relying on +GC to free gpu memory in time. + +`name` is a `Symbol` that defines which allocator to use +(`:none` is reserved and means no allocator). + +# Example + +In following example we apply caching allocator at every iteration of the for-loop. +Every iteration requires 2 GiB of gpu memory, without caching allocator +GC wouldn't be able to free arrays in time resulting in higher memory usage. +With caching allocator, memory usage stays at exactly 2 GiB. + +After the loop, we free all cached memory if there's any. + +```julia +kab = CUDABackend() +n = 1024^3 +for i in 1:1000 + @cache_scope kab :loop begin + sin.(CUDA.rand(Float32, n)) + end +end +invalidate_cache_allocator!(kab, :loop) +``` +""" macro cache_scope(backend, name, expr) quote res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr)) @@ -157,6 +220,12 @@ macro cache_scope(backend, name, expr) end end +""" + @no_cache_scope expr + +Evaluate expression `expr` without using caching allocator. +This is useful to call from within `@cache_scope` to avoid caching arrays. +""" macro no_cache_scope(expr) quote @with $(esc(CacheAllocatorName)) => :none $(esc(expr)) @@ -165,6 +234,18 @@ end # Interface API. +""" + cache_allocator(::Backend) + +Given KernelAbstractions `backend`, return corresponding `PerDeviceCacheAllocator` for it. +Each GPU backend must implement this. +""" cache_allocator(::Backend) = error("Not implemented.") +""" + device(::Backend) + +Given KernelAbstractions `backend`, return current device. +Each GPU backend must implement this. +""" device(::Backend) = error("Not implemented.") From 977657815ba7de21efea97b5e0b319c1b76deda2 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 18 Dec 2024 18:01:32 +0200 Subject: [PATCH 06/28] Add tests --- lib/JLArrays/src/JLArrays.jl | 26 +++++++-- src/host/allocations_cache.jl | 6 +-- test/runtests.jl | 3 ++ test/testsuite.jl | 1 + test/testsuite/caching_allocator.jl | 82 +++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 test/testsuite/caching_allocator.jl diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index dbec3d25..d717e9a3 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -88,12 +88,20 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} function JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} check_eltype(T) maxsize = prod(dims) * sizeof(T) - data = Vector{UInt8}(undef, maxsize) - ref = DataRef(data) do data - resize!(data, 0) + + function _alloc_f() + data = Vector{UInt8}(undef, maxsize) + ref = DataRef(data) do data + resize!(data, 0) + end + obj = new{T,N}(ref, 0, dims) + finalizer(unsafe_free!, obj) end - obj = new{T,N}(ref, 0, dims) - finalizer(unsafe_free!, obj) + + name = GPUArrays.CacheAllocatorName[] + return name == :none ? + _alloc_f() : + GPUArrays.alloc!(_alloc_f, JLBackend(), name, T, dims)::JLArray{T, N} end # low-level constructor for wrapping existing data @@ -387,4 +395,12 @@ Adapt.adapt_storage(::JLBackend, a::Array) = Adapt.adapt(JLArrays.JLArray, a) Adapt.adapt_storage(::JLBackend, a::JLArrays.JLArray) = a Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Array, a) +# Caching Allocator. + +const JLACacheAllocator = GPUArrays.PerDeviceCacheAllocator(JLArray; free_immediately=false) + +GPUArrays.cache_allocator(::JLBackend) = JLACacheAllocator + +GPUArrays.device(::JLBackend) = 1 + end diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index aea6a918..90c5f5e6 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -69,11 +69,11 @@ function free_busy!(cache::CacheAllocator; free_immediately::Bool) busy_pool = get_pool!(cache, :busy, uid) isempty(busy_pool) && continue - free_pool = get_pool!(cache, :free, uid) Base.@lock cache.lock begin if free_immediately map(unsafe_free!, busy_pool) else + free_pool = get_pool!(cache, :free, uid) append!(free_pool, busy_pool) end empty!(busy_pool) @@ -81,7 +81,7 @@ function free_busy!(cache::CacheAllocator; free_immediately::Bool) end end -struct PerDeviceCacheAllocator{T <: AbstractGPUArray} +mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray} lock::ReentrantLock caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}} free_immediately::Bool @@ -104,7 +104,7 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam named_cache = get(dev_cache, name, nothing) if named_cache ≡ nothing named_cache = CacheAllocator(T) - Base.@lock dev_cache.lock dev_cache[name] = named_cache + Base.@lock pdcache.lock dev_cache[name] = named_cache end return named_cache end diff --git a/test/runtests.jl b/test/runtests.jl index 66d6a096..5fe51aec 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -48,6 +48,9 @@ include("setup.jl") # make sure everything is precompiled const tests = [] const test_runners = Dict() for AT in (JLArray, Array), name in keys(TestSuite.tests) + # Disable for now. + name == "Caching Allocator" && continue + push!(tests, "$(AT)/$name") test_runners["$(AT)/$name"] = ()->TestSuite.tests[name](AT) end diff --git a/test/testsuite.jl b/test/testsuite.jl index 179c824b..59bb967b 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -93,6 +93,7 @@ include("testsuite/math.jl") include("testsuite/random.jl") include("testsuite/uniformscaling.jl") include("testsuite/statistics.jl") +include("testsuite/caching_allocator.jl") """ Runs the entire GPUArrays test suite on array type `AT` diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl new file mode 100644 index 00000000..4f63023a --- /dev/null +++ b/test/testsuite/caching_allocator.jl @@ -0,0 +1,82 @@ +@testsuite "Caching Allocator" (AT, eltypes) -> begin + # Hacky way to get KA backend from AT. + kab = KernelAbstractions.get_backend(AT(Array{Int}(undef, 0))) + device = GPUArrays.device(kab) + + @testset "free_immediately=false" begin + pdcache = GPUArrays.cache_allocator(kab) + pdcache.free_immediately = false + named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache) + + T = Float32 + dims = (1, 2, 3) + key = hash((T, dims)) + + GPUArrays.@cache_scope kab :cache begin + x1 = AT(zeros(T, dims)) + end + @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) + @test length(named_cache.free[key]) == 1 + @test length(named_cache.busy[key]) == 0 + @test x1 === named_cache.free[key][1] + + # Second allocation does not allocate - cache stays the same in size. + + GPUArrays.@cache_scope kab :cache begin + x2 = AT(zeros(T, dims)) + + # Does not go to cache. + GPUArrays.@no_cache_scope begin + x_free = AT(zeros(T, dims)) + end + end + @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) + @test length(named_cache.free[key]) == 1 + @test length(named_cache.busy[key]) == 0 + @test x2 === x1 + @test x2 === named_cache.free[key][1] + @test x_free !== x2 + + # Third allocation of different type - cache grows. + + T2 = Int32 + key2 = hash((T2, dims)) + GPUArrays.@cache_scope kab :cache begin + x3 = AT(zeros(T2, dims)) + end + @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) + @test length(named_cache.free[key]) == 1 + @test length(named_cache.free[key2]) == 1 + @test x3 === named_cache.free[key2][1] + + # Freeing all memory held by cache. + + GPUArrays.invalidate_cache_allocator!(kab, :cache) + @test sizeof(pdcache, device, :cache) == 0 + end + + @testset "free_immediately=true" begin + pdcache = GPUArrays.cache_allocator(kab) + pdcache.free_immediately = true + named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache2) + + T = Float32 + dims = (1, 2, 3) + key = hash((T, dims)) + + @test sizeof(pdcache, device, :cache2) == 0 + + GPUArrays.@cache_scope kab :cache2 begin + x1 = AT(zeros(T, dims)) + + @test !haskey(named_cache.free, key) + @test length(named_cache.busy[key]) == 1 + @test sizeof(pdcache, device, :cache2) == sizeof(Float32) * prod(dims) + end + + # `free` was never even used with `free_immediately=true`. + @test !haskey(named_cache.free, key) + @test length(named_cache.busy[key]) == 0 + @test sizeof(pdcache, device, :cache2) == 0 + end +end From c5032ad4bd4e69c80c697b5f67fad9ac3f398bd1 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 18 Dec 2024 18:57:15 +0200 Subject: [PATCH 07/28] Update docs --- docs/.gitignore | 1 + docs/make.jl | 1 + docs/src/interface.md | 12 ++++++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/.gitignore b/docs/.gitignore index 737939a5..026087e8 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1,4 @@ +Manifest.toml build site Manifest.toml diff --git a/docs/make.jl b/docs/make.jl index 72828e3b..b8ca1f92 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,6 +20,7 @@ function main() "Test suite" => "testsuite.md", ], doctest = true, + warnonly=[:missing_docs], ) deploydocs( diff --git a/docs/src/interface.md b/docs/src/interface.md index 239bef87..f6e3a9ea 100644 --- a/docs/src/interface.md +++ b/docs/src/interface.md @@ -10,7 +10,7 @@ Device functionality is then handled by [KernelAbstractions.jl](https://github.c You should provide an array type that builds on the `AbstractGPUArray` supertype, such as: -``` +```julia mutable struct CustomArray{T, N} <: AbstractGPUArray{T, N} data::DataRef{Vector{UInt8}} offset::Int @@ -23,10 +23,18 @@ end This will allow your defined type (in this case `JLArray`) to use the GPUArrays interface where available. To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you need to define the backend, like so: -``` +```julia import KernelAbstractions: Backend struct CustomBackend <: KernelAbstractions.GPU KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend() ``` There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/master/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/master/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/master/src/gpuarrays.jl). + +## Caching Allocator + +```@docs +GPUArrays.@cache_scope +GPUArrays.@no_cache_scope +GPUArrays.invalidate_cache_allocator! +``` From 99a81710ac07a4d737640201a495865b7fd1568e Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sat, 21 Dec 2024 11:58:49 +0200 Subject: [PATCH 08/28] Update docs & disable test for now --- src/host/allocations_cache.jl | 22 +++++++++++----------- test/runtests.jl | 3 --- test/testsuite.jl | 4 +++- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 90c5f5e6..4c75b32b 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -177,11 +177,11 @@ end Evaluate expression `expr` using `name`d caching allocator for the given KernelAbstractions `backend`. -When during execution of `expr` gpu allocation is requested, -allocator will try to find such allocation in "free" parts of cache, -marking them as "busy" and returning allocation to the user. -If no allocation is found in "free" part, an actual allocation is performed, -marking it as "busy" and returned to the user. +When gpu allocation is requested during execution of `expr`, +allocator will try to use its "free" cache instead of doing an actual allocation. +If no "free" allocation exists, an actual allocation is performed. +Before returning allocation to the user, it is marked as busy and +will not be used by allocation in the scope defined by `@cache_scope`. **After** the execution of `expr` all "busy" allocations are marked as "free" thus they can be re-used next time the program enters this scope. @@ -194,13 +194,13 @@ GC to free gpu memory in time. # Example -In following example we apply caching allocator at every iteration of the for-loop. -Every iteration requires 2 GiB of gpu memory, without caching allocator -GC wouldn't be able to free arrays in time resulting in higher memory usage. -With caching allocator, memory usage stays at exactly 2 GiB. - -After the loop, we free all cached memory if there's any. +In the following example, each iteration of the for-loop requires `2 GiB` +of gpu memory. +Without caching allocator GC wouldn't be able to free arrays in time +resulting in higher memory usage. +With caching allocator, memory usage stays at exactly `2 GiB`. +See [`@no_cache_scope`](@ref), [`invalidate_cache_allocator!`](@ref). ```julia kab = CUDABackend() n = 1024^3 diff --git a/test/runtests.jl b/test/runtests.jl index 5fe51aec..66d6a096 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -48,9 +48,6 @@ include("setup.jl") # make sure everything is precompiled const tests = [] const test_runners = Dict() for AT in (JLArray, Array), name in keys(TestSuite.tests) - # Disable for now. - name == "Caching Allocator" && continue - push!(tests, "$(AT)/$name") test_runners["$(AT)/$name"] = ()->TestSuite.tests[name](AT) end diff --git a/test/testsuite.jl b/test/testsuite.jl index 59bb967b..3ae5573e 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -93,7 +93,9 @@ include("testsuite/math.jl") include("testsuite/random.jl") include("testsuite/uniformscaling.jl") include("testsuite/statistics.jl") -include("testsuite/caching_allocator.jl") + +# TODO re-enable once backends support it. +# include("testsuite/caching_allocator.jl") """ Runs the entire GPUArrays test suite on array type `AT` From ad828dfb841a536c0e42363e7ab712e784602155 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Mon, 6 Jan 2025 21:22:02 +0200 Subject: [PATCH 09/28] Use array type instead of KA backend & allow arbitrary keys --- docs/src/interface.md | 6 +-- src/host/allocations_cache.jl | 75 +++++++++++++++-------------- test/testsuite.jl | 1 - test/testsuite/caching_allocator.jl | 24 +++++---- 4 files changed, 54 insertions(+), 52 deletions(-) diff --git a/docs/src/interface.md b/docs/src/interface.md index f6e3a9ea..dadb06c1 100644 --- a/docs/src/interface.md +++ b/docs/src/interface.md @@ -34,7 +34,7 @@ There are numerous examples of potential interfaces for GPUArrays, such as with ## Caching Allocator ```@docs -GPUArrays.@cache_scope -GPUArrays.@no_cache_scope -GPUArrays.invalidate_cache_allocator! +GPUArrays.AllocCache.@enable +GPUArrays.AllocCache.@disable +GPUArrays.AllocCache.invalidate! ``` diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 4c75b32b..22f3033f 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -1,3 +1,8 @@ +module AllocCache + +using ..GPUArrays +using KernelAbstractions + @static if VERSION < v"1.11" using ScopedValues else @@ -8,7 +13,7 @@ const CacheAllocatorName = ScopedValue(:none) struct CacheAllocator{T <: AbstractGPUArray} lock::ReentrantLock - busy::Dict{UInt64, Vector{T}} # hash((T, dims)) => GPUArray[] + busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[] free::Dict{UInt64, Vector{T}} end @@ -28,10 +33,9 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T end """ - alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N} + alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool) -Attempt to retrieve cached allocation from `cache` using eltype `T` and `dims` -as keys for searching. +Attempt to retrieve cached allocation from `cache` using `key` for searching. If no such allocation is found, execute `alloc_f` that does actual allocation, store it in cache for future use and return it. @@ -40,9 +44,9 @@ When `true` arrays are bulk-freed instead of stored in cache. In this case `alloc!` will avoid looking into "free" part of `cache` and execute `alloc_f` immediately, storing allocation for future bulk-freeing. """ -function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N} +function alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool) x = nothing - uid = hash((T, dims)) + uid = hash(key) busy_pool = get_pool!(cache, :busy, uid) if skip_free @@ -54,7 +58,7 @@ function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_f while !isempty(free_pool) && x ≡ nothing tmp = Base.@lock cache.lock pop!(free_pool) # Array was manually freed via `unsafe_free!`. - storage(tmp).freed && continue + GPUArrays.storage(tmp).freed && continue x = tmp end end @@ -109,10 +113,10 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam return named_cache end -function alloc!(alloc_f, kab::Backend, name::Symbol, ::Type{T}, dims::Dims{N}) where {T, N} - pdcache = cache_allocator(kab) - cache = named_cache_allocator!(pdcache, device(kab), name) - alloc!(alloc_f, cache, T, dims; skip_free=pdcache.free_immediately) +function alloc!(alloc_f, AT::Type{<: AbstractGPUArray}, name::Symbol, key) + pdcache = cache_allocator(AT) + cache = named_cache_allocator!(pdcache, device(AT), name) + alloc!(alloc_f, cache, key; skip_free=pdcache.free_immediately) end function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) @@ -137,14 +141,14 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) end """ - invalidate_cache_allocator!(kab::Backend, name::Symbol) + invalidate!(AT::Type{AbstractGPUArray}, name::Symbol) -Free all memory held by `name`d cached allocator given KernelAbstractions `backend`. +Free all memory held by `name`d cached allocator given array type `AT`. """ -invalidate_cache_allocator!(kab::Backend, name::Symbol) = - invalidate_cache_allocator!(cache_allocator(kab), device(kab), name) +invalidate!(AT::Type{<: AbstractGPUArray}, name::Symbol) = + invalidate!(cache_allocator(AT), device(AT), name) -function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) +function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) h = hash(device) dev_cache = get(pdcache.caches, h, nothing) dev_cache ≡ nothing && return @@ -166,16 +170,16 @@ function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, n return end -function free_busy!(kab::Backend, name::Symbol) - pdcache = cache_allocator(kab) - free_busy!(named_cache_allocator!(pdcache, device(kab), name); pdcache.free_immediately) +function free_busy!(AT::Type{<: AbstractGPUArray}, name::Symbol) + pdcache = cache_allocator(AT) + free_busy!(named_cache_allocator!(pdcache, device(AT), name); pdcache.free_immediately) end """ - @cache_scope backend name expr + @enable AT name expr Evaluate expression `expr` using `name`d caching allocator -for the given KernelAbstractions `backend`. +for the given array type `AT`. When gpu allocation is requested during execution of `expr`, allocator will try to use its "free" cache instead of doing an actual allocation. @@ -202,31 +206,30 @@ With caching allocator, memory usage stays at exactly `2 GiB`. See [`@no_cache_scope`](@ref), [`invalidate_cache_allocator!`](@ref). ```julia -kab = CUDABackend() n = 1024^3 for i in 1:1000 - @cache_scope kab :loop begin + CUDA.AllocCache.@enable CuArray :loop begin sin.(CUDA.rand(Float32, n)) end end -invalidate_cache_allocator!(kab, :loop) +CUDA.AllocCache.invalidate!(CuArray, :loop) ``` """ -macro cache_scope(backend, name, expr) +macro enable(AT, name, expr) quote res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr)) - free_busy!($(esc(backend)), $(esc(name))) + free_busy!($(esc(AT)), $(esc(name))) res end end """ - @no_cache_scope expr + @disable expr Evaluate expression `expr` without using caching allocator. -This is useful to call from within `@cache_scope` to avoid caching arrays. +This is useful to call from within `@enable` to avoid caching arrays. """ -macro no_cache_scope(expr) +macro disable(expr) quote @with $(esc(CacheAllocatorName)) => :none $(esc(expr)) end @@ -235,17 +238,19 @@ end # Interface API. """ - cache_allocator(::Backend) + cache_allocator(::Type{AbstractGPUArray}) -Given KernelAbstractions `backend`, return corresponding `PerDeviceCacheAllocator` for it. +Given array type, return corresponding `PerDeviceCacheAllocator` for it. Each GPU backend must implement this. """ -cache_allocator(::Backend) = error("Not implemented.") +cache_allocator(::Type{AbstractGPUArray}) = error("Not implemented.") """ - device(::Backend) + device(::Type{AbstractGPUArray}) -Given KernelAbstractions `backend`, return current device. +Given array type, return current device. Each GPU backend must implement this. """ -device(::Backend) = error("Not implemented.") +device(::Type{AbstractGPUArray}) = error("Not implemented.") + +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 3ae5573e..c235b756 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -93,7 +93,6 @@ include("testsuite/math.jl") include("testsuite/random.jl") include("testsuite/uniformscaling.jl") include("testsuite/statistics.jl") - # TODO re-enable once backends support it. # include("testsuite/caching_allocator.jl") diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl index 4f63023a..7118e192 100644 --- a/test/testsuite/caching_allocator.jl +++ b/test/testsuite/caching_allocator.jl @@ -1,18 +1,16 @@ @testsuite "Caching Allocator" (AT, eltypes) -> begin - # Hacky way to get KA backend from AT. - kab = KernelAbstractions.get_backend(AT(Array{Int}(undef, 0))) - device = GPUArrays.device(kab) + device = GPUArrays.AllocCache.device(AT) @testset "free_immediately=false" begin - pdcache = GPUArrays.cache_allocator(kab) + pdcache = GPUArrays.AllocCache.cache_allocator(AT) pdcache.free_immediately = false - named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache) + named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache) T = Float32 dims = (1, 2, 3) key = hash((T, dims)) - GPUArrays.@cache_scope kab :cache begin + GPUArrays.AllocCache.@enable AT :cache begin x1 = AT(zeros(T, dims)) end @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) @@ -22,11 +20,11 @@ # Second allocation does not allocate - cache stays the same in size. - GPUArrays.@cache_scope kab :cache begin + GPUArrays.AllocCache.@enable AT :cache begin x2 = AT(zeros(T, dims)) # Does not go to cache. - GPUArrays.@no_cache_scope begin + GPUArrays.AllocCache.@disable begin x_free = AT(zeros(T, dims)) end end @@ -41,7 +39,7 @@ T2 = Int32 key2 = hash((T2, dims)) - GPUArrays.@cache_scope kab :cache begin + GPUArrays.AllocCache.@enable AT :cache begin x3 = AT(zeros(T2, dims)) end @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) @@ -51,14 +49,14 @@ # Freeing all memory held by cache. - GPUArrays.invalidate_cache_allocator!(kab, :cache) + GPUArrays.AllocCache.invalidate!(AT, :cache) @test sizeof(pdcache, device, :cache) == 0 end @testset "free_immediately=true" begin - pdcache = GPUArrays.cache_allocator(kab) + pdcache = GPUArrays.AllocCache.cache_allocator(AT) pdcache.free_immediately = true - named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache2) + named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache2) T = Float32 dims = (1, 2, 3) @@ -66,7 +64,7 @@ @test sizeof(pdcache, device, :cache2) == 0 - GPUArrays.@cache_scope kab :cache2 begin + GPUArrays.AllocCache.@enable AT :cache2 begin x1 = AT(zeros(T, dims)) @test !haskey(named_cache.free, key) From e601f17b5f9b0560b75a99bec55a69371b2183fd Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Mon, 6 Jan 2025 21:28:14 +0200 Subject: [PATCH 10/28] Minor cleanups --- lib/JLArrays/src/JLArrays.jl | 12 ++++++------ src/host/allocations_cache.jl | 16 +++++++--------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index d717e9a3..8f79a1e3 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -98,10 +98,10 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} finalizer(unsafe_free!, obj) end - name = GPUArrays.CacheAllocatorName[] - return name == :none ? + name = GPUArrays.AllocCache.CacheAllocatorName[] + return name ≡ nothing ? _alloc_f() : - GPUArrays.alloc!(_alloc_f, JLBackend(), name, T, dims)::JLArray{T, N} + GPUArrays.AllocCache.alloc!(_alloc_f, JLArray, name, (T, dims))::JLArray{T, N} end # low-level constructor for wrapping existing data @@ -397,10 +397,10 @@ Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Arr # Caching Allocator. -const JLACacheAllocator = GPUArrays.PerDeviceCacheAllocator(JLArray; free_immediately=false) +const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray; free_immediately=false) -GPUArrays.cache_allocator(::JLBackend) = JLACacheAllocator +GPUArrays.AllocCache.cache_allocator(::Type{<: JLArray}) = JLACacheAllocator -GPUArrays.device(::JLBackend) = 1 +GPUArrays.AllocCache.device(::Type{<: JLArray}) = 1 end diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 22f3033f..022d2aaa 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -9,7 +9,7 @@ else using Base.ScopedValues end -const CacheAllocatorName = ScopedValue(:none) +const CacheAllocatorName = ScopedValue{Union{Nothing, Symbol}}(nothing) struct CacheAllocator{T <: AbstractGPUArray} lock::ReentrantLock @@ -178,14 +178,13 @@ end """ @enable AT name expr -Evaluate expression `expr` using `name`d caching allocator -for the given array type `AT`. +Evaluate expression `expr` using `name`d caching allocator for the given array type `AT`. When gpu allocation is requested during execution of `expr`, allocator will try to use its "free" cache instead of doing an actual allocation. If no "free" allocation exists, an actual allocation is performed. Before returning allocation to the user, it is marked as busy and -will not be used by allocation in the scope defined by `@cache_scope`. +will not be used by allocation in the scope defined by `@enable`. **After** the execution of `expr` all "busy" allocations are marked as "free" thus they can be re-used next time the program enters this scope. @@ -194,17 +193,16 @@ This is useful to apply in a repeating block of code to avoid relying on GC to free gpu memory in time. `name` is a `Symbol` that defines which allocator to use -(`:none` is reserved and means no allocator). +(`nothing`, which is a default, disables it). # Example -In the following example, each iteration of the for-loop requires `2 GiB` -of gpu memory. +In the following example, each iteration of the for-loop requires `2 GiB` of gpu memory. Without caching allocator GC wouldn't be able to free arrays in time resulting in higher memory usage. With caching allocator, memory usage stays at exactly `2 GiB`. -See [`@no_cache_scope`](@ref), [`invalidate_cache_allocator!`](@ref). +See [`@disable`](@ref), [`invalidate!`](@ref). ```julia n = 1024^3 for i in 1:1000 @@ -231,7 +229,7 @@ This is useful to call from within `@enable` to avoid caching arrays. """ macro disable(expr) quote - @with $(esc(CacheAllocatorName)) => :none $(esc(expr)) + @with $(esc(CacheAllocatorName)) => nothing $(esc(expr)) end end From ba1941a7248afe5235b49d5bcca2df3c41d1f1dc Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 7 Jan 2025 13:47:20 +0200 Subject: [PATCH 11/28] Remove 'free_immediately' param --- lib/JLArrays/src/JLArrays.jl | 2 +- src/host/allocations_cache.jl | 49 +++++-------- test/testsuite.jl | 3 +- test/testsuite/caching_allocator.jl | 108 +++++++++++----------------- 4 files changed, 61 insertions(+), 101 deletions(-) diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 8f79a1e3..bf652325 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -397,7 +397,7 @@ Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Arr # Caching Allocator. -const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray; free_immediately=false) +const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray) GPUArrays.AllocCache.cache_allocator(::Type{<: JLArray}) = JLACacheAllocator diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 022d2aaa..16f53299 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -33,34 +33,24 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T end """ - alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool) + alloc!(alloc_f, cache::CacheAllocator, key) Attempt to retrieve cached allocation from `cache` using `key` for searching. If no such allocation is found, execute `alloc_f` that does actual allocation, store it in cache for future use and return it. - -`skip_free::Bool` is used together with `PerDeviceCacheAllocator.free_immediately`. -When `true` arrays are bulk-freed instead of stored in cache. -In this case `alloc!` will avoid looking into "free" part of `cache` -and execute `alloc_f` immediately, storing allocation for future bulk-freeing. """ -function alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool) +function alloc!(alloc_f, cache::CacheAllocator, key) x = nothing uid = hash(key) busy_pool = get_pool!(cache, :busy, uid) - - if skip_free - x = alloc_f() - else - free_pool = get_pool!(cache, :free, uid) - isempty(free_pool) && (x = alloc_f()) - - while !isempty(free_pool) && x ≡ nothing - tmp = Base.@lock cache.lock pop!(free_pool) - # Array was manually freed via `unsafe_free!`. - GPUArrays.storage(tmp).freed && continue - x = tmp - end + free_pool = get_pool!(cache, :free, uid) + isempty(free_pool) && (x = alloc_f()) + + while !isempty(free_pool) && x ≡ nothing + tmp = Base.@lock cache.lock pop!(free_pool) + # Array was manually freed via `unsafe_free!`. + GPUArrays.storage(tmp).freed && continue + x = tmp end x ≡ nothing && (x = alloc_f()) @@ -68,18 +58,14 @@ function alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool) return x end -function free_busy!(cache::CacheAllocator; free_immediately::Bool) +function free_busy!(cache::CacheAllocator) for uid in cache.busy.keys busy_pool = get_pool!(cache, :busy, uid) isempty(busy_pool) && continue Base.@lock cache.lock begin - if free_immediately - map(unsafe_free!, busy_pool) - else - free_pool = get_pool!(cache, :free, uid) - append!(free_pool, busy_pool) - end + free_pool = get_pool!(cache, :free, uid) + append!(free_pool, busy_pool) empty!(busy_pool) end end @@ -88,11 +74,10 @@ end mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray} lock::ReentrantLock caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}} - free_immediately::Bool end -PerDeviceCacheAllocator(::Type{T}; free_immediately::Bool) where T <: AbstractGPUArray = - PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}(), free_immediately) +PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray = + PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}()) function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T h = hash(device) @@ -116,7 +101,7 @@ end function alloc!(alloc_f, AT::Type{<: AbstractGPUArray}, name::Symbol, key) pdcache = cache_allocator(AT) cache = named_cache_allocator!(pdcache, device(AT), name) - alloc!(alloc_f, cache, key; skip_free=pdcache.free_immediately) + alloc!(alloc_f, cache, key) end function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) @@ -172,7 +157,7 @@ end function free_busy!(AT::Type{<: AbstractGPUArray}, name::Symbol) pdcache = cache_allocator(AT) - free_busy!(named_cache_allocator!(pdcache, device(AT), name); pdcache.free_immediately) + free_busy!(named_cache_allocator!(pdcache, device(AT), name)) end """ diff --git a/test/testsuite.jl b/test/testsuite.jl index c235b756..59bb967b 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -93,8 +93,7 @@ include("testsuite/math.jl") include("testsuite/random.jl") include("testsuite/uniformscaling.jl") include("testsuite/statistics.jl") -# TODO re-enable once backends support it. -# include("testsuite/caching_allocator.jl") +include("testsuite/caching_allocator.jl") """ Runs the entire GPUArrays test suite on array type `AT` diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl index 7118e192..9d229186 100644 --- a/test/testsuite/caching_allocator.jl +++ b/test/testsuite/caching_allocator.jl @@ -1,80 +1,56 @@ @testsuite "Caching Allocator" (AT, eltypes) -> begin device = GPUArrays.AllocCache.device(AT) + pdcache = GPUArrays.AllocCache.cache_allocator(AT) + named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache) - @testset "free_immediately=false" begin - pdcache = GPUArrays.AllocCache.cache_allocator(AT) - pdcache.free_immediately = false - named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache) + T = Float32 + dims = (1, 2, 3) - T = Float32 - dims = (1, 2, 3) - key = hash((T, dims)) + GPUArrays.AllocCache.@enable AT :cache begin + x1 = AT(zeros(T, dims)) + end + @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) + @test length(named_cache.free) == 1 - GPUArrays.AllocCache.@enable AT :cache begin - x1 = AT(zeros(T, dims)) - end - @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) - @test length(named_cache.free[key]) == 1 - @test length(named_cache.busy[key]) == 0 - @test x1 === named_cache.free[key][1] + key = first(keys(named_cache.free)) + @test length(named_cache.free[key]) == 1 + @test length(named_cache.busy[key]) == 0 + @test x1 === named_cache.free[key][1] - # Second allocation does not allocate - cache stays the same in size. + # Second allocation does not allocate - cache stays the same in size. - GPUArrays.AllocCache.@enable AT :cache begin - x2 = AT(zeros(T, dims)) + GPUArrays.AllocCache.@enable AT :cache begin + x2 = AT(zeros(T, dims)) - # Does not go to cache. - GPUArrays.AllocCache.@disable begin - x_free = AT(zeros(T, dims)) - end + # Does not go to cache. + GPUArrays.AllocCache.@disable begin + x_free = AT(zeros(T, dims)) end - @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) - @test length(named_cache.free[key]) == 1 - @test length(named_cache.busy[key]) == 0 - @test x2 === x1 - @test x2 === named_cache.free[key][1] - @test x_free !== x2 - - # Third allocation of different type - cache grows. - - T2 = Int32 - key2 = hash((T2, dims)) - GPUArrays.AllocCache.@enable AT :cache begin - x3 = AT(zeros(T2, dims)) - end - @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) - @test length(named_cache.free[key]) == 1 - @test length(named_cache.free[key2]) == 1 - @test x3 === named_cache.free[key2][1] - - # Freeing all memory held by cache. - - GPUArrays.AllocCache.invalidate!(AT, :cache) - @test sizeof(pdcache, device, :cache) == 0 end + @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) + @test length(named_cache.free[key]) == 1 + @test length(named_cache.busy[key]) == 0 + @test x2 === x1 + @test x2 === named_cache.free[key][1] + @test x_free !== x2 + + # Third allocation of different type - cache grows. + + T2 = Int32 + key2 = hash((T2, dims)) + GPUArrays.AllocCache.@enable AT :cache begin + x3 = AT(zeros(T2, dims)) + end + @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) - @testset "free_immediately=true" begin - pdcache = GPUArrays.AllocCache.cache_allocator(AT) - pdcache.free_immediately = true - named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache2) - - T = Float32 - dims = (1, 2, 3) - key = hash((T, dims)) - - @test sizeof(pdcache, device, :cache2) == 0 - - GPUArrays.AllocCache.@enable AT :cache2 begin - x1 = AT(zeros(T, dims)) + _keys = collect(keys(named_cache.free)) + key2 = _keys[findfirst(i -> i != key, _keys)] + @test length(named_cache.free[key]) == 1 + @test length(named_cache.free[key2]) == 1 + @test x3 === named_cache.free[key2][1] - @test !haskey(named_cache.free, key) - @test length(named_cache.busy[key]) == 1 - @test sizeof(pdcache, device, :cache2) == sizeof(Float32) * prod(dims) - end + # Freeing all memory held by cache. - # `free` was never even used with `free_immediately=true`. - @test !haskey(named_cache.free, key) - @test length(named_cache.busy[key]) == 0 - @test sizeof(pdcache, device, :cache2) == 0 - end + GPUArrays.AllocCache.invalidate!(AT, :cache) + @test sizeof(pdcache, device, :cache) == 0 end From 166254f2e6bd336b59d788f8b0a077dde22dc863 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 7 Jan 2025 14:14:28 +0200 Subject: [PATCH 12/28] Limit caching allocator tests to AbstractGPUArray --- test/testsuite/caching_allocator.jl | 90 +++++++++++++++-------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl index 9d229186..e499ff38 100644 --- a/test/testsuite/caching_allocator.jl +++ b/test/testsuite/caching_allocator.jl @@ -1,56 +1,58 @@ @testsuite "Caching Allocator" (AT, eltypes) -> begin - device = GPUArrays.AllocCache.device(AT) - pdcache = GPUArrays.AllocCache.cache_allocator(AT) - named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache) + if AT <: AbstractGPUArray + device = GPUArrays.AllocCache.device(AT) + pdcache = GPUArrays.AllocCache.cache_allocator(AT) + named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache) - T = Float32 - dims = (1, 2, 3) + T = Float32 + dims = (1, 2, 3) - GPUArrays.AllocCache.@enable AT :cache begin - x1 = AT(zeros(T, dims)) - end - @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) - @test length(named_cache.free) == 1 + GPUArrays.AllocCache.@enable AT :cache begin + x1 = AT(zeros(T, dims)) + end + @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) + @test length(named_cache.free) == 1 - key = first(keys(named_cache.free)) - @test length(named_cache.free[key]) == 1 - @test length(named_cache.busy[key]) == 0 - @test x1 === named_cache.free[key][1] + key = first(keys(named_cache.free)) + @test length(named_cache.free[key]) == 1 + @test length(named_cache.busy[key]) == 0 + @test x1 === named_cache.free[key][1] - # Second allocation does not allocate - cache stays the same in size. + # Second allocation does not allocate - cache stays the same in size. - GPUArrays.AllocCache.@enable AT :cache begin - x2 = AT(zeros(T, dims)) + GPUArrays.AllocCache.@enable AT :cache begin + x2 = AT(zeros(T, dims)) - # Does not go to cache. - GPUArrays.AllocCache.@disable begin - x_free = AT(zeros(T, dims)) + # Does not go to cache. + GPUArrays.AllocCache.@disable begin + x_free = AT(zeros(T, dims)) + end end - end - @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) - @test length(named_cache.free[key]) == 1 - @test length(named_cache.busy[key]) == 0 - @test x2 === x1 - @test x2 === named_cache.free[key][1] - @test x_free !== x2 - - # Third allocation of different type - cache grows. - - T2 = Int32 - key2 = hash((T2, dims)) - GPUArrays.AllocCache.@enable AT :cache begin - x3 = AT(zeros(T2, dims)) - end - @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) + @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) + @test length(named_cache.free[key]) == 1 + @test length(named_cache.busy[key]) == 0 + @test x2 === x1 + @test x2 === named_cache.free[key][1] + @test x_free !== x2 + + # Third allocation of different type - cache grows. + + T2 = Int32 + key2 = hash((T2, dims)) + GPUArrays.AllocCache.@enable AT :cache begin + x3 = AT(zeros(T2, dims)) + end + @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) - _keys = collect(keys(named_cache.free)) - key2 = _keys[findfirst(i -> i != key, _keys)] - @test length(named_cache.free[key]) == 1 - @test length(named_cache.free[key2]) == 1 - @test x3 === named_cache.free[key2][1] + _keys = collect(keys(named_cache.free)) + key2 = _keys[findfirst(i -> i != key, _keys)] + @test length(named_cache.free[key]) == 1 + @test length(named_cache.free[key2]) == 1 + @test x3 === named_cache.free[key2][1] - # Freeing all memory held by cache. + # Freeing all memory held by cache. - GPUArrays.AllocCache.invalidate!(AT, :cache) - @test sizeof(pdcache, device, :cache) == 0 + GPUArrays.AllocCache.invalidate!(AT, :cache) + @test sizeof(pdcache, device, :cache) == 0 + end end From 01d6abc349c84c81be05444797432c2d2f2a5c2f Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 7 Jan 2025 14:41:19 +0200 Subject: [PATCH 13/28] Fix tests for 1.10 --- test/testsuite/caching_allocator.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl index e499ff38..a62d9fdf 100644 --- a/test/testsuite/caching_allocator.jl +++ b/test/testsuite/caching_allocator.jl @@ -7,8 +7,8 @@ T = Float32 dims = (1, 2, 3) - GPUArrays.AllocCache.@enable AT :cache begin - x1 = AT(zeros(T, dims)) + x1 = GPUArrays.AllocCache.@enable AT :cache begin + AT(zeros(T, dims)) end @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) @test length(named_cache.free) == 1 @@ -20,13 +20,14 @@ # Second allocation does not allocate - cache stays the same in size. - GPUArrays.AllocCache.@enable AT :cache begin + x2, x_free = GPUArrays.AllocCache.@enable AT :cache begin x2 = AT(zeros(T, dims)) # Does not go to cache. GPUArrays.AllocCache.@disable begin x_free = AT(zeros(T, dims)) end + x2, x_free end @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) @test length(named_cache.free[key]) == 1 @@ -39,8 +40,8 @@ T2 = Int32 key2 = hash((T2, dims)) - GPUArrays.AllocCache.@enable AT :cache begin - x3 = AT(zeros(T2, dims)) + x3 = GPUArrays.AllocCache.@enable AT :cache begin + AT(zeros(T2, dims)) end @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) From 41bb06dc14fc7a53ca48da16e9744aa2837f4cff Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 8 Jan 2025 14:58:07 +0100 Subject: [PATCH 14/28] Runic formatting. Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/make.jl | 2 +- lib/JLArrays/src/JLArrays.jl | 8 ++++---- src/host/allocations_cache.jl | 27 ++++++++++++++------------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index b8ca1f92..a37b0cd9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,7 +20,7 @@ function main() "Test suite" => "testsuite.md", ], doctest = true, - warnonly=[:missing_docs], + warnonly = [:missing_docs], ) deploydocs( diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index bf652325..38a07476 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -94,8 +94,8 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} ref = DataRef(data) do data resize!(data, 0) end - obj = new{T,N}(ref, 0, dims) - finalizer(unsafe_free!, obj) + obj = new{T, N}(ref, 0, dims) + return finalizer(unsafe_free!, obj) end name = GPUArrays.AllocCache.CacheAllocatorName[] @@ -399,8 +399,8 @@ Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Arr const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray) -GPUArrays.AllocCache.cache_allocator(::Type{<: JLArray}) = JLACacheAllocator +GPUArrays.AllocCache.cache_allocator(::Type{<:JLArray}) = JLACacheAllocator -GPUArrays.AllocCache.device(::Type{<: JLArray}) = 1 +GPUArrays.AllocCache.device(::Type{<:JLArray}) = 1 end diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl index 16f53299..d3c590d2 100644 --- a/src/host/allocations_cache.jl +++ b/src/host/allocations_cache.jl @@ -17,13 +17,13 @@ struct CacheAllocator{T <: AbstractGPUArray} free::Dict{UInt64, Vector{T}} end -CacheAllocator(::Type{T}) where T = CacheAllocator( +CacheAllocator(::Type{T}) where {T} = CacheAllocator( ReentrantLock(), Dict{UInt64, Vector{T}}(), Dict{UInt64, Vector{T}}(), ) -function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T +function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where {T} pool = getproperty(cache, pool) uid_pool = get(pool, uid, nothing) if uid_pool ≡ nothing @@ -69,6 +69,7 @@ function free_busy!(cache::CacheAllocator) empty!(busy_pool) end end + return end mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray} @@ -76,10 +77,10 @@ mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray} caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}} end -PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray = +PerDeviceCacheAllocator(::Type{T}) where {T <: AbstractGPUArray} = PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}()) -function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T +function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where {T} h = hash(device) dev_cache = get(pdcache.caches, h, nothing) if dev_cache ≡ nothing @@ -98,10 +99,10 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam return named_cache end -function alloc!(alloc_f, AT::Type{<: AbstractGPUArray}, name::Symbol, key) +function alloc!(alloc_f, AT::Type{<:AbstractGPUArray}, name::Symbol, key) pdcache = cache_allocator(AT) cache = named_cache_allocator!(pdcache, device(AT), name) - alloc!(alloc_f, cache, key) + return alloc!(alloc_f, cache, key) end function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) @@ -116,10 +117,10 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) Base.@lock named_cache.lock begin for (_, pool) in named_cache.free - sz += sum(sizeof, pool; init=UInt64(0)) + sz += sum(sizeof, pool; init = UInt64(0)) end for (_, pool) in named_cache.busy - sz += sum(sizeof, pool; init=UInt64(0)) + sz += sum(sizeof, pool; init = UInt64(0)) end end return sz @@ -130,7 +131,7 @@ end Free all memory held by `name`d cached allocator given array type `AT`. """ -invalidate!(AT::Type{<: AbstractGPUArray}, name::Symbol) = +invalidate!(AT::Type{<:AbstractGPUArray}, name::Symbol) = invalidate!(cache_allocator(AT), device(AT), name) function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) @@ -155,9 +156,9 @@ function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) return end -function free_busy!(AT::Type{<: AbstractGPUArray}, name::Symbol) +function free_busy!(AT::Type{<:AbstractGPUArray}, name::Symbol) pdcache = cache_allocator(AT) - free_busy!(named_cache_allocator!(pdcache, device(AT), name)) + return free_busy!(named_cache_allocator!(pdcache, device(AT), name)) end """ @@ -199,7 +200,7 @@ CUDA.AllocCache.invalidate!(CuArray, :loop) ``` """ macro enable(AT, name, expr) - quote + return quote res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr)) free_busy!($(esc(AT)), $(esc(name))) res @@ -213,7 +214,7 @@ Evaluate expression `expr` without using caching allocator. This is useful to call from within `@enable` to avoid caching arrays. """ macro disable(expr) - quote + return quote @with $(esc(CacheAllocatorName)) => nothing $(esc(expr)) end end From b2df4c56e5f297e1244952fd6e61867a32e9ffc1 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 8 Jan 2025 23:43:52 +0200 Subject: [PATCH 15/28] Simplify --- lib/JLArrays/src/JLArrays.jl | 10 +- src/GPUArrays.jl | 2 +- src/host/alloc_cache.jl | 149 +++++++++++++++++ src/host/allocations_cache.jl | 240 ---------------------------- test/testsuite/caching_allocator.jl | 76 ++++----- 5 files changed, 186 insertions(+), 291 deletions(-) create mode 100644 src/host/alloc_cache.jl delete mode 100644 src/host/allocations_cache.jl diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 38a07476..d5b4e9f3 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -98,10 +98,12 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} return finalizer(unsafe_free!, obj) end - name = GPUArrays.AllocCache.CacheAllocatorName[] - return name ≡ nothing ? - _alloc_f() : - GPUArrays.AllocCache.alloc!(_alloc_f, JLArray, name, (T, dims))::JLArray{T, N} + cache = GPUArrays.ALLOC_CACHE[] + return if cache ≡ nothing + _alloc_f() + else + GPUArrays.alloc!(_alloc_f, cache, (JLArray, T, dims))::JLArray{T, N} + end end # low-level constructor for wrapping existing data diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 948ff068..8c1fc14e 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -34,7 +34,7 @@ include("host/random.jl") include("host/quirks.jl") include("host/uniformscaling.jl") include("host/statistics.jl") -include("host/allocations_cache.jl") +include("host/alloc_cache.jl") end # module diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl new file mode 100644 index 00000000..3c5ecb51 --- /dev/null +++ b/src/host/alloc_cache.jl @@ -0,0 +1,149 @@ +using ..GPUArrays + +@static if VERSION < v"1.11" + using ScopedValues +else + using Base.ScopedValues +end + +mutable struct AllocCache{T <: AbstractGPUArray} + lock::ReentrantLock + busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[] + free::Dict{UInt64, Vector{T}} + + function AllocCache(::Type{T}) where {T <: AbstractGPUArray} + cache = new{T}(ReentrantLock(), + Dict{UInt64, Vector{T}}(), + Dict{UInt64, Vector{T}}()) + finalizer(unsafe_free!, cache) + end +end + +function get_pool!(cache::AllocCache{T}, pool::Symbol, uid::UInt64) where {T <: AbstractGPUArray} + pool = getproperty(cache, pool) + uid_pool = get(pool, uid, nothing) + if uid_pool ≡ nothing + uid_pool = Base.@lock cache.lock pool[uid] = T[] + end + return uid_pool +end + +function alloc!(alloc_f, cache::AllocCache, key) + x = nothing + uid = hash(key) + + busy_pool = get_pool!(cache, :busy, uid) + free_pool = get_pool!(cache, :free, uid) + isempty(free_pool) && (x = alloc_f()) + + while !isempty(free_pool) && x ≡ nothing + tmp = Base.@lock cache.lock pop!(free_pool) + # Array was manually freed via `unsafe_free!`. + GPUArrays.storage(tmp).freed && continue + x = tmp + end + + x ≡ nothing && (x = alloc_f()) + Base.@lock cache.lock push!(busy_pool, x) + return x +end + +function free_busy!(cache::AllocCache) + for uid in cache.busy.keys + busy_pool = get_pool!(cache, :busy, uid) + isempty(busy_pool) && continue + + Base.@lock cache.lock begin + free_pool = get_pool!(cache, :free, uid) + append!(free_pool, busy_pool) + empty!(busy_pool) + end + end + return +end + +function unsafe_free!(cache::AllocCache) + Base.@lock cache.lock begin + for (_, pool) in cache.busy + isempty(pool) || error( + "Invalidating allocations cache that's currently in use. " * + "Invalidating inside `@enable` is not allowed.") + end + for (_, pool) in cache.free + map(unsafe_free!, pool) + end + empty!(cache.free) + end + return +end + +function Base.sizeof(cache::AllocCache) + sz = UInt64(0) + Base.@lock cache.lock begin + for kind in (cache.free, cache.busy), (_, pool) in kind + sz += sum(sizeof, pool; init=UInt64(0)) + end + end + return sz +end + +const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing) + +""" + @enable(cache, expr) + +Evaluate expression `expr` using allocations cache `cache`. + +When gpu allocation is requested during execution of `expr`, +it will first check if there's "free" cache instead of performing an actual allocation. +If no "free" allocation exists, an actual allocation is performed. +Before returning allocation to the user, it is marked as busy and +will not be used by allocation in the scope defined by `@enable`. + +**After** the execution of `expr` all "busy" allocations are marked as "free" +thus they can be re-used next time the program enters this scope. + +This is useful to apply in a repeating block of code to avoid relying on +GC to free gpu memory in time. + +# Example + +In the following example, each iteration of the for-loop requires `8 GiB` of gpu memory. +Without caching allocator GC wouldn't be able to free arrays in time +resulting in higher memory usage. +With caching allocator, memory usage stays at exactly `8 GiB`. + +```julia +cache = GPUArrays.AllocCache(CuArray) +n = 1024^3 +for i in 1:1000 + GPUArrays.@enable cache begin + sin.(CUDA.rand(Float32, n)) + end +end +# To free immediately. +# Otherwise, it will be freed when collected by GC. +GPUArrays.unsafe_free!(cache) +``` + +See [`@disable`](@ref). +""" +macro enable(cache, expr) + return quote + res = @with $(esc(ALLOC_CACHE)) => $(esc(cache)) $(esc(expr)) + free_busy!($(esc(cache))) + res + end +end + +""" + disable(expr) + +Evaluate expression `expr` without using allocations cache. +This is useful to call from within `@enable` to avoid caching some allocations. +""" +macro disable(expr) + return quote + @with $(esc(ALLOC_CACHE)) => nothing $(esc(expr)) + end +end diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl deleted file mode 100644 index d3c590d2..00000000 --- a/src/host/allocations_cache.jl +++ /dev/null @@ -1,240 +0,0 @@ -module AllocCache - -using ..GPUArrays -using KernelAbstractions - -@static if VERSION < v"1.11" - using ScopedValues -else - using Base.ScopedValues -end - -const CacheAllocatorName = ScopedValue{Union{Nothing, Symbol}}(nothing) - -struct CacheAllocator{T <: AbstractGPUArray} - lock::ReentrantLock - busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[] - free::Dict{UInt64, Vector{T}} -end - -CacheAllocator(::Type{T}) where {T} = CacheAllocator( - ReentrantLock(), - Dict{UInt64, Vector{T}}(), - Dict{UInt64, Vector{T}}(), -) - -function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where {T} - pool = getproperty(cache, pool) - uid_pool = get(pool, uid, nothing) - if uid_pool ≡ nothing - uid_pool = Base.@lock cache.lock pool[uid] = T[] - end - return uid_pool -end - -""" - alloc!(alloc_f, cache::CacheAllocator, key) - -Attempt to retrieve cached allocation from `cache` using `key` for searching. -If no such allocation is found, execute `alloc_f` that does actual allocation, -store it in cache for future use and return it. -""" -function alloc!(alloc_f, cache::CacheAllocator, key) - x = nothing - uid = hash(key) - busy_pool = get_pool!(cache, :busy, uid) - free_pool = get_pool!(cache, :free, uid) - isempty(free_pool) && (x = alloc_f()) - - while !isempty(free_pool) && x ≡ nothing - tmp = Base.@lock cache.lock pop!(free_pool) - # Array was manually freed via `unsafe_free!`. - GPUArrays.storage(tmp).freed && continue - x = tmp - end - - x ≡ nothing && (x = alloc_f()) - Base.@lock cache.lock push!(busy_pool, x) - return x -end - -function free_busy!(cache::CacheAllocator) - for uid in cache.busy.keys - busy_pool = get_pool!(cache, :busy, uid) - isempty(busy_pool) && continue - - Base.@lock cache.lock begin - free_pool = get_pool!(cache, :free, uid) - append!(free_pool, busy_pool) - empty!(busy_pool) - end - end - return -end - -mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray} - lock::ReentrantLock - caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}} -end - -PerDeviceCacheAllocator(::Type{T}) where {T <: AbstractGPUArray} = - PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}()) - -function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where {T} - h = hash(device) - dev_cache = get(pdcache.caches, h, nothing) - if dev_cache ≡ nothing - Base.@lock pdcache.lock begin - named_cache = CacheAllocator(T) - pdcache.caches[h] = Dict{Symbol, CacheAllocator{T}}(name => named_cache) - return named_cache - end - end - - named_cache = get(dev_cache, name, nothing) - if named_cache ≡ nothing - named_cache = CacheAllocator(T) - Base.@lock pdcache.lock dev_cache[name] = named_cache - end - return named_cache -end - -function alloc!(alloc_f, AT::Type{<:AbstractGPUArray}, name::Symbol, key) - pdcache = cache_allocator(AT) - cache = named_cache_allocator!(pdcache, device(AT), name) - return alloc!(alloc_f, cache, key) -end - -function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol) - sz = UInt64(0) - h = hash(device) - - dev_cache = get(pdcache.caches, h, nothing) - dev_cache ≡ nothing && return sz - - named_cache = get(dev_cache, name, nothing) - named_cache ≡ nothing && return sz - - Base.@lock named_cache.lock begin - for (_, pool) in named_cache.free - sz += sum(sizeof, pool; init = UInt64(0)) - end - for (_, pool) in named_cache.busy - sz += sum(sizeof, pool; init = UInt64(0)) - end - end - return sz -end - -""" - invalidate!(AT::Type{AbstractGPUArray}, name::Symbol) - -Free all memory held by `name`d cached allocator given array type `AT`. -""" -invalidate!(AT::Type{<:AbstractGPUArray}, name::Symbol) = - invalidate!(cache_allocator(AT), device(AT), name) - -function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol) - h = hash(device) - dev_cache = get(pdcache.caches, h, nothing) - dev_cache ≡ nothing && return - - named_cache = get(dev_cache, name, nothing) - named_cache ≡ nothing && return - - Base.@lock named_cache.lock begin - for (_, pool) in named_cache.free - map(unsafe_free!, pool) - end - # TODO error when trying to invalidate busy cache? - for (_, pool) in named_cache.busy - map(unsafe_free!, pool) - end - empty!(named_cache.busy) - empty!(named_cache.free) - end - return -end - -function free_busy!(AT::Type{<:AbstractGPUArray}, name::Symbol) - pdcache = cache_allocator(AT) - return free_busy!(named_cache_allocator!(pdcache, device(AT), name)) -end - -""" - @enable AT name expr - -Evaluate expression `expr` using `name`d caching allocator for the given array type `AT`. - -When gpu allocation is requested during execution of `expr`, -allocator will try to use its "free" cache instead of doing an actual allocation. -If no "free" allocation exists, an actual allocation is performed. -Before returning allocation to the user, it is marked as busy and -will not be used by allocation in the scope defined by `@enable`. - -**After** the execution of `expr` all "busy" allocations are marked as "free" -thus they can be re-used next time the program enters this scope. - -This is useful to apply in a repeating block of code to avoid relying on -GC to free gpu memory in time. - -`name` is a `Symbol` that defines which allocator to use -(`nothing`, which is a default, disables it). - -# Example - -In the following example, each iteration of the for-loop requires `2 GiB` of gpu memory. -Without caching allocator GC wouldn't be able to free arrays in time -resulting in higher memory usage. -With caching allocator, memory usage stays at exactly `2 GiB`. - -See [`@disable`](@ref), [`invalidate!`](@ref). -```julia -n = 1024^3 -for i in 1:1000 - CUDA.AllocCache.@enable CuArray :loop begin - sin.(CUDA.rand(Float32, n)) - end -end -CUDA.AllocCache.invalidate!(CuArray, :loop) -``` -""" -macro enable(AT, name, expr) - return quote - res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr)) - free_busy!($(esc(AT)), $(esc(name))) - res - end -end - -""" - @disable expr - -Evaluate expression `expr` without using caching allocator. -This is useful to call from within `@enable` to avoid caching arrays. -""" -macro disable(expr) - return quote - @with $(esc(CacheAllocatorName)) => nothing $(esc(expr)) - end -end - -# Interface API. - -""" - cache_allocator(::Type{AbstractGPUArray}) - -Given array type, return corresponding `PerDeviceCacheAllocator` for it. -Each GPU backend must implement this. -""" -cache_allocator(::Type{AbstractGPUArray}) = error("Not implemented.") - -""" - device(::Type{AbstractGPUArray}) - -Given array type, return current device. -Each GPU backend must implement this. -""" -device(::Type{AbstractGPUArray}) = error("Not implemented.") - -end diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl index a62d9fdf..43bf5589 100644 --- a/test/testsuite/caching_allocator.jl +++ b/test/testsuite/caching_allocator.jl @@ -1,59 +1,43 @@ -@testsuite "Caching Allocator" (AT, eltypes) -> begin +@testsuite "alloc cache" (AT, eltypes) -> begin if AT <: AbstractGPUArray - device = GPUArrays.AllocCache.device(AT) - pdcache = GPUArrays.AllocCache.cache_allocator(AT) - named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache) + cache = GPUArrays.AllocCache(AT) - T = Float32 - dims = (1, 2, 3) - - x1 = GPUArrays.AllocCache.@enable AT :cache begin - AT(zeros(T, dims)) + T, dims = Float32, (1, 2, 3) + GPUArrays.@enable cache begin + x1 = AT(zeros(T, dims)) end - @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) - @test length(named_cache.free) == 1 - - key = first(keys(named_cache.free)) - @test length(named_cache.free[key]) == 1 - @test length(named_cache.busy[key]) == 0 - @test x1 === named_cache.free[key][1] - - # Second allocation does not allocate - cache stays the same in size. - - x2, x_free = GPUArrays.AllocCache.@enable AT :cache begin + @test sizeof(cache) == sizeof(T) * prod(dims) + key = first(keys(cache.free)) + @test length(cache.free[key]) == 1 + @test length(cache.busy[key]) == 0 + @test x1 === cache.free[key][1] + + # Second allocation hits cache. + GPUArrays.@enable cache begin x2 = AT(zeros(T, dims)) - - # Does not go to cache. - GPUArrays.AllocCache.@disable begin - x_free = AT(zeros(T, dims)) - end - x2, x_free + # Does not hit the cache. + GPUArrays.@disable x_free = AT(zeros(T, dims)) end - @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims) - @test length(named_cache.free[key]) == 1 - @test length(named_cache.busy[key]) == 0 - @test x2 === x1 - @test x2 === named_cache.free[key][1] + @test sizeof(cache) == sizeof(T) * prod(dims) + key = first(keys(cache.free)) + @test length(cache.free[key]) == 1 + @test length(cache.busy[key]) == 0 + @test x2 === cache.free[key][1] @test x_free !== x2 - # Third allocation of different type - cache grows. - - T2 = Int32 - key2 = hash((T2, dims)) - x3 = GPUArrays.AllocCache.@enable AT :cache begin - AT(zeros(T2, dims)) + # Third allocation is of different shape - allocates. + dims = (2, 2) + GPUArrays.@enable cache begin + x3 = AT(zeros(T, dims)) end - @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims) - - _keys = collect(keys(named_cache.free)) + _keys = collect(keys(cache.free)) key2 = _keys[findfirst(i -> i != key, _keys)] - @test length(named_cache.free[key]) == 1 - @test length(named_cache.free[key2]) == 1 - @test x3 === named_cache.free[key2][1] + @test length(cache.free[key]) == 1 + @test length(cache.free[key2]) == 1 + @test x3 === cache.free[key2][1] # Freeing all memory held by cache. - - GPUArrays.AllocCache.invalidate!(AT, :cache) - @test sizeof(pdcache, device, :cache) == 0 + GPUArrays.unsafe_free!(cache) + @test sizeof(cache) == 0 end end From 3ffca034d00810da1b2af66dbd0c651b5dd978fb Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 8 Jan 2025 23:48:45 +0200 Subject: [PATCH 16/28] Cleanup --- lib/JLArrays/src/JLArrays.jl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index d5b4e9f3..92ee7de0 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -397,12 +397,4 @@ Adapt.adapt_storage(::JLBackend, a::Array) = Adapt.adapt(JLArrays.JLArray, a) Adapt.adapt_storage(::JLBackend, a::JLArrays.JLArray) = a Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Array, a) -# Caching Allocator. - -const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray) - -GPUArrays.AllocCache.cache_allocator(::Type{<:JLArray}) = JLACacheAllocator - -GPUArrays.AllocCache.device(::Type{<:JLArray}) = 1 - end From 96af44c8925e74fbbfdfec0837eadc3ccb87a2ba Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 8 Jan 2025 23:50:01 +0200 Subject: [PATCH 17/28] Update src/host/alloc_cache.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/host/alloc_cache.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index 3c5ecb51..73ef8f37 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -81,7 +81,7 @@ function Base.sizeof(cache::AllocCache) sz = UInt64(0) Base.@lock cache.lock begin for kind in (cache.free, cache.busy), (_, pool) in kind - sz += sum(sizeof, pool; init=UInt64(0)) + sz += sum(sizeof, pool; init = UInt64(0)) end end return sz From cf5fda2a1d2955957c38b3b768b12b4fd0109356 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 8 Jan 2025 23:50:08 +0200 Subject: [PATCH 18/28] Update src/host/alloc_cache.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/host/alloc_cache.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index 73ef8f37..d909cdc8 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -67,7 +67,8 @@ function unsafe_free!(cache::AllocCache) for (_, pool) in cache.busy isempty(pool) || error( "Invalidating allocations cache that's currently in use. " * - "Invalidating inside `@enable` is not allowed.") + "Invalidating inside `@enable` is not allowed." + ) end for (_, pool) in cache.free map(unsafe_free!, pool) From 36ced8346987fd442466adccfc7f333fab75c150 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 8 Jan 2025 23:50:18 +0200 Subject: [PATCH 19/28] Update src/host/alloc_cache.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/host/alloc_cache.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index d909cdc8..7ecc437e 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -12,10 +12,12 @@ mutable struct AllocCache{T <: AbstractGPUArray} free::Dict{UInt64, Vector{T}} function AllocCache(::Type{T}) where {T <: AbstractGPUArray} - cache = new{T}(ReentrantLock(), + cache = new{T}( + ReentrantLock(), Dict{UInt64, Vector{T}}(), - Dict{UInt64, Vector{T}}()) - finalizer(unsafe_free!, cache) + Dict{UInt64, Vector{T}}() + ) + return finalizer(unsafe_free!, cache) end end From c98bfa408082012a8dfcb853f929fd09ff3ddf74 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Wed, 8 Jan 2025 23:55:40 +0200 Subject: [PATCH 20/28] Update docs --- docs/src/interface.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/src/interface.md b/docs/src/interface.md index dadb06c1..0aa33bd3 100644 --- a/docs/src/interface.md +++ b/docs/src/interface.md @@ -34,7 +34,6 @@ There are numerous examples of potential interfaces for GPUArrays, such as with ## Caching Allocator ```@docs -GPUArrays.AllocCache.@enable -GPUArrays.AllocCache.@disable -GPUArrays.AllocCache.invalidate! +GPUArrays.@enable +GPUArrays.@disable ``` From 63ffeaef7f3194441ce3a8f4c9215c9f1695a9fb Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 9 Jan 2025 10:38:13 +0200 Subject: [PATCH 21/28] Rename enable to cached --- docs/src/interface.md | 4 ++-- src/host/alloc_cache.jl | 18 +++++++++--------- test/testsuite/caching_allocator.jl | 8 ++++---- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/src/interface.md b/docs/src/interface.md index 0aa33bd3..9e4864ad 100644 --- a/docs/src/interface.md +++ b/docs/src/interface.md @@ -34,6 +34,6 @@ There are numerous examples of potential interfaces for GPUArrays, such as with ## Caching Allocator ```@docs -GPUArrays.@enable -GPUArrays.@disable +GPUArrays.@cached +GPUArrays.@uncached ``` diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index 7ecc437e..899f2e5d 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -69,7 +69,7 @@ function unsafe_free!(cache::AllocCache) for (_, pool) in cache.busy isempty(pool) || error( "Invalidating allocations cache that's currently in use. " * - "Invalidating inside `@enable` is not allowed." + "Invalidating inside `@cached` is not allowed." ) end for (_, pool) in cache.free @@ -93,7 +93,7 @@ end const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing) """ - @enable(cache, expr) + @cached(cache, expr) Evaluate expression `expr` using allocations cache `cache`. @@ -101,7 +101,7 @@ When gpu allocation is requested during execution of `expr`, it will first check if there's "free" cache instead of performing an actual allocation. If no "free" allocation exists, an actual allocation is performed. Before returning allocation to the user, it is marked as busy and -will not be used by allocation in the scope defined by `@enable`. +will not be used by allocation in the scope defined by `@cached`. **After** the execution of `expr` all "busy" allocations are marked as "free" thus they can be re-used next time the program enters this scope. @@ -120,7 +120,7 @@ With caching allocator, memory usage stays at exactly `8 GiB`. cache = GPUArrays.AllocCache(CuArray) n = 1024^3 for i in 1:1000 - GPUArrays.@enable cache begin + GPUArrays.@cached cache begin sin.(CUDA.rand(Float32, n)) end end @@ -129,9 +129,9 @@ end GPUArrays.unsafe_free!(cache) ``` -See [`@disable`](@ref). +See [`@uncached`](@ref). """ -macro enable(cache, expr) +macro cached(cache, expr) return quote res = @with $(esc(ALLOC_CACHE)) => $(esc(cache)) $(esc(expr)) free_busy!($(esc(cache))) @@ -140,12 +140,12 @@ macro enable(cache, expr) end """ - disable(expr) + uncached(expr) Evaluate expression `expr` without using allocations cache. -This is useful to call from within `@enable` to avoid caching some allocations. +This is useful to call from within `@cached` to avoid caching some allocations. """ -macro disable(expr) +macro uncached(expr) return quote @with $(esc(ALLOC_CACHE)) => nothing $(esc(expr)) end diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl index 43bf5589..9a7201a5 100644 --- a/test/testsuite/caching_allocator.jl +++ b/test/testsuite/caching_allocator.jl @@ -3,7 +3,7 @@ cache = GPUArrays.AllocCache(AT) T, dims = Float32, (1, 2, 3) - GPUArrays.@enable cache begin + GPUArrays.@cached cache begin x1 = AT(zeros(T, dims)) end @test sizeof(cache) == sizeof(T) * prod(dims) @@ -13,10 +13,10 @@ @test x1 === cache.free[key][1] # Second allocation hits cache. - GPUArrays.@enable cache begin + GPUArrays.@cached cache begin x2 = AT(zeros(T, dims)) # Does not hit the cache. - GPUArrays.@disable x_free = AT(zeros(T, dims)) + GPUArrays.@uncached x_free = AT(zeros(T, dims)) end @test sizeof(cache) == sizeof(T) * prod(dims) key = first(keys(cache.free)) @@ -27,7 +27,7 @@ # Third allocation is of different shape - allocates. dims = (2, 2) - GPUArrays.@enable cache begin + GPUArrays.@cached cache begin x3 = AT(zeros(T, dims)) end _keys = collect(keys(cache.free)) From 972b386937b10e132b89785f508b3cb064a0cd78 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 9 Jan 2025 11:34:30 +0100 Subject: [PATCH 22/28] Rename. --- test/testsuite.jl | 2 +- test/testsuite/{caching_allocator.jl => alloc_cache.jl} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename test/testsuite/{caching_allocator.jl => alloc_cache.jl} (100%) diff --git a/test/testsuite.jl b/test/testsuite.jl index 59bb967b..e138dabe 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -93,7 +93,7 @@ include("testsuite/math.jl") include("testsuite/random.jl") include("testsuite/uniformscaling.jl") include("testsuite/statistics.jl") -include("testsuite/caching_allocator.jl") +include("testsuite/alloc_cache.jl") """ Runs the entire GPUArrays test suite on array type `AT` diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/alloc_cache.jl similarity index 100% rename from test/testsuite/caching_allocator.jl rename to test/testsuite/alloc_cache.jl From 9960b5225c18a997711219c7b2a7040b9952aa7e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 9 Jan 2025 11:47:43 +0100 Subject: [PATCH 23/28] Simplify back-end interface. --- lib/JLArrays/src/JLArrays.jl | 15 +++++---------- src/host/alloc_cache.jl | 11 ++++++++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 92ee7de0..18be1889 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -89,21 +89,15 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} check_eltype(T) maxsize = prod(dims) * sizeof(T) - function _alloc_f() + GPUArrays.cached_alloc((JLArray, T, dims)) do data = Vector{UInt8}(undef, maxsize) ref = DataRef(data) do data resize!(data, 0) end obj = new{T, N}(ref, 0, dims) - return finalizer(unsafe_free!, obj) - end - - cache = GPUArrays.ALLOC_CACHE[] - return if cache ≡ nothing - _alloc_f() - else - GPUArrays.alloc!(_alloc_f, cache, (JLArray, T, dims))::JLArray{T, N} - end + finalizer(unsafe_free!, obj) + return obj + end::JLArray{T,N} end # low-level constructor for wrapping existing data @@ -112,6 +106,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} check_eltype(T) obj = new{T,N}(ref, offset, dims) finalizer(unsafe_free!, obj) + return obj end end diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index 899f2e5d..6c9a1200 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -30,13 +30,18 @@ function get_pool!(cache::AllocCache{T}, pool::Symbol, uid::UInt64) where {T <: return uid_pool end -function alloc!(alloc_f, cache::AllocCache, key) +function cached_alloc(f, key) + cache = ALLOC_CACHE[] + if cache === nothing + return f() + end + x = nothing uid = hash(key) busy_pool = get_pool!(cache, :busy, uid) free_pool = get_pool!(cache, :free, uid) - isempty(free_pool) && (x = alloc_f()) + isempty(free_pool) && (x = f()) while !isempty(free_pool) && x ≡ nothing tmp = Base.@lock cache.lock pop!(free_pool) @@ -45,7 +50,7 @@ function alloc!(alloc_f, cache::AllocCache, key) x = tmp end - x ≡ nothing && (x = alloc_f()) + x ≡ nothing && (x = f()) Base.@lock cache.lock push!(busy_pool, x) return x end From 7e32124a222c10c435ef16846c4bca034045b499 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 9 Jan 2025 11:59:52 +0100 Subject: [PATCH 24/28] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- lib/JLArrays/src/JLArrays.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl index 18be1889..d36e9af2 100644 --- a/lib/JLArrays/src/JLArrays.jl +++ b/lib/JLArrays/src/JLArrays.jl @@ -89,7 +89,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} check_eltype(T) maxsize = prod(dims) * sizeof(T) - GPUArrays.cached_alloc((JLArray, T, dims)) do + return GPUArrays.cached_alloc((JLArray, T, dims)) do data = Vector{UInt8}(undef, maxsize) ref = DataRef(data) do data resize!(data, 0) @@ -97,7 +97,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N} obj = new{T, N}(ref, 0, dims) finalizer(unsafe_free!, obj) return obj - end::JLArray{T,N} + end::JLArray{T, N} end # low-level constructor for wrapping existing data From e579824b9a1f40252f98ff1f91c1b8d5a216eefd Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 9 Jan 2025 13:25:54 +0200 Subject: [PATCH 25/28] Strip down cache from array type --- src/host/alloc_cache.jl | 30 ++++++++++++++++++++---------- test/testsuite/alloc_cache.jl | 2 +- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index 6c9a1200..b66e7c40 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -6,26 +6,26 @@ else using Base.ScopedValues end -mutable struct AllocCache{T <: AbstractGPUArray} +mutable struct AllocCache lock::ReentrantLock - busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[] - free::Dict{UInt64, Vector{T}} + busy::Dict{UInt64, Vector{Any}} # hash(key) => GPUArray[] + free::Dict{UInt64, Vector{Any}} - function AllocCache(::Type{T}) where {T <: AbstractGPUArray} - cache = new{T}( + function AllocCache() + cache = new( ReentrantLock(), - Dict{UInt64, Vector{T}}(), - Dict{UInt64, Vector{T}}() + Dict{UInt64, Vector{Any}}(), + Dict{UInt64, Vector{Any}}() ) return finalizer(unsafe_free!, cache) end end -function get_pool!(cache::AllocCache{T}, pool::Symbol, uid::UInt64) where {T <: AbstractGPUArray} +function get_pool!(cache::AllocCache, pool::Symbol, uid::UInt64) pool = getproperty(cache, pool) uid_pool = get(pool, uid, nothing) if uid_pool ≡ nothing - uid_pool = Base.@lock cache.lock pool[uid] = T[] + uid_pool = Base.@lock cache.lock pool[uid] = Any[] end return uid_pool end @@ -95,6 +95,16 @@ function Base.sizeof(cache::AllocCache) return sz end +function Base.show(io::IO, cache::AllocCache) + sz, n_free, n_busy = Base.@lock cache.lock begin + sz = sizeof(cache) + n_free = sum(p -> length(p[2]), cache.free; init = 0) + n_busy = sum(p -> length(p[2]), cache.busy; init = 0) + sz, n_free, n_busy + end + print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))") +end + const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing) """ @@ -122,7 +132,7 @@ resulting in higher memory usage. With caching allocator, memory usage stays at exactly `8 GiB`. ```julia -cache = GPUArrays.AllocCache(CuArray) +cache = GPUArrays.AllocCache() n = 1024^3 for i in 1:1000 GPUArrays.@cached cache begin diff --git a/test/testsuite/alloc_cache.jl b/test/testsuite/alloc_cache.jl index 9a7201a5..b032c8bd 100644 --- a/test/testsuite/alloc_cache.jl +++ b/test/testsuite/alloc_cache.jl @@ -1,6 +1,6 @@ @testsuite "alloc cache" (AT, eltypes) -> begin if AT <: AbstractGPUArray - cache = GPUArrays.AllocCache(AT) + cache = GPUArrays.AllocCache() T, dims = Float32, (1, 2, 3) GPUArrays.@cached cache begin From cdc2543a8ff51d2afc032d86c92e2a10c0755435 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 9 Jan 2025 13:27:10 +0200 Subject: [PATCH 26/28] Add return stmt --- src/host/alloc_cache.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index b66e7c40..75286a33 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -102,7 +102,7 @@ function Base.show(io::IO, cache::AllocCache) n_busy = sum(p -> length(p[2]), cache.busy; init = 0) sz, n_free, n_busy end - print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))") + return print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))") end const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing) From 8734a35b4d6c0f1c37890241eeba6ecacb9c5ab6 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 9 Jan 2025 12:42:47 +0100 Subject: [PATCH 27/28] Improve docs. --- src/host/alloc_cache.jl | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl index 75286a33..7ed435de 100644 --- a/src/host/alloc_cache.jl +++ b/src/host/alloc_cache.jl @@ -110,37 +110,33 @@ const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing) """ @cached(cache, expr) -Evaluate expression `expr` using allocations cache `cache`. +Evaluate `expr` using allocations cache `cache`. -When gpu allocation is requested during execution of `expr`, -it will first check if there's "free" cache instead of performing an actual allocation. -If no "free" allocation exists, an actual allocation is performed. -Before returning allocation to the user, it is marked as busy and -will not be used by allocation in the scope defined by `@cached`. +When GPU memory is allocated during the execution of `expr`, `cache` will first be checked. +If no memory is available in the cache, a new allocation will be requested. -**After** the execution of `expr` all "busy" allocations are marked as "free" -thus they can be re-used next time the program enters this scope. +After the execution of `expr`, all allocations made under the scope of `@cached` will be +cached within `cache` for future use. This is useful to avoid relying on GC to free GPU +memory in time. -This is useful to apply in a repeating block of code to avoid relying on -GC to free gpu memory in time. +Once `cache` goes out scope, or when the user calls `unsafe_free!` on it, all cached +allocations will be freed. # Example -In the following example, each iteration of the for-loop requires `8 GiB` of gpu memory. -Without caching allocator GC wouldn't be able to free arrays in time -resulting in higher memory usage. -With caching allocator, memory usage stays at exactly `8 GiB`. +In the following example, each iteration of the for-loop requires 8 GiB of GPU memory. +Without caching those allocations, significant pressure would be put on the GC, resulting +in high memory usage and latency. By using the allocator cache, the memory usage is stable: ```julia cache = GPUArrays.AllocCache() -n = 1024^3 for i in 1:1000 GPUArrays.@cached cache begin - sin.(CUDA.rand(Float32, n)) + sin.(CUDA.rand(Float32, 1024^3)) end end -# To free immediately. -# Otherwise, it will be freed when collected by GC. + +# optionally: free the memory now, instead of waiting for the GC to collect `cache` GPUArrays.unsafe_free!(cache) ``` @@ -157,8 +153,9 @@ end """ uncached(expr) -Evaluate expression `expr` without using allocations cache. -This is useful to call from within `@cached` to avoid caching some allocations. +Evaluate expression `expr` without using the allocation. This is useful to call from within +`@cached` to avoid caching some allocations, e.g., because they can be returned out of the +`@cached` scope. """ macro uncached(expr) return quote From a83a52718ffed1adf05cee2b463ddf560bc68f6b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 9 Jan 2025 12:54:44 +0100 Subject: [PATCH 28/28] Remove duplicate gitignore. [ci skip] --- docs/.gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/.gitignore b/docs/.gitignore index 026087e8..737939a5 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,4 +1,3 @@ -Manifest.toml build site Manifest.toml