JuliaGPU · maleadt · Jan 9, 2025 · Dec 14, 2024 · Dec 14, 2024 · Dec 15, 2024
diff --git a/Project.toml b/Project.toml
@@ -11,6 +11,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
@@ -23,6 +24,7 @@ LinearAlgebra = "1"
 Printf = "1"
 Random = "1"
 Reexport = "1"
+ScopedValues = "1"
 Serialization = "1"
 Statistics = "1"
 julia = "1.10"
diff --git a/docs/make.jl b/docs/make.jl
@@ -20,6 +20,7 @@ function main()
             "Test suite"    => "testsuite.md",
         ],
         doctest = true,
+        warnonly = [:missing_docs],
     )
 
     deploydocs(

diff --git a/docs/src/interface.md b/docs/src/interface.md
@@ -10,7 +10,7 @@ Device functionality is then handled by [KernelAbstractions.jl](https://github.c
 
 You should provide an array type that builds on the `AbstractGPUArray` supertype, such as:
 
-```
+```julia
 mutable struct CustomArray{T, N} <: AbstractGPUArray{T, N}
     data::DataRef{Vector{UInt8}}
     offset::Int
@@ -23,10 +23,17 @@ end
 This will allow your defined type (in this case `JLArray`) to use the GPUArrays interface where available.
 To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you need to define the backend, like so:
 
-```
+```julia
 import KernelAbstractions: Backend
 struct CustomBackend <: KernelAbstractions.GPU
 KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend()
 ```
 
 There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/master/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/master/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/master/src/gpuarrays.jl).
+
+## Caching Allocator
+
+```@docs
+GPUArrays.@cached
+GPUArrays.@uncached
+```
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
@@ -88,12 +88,16 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
     function JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N}
         check_eltype(T)
         maxsize = prod(dims) * sizeof(T)
-        data = Vector{UInt8}(undef, maxsize)
-        ref = DataRef(data) do data
-            resize!(data, 0)
-        end
-        obj = new{T,N}(ref, 0, dims)
-        finalizer(unsafe_free!, obj)
+
+        return GPUArrays.cached_alloc((JLArray, T, dims)) do
+            data = Vector{UInt8}(undef, maxsize)
+            ref = DataRef(data) do data
+                resize!(data, 0)
+            end
+            obj = new{T, N}(ref, 0, dims)
+            finalizer(unsafe_free!, obj)
+            return obj
+        end::JLArray{T, N}
     end
 
     # low-level constructor for wrapping existing data
@@ -102,6 +106,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
         check_eltype(T)
         obj = new{T,N}(ref, offset, dims)
         finalizer(unsafe_free!, obj)
+        return obj
     end
 end
 

diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -34,6 +34,7 @@ include("host/random.jl")
 include("host/quirks.jl")
 include("host/uniformscaling.jl")
 include("host/statistics.jl")
+include("host/alloc_cache.jl")
 
 
 end # module
diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
@@ -0,0 +1,164 @@
+using ..GPUArrays
+
+@static if VERSION < v"1.11"
+    using ScopedValues
+else
+    using Base.ScopedValues
+end
+
+mutable struct AllocCache
+    lock::ReentrantLock
+    busy::Dict{UInt64, Vector{Any}} # hash(key) => GPUArray[]
+    free::Dict{UInt64, Vector{Any}}
+
+    function AllocCache()
+        cache = new(
+            ReentrantLock(),
+            Dict{UInt64, Vector{Any}}(),
+            Dict{UInt64, Vector{Any}}()
+        )
+        return finalizer(unsafe_free!, cache)
+    end
+end
+
+function get_pool!(cache::AllocCache, pool::Symbol, uid::UInt64)
+    pool = getproperty(cache, pool)
+    uid_pool = get(pool, uid, nothing)
+    if uid_pool ≡ nothing
+        uid_pool = Base.@lock cache.lock pool[uid] = Any[]
+    end
+    return uid_pool
+end
+
+function cached_alloc(f, key)
+    cache = ALLOC_CACHE[]
+    if cache === nothing
+        return f()
+    end
+
+    x = nothing
+    uid = hash(key)
+
+    busy_pool = get_pool!(cache, :busy, uid)
+    free_pool = get_pool!(cache, :free, uid)
+    isempty(free_pool) && (x = f())
+
+    while !isempty(free_pool) && x ≡ nothing
+        tmp = Base.@lock cache.lock pop!(free_pool)
+        # Array was manually freed via `unsafe_free!`.
+        GPUArrays.storage(tmp).freed && continue
+        x = tmp
+    end
+
+    x ≡ nothing && (x = f())
+    Base.@lock cache.lock push!(busy_pool, x)
+    return x
+end
+
+function free_busy!(cache::AllocCache)
+    for uid in cache.busy.keys
+        busy_pool = get_pool!(cache, :busy, uid)
+        isempty(busy_pool) && continue
+
+        Base.@lock cache.lock begin
+            free_pool = get_pool!(cache, :free, uid)
+            append!(free_pool, busy_pool)
+            empty!(busy_pool)
+        end
+    end
+    return
+end
+
+function unsafe_free!(cache::AllocCache)
+    Base.@lock cache.lock begin
+        for (_, pool) in cache.busy
+            isempty(pool) || error(
+                "Invalidating allocations cache that's currently in use. " *
+                    "Invalidating inside `@cached` is not allowed."
+            )
+        end
+        for (_, pool) in cache.free
+            map(unsafe_free!, pool)
+        end
+        empty!(cache.free)
+    end
+    return
+end
+
+function Base.sizeof(cache::AllocCache)
+    sz = UInt64(0)
+    Base.@lock cache.lock begin
+        for kind in (cache.free, cache.busy), (_, pool) in kind
+            sz += sum(sizeof, pool; init = UInt64(0))
+        end
+    end
+    return sz
+end
+
+function Base.show(io::IO, cache::AllocCache)
+    sz, n_free, n_busy = Base.@lock cache.lock begin
+        sz = sizeof(cache)
+        n_free = sum(p -> length(p[2]), cache.free; init = 0)
+        n_busy = sum(p -> length(p[2]), cache.busy; init = 0)
+        sz, n_free, n_busy
+    end
+    return print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))")
+end
+
+const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing)
+
+"""
+    @cached(cache, expr)
+
+Evaluate `expr` using allocations cache `cache`.
+
+When GPU memory is allocated during the execution of `expr`, `cache` will first be checked.
+If no memory is available in the cache, a new allocation will be requested.
+
+After the execution of `expr`, all allocations made under the scope of `@cached` will be
+cached within `cache` for future use. This is useful to avoid relying on GC to free GPU
+memory in time.
+
+Once `cache` goes out scope, or when the user calls `unsafe_free!` on it, all cached
+allocations will be freed.
+
+# Example
+
+In the following example, each iteration of the for-loop requires 8 GiB of GPU memory.
+Without caching those allocations, significant pressure would be put on the GC, resulting
+in high memory usage and latency. By using the allocator cache, the memory usage is stable:
+
+```julia
+cache = GPUArrays.AllocCache()
+for i in 1:1000
+    GPUArrays.@cached cache begin
+        sin.(CUDA.rand(Float32, 1024^3))
+    end
+end
+
+# optionally: free the memory now, instead of waiting for the GC to collect `cache`
+GPUArrays.unsafe_free!(cache)
+```
+
+See [`@uncached`](@ref).
+"""
+macro cached(cache, expr)
+    return quote
+        res = @with $(esc(ALLOC_CACHE)) => $(esc(cache)) $(esc(expr))
+        free_busy!($(esc(cache)))
+        res
+    end
+end
+
+"""
+    uncached(expr)
+
+Evaluate expression `expr` without using the allocation. This is useful to call from within
+`@cached` to avoid caching some allocations, e.g., because they can be returned out of the
+`@cached` scope.
+"""
+macro uncached(expr)
+    return quote
+        @with $(esc(ALLOC_CACHE)) => nothing $(esc(expr))
+    end
+end
diff --git a/test/testsuite.jl b/test/testsuite.jl
@@ -93,6 +93,7 @@ include("testsuite/math.jl")
 include("testsuite/random.jl")
 include("testsuite/uniformscaling.jl")
 include("testsuite/statistics.jl")
+include("testsuite/alloc_cache.jl")
 
 """
 Runs the entire GPUArrays test suite on array type `AT`

diff --git a/test/testsuite/alloc_cache.jl b/test/testsuite/alloc_cache.jl
@@ -0,0 +1,43 @@
+@testsuite "alloc cache" (AT, eltypes) -> begin
+    if AT <: AbstractGPUArray
+        cache = GPUArrays.AllocCache()
+
+        T, dims = Float32, (1, 2, 3)
+        GPUArrays.@cached cache begin
+            x1 = AT(zeros(T, dims))
+        end
+        @test sizeof(cache) == sizeof(T) * prod(dims)
+        key = first(keys(cache.free))
+        @test length(cache.free[key]) == 1
+        @test length(cache.busy[key]) == 0
+        @test x1 === cache.free[key][1]
+
+        # Second allocation hits cache.
+        GPUArrays.@cached cache begin
+            x2 = AT(zeros(T, dims))
+            # Does not hit the cache.
+            GPUArrays.@uncached x_free = AT(zeros(T, dims))
+        end
+        @test sizeof(cache) == sizeof(T) * prod(dims)
+        key = first(keys(cache.free))
+        @test length(cache.free[key]) == 1
+        @test length(cache.busy[key]) == 0
+        @test x2 === cache.free[key][1]
+        @test x_free !== x2
+
+        # Third allocation is of different shape - allocates.
+        dims = (2, 2)
+        GPUArrays.@cached cache begin
+            x3 = AT(zeros(T, dims))
+        end
+        _keys = collect(keys(cache.free))
+        key2 = _keys[findfirst(i -> i != key, _keys)]
+        @test length(cache.free[key]) == 1
+        @test length(cache.free[key2]) == 1
+        @test x3 === cache.free[key2][1]
+
+        # Freeing all memory held by cache.
+        GPUArrays.unsafe_free!(cache)
+        @test sizeof(cache) == 0
+    end
+end