From 12cad6293b30c6a4a54de0c1ec9c3593302ddbd5 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Sat, 14 Dec 2024 18:18:58 +0200
Subject: [PATCH 01/28] Implement allocation cache

---
 src/GPUArrays.jl              |   1 +
 src/host/allocations_cache.jl | 132 ++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100644 src/host/allocations_cache.jl

diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 418b87b5..948ff068 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -34,6 +34,7 @@ include("host/random.jl")
 include("host/quirks.jl")
 include("host/uniformscaling.jl")
 include("host/statistics.jl")
+include("host/allocations_cache.jl")
 
 
 end # module
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
new file mode 100644
index 00000000..3f45b052
--- /dev/null
+++ b/src/host/allocations_cache.jl
@@ -0,0 +1,132 @@
+using Base.ScopedValues
+
+struct CacheAllocator{T <: AbstractGPUArray}
+    lock::ReentrantLock
+    busy::Dict{UInt64, Vector{T}} # hash((T, dims)) => GPUArray[]
+    free::Dict{UInt64, Vector{T}}
+end
+
+CacheAllocator(::Type{T}) where T = CacheAllocator(
+    ReentrantLock(),
+    Dict{UInt64, Vector{T}}(),
+    Dict{UInt64, Vector{T}}(),
+)
+
+function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T
+    pool = getproperty(cache, pool)
+    uid_pool = get(pool, uid, nothing)
+    if uid_pool ≡ nothing
+        uid_pool = Base.@lock cache.lock pool[uid] = T[]
+    end
+    return uid_pool
+end
+
+function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}) where {T, N}
+    uid = hash((T, dims))
+    free_pool = get_pool!(cache, :free, uid)
+    busy_pool = get_pool!(cache, :busy, uid)
+
+    x = nothing
+
+    # No array available in `free` - call `alloc_f`.
+    isempty(free_pool) && (x = alloc_f())
+
+    # Otherwise, try fetching from `free`.
+    while !isempty(free_pool) && x ≡ nothing
+        tmp = pop!(free_pool)
+        # Array was manually freed via `unsafe_free!`.
+        tmp.buf.freed && continue
+        x = tmp
+    end
+
+    # No array in cache - call `alloc_f`.
+    x ≡ nothing && (x = alloc_f())
+    push!(busy_pool, x)
+    return x
+end
+
+function free_busy!(cache::CacheAllocator)
+    for uid in cache.busy.keys
+        busy_pool = get_pool!(cache, :busy, uid)
+        isempty(busy_pool) && continue
+
+        free_pool = get_pool!(cache, :free, uid)
+        Base.@lock cache.lock begin
+            append!(free_pool, busy_pool)
+            empty!(busy_pool)
+        end
+    end
+end
+
+struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
+    lock::ReentrantLock
+    caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}
+end
+
+PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray =
+    PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}())
+
+function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T
+    h = hash(device)
+    dev_cache = get(pdcache.caches, h, nothing)
+    if dev_cache ≡ nothing
+        Base.@lock pdcache.lock begin
+            named_cache = CacheAllocator(T)
+            pdcache.caches[h] = Dict{Symbol, CacheAllocator{T}}(name => named_cache)
+            return named_cache
+        end
+    end
+
+    named_cache = get(dev_cache, name, nothing)
+    if named_cache ≡ nothing
+        named_cache = CacheAllocator(T)
+        Base.@lock dev_cache.lock dev_cache[name] = named_cache
+    end
+    return named_cache
+end
+
+function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
+    h = hash(device)
+    dev_cache = get(pdcache.caches, h, nothing)
+    dev_cache ≡ nothing && return
+
+    named_cache = get(dev_cache, name, nothing)
+    named_cache ≡ nothing && return
+
+    Base.@lock named_cache.lock begin
+        for (_, pool) in named_cache.free
+            map(unsafe_free!, pool)
+        end
+        # TODO error when trying to invalidate busy cache?
+        for (_, pool) in named_cache.busy
+            map(unsafe_free!, pool)
+        end
+        empty!(named_cache.busy)
+        empty!(named_cache.free)
+    end
+    return
+end
+
+macro cache_scope(backend, name, expr)
+    quote
+        scope = cache_alloc_scope($(esc(backend)))
+        res = @with scope => $(esc(name)) $(esc(expr))
+        free_busy_cache_alloc!(cache_allocator($(esc(backend))), $(esc(name)))
+        res
+    end
+end
+
+macro no_cache_scope(backend, expr)
+    quote
+        scope = cache_alloc_scope($(esc(backend)))
+        @with scope => :none $(esc(expr))
+    end
+end
+
+# Interface API.
+
+cache_alloc_scope(::Backend) = error("Not implemented.")
+
+cache_allocator(::Backend) = error("Not implemented.")
+
+free_busy_cache_alloc!(pdcache, name::Symbol) = error("Not implemented.")

From c6f128f38c689024462b20b96050425d4642b153 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Sun, 15 Dec 2024 00:15:44 +0200
Subject: [PATCH 02/28] Correctly fetch underlying storage

---
 src/host/allocations_cache.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 3f45b052..44a0aac4 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -35,7 +35,7 @@ function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}) where
     while !isempty(free_pool) && x ≡ nothing
         tmp = pop!(free_pool)
         # Array was manually freed via `unsafe_free!`.
-        tmp.buf.freed && continue
+        storage(tmp).freed && continue
         x = tmp
     end
 

From c2f32e13becfdf3f1733b1358da9b6b484015eae Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Sun, 15 Dec 2024 13:53:26 +0200
Subject: [PATCH 03/28] Add cache sizeof

---
 src/host/allocations_cache.jl | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 44a0aac4..14fd7764 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -85,6 +85,27 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam
     return named_cache
 end
 
+function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
+    sz = UInt64(0)
+    h = hash(device)
+
+    dev_cache = get(pdcache.caches, h, nothing)
+    dev_cache ≡ nothing && return sz
+
+    named_cache = get(dev_cache, name, nothing)
+    named_cache ≡ nothing && return sz
+
+    Base.@lock named_cache.lock begin
+        for (_, pool) in named_cache.free
+            sz += sum(sizeof, pool; init=UInt64(0))
+        end
+        for (_, pool) in named_cache.busy
+            sz += sum(sizeof, pool; init=UInt64(0))
+        end
+    end
+    return sz
+end
+
 function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
     h = hash(device)
     dev_cache = get(pdcache.caches, h, nothing)
@@ -130,3 +151,5 @@ cache_alloc_scope(::Backend) = error("Not implemented.")
 cache_allocator(::Backend) = error("Not implemented.")
 
 free_busy_cache_alloc!(pdcache, name::Symbol) = error("Not implemented.")
+
+invalidate_cache_allocator!(pdcache, name::Symbol) = error("Not implemented.")

From 44e8990cd713cf806253a9503947ef5bbf788f0f Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Tue, 17 Dec 2024 12:49:53 +0200
Subject: [PATCH 04/28] Allow bulk-freeing arrays instead of caching them

---
 src/host/allocations_cache.jl | 73 +++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 14fd7764..770485c6 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -1,5 +1,7 @@
 using Base.ScopedValues
 
+const CacheAllocatorName = ScopedValue(:none)
+
 struct CacheAllocator{T <: AbstractGPUArray}
     lock::ReentrantLock
     busy::Dict{UInt64, Vector{T}} # hash((T, dims)) => GPUArray[]
@@ -21,38 +23,42 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T
     return uid_pool
 end
 
-function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}) where {T, N}
+function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N}
+    x = nothing
     uid = hash((T, dims))
-    free_pool = get_pool!(cache, :free, uid)
     busy_pool = get_pool!(cache, :busy, uid)
 
-    x = nothing
-
-    # No array available in `free` - call `alloc_f`.
-    isempty(free_pool) && (x = alloc_f())
+    if skip_free
+        x = alloc_f()
+    else
+        free_pool = get_pool!(cache, :free, uid)
+        isempty(free_pool) && (x = alloc_f())
 
-    # Otherwise, try fetching from `free`.
-    while !isempty(free_pool) && x ≡ nothing
-        tmp = pop!(free_pool)
-        # Array was manually freed via `unsafe_free!`.
-        storage(tmp).freed && continue
-        x = tmp
+        while !isempty(free_pool) && x ≡ nothing
+            tmp = Base.@lock cache.lock pop!(free_pool)
+            # Array was manually freed via `unsafe_free!`.
+            storage(tmp).freed && continue
+            x = tmp
+        end
     end
 
-    # No array in cache - call `alloc_f`.
     x ≡ nothing && (x = alloc_f())
-    push!(busy_pool, x)
+    Base.@lock cache.lock push!(busy_pool, x)
     return x
 end
 
-function free_busy!(cache::CacheAllocator)
+function free_busy!(cache::CacheAllocator; free_immediately::Bool)
     for uid in cache.busy.keys
         busy_pool = get_pool!(cache, :busy, uid)
         isempty(busy_pool) && continue
 
         free_pool = get_pool!(cache, :free, uid)
         Base.@lock cache.lock begin
-            append!(free_pool, busy_pool)
+            if free_immediately
+                for p in busy_pool unsafe_free!(p) end
+            else
+                append!(free_pool, busy_pool)
+            end
             empty!(busy_pool)
         end
     end
@@ -61,10 +67,11 @@ end
 struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
     lock::ReentrantLock
     caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}
+    free_immediately::Bool
 end
 
-PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray =
-    PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}())
+PerDeviceCacheAllocator(::Type{T}; free_immediately::Bool) where T <: AbstractGPUArray =
+    PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}(), free_immediately)
 
 function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T
     h = hash(device)
@@ -85,6 +92,12 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam
     return named_cache
 end
 
+function alloc!(alloc_f, kab::Backend, name::Symbol, ::Type{T}, dims::Dims{N}) where {T, N}
+    pdcache = cache_allocator(kab)
+    cache = named_cache_allocator!(pdcache, device(kab), name)
+    alloc!(alloc_f, cache, T, dims; skip_free=pdcache.free_immediately)
+end
+
 function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
     sz = UInt64(0)
     h = hash(device)
@@ -106,6 +119,9 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
     return sz
 end
 
+invalidate_cache_allocator!(kab::Backend, name::Symbol) =
+    invalidate_cache_allocator!(cache_allocator(kab), device(kab), name)
+
 function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
     h = hash(device)
     dev_cache = get(pdcache.caches, h, nothing)
@@ -128,28 +144,27 @@ function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, n
     return
 end
 
+function free_busy!(kab::Backend, name::Symbol)
+    pdcache = cache_allocator(kab)
+    free_busy!(named_cache_allocator!(pdcache, device(kab), name); pdcache.free_immediately)
+end
+
 macro cache_scope(backend, name, expr)
     quote
-        scope = cache_alloc_scope($(esc(backend)))
-        res = @with scope => $(esc(name)) $(esc(expr))
-        free_busy_cache_alloc!(cache_allocator($(esc(backend))), $(esc(name)))
+        res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr))
+        free_busy!($(esc(backend)), $(esc(name)))
         res
     end
 end
 
-macro no_cache_scope(backend, expr)
+macro no_cache_scope(expr)
     quote
-        scope = cache_alloc_scope($(esc(backend)))
-        @with scope => :none $(esc(expr))
+        @with $(esc(CacheAllocatorName)) => :none $(esc(expr))
     end
 end
 
 # Interface API.
 
-cache_alloc_scope(::Backend) = error("Not implemented.")
-
 cache_allocator(::Backend) = error("Not implemented.")
 
-free_busy_cache_alloc!(pdcache, name::Symbol) = error("Not implemented.")
-
-invalidate_cache_allocator!(pdcache, name::Symbol) = error("Not implemented.")
+device(::Backend) = error("Not implemented.")

From 5ce044d76188874bfe4cc5e89b0945505c94c35e Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 18 Dec 2024 00:19:39 +0200
Subject: [PATCH 05/28] Add docs

---
 Project.toml                  |  2 +
 src/host/allocations_cache.jl | 85 ++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 551c9edc..1582449f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
@@ -23,6 +24,7 @@ LinearAlgebra = "1"
 Printf = "1"
 Random = "1"
 Reexport = "1"
+ScopedValues = "1"
 Serialization = "1"
 Statistics = "1"
 julia = "1.10"
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 770485c6..aea6a918 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -1,4 +1,8 @@
-using Base.ScopedValues
+@static if VERSION < v"1.11"
+    using ScopedValues
+else
+    using Base.ScopedValues
+end
 
 const CacheAllocatorName = ScopedValue(:none)
 
@@ -23,6 +27,19 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T
     return uid_pool
 end
 
+"""
+    alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N}
+
+Attempt to retrieve cached allocation from `cache` using eltype `T` and `dims`
+as keys for searching.
+If no such allocation is found, execute `alloc_f` that does actual allocation,
+store it in cache for future use and return it.
+
+`skip_free::Bool` is used together with `PerDeviceCacheAllocator.free_immediately`.
+When `true` arrays are bulk-freed instead of stored in cache.
+In this case `alloc!` will avoid looking into "free" part of `cache`
+and execute `alloc_f` immediately, storing allocation for future bulk-freeing.
+"""
 function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N}
     x = nothing
     uid = hash((T, dims))
@@ -55,7 +72,7 @@ function free_busy!(cache::CacheAllocator; free_immediately::Bool)
         free_pool = get_pool!(cache, :free, uid)
         Base.@lock cache.lock begin
             if free_immediately
-                for p in busy_pool unsafe_free!(p) end
+                map(unsafe_free!, busy_pool)
             else
                 append!(free_pool, busy_pool)
             end
@@ -119,6 +136,11 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
     return sz
 end
 
+"""
+    invalidate_cache_allocator!(kab::Backend, name::Symbol)
+
+Free all memory held by `name`d cached allocator given KernelAbstractions `backend`.
+"""
 invalidate_cache_allocator!(kab::Backend, name::Symbol) =
     invalidate_cache_allocator!(cache_allocator(kab), device(kab), name)
 
@@ -149,6 +171,47 @@ function free_busy!(kab::Backend, name::Symbol)
     free_busy!(named_cache_allocator!(pdcache, device(kab), name); pdcache.free_immediately)
 end
 
+"""
+    @cache_scope backend name expr
+
+Evaluate expression `expr` using `name`d caching allocator
+for the given KernelAbstractions `backend`.
+
+When during execution of `expr` gpu allocation is requested,
+allocator will try to find such allocation in "free" parts of cache,
+marking them as "busy" and returning allocation to the user.
+If no allocation is found in "free" part, an actual allocation is performed,
+marking it as "busy" and returned to the user.
+
+**After** the execution of `expr` all "busy" allocations are marked as "free"
+thus they can be re-used next time the program enters this scope.
+
+This is useful to apply in a repeating block of code to avoid relying on
+GC to free gpu memory in time.
+
+`name` is a `Symbol` that defines which allocator to use
+(`:none` is reserved and means no allocator).
+
+# Example
+
+In following example we apply caching allocator at every iteration of the for-loop.
+Every iteration requires 2 GiB of gpu memory, without caching allocator
+GC wouldn't be able to free arrays in time resulting in higher memory usage.
+With caching allocator, memory usage stays at exactly 2 GiB.
+
+After the loop, we free all cached memory if there's any.
+
+```julia
+kab = CUDABackend()
+n = 1024^3
+for i in 1:1000
+    @cache_scope kab :loop begin
+        sin.(CUDA.rand(Float32, n))
+    end
+end
+invalidate_cache_allocator!(kab, :loop)
+```
+"""
 macro cache_scope(backend, name, expr)
     quote
         res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr))
@@ -157,6 +220,12 @@ macro cache_scope(backend, name, expr)
     end
 end
 
+"""
+    @no_cache_scope expr
+
+Evaluate expression `expr` without using caching allocator.
+This is useful to call from within `@cache_scope` to avoid caching arrays.
+"""
 macro no_cache_scope(expr)
     quote
         @with $(esc(CacheAllocatorName)) => :none $(esc(expr))
@@ -165,6 +234,18 @@ end
 
 # Interface API.
 
+"""
+    cache_allocator(::Backend)
+
+Given KernelAbstractions `backend`, return corresponding `PerDeviceCacheAllocator` for it.
+Each GPU backend must implement this.
+"""
 cache_allocator(::Backend) = error("Not implemented.")
 
+"""
+    device(::Backend)
+
+Given KernelAbstractions `backend`, return current device.
+Each GPU backend must implement this.
+"""
 device(::Backend) = error("Not implemented.")

From 977657815ba7de21efea97b5e0b319c1b76deda2 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 18 Dec 2024 18:01:32 +0200
Subject: [PATCH 06/28] Add tests

---
 lib/JLArrays/src/JLArrays.jl        | 26 +++++++--
 src/host/allocations_cache.jl       |  6 +--
 test/runtests.jl                    |  3 ++
 test/testsuite.jl                   |  1 +
 test/testsuite/caching_allocator.jl | 82 +++++++++++++++++++++++++++++
 5 files changed, 110 insertions(+), 8 deletions(-)
 create mode 100644 test/testsuite/caching_allocator.jl

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index dbec3d25..d717e9a3 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -88,12 +88,20 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
     function JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N}
         check_eltype(T)
         maxsize = prod(dims) * sizeof(T)
-        data = Vector{UInt8}(undef, maxsize)
-        ref = DataRef(data) do data
-            resize!(data, 0)
+
+        function _alloc_f()
+            data = Vector{UInt8}(undef, maxsize)
+            ref = DataRef(data) do data
+                resize!(data, 0)
+            end
+            obj = new{T,N}(ref, 0, dims)
+            finalizer(unsafe_free!, obj)
         end
-        obj = new{T,N}(ref, 0, dims)
-        finalizer(unsafe_free!, obj)
+
+        name = GPUArrays.CacheAllocatorName[]
+        return name == :none ?
+            _alloc_f() :
+            GPUArrays.alloc!(_alloc_f, JLBackend(), name, T, dims)::JLArray{T, N}
     end
 
     # low-level constructor for wrapping existing data
@@ -387,4 +395,12 @@ Adapt.adapt_storage(::JLBackend, a::Array) = Adapt.adapt(JLArrays.JLArray, a)
 Adapt.adapt_storage(::JLBackend, a::JLArrays.JLArray) = a
 Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Array, a)
 
+# Caching Allocator.
+
+const JLACacheAllocator = GPUArrays.PerDeviceCacheAllocator(JLArray; free_immediately=false)
+
+GPUArrays.cache_allocator(::JLBackend) = JLACacheAllocator
+
+GPUArrays.device(::JLBackend) = 1
+
 end
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index aea6a918..90c5f5e6 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -69,11 +69,11 @@ function free_busy!(cache::CacheAllocator; free_immediately::Bool)
         busy_pool = get_pool!(cache, :busy, uid)
         isempty(busy_pool) && continue
 
-        free_pool = get_pool!(cache, :free, uid)
         Base.@lock cache.lock begin
             if free_immediately
                 map(unsafe_free!, busy_pool)
             else
+                free_pool = get_pool!(cache, :free, uid)
                 append!(free_pool, busy_pool)
             end
             empty!(busy_pool)
@@ -81,7 +81,7 @@ function free_busy!(cache::CacheAllocator; free_immediately::Bool)
     end
 end
 
-struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
+mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
     lock::ReentrantLock
     caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}
     free_immediately::Bool
@@ -104,7 +104,7 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam
     named_cache = get(dev_cache, name, nothing)
     if named_cache ≡ nothing
         named_cache = CacheAllocator(T)
-        Base.@lock dev_cache.lock dev_cache[name] = named_cache
+        Base.@lock pdcache.lock dev_cache[name] = named_cache
     end
     return named_cache
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 66d6a096..5fe51aec 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -48,6 +48,9 @@ include("setup.jl")     # make sure everything is precompiled
 const tests = []
 const test_runners = Dict()
 for AT in (JLArray, Array), name in keys(TestSuite.tests)
+    # Disable for now.
+    name == "Caching Allocator" && continue
+
     push!(tests, "$(AT)/$name")
     test_runners["$(AT)/$name"] = ()->TestSuite.tests[name](AT)
 end
diff --git a/test/testsuite.jl b/test/testsuite.jl
index 179c824b..59bb967b 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -93,6 +93,7 @@ include("testsuite/math.jl")
 include("testsuite/random.jl")
 include("testsuite/uniformscaling.jl")
 include("testsuite/statistics.jl")
+include("testsuite/caching_allocator.jl")
 
 """
 Runs the entire GPUArrays test suite on array type `AT`
diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl
new file mode 100644
index 00000000..4f63023a
--- /dev/null
+++ b/test/testsuite/caching_allocator.jl
@@ -0,0 +1,82 @@
+@testsuite "Caching Allocator" (AT, eltypes) -> begin
+    # Hacky way to get KA backend from AT.
+    kab = KernelAbstractions.get_backend(AT(Array{Int}(undef, 0)))
+    device = GPUArrays.device(kab)
+
+    @testset "free_immediately=false" begin
+        pdcache = GPUArrays.cache_allocator(kab)
+        pdcache.free_immediately = false
+        named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache)
+
+        T = Float32
+        dims = (1, 2, 3)
+        key = hash((T, dims))
+
+        GPUArrays.@cache_scope kab :cache begin
+            x1 = AT(zeros(T, dims))
+        end
+        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
+        @test length(named_cache.free[key]) == 1
+        @test length(named_cache.busy[key]) == 0
+        @test x1 === named_cache.free[key][1]
+
+        # Second allocation does not allocate - cache stays the same in size.
+
+        GPUArrays.@cache_scope kab :cache begin
+            x2 = AT(zeros(T, dims))
+
+            # Does not go to cache.
+            GPUArrays.@no_cache_scope begin
+                x_free = AT(zeros(T, dims))
+            end
+        end
+        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
+        @test length(named_cache.free[key]) == 1
+        @test length(named_cache.busy[key]) == 0
+        @test x2 === x1
+        @test x2 === named_cache.free[key][1]
+        @test x_free !== x2
+
+        # Third allocation of different type - cache grows.
+
+        T2 = Int32
+        key2 = hash((T2, dims))
+        GPUArrays.@cache_scope kab :cache begin
+            x3 = AT(zeros(T2, dims))
+        end
+        @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
+        @test length(named_cache.free[key]) == 1
+        @test length(named_cache.free[key2]) == 1
+        @test x3 === named_cache.free[key2][1]
+
+        # Freeing all memory held by cache.
+
+        GPUArrays.invalidate_cache_allocator!(kab, :cache)
+        @test sizeof(pdcache, device, :cache) == 0
+    end
+
+    @testset "free_immediately=true" begin
+        pdcache = GPUArrays.cache_allocator(kab)
+        pdcache.free_immediately = true
+        named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache2)
+
+        T = Float32
+        dims = (1, 2, 3)
+        key = hash((T, dims))
+
+        @test sizeof(pdcache, device, :cache2) == 0
+
+        GPUArrays.@cache_scope kab :cache2 begin
+            x1 = AT(zeros(T, dims))
+
+            @test !haskey(named_cache.free, key)
+            @test length(named_cache.busy[key]) == 1
+            @test sizeof(pdcache, device, :cache2) == sizeof(Float32) * prod(dims)
+        end
+
+        # `free` was never even used with `free_immediately=true`.
+        @test !haskey(named_cache.free, key)
+        @test length(named_cache.busy[key]) == 0
+        @test sizeof(pdcache, device, :cache2) == 0
+    end
+end

From c5032ad4bd4e69c80c697b5f67fad9ac3f398bd1 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 18 Dec 2024 18:57:15 +0200
Subject: [PATCH 07/28] Update docs

---
 docs/.gitignore       |  1 +
 docs/make.jl          |  1 +
 docs/src/interface.md | 12 ++++++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/.gitignore b/docs/.gitignore
index 737939a5..026087e8 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,3 +1,4 @@
+Manifest.toml
 build
 site
 Manifest.toml
diff --git a/docs/make.jl b/docs/make.jl
index 72828e3b..b8ca1f92 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -20,6 +20,7 @@ function main()
             "Test suite"    => "testsuite.md",
         ],
         doctest = true,
+        warnonly=[:missing_docs],
     )
 
     deploydocs(
diff --git a/docs/src/interface.md b/docs/src/interface.md
index 239bef87..f6e3a9ea 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -10,7 +10,7 @@ Device functionality is then handled by [KernelAbstractions.jl](https://github.c
 
 You should provide an array type that builds on the `AbstractGPUArray` supertype, such as:
 
-```
+```julia
 mutable struct CustomArray{T, N} <: AbstractGPUArray{T, N}
     data::DataRef{Vector{UInt8}}
     offset::Int
@@ -23,10 +23,18 @@ end
 This will allow your defined type (in this case `JLArray`) to use the GPUArrays interface where available.
 To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you need to define the backend, like so:
 
-```
+```julia
 import KernelAbstractions: Backend
 struct CustomBackend <: KernelAbstractions.GPU
 KernelAbstractions.get_backend(a::CA) where CA <: CustomArray = CustomBackend()
 ```
 
 There are numerous examples of potential interfaces for GPUArrays, such as with [JLArrays](https://github.com/JuliaGPU/GPUArrays.jl/blob/master/lib/JLArrays/src/JLArrays.jl), [CuArrays](https://github.com/JuliaGPU/CUDA.jl/blob/master/src/gpuarrays.jl), and [ROCArrays](https://github.com/JuliaGPU/AMDGPU.jl/blob/master/src/gpuarrays.jl).
+
+## Caching Allocator
+
+```@docs
+GPUArrays.@cache_scope
+GPUArrays.@no_cache_scope
+GPUArrays.invalidate_cache_allocator!
+```

From 99a81710ac07a4d737640201a495865b7fd1568e Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Sat, 21 Dec 2024 11:58:49 +0200
Subject: [PATCH 08/28] Update docs & disable test for now

---
 src/host/allocations_cache.jl | 22 +++++++++++-----------
 test/runtests.jl              |  3 ---
 test/testsuite.jl             |  4 +++-
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 90c5f5e6..4c75b32b 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -177,11 +177,11 @@ end
 Evaluate expression `expr` using `name`d caching allocator
 for the given KernelAbstractions `backend`.
 
-When during execution of `expr` gpu allocation is requested,
-allocator will try to find such allocation in "free" parts of cache,
-marking them as "busy" and returning allocation to the user.
-If no allocation is found in "free" part, an actual allocation is performed,
-marking it as "busy" and returned to the user.
+When gpu allocation is requested during execution of `expr`,
+allocator will try to use its "free" cache instead of doing an actual allocation.
+If no "free" allocation exists, an actual allocation is performed.
+Before returning allocation to the user, it is marked as busy and
+will not be used by allocation in the scope defined by `@cache_scope`.
 
 **After** the execution of `expr` all "busy" allocations are marked as "free"
 thus they can be re-used next time the program enters this scope.
@@ -194,13 +194,13 @@ GC to free gpu memory in time.
 
 # Example
 
-In following example we apply caching allocator at every iteration of the for-loop.
-Every iteration requires 2 GiB of gpu memory, without caching allocator
-GC wouldn't be able to free arrays in time resulting in higher memory usage.
-With caching allocator, memory usage stays at exactly 2 GiB.
-
-After the loop, we free all cached memory if there's any.
+In the following example, each iteration of the for-loop requires `2 GiB`
+of gpu memory.
+Without caching allocator GC wouldn't be able to free arrays in time
+resulting in higher memory usage.
+With caching allocator, memory usage stays at exactly `2 GiB`.
 
+See [`@no_cache_scope`](@ref), [`invalidate_cache_allocator!`](@ref).
 ```julia
 kab = CUDABackend()
 n = 1024^3
diff --git a/test/runtests.jl b/test/runtests.jl
index 5fe51aec..66d6a096 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -48,9 +48,6 @@ include("setup.jl")     # make sure everything is precompiled
 const tests = []
 const test_runners = Dict()
 for AT in (JLArray, Array), name in keys(TestSuite.tests)
-    # Disable for now.
-    name == "Caching Allocator" && continue
-
     push!(tests, "$(AT)/$name")
     test_runners["$(AT)/$name"] = ()->TestSuite.tests[name](AT)
 end
diff --git a/test/testsuite.jl b/test/testsuite.jl
index 59bb967b..3ae5573e 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -93,7 +93,9 @@ include("testsuite/math.jl")
 include("testsuite/random.jl")
 include("testsuite/uniformscaling.jl")
 include("testsuite/statistics.jl")
-include("testsuite/caching_allocator.jl")
+
+# TODO re-enable once backends support it.
+# include("testsuite/caching_allocator.jl")
 
 """
 Runs the entire GPUArrays test suite on array type `AT`

From ad828dfb841a536c0e42363e7ab712e784602155 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Mon, 6 Jan 2025 21:22:02 +0200
Subject: [PATCH 09/28] Use array type instead of KA backend & allow arbitrary
 keys

---
 docs/src/interface.md               |  6 +--
 src/host/allocations_cache.jl       | 75 +++++++++++++++--------------
 test/testsuite.jl                   |  1 -
 test/testsuite/caching_allocator.jl | 24 +++++----
 4 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index f6e3a9ea..dadb06c1 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -34,7 +34,7 @@ There are numerous examples of potential interfaces for GPUArrays, such as with
 ## Caching Allocator
 
 ```@docs
-GPUArrays.@cache_scope
-GPUArrays.@no_cache_scope
-GPUArrays.invalidate_cache_allocator!
+GPUArrays.AllocCache.@enable
+GPUArrays.AllocCache.@disable
+GPUArrays.AllocCache.invalidate!
 ```
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 4c75b32b..22f3033f 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -1,3 +1,8 @@
+module AllocCache
+
+using ..GPUArrays
+using KernelAbstractions
+
 @static if VERSION < v"1.11"
     using ScopedValues
 else
@@ -8,7 +13,7 @@ const CacheAllocatorName = ScopedValue(:none)
 
 struct CacheAllocator{T <: AbstractGPUArray}
     lock::ReentrantLock
-    busy::Dict{UInt64, Vector{T}} # hash((T, dims)) => GPUArray[]
+    busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[]
     free::Dict{UInt64, Vector{T}}
 end
 
@@ -28,10 +33,9 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T
 end
 
 """
-    alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N}
+    alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool)
 
-Attempt to retrieve cached allocation from `cache` using eltype `T` and `dims`
-as keys for searching.
+Attempt to retrieve cached allocation from `cache` using `key` for searching.
 If no such allocation is found, execute `alloc_f` that does actual allocation,
 store it in cache for future use and return it.
 
@@ -40,9 +44,9 @@ When `true` arrays are bulk-freed instead of stored in cache.
 In this case `alloc!` will avoid looking into "free" part of `cache`
 and execute `alloc_f` immediately, storing allocation for future bulk-freeing.
 """
-function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_free::Bool) where {T, N}
+function alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool)
     x = nothing
-    uid = hash((T, dims))
+    uid = hash(key)
     busy_pool = get_pool!(cache, :busy, uid)
 
     if skip_free
@@ -54,7 +58,7 @@ function alloc!(alloc_f, cache::CacheAllocator, ::Type{T}, dims::Dims{N}; skip_f
         while !isempty(free_pool) && x ≡ nothing
             tmp = Base.@lock cache.lock pop!(free_pool)
             # Array was manually freed via `unsafe_free!`.
-            storage(tmp).freed && continue
+            GPUArrays.storage(tmp).freed && continue
             x = tmp
         end
     end
@@ -109,10 +113,10 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam
     return named_cache
 end
 
-function alloc!(alloc_f, kab::Backend, name::Symbol, ::Type{T}, dims::Dims{N}) where {T, N}
-    pdcache = cache_allocator(kab)
-    cache = named_cache_allocator!(pdcache, device(kab), name)
-    alloc!(alloc_f, cache, T, dims; skip_free=pdcache.free_immediately)
+function alloc!(alloc_f, AT::Type{<: AbstractGPUArray}, name::Symbol, key)
+    pdcache = cache_allocator(AT)
+    cache = named_cache_allocator!(pdcache, device(AT), name)
+    alloc!(alloc_f, cache, key; skip_free=pdcache.free_immediately)
 end
 
 function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
@@ -137,14 +141,14 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
 end
 
 """
-    invalidate_cache_allocator!(kab::Backend, name::Symbol)
+    invalidate!(AT::Type{AbstractGPUArray}, name::Symbol)
 
-Free all memory held by `name`d cached allocator given KernelAbstractions `backend`.
+Free all memory held by `name`d cached allocator given array type `AT`.
 """
-invalidate_cache_allocator!(kab::Backend, name::Symbol) =
-    invalidate_cache_allocator!(cache_allocator(kab), device(kab), name)
+invalidate!(AT::Type{<: AbstractGPUArray}, name::Symbol) =
+    invalidate!(cache_allocator(AT), device(AT), name)
 
-function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
+function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
     h = hash(device)
     dev_cache = get(pdcache.caches, h, nothing)
     dev_cache ≡ nothing && return
@@ -166,16 +170,16 @@ function invalidate_cache_allocator!(pdcache::PerDeviceCacheAllocator, device, n
     return
 end
 
-function free_busy!(kab::Backend, name::Symbol)
-    pdcache = cache_allocator(kab)
-    free_busy!(named_cache_allocator!(pdcache, device(kab), name); pdcache.free_immediately)
+function free_busy!(AT::Type{<: AbstractGPUArray}, name::Symbol)
+    pdcache = cache_allocator(AT)
+    free_busy!(named_cache_allocator!(pdcache, device(AT), name); pdcache.free_immediately)
 end
 
 """
-    @cache_scope backend name expr
+    @enable AT name expr
 
 Evaluate expression `expr` using `name`d caching allocator
-for the given KernelAbstractions `backend`.
+for the given array type `AT`.
 
 When gpu allocation is requested during execution of `expr`,
 allocator will try to use its "free" cache instead of doing an actual allocation.
@@ -202,31 +206,30 @@ With caching allocator, memory usage stays at exactly `2 GiB`.
 
 See [`@no_cache_scope`](@ref), [`invalidate_cache_allocator!`](@ref).
 ```julia
-kab = CUDABackend()
 n = 1024^3
 for i in 1:1000
-    @cache_scope kab :loop begin
+    CUDA.AllocCache.@enable CuArray :loop begin
         sin.(CUDA.rand(Float32, n))
     end
 end
-invalidate_cache_allocator!(kab, :loop)
+CUDA.AllocCache.invalidate!(CuArray, :loop)
 ```
 """
-macro cache_scope(backend, name, expr)
+macro enable(AT, name, expr)
     quote
         res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr))
-        free_busy!($(esc(backend)), $(esc(name)))
+        free_busy!($(esc(AT)), $(esc(name)))
         res
     end
 end
 
 """
-    @no_cache_scope expr
+    @disable expr
 
 Evaluate expression `expr` without using caching allocator.
-This is useful to call from within `@cache_scope` to avoid caching arrays.
+This is useful to call from within `@enable` to avoid caching arrays.
 """
-macro no_cache_scope(expr)
+macro disable(expr)
     quote
         @with $(esc(CacheAllocatorName)) => :none $(esc(expr))
     end
@@ -235,17 +238,19 @@ end
 # Interface API.
 
 """
-    cache_allocator(::Backend)
+    cache_allocator(::Type{AbstractGPUArray})
 
-Given KernelAbstractions `backend`, return corresponding `PerDeviceCacheAllocator` for it.
+Given array type, return corresponding `PerDeviceCacheAllocator` for it.
 Each GPU backend must implement this.
 """
-cache_allocator(::Backend) = error("Not implemented.")
+cache_allocator(::Type{AbstractGPUArray}) = error("Not implemented.")
 
 """
-    device(::Backend)
+    device(::Type{AbstractGPUArray})
 
-Given KernelAbstractions `backend`, return current device.
+Given array type, return current device.
 Each GPU backend must implement this.
 """
-device(::Backend) = error("Not implemented.")
+device(::Type{AbstractGPUArray}) = error("Not implemented.")
+
+end
diff --git a/test/testsuite.jl b/test/testsuite.jl
index 3ae5573e..c235b756 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -93,7 +93,6 @@ include("testsuite/math.jl")
 include("testsuite/random.jl")
 include("testsuite/uniformscaling.jl")
 include("testsuite/statistics.jl")
-
 # TODO re-enable once backends support it.
 # include("testsuite/caching_allocator.jl")
 
diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl
index 4f63023a..7118e192 100644
--- a/test/testsuite/caching_allocator.jl
+++ b/test/testsuite/caching_allocator.jl
@@ -1,18 +1,16 @@
 @testsuite "Caching Allocator" (AT, eltypes) -> begin
-    # Hacky way to get KA backend from AT.
-    kab = KernelAbstractions.get_backend(AT(Array{Int}(undef, 0)))
-    device = GPUArrays.device(kab)
+    device = GPUArrays.AllocCache.device(AT)
 
     @testset "free_immediately=false" begin
-        pdcache = GPUArrays.cache_allocator(kab)
+        pdcache = GPUArrays.AllocCache.cache_allocator(AT)
         pdcache.free_immediately = false
-        named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache)
+        named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache)
 
         T = Float32
         dims = (1, 2, 3)
         key = hash((T, dims))
 
-        GPUArrays.@cache_scope kab :cache begin
+        GPUArrays.AllocCache.@enable AT :cache begin
             x1 = AT(zeros(T, dims))
         end
         @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
@@ -22,11 +20,11 @@
 
         # Second allocation does not allocate - cache stays the same in size.
 
-        GPUArrays.@cache_scope kab :cache begin
+        GPUArrays.AllocCache.@enable AT :cache begin
             x2 = AT(zeros(T, dims))
 
             # Does not go to cache.
-            GPUArrays.@no_cache_scope begin
+            GPUArrays.AllocCache.@disable begin
                 x_free = AT(zeros(T, dims))
             end
         end
@@ -41,7 +39,7 @@
 
         T2 = Int32
         key2 = hash((T2, dims))
-        GPUArrays.@cache_scope kab :cache begin
+        GPUArrays.AllocCache.@enable AT :cache begin
             x3 = AT(zeros(T2, dims))
         end
         @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
@@ -51,14 +49,14 @@
 
         # Freeing all memory held by cache.
 
-        GPUArrays.invalidate_cache_allocator!(kab, :cache)
+        GPUArrays.AllocCache.invalidate!(AT, :cache)
         @test sizeof(pdcache, device, :cache) == 0
     end
 
     @testset "free_immediately=true" begin
-        pdcache = GPUArrays.cache_allocator(kab)
+        pdcache = GPUArrays.AllocCache.cache_allocator(AT)
         pdcache.free_immediately = true
-        named_cache = GPUArrays.named_cache_allocator!(pdcache, device, :cache2)
+        named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache2)
 
         T = Float32
         dims = (1, 2, 3)
@@ -66,7 +64,7 @@
 
         @test sizeof(pdcache, device, :cache2) == 0
 
-        GPUArrays.@cache_scope kab :cache2 begin
+        GPUArrays.AllocCache.@enable AT :cache2 begin
             x1 = AT(zeros(T, dims))
 
             @test !haskey(named_cache.free, key)

From e601f17b5f9b0560b75a99bec55a69371b2183fd Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Mon, 6 Jan 2025 21:28:14 +0200
Subject: [PATCH 10/28] Minor cleanups

---
 lib/JLArrays/src/JLArrays.jl  | 12 ++++++------
 src/host/allocations_cache.jl | 16 +++++++---------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index d717e9a3..8f79a1e3 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -98,10 +98,10 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
             finalizer(unsafe_free!, obj)
         end
 
-        name = GPUArrays.CacheAllocatorName[]
-        return name == :none ?
+        name = GPUArrays.AllocCache.CacheAllocatorName[]
+        return name ≡ nothing ?
             _alloc_f() :
-            GPUArrays.alloc!(_alloc_f, JLBackend(), name, T, dims)::JLArray{T, N}
+            GPUArrays.AllocCache.alloc!(_alloc_f, JLArray, name, (T, dims))::JLArray{T, N}
     end
 
     # low-level constructor for wrapping existing data
@@ -397,10 +397,10 @@ Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Arr
 
 # Caching Allocator.
 
-const JLACacheAllocator = GPUArrays.PerDeviceCacheAllocator(JLArray; free_immediately=false)
+const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray; free_immediately=false)
 
-GPUArrays.cache_allocator(::JLBackend) = JLACacheAllocator
+GPUArrays.AllocCache.cache_allocator(::Type{<: JLArray}) = JLACacheAllocator
 
-GPUArrays.device(::JLBackend) = 1
+GPUArrays.AllocCache.device(::Type{<: JLArray}) = 1
 
 end
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 22f3033f..022d2aaa 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -9,7 +9,7 @@ else
     using Base.ScopedValues
 end
 
-const CacheAllocatorName = ScopedValue(:none)
+const CacheAllocatorName = ScopedValue{Union{Nothing, Symbol}}(nothing)
 
 struct CacheAllocator{T <: AbstractGPUArray}
     lock::ReentrantLock
@@ -178,14 +178,13 @@ end
 """
     @enable AT name expr
 
-Evaluate expression `expr` using `name`d caching allocator
-for the given array type `AT`.
+Evaluate expression `expr` using `name`d caching allocator for the given array type `AT`.
 
 When gpu allocation is requested during execution of `expr`,
 allocator will try to use its "free" cache instead of doing an actual allocation.
 If no "free" allocation exists, an actual allocation is performed.
 Before returning allocation to the user, it is marked as busy and
-will not be used by allocation in the scope defined by `@cache_scope`.
+will not be used by allocation in the scope defined by `@enable`.
 
 **After** the execution of `expr` all "busy" allocations are marked as "free"
 thus they can be re-used next time the program enters this scope.
@@ -194,17 +193,16 @@ This is useful to apply in a repeating block of code to avoid relying on
 GC to free gpu memory in time.
 
 `name` is a `Symbol` that defines which allocator to use
-(`:none` is reserved and means no allocator).
+(`nothing`, which is a default, disables it).
 
 # Example
 
-In the following example, each iteration of the for-loop requires `2 GiB`
-of gpu memory.
+In the following example, each iteration of the for-loop requires `2 GiB` of gpu memory.
 Without caching allocator GC wouldn't be able to free arrays in time
 resulting in higher memory usage.
 With caching allocator, memory usage stays at exactly `2 GiB`.
 
-See [`@no_cache_scope`](@ref), [`invalidate_cache_allocator!`](@ref).
+See [`@disable`](@ref), [`invalidate!`](@ref).
 ```julia
 n = 1024^3
 for i in 1:1000
@@ -231,7 +229,7 @@ This is useful to call from within `@enable` to avoid caching arrays.
 """
 macro disable(expr)
     quote
-        @with $(esc(CacheAllocatorName)) => :none $(esc(expr))
+        @with $(esc(CacheAllocatorName)) => nothing $(esc(expr))
     end
 end
 

From ba1941a7248afe5235b49d5bcca2df3c41d1f1dc Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Tue, 7 Jan 2025 13:47:20 +0200
Subject: [PATCH 11/28] Remove 'free_immediately' param

---
 lib/JLArrays/src/JLArrays.jl        |   2 +-
 src/host/allocations_cache.jl       |  49 +++++--------
 test/testsuite.jl                   |   3 +-
 test/testsuite/caching_allocator.jl | 108 +++++++++++-----------------
 4 files changed, 61 insertions(+), 101 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 8f79a1e3..bf652325 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -397,7 +397,7 @@ Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Arr
 
 # Caching Allocator.
 
-const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray; free_immediately=false)
+const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray)
 
 GPUArrays.AllocCache.cache_allocator(::Type{<: JLArray}) = JLACacheAllocator
 
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 022d2aaa..16f53299 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -33,34 +33,24 @@ function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T
 end
 
 """
-    alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool)
+    alloc!(alloc_f, cache::CacheAllocator, key)
 
 Attempt to retrieve cached allocation from `cache` using `key` for searching.
 If no such allocation is found, execute `alloc_f` that does actual allocation,
 store it in cache for future use and return it.
-
-`skip_free::Bool` is used together with `PerDeviceCacheAllocator.free_immediately`.
-When `true` arrays are bulk-freed instead of stored in cache.
-In this case `alloc!` will avoid looking into "free" part of `cache`
-and execute `alloc_f` immediately, storing allocation for future bulk-freeing.
 """
-function alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool)
+function alloc!(alloc_f, cache::CacheAllocator, key)
     x = nothing
     uid = hash(key)
     busy_pool = get_pool!(cache, :busy, uid)
-
-    if skip_free
-        x = alloc_f()
-    else
-        free_pool = get_pool!(cache, :free, uid)
-        isempty(free_pool) && (x = alloc_f())
-
-        while !isempty(free_pool) && x ≡ nothing
-            tmp = Base.@lock cache.lock pop!(free_pool)
-            # Array was manually freed via `unsafe_free!`.
-            GPUArrays.storage(tmp).freed && continue
-            x = tmp
-        end
+    free_pool = get_pool!(cache, :free, uid)
+    isempty(free_pool) && (x = alloc_f())
+
+    while !isempty(free_pool) && x ≡ nothing
+        tmp = Base.@lock cache.lock pop!(free_pool)
+        # Array was manually freed via `unsafe_free!`.
+        GPUArrays.storage(tmp).freed && continue
+        x = tmp
     end
 
     x ≡ nothing && (x = alloc_f())
@@ -68,18 +58,14 @@ function alloc!(alloc_f, cache::CacheAllocator, key; skip_free::Bool)
     return x
 end
 
-function free_busy!(cache::CacheAllocator; free_immediately::Bool)
+function free_busy!(cache::CacheAllocator)
     for uid in cache.busy.keys
         busy_pool = get_pool!(cache, :busy, uid)
         isempty(busy_pool) && continue
 
         Base.@lock cache.lock begin
-            if free_immediately
-                map(unsafe_free!, busy_pool)
-            else
-                free_pool = get_pool!(cache, :free, uid)
-                append!(free_pool, busy_pool)
-            end
+            free_pool = get_pool!(cache, :free, uid)
+            append!(free_pool, busy_pool)
             empty!(busy_pool)
         end
     end
@@ -88,11 +74,10 @@ end
 mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
     lock::ReentrantLock
     caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}
-    free_immediately::Bool
 end
 
-PerDeviceCacheAllocator(::Type{T}; free_immediately::Bool) where T <: AbstractGPUArray =
-    PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}(), free_immediately)
+PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray =
+    PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}())
 
 function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T
     h = hash(device)
@@ -116,7 +101,7 @@ end
 function alloc!(alloc_f, AT::Type{<: AbstractGPUArray}, name::Symbol, key)
     pdcache = cache_allocator(AT)
     cache = named_cache_allocator!(pdcache, device(AT), name)
-    alloc!(alloc_f, cache, key; skip_free=pdcache.free_immediately)
+    alloc!(alloc_f, cache, key)
 end
 
 function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
@@ -172,7 +157,7 @@ end
 
 function free_busy!(AT::Type{<: AbstractGPUArray}, name::Symbol)
     pdcache = cache_allocator(AT)
-    free_busy!(named_cache_allocator!(pdcache, device(AT), name); pdcache.free_immediately)
+    free_busy!(named_cache_allocator!(pdcache, device(AT), name))
 end
 
 """
diff --git a/test/testsuite.jl b/test/testsuite.jl
index c235b756..59bb967b 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -93,8 +93,7 @@ include("testsuite/math.jl")
 include("testsuite/random.jl")
 include("testsuite/uniformscaling.jl")
 include("testsuite/statistics.jl")
-# TODO re-enable once backends support it.
-# include("testsuite/caching_allocator.jl")
+include("testsuite/caching_allocator.jl")
 
 """
 Runs the entire GPUArrays test suite on array type `AT`
diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl
index 7118e192..9d229186 100644
--- a/test/testsuite/caching_allocator.jl
+++ b/test/testsuite/caching_allocator.jl
@@ -1,80 +1,56 @@
 @testsuite "Caching Allocator" (AT, eltypes) -> begin
     device = GPUArrays.AllocCache.device(AT)
+    pdcache = GPUArrays.AllocCache.cache_allocator(AT)
+    named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache)
 
-    @testset "free_immediately=false" begin
-        pdcache = GPUArrays.AllocCache.cache_allocator(AT)
-        pdcache.free_immediately = false
-        named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache)
+    T = Float32
+    dims = (1, 2, 3)
 
-        T = Float32
-        dims = (1, 2, 3)
-        key = hash((T, dims))
+    GPUArrays.AllocCache.@enable AT :cache begin
+        x1 = AT(zeros(T, dims))
+    end
+    @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
+    @test length(named_cache.free) == 1
 
-        GPUArrays.AllocCache.@enable AT :cache begin
-            x1 = AT(zeros(T, dims))
-        end
-        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
-        @test length(named_cache.free[key]) == 1
-        @test length(named_cache.busy[key]) == 0
-        @test x1 === named_cache.free[key][1]
+    key = first(keys(named_cache.free))
+    @test length(named_cache.free[key]) == 1
+    @test length(named_cache.busy[key]) == 0
+    @test x1 === named_cache.free[key][1]
 
-        # Second allocation does not allocate - cache stays the same in size.
+    # Second allocation does not allocate - cache stays the same in size.
 
-        GPUArrays.AllocCache.@enable AT :cache begin
-            x2 = AT(zeros(T, dims))
+    GPUArrays.AllocCache.@enable AT :cache begin
+        x2 = AT(zeros(T, dims))
 
-            # Does not go to cache.
-            GPUArrays.AllocCache.@disable begin
-                x_free = AT(zeros(T, dims))
-            end
+        # Does not go to cache.
+        GPUArrays.AllocCache.@disable begin
+            x_free = AT(zeros(T, dims))
         end
-        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
-        @test length(named_cache.free[key]) == 1
-        @test length(named_cache.busy[key]) == 0
-        @test x2 === x1
-        @test x2 === named_cache.free[key][1]
-        @test x_free !== x2
-
-        # Third allocation of different type - cache grows.
-
-        T2 = Int32
-        key2 = hash((T2, dims))
-        GPUArrays.AllocCache.@enable AT :cache begin
-            x3 = AT(zeros(T2, dims))
-        end
-        @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
-        @test length(named_cache.free[key]) == 1
-        @test length(named_cache.free[key2]) == 1
-        @test x3 === named_cache.free[key2][1]
-
-        # Freeing all memory held by cache.
-
-        GPUArrays.AllocCache.invalidate!(AT, :cache)
-        @test sizeof(pdcache, device, :cache) == 0
     end
+    @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
+    @test length(named_cache.free[key]) == 1
+    @test length(named_cache.busy[key]) == 0
+    @test x2 === x1
+    @test x2 === named_cache.free[key][1]
+    @test x_free !== x2
+
+    # Third allocation of different type - cache grows.
+
+    T2 = Int32
+    key2 = hash((T2, dims))
+    GPUArrays.AllocCache.@enable AT :cache begin
+        x3 = AT(zeros(T2, dims))
+    end
+    @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
 
-    @testset "free_immediately=true" begin
-        pdcache = GPUArrays.AllocCache.cache_allocator(AT)
-        pdcache.free_immediately = true
-        named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache2)
-
-        T = Float32
-        dims = (1, 2, 3)
-        key = hash((T, dims))
-
-        @test sizeof(pdcache, device, :cache2) == 0
-
-        GPUArrays.AllocCache.@enable AT :cache2 begin
-            x1 = AT(zeros(T, dims))
+    _keys = collect(keys(named_cache.free))
+    key2 = _keys[findfirst(i -> i != key, _keys)]
+    @test length(named_cache.free[key]) == 1
+    @test length(named_cache.free[key2]) == 1
+    @test x3 === named_cache.free[key2][1]
 
-            @test !haskey(named_cache.free, key)
-            @test length(named_cache.busy[key]) == 1
-            @test sizeof(pdcache, device, :cache2) == sizeof(Float32) * prod(dims)
-        end
+    # Freeing all memory held by cache.
 
-        # `free` was never even used with `free_immediately=true`.
-        @test !haskey(named_cache.free, key)
-        @test length(named_cache.busy[key]) == 0
-        @test sizeof(pdcache, device, :cache2) == 0
-    end
+    GPUArrays.AllocCache.invalidate!(AT, :cache)
+    @test sizeof(pdcache, device, :cache) == 0
 end

From 166254f2e6bd336b59d788f8b0a077dde22dc863 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Tue, 7 Jan 2025 14:14:28 +0200
Subject: [PATCH 12/28] Limit caching allocator tests to AbstractGPUArray

---
 test/testsuite/caching_allocator.jl | 90 +++++++++++++++--------------
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl
index 9d229186..e499ff38 100644
--- a/test/testsuite/caching_allocator.jl
+++ b/test/testsuite/caching_allocator.jl
@@ -1,56 +1,58 @@
 @testsuite "Caching Allocator" (AT, eltypes) -> begin
-    device = GPUArrays.AllocCache.device(AT)
-    pdcache = GPUArrays.AllocCache.cache_allocator(AT)
-    named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache)
+    if AT <: AbstractGPUArray
+        device = GPUArrays.AllocCache.device(AT)
+        pdcache = GPUArrays.AllocCache.cache_allocator(AT)
+        named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache)
 
-    T = Float32
-    dims = (1, 2, 3)
+        T = Float32
+        dims = (1, 2, 3)
 
-    GPUArrays.AllocCache.@enable AT :cache begin
-        x1 = AT(zeros(T, dims))
-    end
-    @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
-    @test length(named_cache.free) == 1
+        GPUArrays.AllocCache.@enable AT :cache begin
+            x1 = AT(zeros(T, dims))
+        end
+        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
+        @test length(named_cache.free) == 1
 
-    key = first(keys(named_cache.free))
-    @test length(named_cache.free[key]) == 1
-    @test length(named_cache.busy[key]) == 0
-    @test x1 === named_cache.free[key][1]
+        key = first(keys(named_cache.free))
+        @test length(named_cache.free[key]) == 1
+        @test length(named_cache.busy[key]) == 0
+        @test x1 === named_cache.free[key][1]
 
-    # Second allocation does not allocate - cache stays the same in size.
+        # Second allocation does not allocate - cache stays the same in size.
 
-    GPUArrays.AllocCache.@enable AT :cache begin
-        x2 = AT(zeros(T, dims))
+        GPUArrays.AllocCache.@enable AT :cache begin
+            x2 = AT(zeros(T, dims))
 
-        # Does not go to cache.
-        GPUArrays.AllocCache.@disable begin
-            x_free = AT(zeros(T, dims))
+            # Does not go to cache.
+            GPUArrays.AllocCache.@disable begin
+                x_free = AT(zeros(T, dims))
+            end
         end
-    end
-    @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
-    @test length(named_cache.free[key]) == 1
-    @test length(named_cache.busy[key]) == 0
-    @test x2 === x1
-    @test x2 === named_cache.free[key][1]
-    @test x_free !== x2
-
-    # Third allocation of different type - cache grows.
-
-    T2 = Int32
-    key2 = hash((T2, dims))
-    GPUArrays.AllocCache.@enable AT :cache begin
-        x3 = AT(zeros(T2, dims))
-    end
-    @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
+        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
+        @test length(named_cache.free[key]) == 1
+        @test length(named_cache.busy[key]) == 0
+        @test x2 === x1
+        @test x2 === named_cache.free[key][1]
+        @test x_free !== x2
+
+        # Third allocation of different type - cache grows.
+
+        T2 = Int32
+        key2 = hash((T2, dims))
+        GPUArrays.AllocCache.@enable AT :cache begin
+            x3 = AT(zeros(T2, dims))
+        end
+        @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
 
-    _keys = collect(keys(named_cache.free))
-    key2 = _keys[findfirst(i -> i != key, _keys)]
-    @test length(named_cache.free[key]) == 1
-    @test length(named_cache.free[key2]) == 1
-    @test x3 === named_cache.free[key2][1]
+        _keys = collect(keys(named_cache.free))
+        key2 = _keys[findfirst(i -> i != key, _keys)]
+        @test length(named_cache.free[key]) == 1
+        @test length(named_cache.free[key2]) == 1
+        @test x3 === named_cache.free[key2][1]
 
-    # Freeing all memory held by cache.
+        # Freeing all memory held by cache.
 
-    GPUArrays.AllocCache.invalidate!(AT, :cache)
-    @test sizeof(pdcache, device, :cache) == 0
+        GPUArrays.AllocCache.invalidate!(AT, :cache)
+        @test sizeof(pdcache, device, :cache) == 0
+    end
 end

From 01d6abc349c84c81be05444797432c2d2f2a5c2f Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Tue, 7 Jan 2025 14:41:19 +0200
Subject: [PATCH 13/28] Fix tests for 1.10

---
 test/testsuite/caching_allocator.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl
index e499ff38..a62d9fdf 100644
--- a/test/testsuite/caching_allocator.jl
+++ b/test/testsuite/caching_allocator.jl
@@ -7,8 +7,8 @@
         T = Float32
         dims = (1, 2, 3)
 
-        GPUArrays.AllocCache.@enable AT :cache begin
-            x1 = AT(zeros(T, dims))
+        x1 = GPUArrays.AllocCache.@enable AT :cache begin
+            AT(zeros(T, dims))
         end
         @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
         @test length(named_cache.free) == 1
@@ -20,13 +20,14 @@
 
         # Second allocation does not allocate - cache stays the same in size.
 
-        GPUArrays.AllocCache.@enable AT :cache begin
+        x2, x_free = GPUArrays.AllocCache.@enable AT :cache begin
             x2 = AT(zeros(T, dims))
 
             # Does not go to cache.
             GPUArrays.AllocCache.@disable begin
                 x_free = AT(zeros(T, dims))
             end
+            x2, x_free
         end
         @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
         @test length(named_cache.free[key]) == 1
@@ -39,8 +40,8 @@
 
         T2 = Int32
         key2 = hash((T2, dims))
-        GPUArrays.AllocCache.@enable AT :cache begin
-            x3 = AT(zeros(T2, dims))
+        x3 = GPUArrays.AllocCache.@enable AT :cache begin
+            AT(zeros(T2, dims))
         end
         @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
 

From 41bb06dc14fc7a53ca48da16e9744aa2837f4cff Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 8 Jan 2025 14:58:07 +0100
Subject: [PATCH 14/28] Runic formatting.

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/make.jl                  |  2 +-
 lib/JLArrays/src/JLArrays.jl  |  8 ++++----
 src/host/allocations_cache.jl | 27 ++++++++++++++-------------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index b8ca1f92..a37b0cd9 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -20,7 +20,7 @@ function main()
             "Test suite"    => "testsuite.md",
         ],
         doctest = true,
-        warnonly=[:missing_docs],
+        warnonly = [:missing_docs],
     )
 
     deploydocs(
diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index bf652325..38a07476 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -94,8 +94,8 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
             ref = DataRef(data) do data
                 resize!(data, 0)
             end
-            obj = new{T,N}(ref, 0, dims)
-            finalizer(unsafe_free!, obj)
+            obj = new{T, N}(ref, 0, dims)
+            return finalizer(unsafe_free!, obj)
         end
 
         name = GPUArrays.AllocCache.CacheAllocatorName[]
@@ -399,8 +399,8 @@ Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Arr
 
 const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray)
 
-GPUArrays.AllocCache.cache_allocator(::Type{<: JLArray}) = JLACacheAllocator
+GPUArrays.AllocCache.cache_allocator(::Type{<:JLArray}) = JLACacheAllocator
 
-GPUArrays.AllocCache.device(::Type{<: JLArray}) = 1
+GPUArrays.AllocCache.device(::Type{<:JLArray}) = 1
 
 end
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
index 16f53299..d3c590d2 100644
--- a/src/host/allocations_cache.jl
+++ b/src/host/allocations_cache.jl
@@ -17,13 +17,13 @@ struct CacheAllocator{T <: AbstractGPUArray}
     free::Dict{UInt64, Vector{T}}
 end
 
-CacheAllocator(::Type{T}) where T = CacheAllocator(
+CacheAllocator(::Type{T}) where {T} = CacheAllocator(
     ReentrantLock(),
     Dict{UInt64, Vector{T}}(),
     Dict{UInt64, Vector{T}}(),
 )
 
-function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where T
+function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where {T}
     pool = getproperty(cache, pool)
     uid_pool = get(pool, uid, nothing)
     if uid_pool ≡ nothing
@@ -69,6 +69,7 @@ function free_busy!(cache::CacheAllocator)
             empty!(busy_pool)
         end
     end
+    return
 end
 
 mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
@@ -76,10 +77,10 @@ mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
     caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}
 end
 
-PerDeviceCacheAllocator(::Type{T}) where T <: AbstractGPUArray =
+PerDeviceCacheAllocator(::Type{T}) where {T <: AbstractGPUArray} =
     PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}())
 
-function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where T
+function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where {T}
     h = hash(device)
     dev_cache = get(pdcache.caches, h, nothing)
     if dev_cache ≡ nothing
@@ -98,10 +99,10 @@ function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, nam
     return named_cache
 end
 
-function alloc!(alloc_f, AT::Type{<: AbstractGPUArray}, name::Symbol, key)
+function alloc!(alloc_f, AT::Type{<:AbstractGPUArray}, name::Symbol, key)
     pdcache = cache_allocator(AT)
     cache = named_cache_allocator!(pdcache, device(AT), name)
-    alloc!(alloc_f, cache, key)
+    return alloc!(alloc_f, cache, key)
 end
 
 function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
@@ -116,10 +117,10 @@ function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
 
     Base.@lock named_cache.lock begin
         for (_, pool) in named_cache.free
-            sz += sum(sizeof, pool; init=UInt64(0))
+            sz += sum(sizeof, pool; init = UInt64(0))
         end
         for (_, pool) in named_cache.busy
-            sz += sum(sizeof, pool; init=UInt64(0))
+            sz += sum(sizeof, pool; init = UInt64(0))
         end
     end
     return sz
@@ -130,7 +131,7 @@ end
 
 Free all memory held by `name`d cached allocator given array type `AT`.
 """
-invalidate!(AT::Type{<: AbstractGPUArray}, name::Symbol) =
+invalidate!(AT::Type{<:AbstractGPUArray}, name::Symbol) =
     invalidate!(cache_allocator(AT), device(AT), name)
 
 function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
@@ -155,9 +156,9 @@ function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
     return
 end
 
-function free_busy!(AT::Type{<: AbstractGPUArray}, name::Symbol)
+function free_busy!(AT::Type{<:AbstractGPUArray}, name::Symbol)
     pdcache = cache_allocator(AT)
-    free_busy!(named_cache_allocator!(pdcache, device(AT), name))
+    return free_busy!(named_cache_allocator!(pdcache, device(AT), name))
 end
 
 """
@@ -199,7 +200,7 @@ CUDA.AllocCache.invalidate!(CuArray, :loop)
 ```
 """
 macro enable(AT, name, expr)
-    quote
+    return quote
         res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr))
         free_busy!($(esc(AT)), $(esc(name)))
         res
@@ -213,7 +214,7 @@ Evaluate expression `expr` without using caching allocator.
 This is useful to call from within `@enable` to avoid caching arrays.
 """
 macro disable(expr)
-    quote
+    return quote
         @with $(esc(CacheAllocatorName)) => nothing $(esc(expr))
     end
 end

From b2df4c56e5f297e1244952fd6e61867a32e9ffc1 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 8 Jan 2025 23:43:52 +0200
Subject: [PATCH 15/28] Simplify

---
 lib/JLArrays/src/JLArrays.jl        |  10 +-
 src/GPUArrays.jl                    |   2 +-
 src/host/alloc_cache.jl             | 149 +++++++++++++++++
 src/host/allocations_cache.jl       | 240 ----------------------------
 test/testsuite/caching_allocator.jl |  76 ++++-----
 5 files changed, 186 insertions(+), 291 deletions(-)
 create mode 100644 src/host/alloc_cache.jl
 delete mode 100644 src/host/allocations_cache.jl

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 38a07476..d5b4e9f3 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -98,10 +98,12 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
             return finalizer(unsafe_free!, obj)
         end
 
-        name = GPUArrays.AllocCache.CacheAllocatorName[]
-        return name ≡ nothing ?
-            _alloc_f() :
-            GPUArrays.AllocCache.alloc!(_alloc_f, JLArray, name, (T, dims))::JLArray{T, N}
+        cache = GPUArrays.ALLOC_CACHE[]
+        return if cache ≡ nothing
+            _alloc_f()
+        else
+            GPUArrays.alloc!(_alloc_f, cache, (JLArray, T, dims))::JLArray{T, N}
+        end
     end
 
     # low-level constructor for wrapping existing data
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 948ff068..8c1fc14e 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -34,7 +34,7 @@ include("host/random.jl")
 include("host/quirks.jl")
 include("host/uniformscaling.jl")
 include("host/statistics.jl")
-include("host/allocations_cache.jl")
+include("host/alloc_cache.jl")
 
 
 end # module
diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
new file mode 100644
index 00000000..3c5ecb51
--- /dev/null
+++ b/src/host/alloc_cache.jl
@@ -0,0 +1,149 @@
+using ..GPUArrays
+
+@static if VERSION < v"1.11"
+    using ScopedValues
+else
+    using Base.ScopedValues
+end
+
+mutable struct AllocCache{T <: AbstractGPUArray}
+    lock::ReentrantLock
+    busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[]
+    free::Dict{UInt64, Vector{T}}
+
+    function AllocCache(::Type{T}) where {T <: AbstractGPUArray}
+        cache = new{T}(ReentrantLock(),
+            Dict{UInt64, Vector{T}}(),
+            Dict{UInt64, Vector{T}}())
+        finalizer(unsafe_free!, cache)
+    end
+end
+
+function get_pool!(cache::AllocCache{T}, pool::Symbol, uid::UInt64) where {T <: AbstractGPUArray}
+    pool = getproperty(cache, pool)
+    uid_pool = get(pool, uid, nothing)
+    if uid_pool ≡ nothing
+        uid_pool = Base.@lock cache.lock pool[uid] = T[]
+    end
+    return uid_pool
+end
+
+function alloc!(alloc_f, cache::AllocCache, key)
+    x = nothing
+    uid = hash(key)
+
+    busy_pool = get_pool!(cache, :busy, uid)
+    free_pool = get_pool!(cache, :free, uid)
+    isempty(free_pool) && (x = alloc_f())
+
+    while !isempty(free_pool) && x ≡ nothing
+        tmp = Base.@lock cache.lock pop!(free_pool)
+        # Array was manually freed via `unsafe_free!`.
+        GPUArrays.storage(tmp).freed && continue
+        x = tmp
+    end
+
+    x ≡ nothing && (x = alloc_f())
+    Base.@lock cache.lock push!(busy_pool, x)
+    return x
+end
+
+function free_busy!(cache::AllocCache)
+    for uid in cache.busy.keys
+        busy_pool = get_pool!(cache, :busy, uid)
+        isempty(busy_pool) && continue
+
+        Base.@lock cache.lock begin
+            free_pool = get_pool!(cache, :free, uid)
+            append!(free_pool, busy_pool)
+            empty!(busy_pool)
+        end
+    end
+    return
+end
+
+function unsafe_free!(cache::AllocCache)
+    Base.@lock cache.lock begin
+        for (_, pool) in cache.busy
+            isempty(pool) || error(
+                "Invalidating allocations cache that's currently in use. " *
+                "Invalidating inside `@enable` is not allowed.")
+        end
+        for (_, pool) in cache.free
+            map(unsafe_free!, pool)
+        end
+        empty!(cache.free)
+    end
+    return
+end
+
+function Base.sizeof(cache::AllocCache)
+    sz = UInt64(0)
+    Base.@lock cache.lock begin
+        for kind in (cache.free, cache.busy), (_, pool) in kind
+            sz += sum(sizeof, pool; init=UInt64(0))
+        end
+    end
+    return sz
+end
+
+const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing)
+
+"""
+    @enable(cache, expr)
+
+Evaluate expression `expr` using allocations cache `cache`.
+
+When gpu allocation is requested during execution of `expr`,
+it will first check if there's "free" cache instead of performing an actual allocation.
+If no "free" allocation exists, an actual allocation is performed.
+Before returning allocation to the user, it is marked as busy and
+will not be used by allocation in the scope defined by `@enable`.
+
+**After** the execution of `expr` all "busy" allocations are marked as "free"
+thus they can be re-used next time the program enters this scope.
+
+This is useful to apply in a repeating block of code to avoid relying on
+GC to free gpu memory in time.
+
+# Example
+
+In the following example, each iteration of the for-loop requires `8 GiB` of gpu memory.
+Without caching allocator GC wouldn't be able to free arrays in time
+resulting in higher memory usage.
+With caching allocator, memory usage stays at exactly `8 GiB`.
+
+```julia
+cache = GPUArrays.AllocCache(CuArray)
+n = 1024^3
+for i in 1:1000
+    GPUArrays.@enable cache begin
+        sin.(CUDA.rand(Float32, n))
+    end
+end
+# To free immediately.
+# Otherwise, it will be freed when collected by GC.
+GPUArrays.unsafe_free!(cache)
+```
+
+See [`@disable`](@ref).
+"""
+macro enable(cache, expr)
+    return quote
+        res = @with $(esc(ALLOC_CACHE)) => $(esc(cache)) $(esc(expr))
+        free_busy!($(esc(cache)))
+        res
+    end
+end
+
+"""
+    disable(expr)
+
+Evaluate expression `expr` without using allocations cache.
+This is useful to call from within `@enable` to avoid caching some allocations.
+"""
+macro disable(expr)
+    return quote
+        @with $(esc(ALLOC_CACHE)) => nothing $(esc(expr))
+    end
+end
diff --git a/src/host/allocations_cache.jl b/src/host/allocations_cache.jl
deleted file mode 100644
index d3c590d2..00000000
--- a/src/host/allocations_cache.jl
+++ /dev/null
@@ -1,240 +0,0 @@
-module AllocCache
-
-using ..GPUArrays
-using KernelAbstractions
-
-@static if VERSION < v"1.11"
-    using ScopedValues
-else
-    using Base.ScopedValues
-end
-
-const CacheAllocatorName = ScopedValue{Union{Nothing, Symbol}}(nothing)
-
-struct CacheAllocator{T <: AbstractGPUArray}
-    lock::ReentrantLock
-    busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[]
-    free::Dict{UInt64, Vector{T}}
-end
-
-CacheAllocator(::Type{T}) where {T} = CacheAllocator(
-    ReentrantLock(),
-    Dict{UInt64, Vector{T}}(),
-    Dict{UInt64, Vector{T}}(),
-)
-
-function get_pool!(cache::CacheAllocator{T}, pool::Symbol, uid::UInt64) where {T}
-    pool = getproperty(cache, pool)
-    uid_pool = get(pool, uid, nothing)
-    if uid_pool ≡ nothing
-        uid_pool = Base.@lock cache.lock pool[uid] = T[]
-    end
-    return uid_pool
-end
-
-"""
-    alloc!(alloc_f, cache::CacheAllocator, key)
-
-Attempt to retrieve cached allocation from `cache` using `key` for searching.
-If no such allocation is found, execute `alloc_f` that does actual allocation,
-store it in cache for future use and return it.
-"""
-function alloc!(alloc_f, cache::CacheAllocator, key)
-    x = nothing
-    uid = hash(key)
-    busy_pool = get_pool!(cache, :busy, uid)
-    free_pool = get_pool!(cache, :free, uid)
-    isempty(free_pool) && (x = alloc_f())
-
-    while !isempty(free_pool) && x ≡ nothing
-        tmp = Base.@lock cache.lock pop!(free_pool)
-        # Array was manually freed via `unsafe_free!`.
-        GPUArrays.storage(tmp).freed && continue
-        x = tmp
-    end
-
-    x ≡ nothing && (x = alloc_f())
-    Base.@lock cache.lock push!(busy_pool, x)
-    return x
-end
-
-function free_busy!(cache::CacheAllocator)
-    for uid in cache.busy.keys
-        busy_pool = get_pool!(cache, :busy, uid)
-        isempty(busy_pool) && continue
-
-        Base.@lock cache.lock begin
-            free_pool = get_pool!(cache, :free, uid)
-            append!(free_pool, busy_pool)
-            empty!(busy_pool)
-        end
-    end
-    return
-end
-
-mutable struct PerDeviceCacheAllocator{T <: AbstractGPUArray}
-    lock::ReentrantLock
-    caches::Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}
-end
-
-PerDeviceCacheAllocator(::Type{T}) where {T <: AbstractGPUArray} =
-    PerDeviceCacheAllocator(ReentrantLock(), Dict{UInt64, Dict{Symbol, CacheAllocator{T}}}())
-
-function named_cache_allocator!(pdcache::PerDeviceCacheAllocator{T}, device, name::Symbol) where {T}
-    h = hash(device)
-    dev_cache = get(pdcache.caches, h, nothing)
-    if dev_cache ≡ nothing
-        Base.@lock pdcache.lock begin
-            named_cache = CacheAllocator(T)
-            pdcache.caches[h] = Dict{Symbol, CacheAllocator{T}}(name => named_cache)
-            return named_cache
-        end
-    end
-
-    named_cache = get(dev_cache, name, nothing)
-    if named_cache ≡ nothing
-        named_cache = CacheAllocator(T)
-        Base.@lock pdcache.lock dev_cache[name] = named_cache
-    end
-    return named_cache
-end
-
-function alloc!(alloc_f, AT::Type{<:AbstractGPUArray}, name::Symbol, key)
-    pdcache = cache_allocator(AT)
-    cache = named_cache_allocator!(pdcache, device(AT), name)
-    return alloc!(alloc_f, cache, key)
-end
-
-function Base.sizeof(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
-    sz = UInt64(0)
-    h = hash(device)
-
-    dev_cache = get(pdcache.caches, h, nothing)
-    dev_cache ≡ nothing && return sz
-
-    named_cache = get(dev_cache, name, nothing)
-    named_cache ≡ nothing && return sz
-
-    Base.@lock named_cache.lock begin
-        for (_, pool) in named_cache.free
-            sz += sum(sizeof, pool; init = UInt64(0))
-        end
-        for (_, pool) in named_cache.busy
-            sz += sum(sizeof, pool; init = UInt64(0))
-        end
-    end
-    return sz
-end
-
-"""
-    invalidate!(AT::Type{AbstractGPUArray}, name::Symbol)
-
-Free all memory held by `name`d cached allocator given array type `AT`.
-"""
-invalidate!(AT::Type{<:AbstractGPUArray}, name::Symbol) =
-    invalidate!(cache_allocator(AT), device(AT), name)
-
-function invalidate!(pdcache::PerDeviceCacheAllocator, device, name::Symbol)
-    h = hash(device)
-    dev_cache = get(pdcache.caches, h, nothing)
-    dev_cache ≡ nothing && return
-
-    named_cache = get(dev_cache, name, nothing)
-    named_cache ≡ nothing && return
-
-    Base.@lock named_cache.lock begin
-        for (_, pool) in named_cache.free
-            map(unsafe_free!, pool)
-        end
-        # TODO error when trying to invalidate busy cache?
-        for (_, pool) in named_cache.busy
-            map(unsafe_free!, pool)
-        end
-        empty!(named_cache.busy)
-        empty!(named_cache.free)
-    end
-    return
-end
-
-function free_busy!(AT::Type{<:AbstractGPUArray}, name::Symbol)
-    pdcache = cache_allocator(AT)
-    return free_busy!(named_cache_allocator!(pdcache, device(AT), name))
-end
-
-"""
-    @enable AT name expr
-
-Evaluate expression `expr` using `name`d caching allocator for the given array type `AT`.
-
-When gpu allocation is requested during execution of `expr`,
-allocator will try to use its "free" cache instead of doing an actual allocation.
-If no "free" allocation exists, an actual allocation is performed.
-Before returning allocation to the user, it is marked as busy and
-will not be used by allocation in the scope defined by `@enable`.
-
-**After** the execution of `expr` all "busy" allocations are marked as "free"
-thus they can be re-used next time the program enters this scope.
-
-This is useful to apply in a repeating block of code to avoid relying on
-GC to free gpu memory in time.
-
-`name` is a `Symbol` that defines which allocator to use
-(`nothing`, which is a default, disables it).
-
-# Example
-
-In the following example, each iteration of the for-loop requires `2 GiB` of gpu memory.
-Without caching allocator GC wouldn't be able to free arrays in time
-resulting in higher memory usage.
-With caching allocator, memory usage stays at exactly `2 GiB`.
-
-See [`@disable`](@ref), [`invalidate!`](@ref).
-```julia
-n = 1024^3
-for i in 1:1000
-    CUDA.AllocCache.@enable CuArray :loop begin
-        sin.(CUDA.rand(Float32, n))
-    end
-end
-CUDA.AllocCache.invalidate!(CuArray, :loop)
-```
-"""
-macro enable(AT, name, expr)
-    return quote
-        res = @with $(esc(CacheAllocatorName)) => $(esc(name)) $(esc(expr))
-        free_busy!($(esc(AT)), $(esc(name)))
-        res
-    end
-end
-
-"""
-    @disable expr
-
-Evaluate expression `expr` without using caching allocator.
-This is useful to call from within `@enable` to avoid caching arrays.
-"""
-macro disable(expr)
-    return quote
-        @with $(esc(CacheAllocatorName)) => nothing $(esc(expr))
-    end
-end
-
-# Interface API.
-
-"""
-    cache_allocator(::Type{AbstractGPUArray})
-
-Given array type, return corresponding `PerDeviceCacheAllocator` for it.
-Each GPU backend must implement this.
-"""
-cache_allocator(::Type{AbstractGPUArray}) = error("Not implemented.")
-
-"""
-    device(::Type{AbstractGPUArray})
-
-Given array type, return current device.
-Each GPU backend must implement this.
-"""
-device(::Type{AbstractGPUArray}) = error("Not implemented.")
-
-end
diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl
index a62d9fdf..43bf5589 100644
--- a/test/testsuite/caching_allocator.jl
+++ b/test/testsuite/caching_allocator.jl
@@ -1,59 +1,43 @@
-@testsuite "Caching Allocator" (AT, eltypes) -> begin
+@testsuite "alloc cache" (AT, eltypes) -> begin
     if AT <: AbstractGPUArray
-        device = GPUArrays.AllocCache.device(AT)
-        pdcache = GPUArrays.AllocCache.cache_allocator(AT)
-        named_cache = GPUArrays.AllocCache.named_cache_allocator!(pdcache, device, :cache)
+        cache = GPUArrays.AllocCache(AT)
 
-        T = Float32
-        dims = (1, 2, 3)
-
-        x1 = GPUArrays.AllocCache.@enable AT :cache begin
-            AT(zeros(T, dims))
+        T, dims = Float32, (1, 2, 3)
+        GPUArrays.@enable cache begin
+            x1 = AT(zeros(T, dims))
         end
-        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
-        @test length(named_cache.free) == 1
-
-        key = first(keys(named_cache.free))
-        @test length(named_cache.free[key]) == 1
-        @test length(named_cache.busy[key]) == 0
-        @test x1 === named_cache.free[key][1]
-
-        # Second allocation does not allocate - cache stays the same in size.
-
-        x2, x_free = GPUArrays.AllocCache.@enable AT :cache begin
+        @test sizeof(cache) == sizeof(T) * prod(dims)
+        key = first(keys(cache.free))
+        @test length(cache.free[key]) == 1
+        @test length(cache.busy[key]) == 0
+        @test x1 === cache.free[key][1]
+
+        # Second allocation hits cache.
+        GPUArrays.@enable cache begin
             x2 = AT(zeros(T, dims))
-
-            # Does not go to cache.
-            GPUArrays.AllocCache.@disable begin
-                x_free = AT(zeros(T, dims))
-            end
-            x2, x_free
+            # Does not hit the cache.
+            GPUArrays.@disable x_free = AT(zeros(T, dims))
         end
-        @test sizeof(pdcache, device, :cache) == sizeof(Float32) * prod(dims)
-        @test length(named_cache.free[key]) == 1
-        @test length(named_cache.busy[key]) == 0
-        @test x2 === x1
-        @test x2 === named_cache.free[key][1]
+        @test sizeof(cache) == sizeof(T) * prod(dims)
+        key = first(keys(cache.free))
+        @test length(cache.free[key]) == 1
+        @test length(cache.busy[key]) == 0
+        @test x2 === cache.free[key][1]
         @test x_free !== x2
 
-        # Third allocation of different type - cache grows.
-
-        T2 = Int32
-        key2 = hash((T2, dims))
-        x3 = GPUArrays.AllocCache.@enable AT :cache begin
-            AT(zeros(T2, dims))
+        # Third allocation is of different shape - allocates.
+        dims = (2, 2)
+        GPUArrays.@enable cache begin
+            x3 = AT(zeros(T, dims))
         end
-        @test sizeof(pdcache, device, :cache) == (sizeof(Float32) + sizeof(Int32)) * prod(dims)
-
-        _keys = collect(keys(named_cache.free))
+        _keys = collect(keys(cache.free))
         key2 = _keys[findfirst(i -> i != key, _keys)]
-        @test length(named_cache.free[key]) == 1
-        @test length(named_cache.free[key2]) == 1
-        @test x3 === named_cache.free[key2][1]
+        @test length(cache.free[key]) == 1
+        @test length(cache.free[key2]) == 1
+        @test x3 === cache.free[key2][1]
 
         # Freeing all memory held by cache.
-
-        GPUArrays.AllocCache.invalidate!(AT, :cache)
-        @test sizeof(pdcache, device, :cache) == 0
+        GPUArrays.unsafe_free!(cache)
+        @test sizeof(cache) == 0
     end
 end

From 3ffca034d00810da1b2af66dbd0c651b5dd978fb Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 8 Jan 2025 23:48:45 +0200
Subject: [PATCH 16/28] Cleanup

---
 lib/JLArrays/src/JLArrays.jl | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index d5b4e9f3..92ee7de0 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -397,12 +397,4 @@ Adapt.adapt_storage(::JLBackend, a::Array) = Adapt.adapt(JLArrays.JLArray, a)
 Adapt.adapt_storage(::JLBackend, a::JLArrays.JLArray) = a
 Adapt.adapt_storage(::KernelAbstractions.CPU, a::JLArrays.JLArray) = convert(Array, a)
 
-# Caching Allocator.
-
-const JLACacheAllocator = GPUArrays.AllocCache.PerDeviceCacheAllocator(JLArray)
-
-GPUArrays.AllocCache.cache_allocator(::Type{<:JLArray}) = JLACacheAllocator
-
-GPUArrays.AllocCache.device(::Type{<:JLArray}) = 1
-
 end

From 96af44c8925e74fbbfdfec0837eadc3ccb87a2ba Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 8 Jan 2025 23:50:01 +0200
Subject: [PATCH 17/28] Update src/host/alloc_cache.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 src/host/alloc_cache.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index 3c5ecb51..73ef8f37 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -81,7 +81,7 @@ function Base.sizeof(cache::AllocCache)
     sz = UInt64(0)
     Base.@lock cache.lock begin
         for kind in (cache.free, cache.busy), (_, pool) in kind
-            sz += sum(sizeof, pool; init=UInt64(0))
+            sz += sum(sizeof, pool; init = UInt64(0))
         end
     end
     return sz

From cf5fda2a1d2955957c38b3b768b12b4fd0109356 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 8 Jan 2025 23:50:08 +0200
Subject: [PATCH 18/28] Update src/host/alloc_cache.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 src/host/alloc_cache.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index 73ef8f37..d909cdc8 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -67,7 +67,8 @@ function unsafe_free!(cache::AllocCache)
         for (_, pool) in cache.busy
             isempty(pool) || error(
                 "Invalidating allocations cache that's currently in use. " *
-                "Invalidating inside `@enable` is not allowed.")
+                    "Invalidating inside `@enable` is not allowed."
+            )
         end
         for (_, pool) in cache.free
             map(unsafe_free!, pool)

From 36ced8346987fd442466adccfc7f333fab75c150 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 8 Jan 2025 23:50:18 +0200
Subject: [PATCH 19/28] Update src/host/alloc_cache.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 src/host/alloc_cache.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index d909cdc8..7ecc437e 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -12,10 +12,12 @@ mutable struct AllocCache{T <: AbstractGPUArray}
     free::Dict{UInt64, Vector{T}}
 
     function AllocCache(::Type{T}) where {T <: AbstractGPUArray}
-        cache = new{T}(ReentrantLock(),
+        cache = new{T}(
+            ReentrantLock(),
             Dict{UInt64, Vector{T}}(),
-            Dict{UInt64, Vector{T}}())
-        finalizer(unsafe_free!, cache)
+            Dict{UInt64, Vector{T}}()
+        )
+        return finalizer(unsafe_free!, cache)
     end
 end
 

From c98bfa408082012a8dfcb853f929fd09ff3ddf74 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Wed, 8 Jan 2025 23:55:40 +0200
Subject: [PATCH 20/28] Update docs

---
 docs/src/interface.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index dadb06c1..0aa33bd3 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -34,7 +34,6 @@ There are numerous examples of potential interfaces for GPUArrays, such as with
 ## Caching Allocator
 
 ```@docs
-GPUArrays.AllocCache.@enable
-GPUArrays.AllocCache.@disable
-GPUArrays.AllocCache.invalidate!
+GPUArrays.@enable
+GPUArrays.@disable
 ```

From 63ffeaef7f3194441ce3a8f4c9215c9f1695a9fb Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Thu, 9 Jan 2025 10:38:13 +0200
Subject: [PATCH 21/28] Rename enable to cached

---
 docs/src/interface.md               |  4 ++--
 src/host/alloc_cache.jl             | 18 +++++++++---------
 test/testsuite/caching_allocator.jl |  8 ++++----
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/src/interface.md b/docs/src/interface.md
index 0aa33bd3..9e4864ad 100644
--- a/docs/src/interface.md
+++ b/docs/src/interface.md
@@ -34,6 +34,6 @@ There are numerous examples of potential interfaces for GPUArrays, such as with
 ## Caching Allocator
 
 ```@docs
-GPUArrays.@enable
-GPUArrays.@disable
+GPUArrays.@cached
+GPUArrays.@uncached
 ```
diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index 7ecc437e..899f2e5d 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -69,7 +69,7 @@ function unsafe_free!(cache::AllocCache)
         for (_, pool) in cache.busy
             isempty(pool) || error(
                 "Invalidating allocations cache that's currently in use. " *
-                    "Invalidating inside `@enable` is not allowed."
+                    "Invalidating inside `@cached` is not allowed."
             )
         end
         for (_, pool) in cache.free
@@ -93,7 +93,7 @@ end
 const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing)
 
 """
-    @enable(cache, expr)
+    @cached(cache, expr)
 
 Evaluate expression `expr` using allocations cache `cache`.
 
@@ -101,7 +101,7 @@ When gpu allocation is requested during execution of `expr`,
 it will first check if there's "free" cache instead of performing an actual allocation.
 If no "free" allocation exists, an actual allocation is performed.
 Before returning allocation to the user, it is marked as busy and
-will not be used by allocation in the scope defined by `@enable`.
+will not be used by allocation in the scope defined by `@cached`.
 
 **After** the execution of `expr` all "busy" allocations are marked as "free"
 thus they can be re-used next time the program enters this scope.
@@ -120,7 +120,7 @@ With caching allocator, memory usage stays at exactly `8 GiB`.
 cache = GPUArrays.AllocCache(CuArray)
 n = 1024^3
 for i in 1:1000
-    GPUArrays.@enable cache begin
+    GPUArrays.@cached cache begin
         sin.(CUDA.rand(Float32, n))
     end
 end
@@ -129,9 +129,9 @@ end
 GPUArrays.unsafe_free!(cache)
 ```
 
-See [`@disable`](@ref).
+See [`@uncached`](@ref).
 """
-macro enable(cache, expr)
+macro cached(cache, expr)
     return quote
         res = @with $(esc(ALLOC_CACHE)) => $(esc(cache)) $(esc(expr))
         free_busy!($(esc(cache)))
@@ -140,12 +140,12 @@ macro enable(cache, expr)
 end
 
 """
-    disable(expr)
+    uncached(expr)
 
 Evaluate expression `expr` without using allocations cache.
-This is useful to call from within `@enable` to avoid caching some allocations.
+This is useful to call from within `@cached` to avoid caching some allocations.
 """
-macro disable(expr)
+macro uncached(expr)
     return quote
         @with $(esc(ALLOC_CACHE)) => nothing $(esc(expr))
     end
diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/caching_allocator.jl
index 43bf5589..9a7201a5 100644
--- a/test/testsuite/caching_allocator.jl
+++ b/test/testsuite/caching_allocator.jl
@@ -3,7 +3,7 @@
         cache = GPUArrays.AllocCache(AT)
 
         T, dims = Float32, (1, 2, 3)
-        GPUArrays.@enable cache begin
+        GPUArrays.@cached cache begin
             x1 = AT(zeros(T, dims))
         end
         @test sizeof(cache) == sizeof(T) * prod(dims)
@@ -13,10 +13,10 @@
         @test x1 === cache.free[key][1]
 
         # Second allocation hits cache.
-        GPUArrays.@enable cache begin
+        GPUArrays.@cached cache begin
             x2 = AT(zeros(T, dims))
             # Does not hit the cache.
-            GPUArrays.@disable x_free = AT(zeros(T, dims))
+            GPUArrays.@uncached x_free = AT(zeros(T, dims))
         end
         @test sizeof(cache) == sizeof(T) * prod(dims)
         key = first(keys(cache.free))
@@ -27,7 +27,7 @@
 
         # Third allocation is of different shape - allocates.
         dims = (2, 2)
-        GPUArrays.@enable cache begin
+        GPUArrays.@cached cache begin
             x3 = AT(zeros(T, dims))
         end
         _keys = collect(keys(cache.free))

From 972b386937b10e132b89785f508b3cb064a0cd78 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 9 Jan 2025 11:34:30 +0100
Subject: [PATCH 22/28] Rename.

---
 test/testsuite.jl                                       | 2 +-
 test/testsuite/{caching_allocator.jl => alloc_cache.jl} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename test/testsuite/{caching_allocator.jl => alloc_cache.jl} (100%)

diff --git a/test/testsuite.jl b/test/testsuite.jl
index 59bb967b..e138dabe 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -93,7 +93,7 @@ include("testsuite/math.jl")
 include("testsuite/random.jl")
 include("testsuite/uniformscaling.jl")
 include("testsuite/statistics.jl")
-include("testsuite/caching_allocator.jl")
+include("testsuite/alloc_cache.jl")
 
 """
 Runs the entire GPUArrays test suite on array type `AT`
diff --git a/test/testsuite/caching_allocator.jl b/test/testsuite/alloc_cache.jl
similarity index 100%
rename from test/testsuite/caching_allocator.jl
rename to test/testsuite/alloc_cache.jl

From 9960b5225c18a997711219c7b2a7040b9952aa7e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 9 Jan 2025 11:47:43 +0100
Subject: [PATCH 23/28] Simplify back-end interface.

---
 lib/JLArrays/src/JLArrays.jl | 15 +++++----------
 src/host/alloc_cache.jl      | 11 ++++++++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 92ee7de0..18be1889 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -89,21 +89,15 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
         check_eltype(T)
         maxsize = prod(dims) * sizeof(T)
 
-        function _alloc_f()
+        GPUArrays.cached_alloc((JLArray, T, dims)) do
             data = Vector{UInt8}(undef, maxsize)
             ref = DataRef(data) do data
                 resize!(data, 0)
             end
             obj = new{T, N}(ref, 0, dims)
-            return finalizer(unsafe_free!, obj)
-        end
-
-        cache = GPUArrays.ALLOC_CACHE[]
-        return if cache ≡ nothing
-            _alloc_f()
-        else
-            GPUArrays.alloc!(_alloc_f, cache, (JLArray, T, dims))::JLArray{T, N}
-        end
+            finalizer(unsafe_free!, obj)
+            return obj
+        end::JLArray{T,N}
     end
 
     # low-level constructor for wrapping existing data
@@ -112,6 +106,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
         check_eltype(T)
         obj = new{T,N}(ref, offset, dims)
         finalizer(unsafe_free!, obj)
+        return obj
     end
 end
 
diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index 899f2e5d..6c9a1200 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -30,13 +30,18 @@ function get_pool!(cache::AllocCache{T}, pool::Symbol, uid::UInt64) where {T <:
     return uid_pool
 end
 
-function alloc!(alloc_f, cache::AllocCache, key)
+function cached_alloc(f, key)
+    cache = ALLOC_CACHE[]
+    if cache === nothing
+        return f()
+    end
+
     x = nothing
     uid = hash(key)
 
     busy_pool = get_pool!(cache, :busy, uid)
     free_pool = get_pool!(cache, :free, uid)
-    isempty(free_pool) && (x = alloc_f())
+    isempty(free_pool) && (x = f())
 
     while !isempty(free_pool) && x ≡ nothing
         tmp = Base.@lock cache.lock pop!(free_pool)
@@ -45,7 +50,7 @@ function alloc!(alloc_f, cache::AllocCache, key)
         x = tmp
     end
 
-    x ≡ nothing && (x = alloc_f())
+    x ≡ nothing && (x = f())
     Base.@lock cache.lock push!(busy_pool, x)
     return x
 end

From 7e32124a222c10c435ef16846c4bca034045b499 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 9 Jan 2025 11:59:52 +0100
Subject: [PATCH 24/28] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 lib/JLArrays/src/JLArrays.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/JLArrays/src/JLArrays.jl b/lib/JLArrays/src/JLArrays.jl
index 18be1889..d36e9af2 100644
--- a/lib/JLArrays/src/JLArrays.jl
+++ b/lib/JLArrays/src/JLArrays.jl
@@ -89,7 +89,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
         check_eltype(T)
         maxsize = prod(dims) * sizeof(T)
 
-        GPUArrays.cached_alloc((JLArray, T, dims)) do
+        return GPUArrays.cached_alloc((JLArray, T, dims)) do
             data = Vector{UInt8}(undef, maxsize)
             ref = DataRef(data) do data
                 resize!(data, 0)
@@ -97,7 +97,7 @@ mutable struct JLArray{T, N} <: AbstractGPUArray{T, N}
             obj = new{T, N}(ref, 0, dims)
             finalizer(unsafe_free!, obj)
             return obj
-        end::JLArray{T,N}
+        end::JLArray{T, N}
     end
 
     # low-level constructor for wrapping existing data

From e579824b9a1f40252f98ff1f91c1b8d5a216eefd Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Thu, 9 Jan 2025 13:25:54 +0200
Subject: [PATCH 25/28] Strip down cache from array type

---
 src/host/alloc_cache.jl       | 30 ++++++++++++++++++++----------
 test/testsuite/alloc_cache.jl |  2 +-
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index 6c9a1200..b66e7c40 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -6,26 +6,26 @@ else
     using Base.ScopedValues
 end
 
-mutable struct AllocCache{T <: AbstractGPUArray}
+mutable struct AllocCache
     lock::ReentrantLock
-    busy::Dict{UInt64, Vector{T}} # hash(key) => GPUArray[]
-    free::Dict{UInt64, Vector{T}}
+    busy::Dict{UInt64, Vector{Any}} # hash(key) => GPUArray[]
+    free::Dict{UInt64, Vector{Any}}
 
-    function AllocCache(::Type{T}) where {T <: AbstractGPUArray}
-        cache = new{T}(
+    function AllocCache()
+        cache = new(
             ReentrantLock(),
-            Dict{UInt64, Vector{T}}(),
-            Dict{UInt64, Vector{T}}()
+            Dict{UInt64, Vector{Any}}(),
+            Dict{UInt64, Vector{Any}}()
         )
         return finalizer(unsafe_free!, cache)
     end
 end
 
-function get_pool!(cache::AllocCache{T}, pool::Symbol, uid::UInt64) where {T <: AbstractGPUArray}
+function get_pool!(cache::AllocCache, pool::Symbol, uid::UInt64)
     pool = getproperty(cache, pool)
     uid_pool = get(pool, uid, nothing)
     if uid_pool ≡ nothing
-        uid_pool = Base.@lock cache.lock pool[uid] = T[]
+        uid_pool = Base.@lock cache.lock pool[uid] = Any[]
     end
     return uid_pool
 end
@@ -95,6 +95,16 @@ function Base.sizeof(cache::AllocCache)
     return sz
 end
 
+function Base.show(io::IO, cache::AllocCache)
+    sz, n_free, n_busy = Base.@lock cache.lock begin
+        sz = sizeof(cache)
+        n_free = sum(p -> length(p[2]), cache.free; init = 0)
+        n_busy = sum(p -> length(p[2]), cache.busy; init = 0)
+        sz, n_free, n_busy
+    end
+    print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))")
+end
+
 const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing)
 
 """
@@ -122,7 +132,7 @@ resulting in higher memory usage.
 With caching allocator, memory usage stays at exactly `8 GiB`.
 
 ```julia
-cache = GPUArrays.AllocCache(CuArray)
+cache = GPUArrays.AllocCache()
 n = 1024^3
 for i in 1:1000
     GPUArrays.@cached cache begin
diff --git a/test/testsuite/alloc_cache.jl b/test/testsuite/alloc_cache.jl
index 9a7201a5..b032c8bd 100644
--- a/test/testsuite/alloc_cache.jl
+++ b/test/testsuite/alloc_cache.jl
@@ -1,6 +1,6 @@
 @testsuite "alloc cache" (AT, eltypes) -> begin
     if AT <: AbstractGPUArray
-        cache = GPUArrays.AllocCache(AT)
+        cache = GPUArrays.AllocCache()
 
         T, dims = Float32, (1, 2, 3)
         GPUArrays.@cached cache begin

From cdc2543a8ff51d2afc032d86c92e2a10c0755435 Mon Sep 17 00:00:00 2001
From: Anton Smirnov <tonysmn97@gmail.com>
Date: Thu, 9 Jan 2025 13:27:10 +0200
Subject: [PATCH 26/28] Add return stmt

---
 src/host/alloc_cache.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index b66e7c40..75286a33 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -102,7 +102,7 @@ function Base.show(io::IO, cache::AllocCache)
         n_busy = sum(p -> length(p[2]), cache.busy; init = 0)
         sz, n_free, n_busy
     end
-    print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))")
+    return print(io, "AllocCache(n_free=$n_free, n_busy=$n_busy, sizeof=$(Base.format_bytes(sz)))")
 end
 
 const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing)

From 8734a35b4d6c0f1c37890241eeba6ecacb9c5ab6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 9 Jan 2025 12:42:47 +0100
Subject: [PATCH 27/28] Improve docs.

---
 src/host/alloc_cache.jl | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/host/alloc_cache.jl b/src/host/alloc_cache.jl
index 75286a33..7ed435de 100644
--- a/src/host/alloc_cache.jl
+++ b/src/host/alloc_cache.jl
@@ -110,37 +110,33 @@ const ALLOC_CACHE = ScopedValue{Union{Nothing, AllocCache}}(nothing)
 """
     @cached(cache, expr)
 
-Evaluate expression `expr` using allocations cache `cache`.
+Evaluate `expr` using allocations cache `cache`.
 
-When gpu allocation is requested during execution of `expr`,
-it will first check if there's "free" cache instead of performing an actual allocation.
-If no "free" allocation exists, an actual allocation is performed.
-Before returning allocation to the user, it is marked as busy and
-will not be used by allocation in the scope defined by `@cached`.
+When GPU memory is allocated during the execution of `expr`, `cache` will first be checked.
+If no memory is available in the cache, a new allocation will be requested.
 
-**After** the execution of `expr` all "busy" allocations are marked as "free"
-thus they can be re-used next time the program enters this scope.
+After the execution of `expr`, all allocations made under the scope of `@cached` will be
+cached within `cache` for future use. This is useful to avoid relying on GC to free GPU
+memory in time.
 
-This is useful to apply in a repeating block of code to avoid relying on
-GC to free gpu memory in time.
+Once `cache` goes out scope, or when the user calls `unsafe_free!` on it, all cached
+allocations will be freed.
 
 # Example
 
-In the following example, each iteration of the for-loop requires `8 GiB` of gpu memory.
-Without caching allocator GC wouldn't be able to free arrays in time
-resulting in higher memory usage.
-With caching allocator, memory usage stays at exactly `8 GiB`.
+In the following example, each iteration of the for-loop requires 8 GiB of GPU memory.
+Without caching those allocations, significant pressure would be put on the GC, resulting
+in high memory usage and latency. By using the allocator cache, the memory usage is stable:
 
 ```julia
 cache = GPUArrays.AllocCache()
-n = 1024^3
 for i in 1:1000
     GPUArrays.@cached cache begin
-        sin.(CUDA.rand(Float32, n))
+        sin.(CUDA.rand(Float32, 1024^3))
     end
 end
-# To free immediately.
-# Otherwise, it will be freed when collected by GC.
+
+# optionally: free the memory now, instead of waiting for the GC to collect `cache`
 GPUArrays.unsafe_free!(cache)
 ```
 
@@ -157,8 +153,9 @@ end
 """
     uncached(expr)
 
-Evaluate expression `expr` without using allocations cache.
-This is useful to call from within `@cached` to avoid caching some allocations.
+Evaluate expression `expr` without using the allocation. This is useful to call from within
+`@cached` to avoid caching some allocations, e.g., because they can be returned out of the
+`@cached` scope.
 """
 macro uncached(expr)
     return quote

From a83a52718ffed1adf05cee2b463ddf560bc68f6b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 9 Jan 2025 12:54:44 +0100
Subject: [PATCH 28/28] Remove duplicate gitignore.

[ci skip]
---
 docs/.gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/.gitignore b/docs/.gitignore
index 026087e8..737939a5 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,4 +1,3 @@
-Manifest.toml
 build
 site
 Manifest.toml