From 270553d52bd662ed9ef8a1cbf5ff52cf8c17c37f Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Fri, 19 Apr 2019 15:35:34 +0200
Subject: [PATCH 1/7] Add a 'malloc' keyword argument to the @cuda macro

---
 src/compiler/common.jl | 10 ++++++++--
 src/execution.jl       |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index 04c9f0a5..6a7de08e 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -13,11 +13,17 @@ struct CompilerJob
     blocks_per_sm::Union{Nothing,Integer}
     maxregs::Union{Nothing,Integer}
     name::Union{Nothing,String}
+    # The name of the 'malloc' function to use when allocating memory.
+    # A transform will rewrite all calls to 'malloc' to use this function
+    # instead. The 'malloc' signature must be 'void* malloc(size_t)' or
+    # compatible.
+    malloc::String
 
     CompilerJob(f, tt, cap, kernel; name=nothing,
                     minthreads=nothing, maxthreads=nothing,
-                    blocks_per_sm=nothing, maxregs=nothing) =
-        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name)
+                    blocks_per_sm=nothing, maxregs=nothing,
+                    malloc="malloc") =
+        new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name, malloc)
 end
 
 # global job reference
diff --git a/src/execution.jl b/src/execution.jl
index 1783669a..d805dec7 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -9,7 +9,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # the code it generates, or the execution
 function split_kwargs(kwargs)
     macro_kws    = [:dynamic]
-    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name]
+    compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :malloc]
     call_kws     = [:cooperative, :blocks, :threads, :config, :shmem, :stream]
     macro_kwargs = []
     compiler_kwargs = []

From 23cfe53919b829aa8b74b63ff1de077cb4501b5b Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 17 Jun 2019 10:30:23 +0200
Subject: [PATCH 2/7] Introduce a special 'managed_malloc' runtime function

Add a pass that rewrites calls to 'managed_malloc'
---
 src/compiler/common.jl |  8 ++++----
 src/compiler/optim.jl  | 41 +++++++++++++++++++++++++++++++++++++++++
 src/device/runtime.jl  | 32 ++++++++++++++++++++++++++++----
 3 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index 6a7de08e..ec463ca3 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -13,10 +13,10 @@ struct CompilerJob
     blocks_per_sm::Union{Nothing,Integer}
     maxregs::Union{Nothing,Integer}
     name::Union{Nothing,String}
-    # The name of the 'malloc' function to use when allocating memory.
-    # A transform will rewrite all calls to 'malloc' to use this function
-    # instead. The 'malloc' signature must be 'void* malloc(size_t)' or
-    # compatible.
+    # The name of the memory allocation function to use when allocating
+    # managed memory. A transform will rewrite all managed memory allocations
+    # to use this function instead. The 'malloc' signature must be
+    # 'void* malloc(size_t)' or compatible.
     malloc::String
 
     CompilerJob(f, tt, cap, kernel; name=nothing,
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 1e76f146..f7e6f8f6 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -63,6 +63,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
 
             run!(pm, mod)
         end
+        replace_malloc!(mod, job.malloc)
     end
 
     # PTX-specific optimizations
@@ -355,6 +356,46 @@ function lower_gc_frame!(fun::LLVM.Function)
     return changed
 end
 
+# Replaces all uses of a function in a particular module with
+# a compatible function.
+function replace_function!(mod::LLVM.Module, old_name::String, new_name::String)
+    if new_name == old_name
+        # There's nothing to replace if the new function is the same as
+        # the old function.
+        return false
+    end
+
+    # Otherwise, we'll try and find the old function.
+    if !haskey(functions(mod), old_name)
+        # If the old function doesn't even appear in the module, then it's not in
+        # use and we can stop right here.
+        return false
+    end
+
+    old_function = functions(mod)[old_name]
+
+    if haskey(functions(mod), new_name)
+        new_function = functions(mod)[new_name]
+    else
+        # Create a new function.
+        new_function = LLVM.Function(
+            mod,
+            new_name,
+            eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType)
+    end
+
+    # Replace all uses of the old function with the new function.
+    replace_uses!(old_function, new_function)
+
+    return true
+end
+
+# Replaces all uses of the managed memory allocation function in a
+# particular module with a compatible function with the specified name.
+function replace_malloc!(mod::LLVM.Module, malloc_name::String)
+    return replace_function!(mod, "julia.managed_malloc", malloc_name)
+end
+
 # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
 #
 # this assumes and checks that the TLS is unused, which should be the case for most GPU code
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index b3addaf0..d4db2e73 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -127,18 +127,43 @@ function T_prjlvalue()
     LLVM.PointerType(eltype(T_pjlvalue), Tracked)
 end
 
+# A function that gets replaced by the proper 'malloc' implementation
+# for the context it executes in. This function gets rewritten as a
+# call to the allocator, probably 'malloc'.
+@generated function managed_malloc(sz::Csize_t)
+    T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
+    T_size = convert(LLVMType, Csize_t)
+    T_ptr = convert(LLVMType, Ptr{UInt8})
+
+    # create function
+    llvm_f, _ = create_function(T_ptr, [T_size])
+    mod = LLVM.parent(llvm_f)
+
+    intr = LLVM.Function(mod, "julia.managed_malloc", LLVM.FunctionType(T_pint8, [T_size]))
+
+    # generate IR
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+        ptr = call!(builder, intr, [parameters(llvm_f)[1]])
+        jlptr = ptrtoint!(builder, ptr, T_ptr)
+        ret!(builder, jlptr)
+    end
+
+    call_function(llvm_f, Ptr{UInt8}, Tuple{Csize_t}, :((sz,)))
+end
+
 function gc_pool_alloc(sz::Csize_t)
-    ptr = malloc(sz)
+    ptr = managed_malloc(sz)
     if ptr == C_NULL
         @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
         throw(OutOfMemoryError())
     end
-    return unsafe_pointer_to_objref(ptr)
+    return
 end
 
 compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue)
 
-
 ## boxing and unboxing
 
 const tag_type = UInt
@@ -226,5 +251,4 @@ for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  =>
     end
 end
 
-
 end

From a03f4aec53ee6c2960e968fa9b01e1e5f76018d6 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 17 Jun 2019 10:21:15 +0200
Subject: [PATCH 3/7] Recompile runtime library for different allocators

---
 src/compiler/driver.jl |  2 +-
 src/compiler/rtlib.jl  | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index ce3d7382..3575c9c7 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -91,7 +91,7 @@ function codegen(target::Symbol, job::CompilerJob;
     # always preload the runtime, and do so early; it cannot be part of any timing block
     # because it recurses into the compiler
     if libraries
-        runtime = load_runtime(job.cap)
+        runtime = load_runtime(job.cap, job.malloc)
         runtime_fns = LLVM.name.(defs(runtime))
     end
 
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index 3d5f33ac..f2e37c57 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -122,26 +122,26 @@ end
 
 ## functionality to build the runtime library
 
-function emit_function!(mod, cap, f, types, name)
+function emit_function!(mod, cap, f, types, name, malloc)
     tt = Base.to_tuple_type(types)
-    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false);
+    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false; malloc=malloc);
                              libraries=false, strict=false)
     LLVM.name!(entry, name)
     link!(mod, new_mod)
 end
 
-function build_runtime(cap)
+function build_runtime(cap, malloc)
     mod = LLVM.Module("CUDAnative run-time library", JuliaContext())
 
     for method in values(Runtime.methods)
-        emit_function!(mod, cap, method.def, method.types, method.llvm_name)
+        emit_function!(mod, cap, method.def, method.types, method.llvm_name, malloc)
     end
 
     mod
 end
 
-function load_runtime(cap)
-    name = "cudanative.$(cap.major)$(cap.minor).bc"
+function load_runtime(cap, malloc)
+    name = "cudanative.$(malloc).$(cap.major)$(cap.minor).bc"
     path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name)
     mkpath(dirname(path))
 
@@ -151,8 +151,8 @@ function load_runtime(cap)
                 parse(LLVM.Module, read(io), JuliaContext())
             end
         else
-            @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..."
-            lib = build_runtime(cap)
+            @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device (allocating with '$malloc'), this might take a while..."
+            lib = build_runtime(cap, malloc)
             open(path, "w") do io
                 write(io, lib)
             end

From 81558e1dd595d468f5b5b507350b0626d510f192 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Thu, 25 Apr 2019 11:20:01 +0200
Subject: [PATCH 4/7] Consider custom malloc during IR checking

---
 src/compiler/validation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl
index af629ac7..60bf4a7a 100644
--- a/src/compiler/validation.jl
+++ b/src/compiler/validation.jl
@@ -231,7 +231,7 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
         end
 
         # detect calls to undefined functions
-        if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns)
+        if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) && fn != job.malloc
             # figure out if the function lives in the Julia runtime library
             if libjulia[] == C_NULL
                 paths = filter(Libdl.dllist()) do path

From d65a9fc5af3600a1bef7dc3af48dc426c48d6582 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 17 Jun 2019 11:01:31 +0200
Subject: [PATCH 5/7] Add a kwarg to '@cuda' that serves as a hook for kernel
 setup

The 'init' kwarg to '@cuda' allows users to define custom kernel
initialization logic, which is run just prior to the kernel.
The main use case for this kwarg right now is setting up globals.
---
 src/execution.jl | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index d805dec7..5bd142ac 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -8,7 +8,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and
 # the code it generates, or the execution
 function split_kwargs(kwargs)
-    macro_kws    = [:dynamic]
+    macro_kws    = [:dynamic, :init]
     compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :malloc]
     call_kws     = [:cooperative, :blocks, :threads, :config, :shmem, :stream]
     macro_kwargs = []
@@ -137,13 +137,14 @@ macro cuda(ex...)
 
     # handle keyword arguments that influence the macro's behavior
     dynamic = false
+    env_kwargs = []
     for kwarg in macro_kwargs
         key,val = kwarg.args
         if key == :dynamic
             isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value"))
             dynamic = val::Bool
         else
-            throw(ArgumentError("Unsupported keyword argument '$key'"))
+            push!(env_kwargs, kwarg)
         end
     end
 
@@ -159,6 +160,7 @@ macro cuda(ex...)
                 # we're in kernel land already, so no need to cudaconvert arguments
                 local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
                 local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
+                prepare_kernel(kernel; $(map(esc, env_kwargs)...))
                 kernel($(var_exprs...); $(map(esc, call_kwargs)...))
              end)
     else
@@ -173,6 +175,7 @@ macro cuda(ex...)
                     local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
                     local kernel = cufunction($(esc(f)), kernel_tt;
                                               $(map(esc, compiler_kwargs)...))
+                    prepare_kernel(kernel; $(map(esc, env_kwargs)...))
                     kernel(kernel_args...; $(map(esc, call_kwargs)...))
                 end
              end)
@@ -447,9 +450,25 @@ end
     return ex
 end
 
+"""
+    prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel)
+
+Prepares a kernel for execution by setting up an environment for that kernel.
+This function should be invoked just prior to running the kernel. Its
+functionality is included in [`@cuda`](@ref).
+
+The 'init' keyword argument is a function that takes a kernel as argument and
+sets up an environment for the kernel.
+"""
+function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) where {F,TT}
+    # Just call the 'init' function for now.
+    init(kernel)
+end
 
 ## device-side API
 
+# There doesn't seem to be a way to access the documentation for the call-syntax,
+# so attach it to the type
 """
     dynamic_cufunction(f, tt=Tuple{})
 
@@ -503,3 +522,8 @@ function nearest_warpsize(dev::CuDevice, threads::Integer)
     ws = CUDAdrv.warpsize(dev)
     return threads + (ws - threads % ws) % ws
 end
+
+function nop_init_kernel(kernel::AbstractKernel{F,TT}) where {F,TT}
+    # Do nothing.
+    return
+end
\ No newline at end of file

From ede33da7fcb52aa8d6e39beb0304287496232c32 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 17 Jun 2019 10:56:30 +0200
Subject: [PATCH 6/7] Implement a bump allocator for kernels

Change how bump allocators are initialized

Define 'cuda_global_ptr' in runtime library
---
 src/device/runtime.jl | 82 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index d4db2e73..5da85e8b 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -12,6 +12,7 @@ module Runtime
 using ..CUDAnative
 using LLVM
 using LLVM.Interop
+using CUDAdrv
 
 
 ## representation of a runtime method instance
@@ -251,4 +252,85 @@ for (T, t) in [Int8   => :int8,  Int16  => :int16,  Int32  => :int32,  Int64  =>
     end
 end
 
+## Bump allocator.
+
+# Gets a pointer to a global with a particular name. If the global
+# does not exist yet, then it is declared in the global memory address
+# space.
+@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T}
+    T_global = convert(LLVMType, T)
+    T_result = convert(LLVMType, Ptr{T})
+
+    # Create a thunk that computes a pointer to the global.
+    llvm_f, _ = create_function(T_result)
+    mod = LLVM.parent(llvm_f)
+
+    # Figure out if the global has been defined already.
+    global_set = LLVM.globals(mod)
+    global_name_string = String(global_name)
+    if haskey(global_set, global_name_string)
+        global_var = global_set[global_name_string]
+    else
+        # If the global hasn't been defined already, then we'll define
+        # it in the global address space, i.e., address space one.
+        global_var = GlobalVariable(mod, T_global, global_name_string, 1)
+        linkage!(global_var, LLVM.API.LLVMLinkOnceODRLinkage)
+        initializer!(global_var, LLVM.null(T_global))
+    end
+
+    # Generate IR that computes the global's address.
+    Builder(JuliaContext()) do builder
+        entry = BasicBlock(llvm_f, "entry", JuliaContext())
+        position!(builder, entry)
+
+        # Cast the global variable's type to the result type.
+        result = ptrtoint!(builder, global_var, T_result)
+        ret!(builder, result)
+    end
+
+    # Call the function.
+    call_function(llvm_f, Ptr{T})
+end
+
+macro cuda_global_ptr(name, type)
+    return :(convert(
+        DevicePtr{T},
+        get_global_pointer(
+            $(Val(Symbol(name))),
+            $(esc(type)))))
+end
+
+# Allocates `bytesize` bytes of storage by bumping the global bump
+# allocator pointer.
+function bump_alloc(bytesize::Csize_t)::Ptr{UInt8}
+    ptr = @cuda_global_ptr("bump_alloc_ptr", Csize_t)
+    chunk_address = atomic_add!(ptr, bytesize)
+    end_ptr = unsafe_load(@cuda_global_ptr("bump_alloc_end", Csize_t))
+    if chunk_address < end_ptr
+        return convert(Ptr{UInt8}, chunk_address)
+    else
+        return C_NULL
+    end
+end
+
+compile(bump_alloc, Ptr{UInt8}, (Csize_t,))
+
+function maybe_set_global(kernel, name, value::T) where T
+    try
+        global_handle = CuGlobal{T}(kernel.mod, name)
+        set(global_handle, value)
+    catch exception
+        # The interrupt pointer may not have been declared (because it is unused).
+        # In that case, we should do nothing.
+        if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
+            rethrow()
+        end
+    end
+end
+
+function bump_alloc_init!(kernel, buffer_start, buffer_size)
+    maybe_set_global(kernel, "bump_alloc_ptr", buffer_start)
+    maybe_set_global(kernel, "bump_alloc_end", buffer_start + buffer_size)
+end
+
 end

From 560daa03e6877c76cc187d1dcef5cac990a33ee7 Mon Sep 17 00:00:00 2001
From: jonathanvdc <jonathan.vdc@outlook.com>
Date: Mon, 17 Jun 2019 11:08:40 +0200
Subject: [PATCH 7/7] Include a bump allocator example

---
 examples/bump-allocator.jl | 40 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 examples/bump-allocator.jl

diff --git a/examples/bump-allocator.jl b/examples/bump-allocator.jl
new file mode 100644
index 00000000..93f539a8
--- /dev/null
+++ b/examples/bump-allocator.jl
@@ -0,0 +1,40 @@
+using Test
+
+using CUDAdrv, CUDAnative
+include(joinpath(@__DIR__, "..", "test", "array.jl"))   # real applications: use CuArrays.jl
+
+mutable struct Box{T}
+    value::T
+end
+
+function vcopy(a, b)
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    box = Box(a[i])
+    b[i] = box.value
+    return
+end
+
+dims = (3,4)
+a = round.(rand(Float32, dims) * 100)
+b = similar(a)
+
+d_a = CuTestArray(a)
+d_b = CuTestArray(b)
+
+len = prod(dims)
+
+# Allocate a 1 MiB heap for the bump allocator.
+heap_capacity = 1024 * 1024
+heap = Mem.alloc(Mem.DeviceBuffer, heap_capacity)
+heap_start_address = pointer(heap)
+# Create an initialization callback for the bump allocator.
+function init(kernel)
+    CUDAnative.Runtime.bump_alloc_init!(kernel, heap_start_address, heap_capacity)
+end
+# Run the kernel.
+@cuda threads=len init=init malloc="ptx_bump_alloc" vcopy(d_a, d_b)
+# Free the heap.
+Mem.free(heap)
+
+b = Array(d_b)
+@test a ≈ b