From 270553d52bd662ed9ef8a1cbf5ff52cf8c17c37f Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Fri, 19 Apr 2019 15:35:34 +0200 Subject: [PATCH 1/7] Add a 'malloc' keyword argument to the @cuda macro --- src/compiler/common.jl | 10 ++++++++-- src/execution.jl | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/compiler/common.jl b/src/compiler/common.jl index 04c9f0a5..6a7de08e 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -13,11 +13,17 @@ struct CompilerJob blocks_per_sm::Union{Nothing,Integer} maxregs::Union{Nothing,Integer} name::Union{Nothing,String} + # The name of the 'malloc' function to use when allocating memory. + # A transform will rewrite all calls to 'malloc' to use this function + # instead. The 'malloc' signature must be 'void* malloc(size_t)' or + # compatible. + malloc::String CompilerJob(f, tt, cap, kernel; name=nothing, minthreads=nothing, maxthreads=nothing, - blocks_per_sm=nothing, maxregs=nothing) = - new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name) + blocks_per_sm=nothing, maxregs=nothing, + malloc="malloc") = + new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name, malloc) end # global job reference diff --git a/src/execution.jl b/src/execution.jl index 1783669a..d805dec7 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -9,7 +9,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize # the code it generates, or the execution function split_kwargs(kwargs) macro_kws = [:dynamic] - compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name] + compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :malloc] call_kws = [:cooperative, :blocks, :threads, :config, :shmem, :stream] macro_kwargs = [] compiler_kwargs = [] From 23cfe53919b829aa8b74b63ff1de077cb4501b5b Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 17 Jun 2019 10:30:23 +0200 Subject: [PATCH 2/7] Introduce a special 'managed_malloc' runtime function Add a pass that rewrites calls to 'managed_malloc' --- src/compiler/common.jl | 8 ++++---- src/compiler/optim.jl | 41 +++++++++++++++++++++++++++++++++++++++++ src/device/runtime.jl | 32 ++++++++++++++++++++++++++++---- 3 files changed, 73 insertions(+), 8 deletions(-) diff --git a/src/compiler/common.jl b/src/compiler/common.jl index 6a7de08e..ec463ca3 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -13,10 +13,10 @@ struct CompilerJob blocks_per_sm::Union{Nothing,Integer} maxregs::Union{Nothing,Integer} name::Union{Nothing,String} - # The name of the 'malloc' function to use when allocating memory. - # A transform will rewrite all calls to 'malloc' to use this function - # instead. The 'malloc' signature must be 'void* malloc(size_t)' or - # compatible. + # The name of the memory allocation function to use when allocating + # managed memory. A transform will rewrite all managed memory allocations + # to use this function instead. The 'malloc' signature must be + # 'void* malloc(size_t)' or compatible. malloc::String CompilerJob(f, tt, cap, kernel; name=nothing, diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 1e76f146..f7e6f8f6 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -63,6 +63,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) run!(pm, mod) end + replace_malloc!(mod, job.malloc) end # PTX-specific optimizations @@ -355,6 +356,46 @@ function lower_gc_frame!(fun::LLVM.Function) return changed end +# Replaces all uses of a function in a particular module with +# a compatible function. +function replace_function!(mod::LLVM.Module, old_name::String, new_name::String) + if new_name == old_name + # There's nothing to replace if the new function is the same as + # the old function. + return false + end + + # Otherwise, we'll try and find the old function. + if !haskey(functions(mod), old_name) + # If the old function doesn't even appear in the module, then it's not in + # use and we can stop right here. + return false + end + + old_function = functions(mod)[old_name] + + if haskey(functions(mod), new_name) + new_function = functions(mod)[new_name] + else + # Create a new function. + new_function = LLVM.Function( + mod, + new_name, + eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType) + end + + # Replace all uses of the old function with the new function. + replace_uses!(old_function, new_function) + + return true +end + +# Replaces all uses of the managed memory allocation function in a +# particular module with a compatible function with the specified name. +function replace_malloc!(mod::LLVM.Module, malloc_name::String) + return replace_function!(mod, "julia.managed_malloc", malloc_name) +end + # lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible. # # this assumes and checks that the TLS is unused, which should be the case for most GPU code diff --git a/src/device/runtime.jl b/src/device/runtime.jl index b3addaf0..d4db2e73 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -127,18 +127,43 @@ function T_prjlvalue() LLVM.PointerType(eltype(T_pjlvalue), Tracked) end +# A function that gets replaced by the proper 'malloc' implementation +# for the context it executes in. This function gets rewritten as a +# call to the allocator, probably 'malloc'. +@generated function managed_malloc(sz::Csize_t) + T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext())) + T_size = convert(LLVMType, Csize_t) + T_ptr = convert(LLVMType, Ptr{UInt8}) + + # create function + llvm_f, _ = create_function(T_ptr, [T_size]) + mod = LLVM.parent(llvm_f) + + intr = LLVM.Function(mod, "julia.managed_malloc", LLVM.FunctionType(T_pint8, [T_size])) + + # generate IR + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + ptr = call!(builder, intr, [parameters(llvm_f)[1]]) + jlptr = ptrtoint!(builder, ptr, T_ptr) + ret!(builder, jlptr) + end + + call_function(llvm_f, Ptr{UInt8}, Tuple{Csize_t}, :((sz,))) +end + function gc_pool_alloc(sz::Csize_t) - ptr = malloc(sz) + ptr = managed_malloc(sz) if ptr == C_NULL @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) throw(OutOfMemoryError()) end - return unsafe_pointer_to_objref(ptr) + return end compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue) - ## boxing and unboxing const tag_type = UInt @@ -226,5 +251,4 @@ for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => end end - end From a03f4aec53ee6c2960e968fa9b01e1e5f76018d6 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 17 Jun 2019 10:21:15 +0200 Subject: [PATCH 3/7] Recompile runtime library for different allocators --- src/compiler/driver.jl | 2 +- src/compiler/rtlib.jl | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index ce3d7382..3575c9c7 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -91,7 +91,7 @@ function codegen(target::Symbol, job::CompilerJob; # always preload the runtime, and do so early; it cannot be part of any timing block # because it recurses into the compiler if libraries - runtime = load_runtime(job.cap) + runtime = load_runtime(job.cap, job.malloc) runtime_fns = LLVM.name.(defs(runtime)) end diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index 3d5f33ac..f2e37c57 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -122,26 +122,26 @@ end ## functionality to build the runtime library -function emit_function!(mod, cap, f, types, name) +function emit_function!(mod, cap, f, types, name, malloc) tt = Base.to_tuple_type(types) - new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false); + new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false; malloc=malloc); libraries=false, strict=false) LLVM.name!(entry, name) link!(mod, new_mod) end -function build_runtime(cap) +function build_runtime(cap, malloc) mod = LLVM.Module("CUDAnative run-time library", JuliaContext()) for method in values(Runtime.methods) - emit_function!(mod, cap, method.def, method.types, method.llvm_name) + emit_function!(mod, cap, method.def, method.types, method.llvm_name, malloc) end mod end -function load_runtime(cap) - name = "cudanative.$(cap.major)$(cap.minor).bc" +function load_runtime(cap, malloc) + name = "cudanative.$(malloc).$(cap.major)$(cap.minor).bc" path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name) mkpath(dirname(path)) @@ -151,8 +151,8 @@ function load_runtime(cap) parse(LLVM.Module, read(io), JuliaContext()) end else - @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..." - lib = build_runtime(cap) + @info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device (allocating with '$malloc'), this might take a while..." + lib = build_runtime(cap, malloc) open(path, "w") do io write(io, lib) end From 81558e1dd595d468f5b5b507350b0626d510f192 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Thu, 25 Apr 2019 11:20:01 +0200 Subject: [PATCH 4/7] Consider custom malloc during IR checking --- src/compiler/validation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl index af629ac7..60bf4a7a 100644 --- a/src/compiler/validation.jl +++ b/src/compiler/validation.jl @@ -231,7 +231,7 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst) end # detect calls to undefined functions - if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) + if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) && fn != job.malloc # figure out if the function lives in the Julia runtime library if libjulia[] == C_NULL paths = filter(Libdl.dllist()) do path From d65a9fc5af3600a1bef7dc3af48dc426c48d6582 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 17 Jun 2019 11:01:31 +0200 Subject: [PATCH 5/7] Add a kwarg to '@cuda' that serves as a hook for kernel setup The 'init' kwarg to '@cuda' allows users to define custom kernel initialization logic, which is run just prior to the kernel. The main use case for this kwarg right now is setting up globals. --- src/execution.jl | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index d805dec7..5bd142ac 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -8,7 +8,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and # the code it generates, or the execution function split_kwargs(kwargs) - macro_kws = [:dynamic] + macro_kws = [:dynamic, :init] compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :malloc] call_kws = [:cooperative, :blocks, :threads, :config, :shmem, :stream] macro_kwargs = [] @@ -137,13 +137,14 @@ macro cuda(ex...) # handle keyword arguments that influence the macro's behavior dynamic = false + env_kwargs = [] for kwarg in macro_kwargs key,val = kwarg.args if key == :dynamic isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value")) dynamic = val::Bool else - throw(ArgumentError("Unsupported keyword argument '$key'")) + push!(env_kwargs, kwarg) end end @@ -159,6 +160,7 @@ macro cuda(ex...) # we're in kernel land already, so no need to cudaconvert arguments local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} local kernel = dynamic_cufunction($(esc(f)), kernel_tt) + prepare_kernel(kernel; $(map(esc, env_kwargs)...)) kernel($(var_exprs...); $(map(esc, call_kwargs)...)) end) else @@ -173,6 +175,7 @@ macro cuda(ex...) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + prepare_kernel(kernel; $(map(esc, env_kwargs)...)) kernel(kernel_args...; $(map(esc, call_kwargs)...)) end end) @@ -447,9 +450,25 @@ end return ex end +""" + prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) + +Prepares a kernel for execution by setting up an environment for that kernel. +This function should be invoked just prior to running the kernel. Its +functionality is included in [`@cuda`](@ref). + +The 'init' keyword argument is a function that takes a kernel as argument and +sets up an environment for the kernel. +""" +function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) where {F,TT} + # Just call the 'init' function for now. + init(kernel) +end ## device-side API +# There doesn't seem to be a way to access the documentation for the call-syntax, +# so attach it to the type """ dynamic_cufunction(f, tt=Tuple{}) @@ -503,3 +522,8 @@ function nearest_warpsize(dev::CuDevice, threads::Integer) ws = CUDAdrv.warpsize(dev) return threads + (ws - threads % ws) % ws end + +function nop_init_kernel(kernel::AbstractKernel{F,TT}) where {F,TT} + # Do nothing. + return +end \ No newline at end of file From ede33da7fcb52aa8d6e39beb0304287496232c32 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 17 Jun 2019 10:56:30 +0200 Subject: [PATCH 6/7] Implement a bump allocator for kernels Change how bump allocators are initialized Define 'cuda_global_ptr' in runtime library --- src/device/runtime.jl | 82 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index d4db2e73..5da85e8b 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -12,6 +12,7 @@ module Runtime using ..CUDAnative using LLVM using LLVM.Interop +using CUDAdrv ## representation of a runtime method instance @@ -251,4 +252,85 @@ for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 => end end +## Bump allocator. + +# Gets a pointer to a global with a particular name. If the global +# does not exist yet, then it is declared in the global memory address +# space. +@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T} + T_global = convert(LLVMType, T) + T_result = convert(LLVMType, Ptr{T}) + + # Create a thunk that computes a pointer to the global. + llvm_f, _ = create_function(T_result) + mod = LLVM.parent(llvm_f) + + # Figure out if the global has been defined already. + global_set = LLVM.globals(mod) + global_name_string = String(global_name) + if haskey(global_set, global_name_string) + global_var = global_set[global_name_string] + else + # If the global hasn't been defined already, then we'll define + # it in the global address space, i.e., address space one. + global_var = GlobalVariable(mod, T_global, global_name_string, 1) + linkage!(global_var, LLVM.API.LLVMLinkOnceODRLinkage) + initializer!(global_var, LLVM.null(T_global)) + end + + # Generate IR that computes the global's address. + Builder(JuliaContext()) do builder + entry = BasicBlock(llvm_f, "entry", JuliaContext()) + position!(builder, entry) + + # Cast the global variable's type to the result type. + result = ptrtoint!(builder, global_var, T_result) + ret!(builder, result) + end + + # Call the function. + call_function(llvm_f, Ptr{T}) +end + +macro cuda_global_ptr(name, type) + return :(convert( + DevicePtr{T}, + get_global_pointer( + $(Val(Symbol(name))), + $(esc(type))))) +end + +# Allocates `bytesize` bytes of storage by bumping the global bump +# allocator pointer. +function bump_alloc(bytesize::Csize_t)::Ptr{UInt8} + ptr = @cuda_global_ptr("bump_alloc_ptr", Csize_t) + chunk_address = atomic_add!(ptr, bytesize) + end_ptr = unsafe_load(@cuda_global_ptr("bump_alloc_end", Csize_t)) + if chunk_address < end_ptr + return convert(Ptr{UInt8}, chunk_address) + else + return C_NULL + end +end + +compile(bump_alloc, Ptr{UInt8}, (Csize_t,)) + +function maybe_set_global(kernel, name, value::T) where T + try + global_handle = CuGlobal{T}(kernel.mod, name) + set(global_handle, value) + catch exception + # The interrupt pointer may not have been declared (because it is unused). + # In that case, we should do nothing. + if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code + rethrow() + end + end +end + +function bump_alloc_init!(kernel, buffer_start, buffer_size) + maybe_set_global(kernel, "bump_alloc_ptr", buffer_start) + maybe_set_global(kernel, "bump_alloc_end", buffer_start + buffer_size) +end + end From 560daa03e6877c76cc187d1dcef5cac990a33ee7 Mon Sep 17 00:00:00 2001 From: jonathanvdc Date: Mon, 17 Jun 2019 11:08:40 +0200 Subject: [PATCH 7/7] Include a bump allocator example --- examples/bump-allocator.jl | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 examples/bump-allocator.jl diff --git a/examples/bump-allocator.jl b/examples/bump-allocator.jl new file mode 100644 index 00000000..93f539a8 --- /dev/null +++ b/examples/bump-allocator.jl @@ -0,0 +1,40 @@ +using Test + +using CUDAdrv, CUDAnative +include(joinpath(@__DIR__, "..", "test", "array.jl")) # real applications: use CuArrays.jl + +mutable struct Box{T} + value::T +end + +function vcopy(a, b) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + box = Box(a[i]) + b[i] = box.value + return +end + +dims = (3,4) +a = round.(rand(Float32, dims) * 100) +b = similar(a) + +d_a = CuTestArray(a) +d_b = CuTestArray(b) + +len = prod(dims) + +# Allocate a 1 MiB heap for the bump allocator. +heap_capacity = 1024 * 1024 +heap = Mem.alloc(Mem.DeviceBuffer, heap_capacity) +heap_start_address = pointer(heap) +# Create an initialization callback for the bump allocator. +function init(kernel) + CUDAnative.Runtime.bump_alloc_init!(kernel, heap_start_address, heap_capacity) +end +# Run the kernel. +@cuda threads=len init=init malloc="ptx_bump_alloc" vcopy(d_a, d_b) +# Free the heap. +Mem.free(heap) + +b = Array(d_b) +@test a ≈ b