Skip to content
This repository has been archived by the owner on May 27, 2021. It is now read-only.

Commit

Permalink
Try #422:
Browse files Browse the repository at this point in the history
  • Loading branch information
bors[bot] committed Jun 17, 2019
2 parents be62124 + 560daa0 commit 8ccd979
Show file tree
Hide file tree
Showing 8 changed files with 235 additions and 18 deletions.
40 changes: 40 additions & 0 deletions examples/bump-allocator.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
using Test

using CUDAdrv, CUDAnative
include(joinpath(@__DIR__, "..", "test", "array.jl")) # real applications: use CuArrays.jl

mutable struct Box{T}
value::T
end

function vcopy(a, b)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
box = Box(a[i])
b[i] = box.value
return
end

dims = (3,4)
a = round.(rand(Float32, dims) * 100)
b = similar(a)

d_a = CuTestArray(a)
d_b = CuTestArray(b)

len = prod(dims)

# Allocate a 1 MiB heap for the bump allocator.
heap_capacity = 1024 * 1024
heap = Mem.alloc(Mem.DeviceBuffer, heap_capacity)
heap_start_address = pointer(heap)
# Create an initialization callback for the bump allocator.
function init(kernel)
CUDAnative.Runtime.bump_alloc_init!(kernel, heap_start_address, heap_capacity)
end
# Run the kernel.
@cuda threads=len init=init malloc="ptx_bump_alloc" vcopy(d_a, d_b)
# Free the heap.
Mem.free(heap)

b = Array(d_b)
@test a b
10 changes: 8 additions & 2 deletions src/compiler/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@ struct CompilerJob
blocks_per_sm::Union{Nothing,Integer}
maxregs::Union{Nothing,Integer}
name::Union{Nothing,String}
# The name of the memory allocation function to use when allocating
# managed memory. A transform will rewrite all managed memory allocations
# to use this function instead. The 'malloc' signature must be
# 'void* malloc(size_t)' or compatible.
malloc::String

CompilerJob(f, tt, cap, kernel; name=nothing,
minthreads=nothing, maxthreads=nothing,
blocks_per_sm=nothing, maxregs=nothing) =
new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name)
blocks_per_sm=nothing, maxregs=nothing,
malloc="malloc") =
new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name, malloc)
end

# global job reference
Expand Down
2 changes: 1 addition & 1 deletion src/compiler/driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ function codegen(target::Symbol, job::CompilerJob;
# always preload the runtime, and do so early; it cannot be part of any timing block
# because it recurses into the compiler
if libraries
runtime = load_runtime(job.cap)
runtime = load_runtime(job.cap, job.malloc)
runtime_fns = LLVM.name.(defs(runtime))
end

Expand Down
41 changes: 41 additions & 0 deletions src/compiler/optim.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)

run!(pm, mod)
end
replace_malloc!(mod, job.malloc)
end

# PTX-specific optimizations
Expand Down Expand Up @@ -355,6 +356,46 @@ function lower_gc_frame!(fun::LLVM.Function)
return changed
end

# Replaces all uses of a function in a particular module with
# a compatible function.
function replace_function!(mod::LLVM.Module, old_name::String, new_name::String)
if new_name == old_name
# There's nothing to replace if the new function is the same as
# the old function.
return false
end

# Otherwise, we'll try and find the old function.
if !haskey(functions(mod), old_name)
# If the old function doesn't even appear in the module, then it's not in
# use and we can stop right here.
return false
end

old_function = functions(mod)[old_name]

if haskey(functions(mod), new_name)
new_function = functions(mod)[new_name]
else
# Create a new function.
new_function = LLVM.Function(
mod,
new_name,
eltype(llvmtype(old_function)::LLVM.PointerType)::LLVM.FunctionType)
end

# Replace all uses of the old function with the new function.
replace_uses!(old_function, new_function)

return true
end

# Replaces all uses of the managed memory allocation function in a
# particular module with a compatible function with the specified name.
function replace_malloc!(mod::LLVM.Module, malloc_name::String)
return replace_function!(mod, "julia.managed_malloc", malloc_name)
end

# lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
#
# this assumes and checks that the TLS is unused, which should be the case for most GPU code
Expand Down
16 changes: 8 additions & 8 deletions src/compiler/rtlib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -122,26 +122,26 @@ end

## functionality to build the runtime library

function emit_function!(mod, cap, f, types, name)
function emit_function!(mod, cap, f, types, name, malloc)
tt = Base.to_tuple_type(types)
new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false);
new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false; malloc=malloc);
libraries=false, strict=false)
LLVM.name!(entry, name)
link!(mod, new_mod)
end

function build_runtime(cap)
function build_runtime(cap, malloc)
mod = LLVM.Module("CUDAnative run-time library", JuliaContext())

for method in values(Runtime.methods)
emit_function!(mod, cap, method.def, method.types, method.llvm_name)
emit_function!(mod, cap, method.def, method.types, method.llvm_name, malloc)
end

mod
end

function load_runtime(cap)
name = "cudanative.$(cap.major)$(cap.minor).bc"
function load_runtime(cap, malloc)
name = "cudanative.$(malloc).$(cap.major)$(cap.minor).bc"
path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name)
mkpath(dirname(path))

Expand All @@ -151,8 +151,8 @@ function load_runtime(cap)
parse(LLVM.Module, read(io), JuliaContext())
end
else
@info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device, this might take a while..."
lib = build_runtime(cap)
@info "Building the CUDAnative run-time library for your sm_$(cap.major)$(cap.minor) device (allocating with '$malloc'), this might take a while..."
lib = build_runtime(cap, malloc)
open(path, "w") do io
write(io, lib)
end
Expand Down
2 changes: 1 addition & 1 deletion src/compiler/validation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
end

# detect calls to undefined functions
if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns)
if isdeclaration(dest) && intrinsic_id(dest) == 0 && !(fn in special_fns) && fn != job.malloc
# figure out if the function lives in the Julia runtime library
if libjulia[] == C_NULL
paths = filter(Libdl.dllist()) do path
Expand Down
112 changes: 109 additions & 3 deletions src/device/runtime.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module Runtime
using ..CUDAnative
using LLVM
using LLVM.Interop
using CUDAdrv


## representation of a runtime method instance
Expand Down Expand Up @@ -127,18 +128,43 @@ function T_prjlvalue()
LLVM.PointerType(eltype(T_pjlvalue), Tracked)
end

# A function that gets replaced by the proper 'malloc' implementation
# for the context it executes in. This function gets rewritten as a
# call to the allocator, probably 'malloc'.
@generated function managed_malloc(sz::Csize_t)
T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
T_size = convert(LLVMType, Csize_t)
T_ptr = convert(LLVMType, Ptr{UInt8})

# create function
llvm_f, _ = create_function(T_ptr, [T_size])
mod = LLVM.parent(llvm_f)

intr = LLVM.Function(mod, "julia.managed_malloc", LLVM.FunctionType(T_pint8, [T_size]))

# generate IR
Builder(JuliaContext()) do builder
entry = BasicBlock(llvm_f, "entry", JuliaContext())
position!(builder, entry)
ptr = call!(builder, intr, [parameters(llvm_f)[1]])
jlptr = ptrtoint!(builder, ptr, T_ptr)
ret!(builder, jlptr)
end

call_function(llvm_f, Ptr{UInt8}, Tuple{Csize_t}, :((sz,)))
end

function gc_pool_alloc(sz::Csize_t)
ptr = malloc(sz)
ptr = managed_malloc(sz)
if ptr == C_NULL
@cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
throw(OutOfMemoryError())
end
return unsafe_pointer_to_objref(ptr)
return
end

compile(gc_pool_alloc, Any, (Csize_t,), T_prjlvalue)


## boxing and unboxing

const tag_type = UInt
Expand Down Expand Up @@ -226,5 +252,85 @@ for (T, t) in [Int8 => :int8, Int16 => :int16, Int32 => :int32, Int64 =>
end
end

## Bump allocator.

# Gets a pointer to a global with a particular name. If the global
# does not exist yet, then it is declared in the global memory address
# space.
@generated function get_global_pointer(::Val{global_name}, ::Type{T})::Ptr{T} where {global_name, T}
T_global = convert(LLVMType, T)
T_result = convert(LLVMType, Ptr{T})

# Create a thunk that computes a pointer to the global.
llvm_f, _ = create_function(T_result)
mod = LLVM.parent(llvm_f)

# Figure out if the global has been defined already.
global_set = LLVM.globals(mod)
global_name_string = String(global_name)
if haskey(global_set, global_name_string)
global_var = global_set[global_name_string]
else
# If the global hasn't been defined already, then we'll define
# it in the global address space, i.e., address space one.
global_var = GlobalVariable(mod, T_global, global_name_string, 1)
linkage!(global_var, LLVM.API.LLVMLinkOnceODRLinkage)
initializer!(global_var, LLVM.null(T_global))
end

# Generate IR that computes the global's address.
Builder(JuliaContext()) do builder
entry = BasicBlock(llvm_f, "entry", JuliaContext())
position!(builder, entry)

# Cast the global variable's type to the result type.
result = ptrtoint!(builder, global_var, T_result)
ret!(builder, result)
end

# Call the function.
call_function(llvm_f, Ptr{T})
end

macro cuda_global_ptr(name, type)
return :(convert(
DevicePtr{T},
get_global_pointer(
$(Val(Symbol(name))),
$(esc(type)))))
end

# Allocates `bytesize` bytes of storage by bumping the global bump
# allocator pointer.
function bump_alloc(bytesize::Csize_t)::Ptr{UInt8}
ptr = @cuda_global_ptr("bump_alloc_ptr", Csize_t)
chunk_address = atomic_add!(ptr, bytesize)
end_ptr = unsafe_load(@cuda_global_ptr("bump_alloc_end", Csize_t))
if chunk_address < end_ptr
return convert(Ptr{UInt8}, chunk_address)
else
return C_NULL
end
end

compile(bump_alloc, Ptr{UInt8}, (Csize_t,))

function maybe_set_global(kernel, name, value::T) where T
try
global_handle = CuGlobal{T}(kernel.mod, name)
set(global_handle, value)
catch exception
# The interrupt pointer may not have been declared (because it is unused).
# In that case, we should do nothing.
if !isa(exception, CUDAdrv.CuError) || exception.code != CUDAdrv.ERROR_NOT_FOUND.code
rethrow()
end
end
end

function bump_alloc_init!(kernel, buffer_start, buffer_size)
maybe_set_global(kernel, "bump_alloc_ptr", buffer_start)
maybe_set_global(kernel, "bump_alloc_end", buffer_start + buffer_size)
end

end
30 changes: 27 additions & 3 deletions src/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
# split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and
# the code it generates, or the execution
function split_kwargs(kwargs)
macro_kws = [:dynamic]
compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name]
macro_kws = [:dynamic, :init]
compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :malloc]
call_kws = [:cooperative, :blocks, :threads, :config, :shmem, :stream]
macro_kwargs = []
compiler_kwargs = []
Expand Down Expand Up @@ -137,13 +137,14 @@ macro cuda(ex...)

# handle keyword arguments that influence the macro's behavior
dynamic = false
env_kwargs = []
for kwarg in macro_kwargs
key,val = kwarg.args
if key == :dynamic
isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value"))
dynamic = val::Bool
else
throw(ArgumentError("Unsupported keyword argument '$key'"))
push!(env_kwargs, kwarg)
end
end

Expand All @@ -159,6 +160,7 @@ macro cuda(ex...)
# we're in kernel land already, so no need to cudaconvert arguments
local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
prepare_kernel(kernel; $(map(esc, env_kwargs)...))
kernel($(var_exprs...); $(map(esc, call_kwargs)...))
end)
else
Expand All @@ -173,6 +175,7 @@ macro cuda(ex...)
local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
local kernel = cufunction($(esc(f)), kernel_tt;
$(map(esc, compiler_kwargs)...))
prepare_kernel(kernel; $(map(esc, env_kwargs)...))
kernel(kernel_args...; $(map(esc, call_kwargs)...))
end
end)
Expand Down Expand Up @@ -447,9 +450,25 @@ end
return ex
end

"""
prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel)
Prepares a kernel for execution by setting up an environment for that kernel.
This function should be invoked just prior to running the kernel. Its
functionality is included in [`@cuda`](@ref).
The 'init' keyword argument is a function that takes a kernel as argument and
sets up an environment for the kernel.
"""
function prepare_kernel(kernel::AbstractKernel{F,TT}; init::Function=nop_init_kernel) where {F,TT}
# Just call the 'init' function for now.
init(kernel)
end

## device-side API

# There doesn't seem to be a way to access the documentation for the call-syntax,
# so attach it to the type
"""
dynamic_cufunction(f, tt=Tuple{})
Expand Down Expand Up @@ -503,3 +522,8 @@ function nearest_warpsize(dev::CuDevice, threads::Integer)
ws = CUDAdrv.warpsize(dev)
return threads + (ws - threads % ws) % ws
end

function nop_init_kernel(kernel::AbstractKernel{F,TT}) where {F,TT}
# Do nothing.
return
end

0 comments on commit 8ccd979

Please sign in to comment.