diff --git a/src/device/execution.jl b/src/device/execution.jl index e6a22204..51b5bbb8 100644 --- a/src/device/execution.jl +++ b/src/device/execution.jl @@ -95,7 +95,8 @@ end function launch_configuration(backend::AbstractGPUBackend, heuristic; elements::Int, elements_per_thread::Int) threads = clamp(elements, 1, heuristic.threads) - blocks = max(cld(elements, threads), 1) + blocks = clamp(cld(elements, threads), elements, heuristic.blocks) + threads = cld(elements, blocks) if elements_per_thread > 1 && blocks > heuristic.blocks # we want to launch more blocks than required, so prefer a grid-stride loop instead diff --git a/test/testsuite/gpuinterface.jl b/test/testsuite/gpuinterface.jl index 1455c732..0d74fe8d 100644 --- a/test/testsuite/gpuinterface.jl +++ b/test/testsuite/gpuinterface.jl @@ -5,24 +5,28 @@ x = AT(Vector{Int}(undef, N)) x .= 0 gpu_call(x) do ctx, x - x[linear_index(ctx)] = 2 + i = @linearidx x + x[i] = 2 return end @test all(x-> x == 2, Array(x)) gpu_call(x; elements=N) do ctx, x - x[linear_index(ctx)] = 2 + i = @linearidx x + x[i] = 2 return end @test all(x-> x == 2, Array(x)) gpu_call(x; threads=2, blocks=(N ÷ 2)) do ctx, x - x[linear_index(ctx)] = threadidx(ctx) + i = @linearidx x + x[i] = threadidx(ctx) return end @test Array(x) == [1,2,1,2,1,2,1,2,1,2] gpu_call(x; threads=2, blocks=(N ÷ 2)) do ctx, x - x[linear_index(ctx)] = blockidx(ctx) + i = @linearidx x + x[i] = blockidx(ctx) return end @test Array(x) == [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]