diff --git a/src/device/execution.jl b/src/device/execution.jl
index e6a22204..51b5bbb8 100644
--- a/src/device/execution.jl
+++ b/src/device/execution.jl
@@ -95,7 +95,8 @@ end
 function launch_configuration(backend::AbstractGPUBackend, heuristic;
                               elements::Int, elements_per_thread::Int)
     threads = clamp(elements, 1, heuristic.threads)
-    blocks = max(cld(elements, threads), 1)
+    blocks = clamp(cld(elements, threads), elements, heuristic.blocks)
+    threads = cld(elements, blocks)
 
     if elements_per_thread > 1 && blocks > heuristic.blocks
         # we want to launch more blocks than required, so prefer a grid-stride loop instead
diff --git a/test/testsuite/gpuinterface.jl b/test/testsuite/gpuinterface.jl
index 1455c732..0d74fe8d 100644
--- a/test/testsuite/gpuinterface.jl
+++ b/test/testsuite/gpuinterface.jl
@@ -5,24 +5,28 @@
     x = AT(Vector{Int}(undef, N))
     x .= 0
     gpu_call(x) do ctx, x
-        x[linear_index(ctx)] = 2
+        i = @linearidx x
+        x[i] = 2
         return
     end
     @test all(x-> x == 2, Array(x))
 
     gpu_call(x; elements=N) do ctx, x
-        x[linear_index(ctx)] = 2
+        i = @linearidx x
+        x[i] = 2
         return
     end
     @test all(x-> x == 2, Array(x))
     gpu_call(x; threads=2, blocks=(N ÷ 2)) do ctx, x
-        x[linear_index(ctx)] = threadidx(ctx)
+        i = @linearidx x
+        x[i] = threadidx(ctx)
         return
     end
     @test Array(x) == [1,2,1,2,1,2,1,2,1,2]
 
     gpu_call(x; threads=2, blocks=(N ÷ 2)) do ctx, x
-        x[linear_index(ctx)] = blockidx(ctx)
+        i = @linearidx x
+        x[i] = blockidx(ctx)
         return
     end
     @test Array(x) == [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]