FlagOpen · AdvancedCompiler · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/benchmark/test_select_and_slice_perf.py b/benchmark/test_select_and_slice_perf.py
@@ -223,6 +223,30 @@ def select_scatter_input_fn(shape, dtype, device):
     bench.run()
 
 
+@pytest.mark.index_select_backward
+def test_perf_index_select_backward():
+    def index_select_backward_input_fn(shape, dtype, device):
+        inp = generate_tensor_input(shape, dtype, device)
+        threshold = 0.1
+        dim = 0
+        index_size = inp.size(dim)
+        from math import floor
+
+        index = torch.randint(
+            0, index_size, [floor(index_size * threshold)], device=device
+        )
+        yield inp, dim, index
+
+    bench = TensorSelectBenchmark(
+        input_fn=index_select_backward_input_fn,
+        op_name="index_select_backward",
+        torch_op=torch.index_select,
+        dtypes=FLOAT_DTYPES,
+        is_backward=True,
+    )
+    bench.run()
+
+
 @pytest.mark.index_add
 def test_index_add_perf():
     def index_add_input_fn(shape, dtype, device):

diff --git a/src/flag_gems/__init__.py b/src/flag_gems/__init__.py
@@ -166,6 +166,7 @@ def enable(lib=aten_lib, unused=None, registrar=registrar):
             ("slice_scatter", slice_scatter, Autograd.disable),
             ("select_scatter", select_scatter, Autograd.disable),
             ("index_select", index_select, Autograd.disable),
+            ("index_select_backward", index_select_backward, Autograd.disable),
             ("tile", tile, Autograd.disable),
             ("masked_fill.Tensor", masked_fill, Autograd.disable),
             ("masked_fill.Scalar", masked_fill, Autograd.disable),

diff --git a/src/flag_gems/ops/__init__.py b/src/flag_gems/ops/__init__.py
@@ -45,7 +45,7 @@
 from .gt import gt, gt_scalar
 from .hstack import hstack
 from .index_add import index_add
-from .index_select import index_select
+from .index_select import index_select, index_select_backward
 from .instancenorm import instance_norm
 from .isclose import allclose, isclose
 from .isfinite import isfinite
@@ -186,6 +186,7 @@
     "gt",
     "gt_scalar",
     "index_select",
+    "index_select_backward",
     "instance_norm",
     "isclose",
     "isfinite",

diff --git a/src/flag_gems/ops/index_select.py b/src/flag_gems/ops/index_select.py
@@ -1,9 +1,11 @@
 import logging
+import math
 
 import torch
 import triton
 import triton.language as tl
 
+from .. import runtime
 from ..utils import dim_compress, libentry
 from ..utils import triton_lang_extension as tle
 
@@ -75,3 +77,82 @@ def index_select(inp, dim, index):
         return out.permute(order)
     else:
         return out
+
+
+def dim_compress_backward(inp, dims):
+    if isinstance(dims, int):
+        dims = [dims]
+    dim = inp.ndim
+    stride = inp.stride()
+    batch_dim = [i for i in range(dim) if i not in dims]
+    sorted_reduction_dim = sorted(dims, key=lambda x: stride[x], reverse=True)
+    order = sorted_reduction_dim + batch_dim
+    return inp.permute(order).contiguous()
+
+
+# kernel
+@libentry()
+@triton.autotune(
+    configs=runtime.get_triton_config("index_select_backward"), key=["M", "N"]
+)
+@triton.jit
+def index_select_backward_kernel(
+    grad,
+    out,
+    M,
+    N,
+    num_blocks_per_CTA,
+    index,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_x = tle.program_id(axis=0)
+    pid_y = tle.program_id(axis=1)
+    rows_offsets = pid_x * BLOCK_M + tl.arange(0, BLOCK_M)
+    cols_offsets = pid_y * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    grad_mask = (rows_offsets < M) and (cols_offsets < N)
+    indices = tl.load(index + rows_offsets, mask=(rows_offsets < M), other=0)
+
+    for i in range(0, num_blocks_per_CTA):
+        grad_off = (pid_x * num_blocks_per_CTA + i) * N + cols_offsets
+        out_off = (indices * num_blocks_per_CTA + i) * N + cols_offsets
+        selected = tl.load(grad + grad_off, mask=grad_mask, other=0.0)
+        tl.atomic_add(out + out_off, selected, mask=grad_mask)
+
+
+# function
+def index_select_backward(grad, self_sizes, dim, index):
+    logging.debug("GEMS INDEX SELECT BACKWARD")
+    assert dim >= -len(self_sizes) and dim < len(self_sizes), "Invalid dim"
+    assert index.ndim <= 1, "Index should have dimension 1 or 0"
+    if index.ndim == 0:
+        index = index.unsqueeze(0)
+    index_shape = list(index.shape)
+    dim = dim % len(self_sizes)
+    grad_shape = list(grad.shape)
+    assert grad_shape[dim] == index_shape[0], "Index out of range"
+    grad = dim_compress_backward(grad, dim)
+    grad_shape = list(grad.shape)
+    out_shape = list(grad.shape)
+    shape_for_block_counting = list(grad.shape[1:])
+    shape_for_block_counting[-1] = 1
+    num_blocks_per_CTA = math.prod(shape_for_block_counting)
+    N = grad_shape[grad.ndim - 1]
+    M = grad.numel() // N // num_blocks_per_CTA
+    out_shape[0] = self_sizes[dim]
+    grad_type = grad.dtype
+    grad = grad.to(torch.float32)
+    out = torch.zeros(out_shape, dtype=torch.float32, device=grad.device)
+    grid = lambda meta: (
+        triton.cdiv(M, meta["BLOCK_M"]),
+        triton.cdiv(N, meta["BLOCK_N"]),
+    )
+    index_select_backward_kernel[grid](grad, out, M, N, num_blocks_per_CTA, index)
+    out = out.to(grad_type)
+    if dim != 0:
+        order = [i for i in range(1, out.ndim)]
+        order.insert(dim, 0)
+        return out.permute(order)
+    else:
+        return out
diff --git a/src/flag_gems/runtime/backend/_nvidia/tune_configs.yaml b/src/flag_gems/runtime/backend/_nvidia/tune_configs.yaml
@@ -924,3 +924,14 @@ var_mean:
   - 4
   - 8
   - 16
+index_select_backward:
+- gen: true
+  param_map:
+    META:
+      BLOCK_M: block_m
+      BLOCK_N: block_n
+    num_warps: 4
+  block_m:
+  - 1
+  block_n:
+  - 1024
diff --git a/tests/test_reduction_ops.py b/tests/test_reduction_ops.py
@@ -689,6 +689,35 @@ def test_accuracy_index_select(shape, dim, dtype):
     gems_assert_equal(res_out, ref_out)
 
 
+@pytest.mark.index_select_backward
+@pytest.mark.parametrize("shape", REDUCTION_SMALL_SHAPES)
+@pytest.mark.parametrize("dim", DIM_LIST)
+@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
+def test_accuracy_index_select_backward(shape, dim, dtype):
+    inp = torch.randn(shape, dtype=dtype, device="cuda", requires_grad=True)
+    ref_inp = to_reference(inp)
+    from math import floor
+
+    index_size = inp.size(dim)
+    index = torch.randint(0, index_size, [floor(index_size * 0.1)], device="cuda")
+    if len(index) == 0:
+        pass
+    else:
+        ref_index = to_reference(index)
+        ref_out = torch.index_select(ref_inp, dim, ref_index)
+        with flag_gems.use_gems():
+            res_out = torch.index_select(inp, dim, index)
+        out_grad = torch.randn_like(res_out)
+        ref_grad = to_reference(out_grad)
+        (ref_in_grad,) = torch.autograd.grad(ref_out, ref_inp, ref_grad)
+        with flag_gems.use_gems():
+            (res_in_grad,) = torch.autograd.grad(res_out, inp, out_grad)
+        res_out = to_reference(res_out)
+        res_in_grad = to_reference(res_in_grad)
+        gems_assert_close(res_out, ref_out, dtype)
+        gems_assert_close(res_in_grad, ref_in_grad, dtype)
+
+
 @pytest.mark.masked_select
 @pytest.mark.parametrize("threshold, shape", THRESHOLD_SHAPE)
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)