From 6cb90370006226c23ef55b786a89283d8fcc886a Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 9 Jan 2025 10:09:35 -0600
Subject: [PATCH] [ET-VK] Adding a common utility function to calculate 3d
 output position based on unique index. (#7564)

Pull Request resolved: https://github.com/pytorch/executorch/pull/7522

This diff adds an indexing utils header file used in Vulkan backend of Executorch. The header file includes functions for converting a global index to u16 indices based on input sizes.
ghstack-source-id: 260707858
@exported-using-ghexport

Differential Revision: [D67821941](https://our.internmc.facebook.com/intern/diff/D67821941/)

Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
---
 .../runtime/graph/ops/glsl/conv2d_dw.glsl     |  7 ++-----
 .../graph/ops/glsl/conv2d_dw_output_tile.glsl |  9 +++------
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     | 10 +++-------
 .../graph/ops/glsl/indexing_utils_u16.h       | 19 +++++++++++++++++++
 4 files changed, 27 insertions(+), 18 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index 43a4f7c8dc..5d7c69ab65 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -14,7 +14,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils.h"
+#include "indexing_utils_u16.h"
 
 layout(std430) buffer;
 
@@ -35,10 +35,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * output at a single output location.
  */
 void main() {
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    (gl_GlobalInvocationID.x / out_limits.x) % out_limits.y,
-    gl_GlobalInvocationID.x / (out_limits.x * out_limits.y));
+  const ivec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y);
 
   if (any(greaterThanEqual(pos, out_limits))) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index b2ae4953a7..20fb9374be 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -18,7 +18,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils.h"
+#include "indexing_utils_u16.h"
 
 layout(std430) buffer;
 
@@ -43,12 +43,9 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   // y divided up by batch size is used to determine 3d position
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
-  const uint out_limits_y_scaled = (out_limits.y + BATCH_SIZE_Y - 1) / BATCH_SIZE_Y;
+  const int out_limits_y_scaled = (out_limits.y + BATCH_SIZE_Y - 1) / BATCH_SIZE_Y;
 
-  u16vec3 pos = u16vec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    ((gl_GlobalInvocationID.x / out_limits.x) % out_limits_y_scaled),
-    gl_GlobalInvocationID.x / (out_limits.x * out_limits_y_scaled));
+  u16vec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits_y_scaled);
 
   // scale pos.y by batch size, because that's the top pixel to be processed
   pos.y *= uint16_t(BATCH_SIZE_Y);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index 23ad912c11..ad5d4adb13 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -16,7 +16,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils.h"
+#include "indexing_utils_u16.h"
 
 layout(std430) buffer;
 
@@ -43,13 +43,10 @@ shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroup
  * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
-  const uvec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
+  const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
   const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
 
-  const u16vec3 gpos = u16vec3(
-    gl_GlobalInvocationID.x % out_limits_scaled.x,
-    (gl_GlobalInvocationID.x / out_limits_scaled.x) % out_limits_scaled.y,
-    gl_GlobalInvocationID.x / (out_limits_scaled.x * out_limits_scaled.y));
+  const u16vec3 gpos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
 
   // Output position for TILE_SIZE = 2
   // +--------+--------+
@@ -98,7 +95,6 @@ void main() {
     const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(2, 0));
     const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(3, 0));
 
-
 #pragma unroll
     for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
       const vec4 in_tex = texelFetch(t_in, u16vec3(ipos[i], z4), 0);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h
new file mode 100644
index 0000000000..6dc59b6303
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef INDEXING_UTILS_U16_H
+#define INDEXING_UTILS_U16_H
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+
+u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) {
+  const uint div_by_x = idx / size_x;
+  return u16vec3(idx % size_x, div_by_x % size_y, div_by_x / size_y);
+}
+
+#endif // INDEXING_UTILS_U16_H