pytorch · trivedivivek · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 7, 2025
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -14,7 +14,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils.h"
+#include "indexing_utils_u16.h"
 
 layout(std430) buffer;
 
@@ -35,10 +35,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * output at a single output location.
  */
 void main() {
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    (gl_GlobalInvocationID.x / out_limits.x) % out_limits.y,
-    gl_GlobalInvocationID.x / (out_limits.x * out_limits.y));
+  const ivec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y);
 
   if (any(greaterThanEqual(pos, out_limits))) {
     return;

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -18,7 +18,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils.h"
+#include "indexing_utils_u16.h"
 
 layout(std430) buffer;
 
@@ -43,12 +43,9 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   // y divided up by batch size is used to determine 3d position
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
-  const uint out_limits_y_scaled = (out_limits.y + BATCH_SIZE_Y - 1) / BATCH_SIZE_Y;
+  const int out_limits_y_scaled = (out_limits.y + BATCH_SIZE_Y - 1) / BATCH_SIZE_Y;
 
-  u16vec3 pos = u16vec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    ((gl_GlobalInvocationID.x / out_limits.x) % out_limits_y_scaled),
-    gl_GlobalInvocationID.x / (out_limits.x * out_limits_y_scaled));
+  u16vec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits_y_scaled);
 
   // scale pos.y by batch size, because that's the top pixel to be processed
   pos.y *= uint16_t(BATCH_SIZE_Y);

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -16,7 +16,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils.h"
+#include "indexing_utils_u16.h"
 
 layout(std430) buffer;
 
@@ -43,13 +43,10 @@ shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroup
  * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
-  const uvec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
+  const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
   const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
 
-  const u16vec3 gpos = u16vec3(
-    gl_GlobalInvocationID.x % out_limits_scaled.x,
-    (gl_GlobalInvocationID.x / out_limits_scaled.x) % out_limits_scaled.y,
-    gl_GlobalInvocationID.x / (out_limits_scaled.x * out_limits_scaled.y));
+  const u16vec3 gpos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
 
   // Output position for TILE_SIZE = 2
   // +--------+--------+
@@ -98,7 +95,6 @@ void main() {
     const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(2, 0));
     const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(3, 0));
 
-
 #pragma unroll
     for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
       const vec4 in_tex = texelFetch(t_in, u16vec3(ipos[i], z4), 0);

diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef INDEXING_UTILS_U16_H
+#define INDEXING_UTILS_U16_H
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+
+u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) {
+  const uint div_by_x = idx / size_x;
+  return u16vec3(idx % size_x, div_by_x % size_y, div_by_x / size_y);
+}
+
+#endif // INDEXING_UTILS_U16_H