From e9674d146ce424d3ea44f8b2ffd9e9f92dfa15f7 Mon Sep 17 00:00:00 2001
From: Jack Khuu <jackkhuu@fb.com>
Date: Tue, 13 Jun 2023 14:10:54 -0700
Subject: [PATCH] [Specialized Kernel] Propagate Specialized Kernel Support
 through ComputeCodegenUnboxedKernels (#103113)

Updating ComputeCodegenUnboxedKernels to accept and write out kernel information to RegisterCodegenUnboxedKernels.cpp

Differential Revision: [D46486195](https://our.internmc.facebook.com/intern/diff/D46486195/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/103113
Approved by: https://github.com/larryliu0820, https://github.com/kirklandsign
---
 test/edge/operator_registry.cpp               | 40 ++++-----
 test/edge/operator_registry.h                 | 44 ++++-----
 .../RegisterCodegenUnboxedKernels.cpp         | 18 ++--
 test/edge/test_operator_registration.cpp      |  8 +-
 tools/test/test_executorch_gen.py             | 89 ++++++++++++++++++-
 torchgen/context.py                           | 15 +++-
 torchgen/executorch/model.py                  |  7 +-
 torchgen/executorch/parse.py                  |  2 +-
 torchgen/gen_executorch.py                    | 69 +++++++++++---
 9 files changed, 217 insertions(+), 75 deletions(-)
diff --git a/test/edge/operator_registry.cpp b/test/edge/operator_registry.cpp
index 924a9078fed7c..765afa66e7a19 100644
--- a/test/edge/operator_registry.cpp
+++ b/test/edge/operator_registry.cpp
@@ -4,40 +4,40 @@
 namespace torch {
 namespace executor {
 
-OperatorRegistry& getOperatorRegistry() {
-  static OperatorRegistry operator_registry;
-  return operator_registry;
+KernelRegistry& getKernelRegistry() {
+  static KernelRegistry kernel_registry;
+  return kernel_registry;
 }
 
-bool register_operators(const ArrayRef<Operator>& operators) {
-  return getOperatorRegistry().register_operators(operators);
+bool register_kernels(const ArrayRef<Kernel>& kernels) {
+  return getKernelRegistry().register_kernels(kernels);
 }
 
-bool OperatorRegistry::register_operators(
-    const ArrayRef<Operator>& operators) {
-  for (const auto& op : operators) {
-    this->operators_map_[op.name_] = op.op_;
+bool KernelRegistry::register_kernels(
+    const ArrayRef<Kernel>& kernels) {
+  for (const auto& kernel : kernels) {
+    this->kernels_map_[kernel.name_] = kernel.kernel_;
   }
   return true;
 }
 
-bool hasOpsFn(const char* name) {
-  return getOperatorRegistry().hasOpsFn(name);
+bool hasKernelFn(const char* name) {
+  return getKernelRegistry().hasKernelFn(name);
 }
 
-bool OperatorRegistry::hasOpsFn(const char* name) {
-  auto op = this->operators_map_.find(name);
-  return op != this->operators_map_.end();
+bool KernelRegistry::hasKernelFn(const char* name) {
+  auto kernel = this->kernels_map_.find(name);
+  return kernel != this->kernels_map_.end();
 }
 
-OpFunction& getOpsFn(const char* name) {
-  return getOperatorRegistry().getOpsFn(name);
+KernelFunction& getKernelFn(const char* name) {
+  return getKernelRegistry().getKernelFn(name);
 }
 
-OpFunction& OperatorRegistry::getOpsFn(const char* name) {
-  auto op = this->operators_map_.find(name);
-  TORCH_CHECK_MSG(op != this->operators_map_.end(), "Operator not found!");
-  return op->second;
+KernelFunction& KernelRegistry::getKernelFn(const char* name) {
+  auto kernel = this->kernels_map_.find(name);
+  TORCH_CHECK_MSG(kernel != this->kernels_map_.end(), "Kernel not found!");
+  return kernel->second;
 }
 
 
diff --git a/test/edge/operator_registry.h b/test/edge/operator_registry.h
index 01b8d2374bccf..7ab6065cc0c17 100644
--- a/test/edge/operator_registry.h
+++ b/test/edge/operator_registry.h
@@ -11,60 +11,60 @@
 namespace torch {
 namespace executor {
 
-using OpFunction = std::function<void(RuntimeContext&, EValue**)>;
+using KernelFunction = std::function<void(RuntimeContext&, EValue**)>;
 
 template<typename T>
 using ArrayRef = at::ArrayRef<T>;
 
 #define EXECUTORCH_SCOPE_PROF(x)
 
-struct Operator {
+struct Kernel {
   const char* name_;
-  OpFunction op_;
+  KernelFunction kernel_;
 
-  Operator() = default;
+  Kernel() = default;
 
   /**
    * We are doing a copy of the string pointer instead of duplicating the string
-   * itself, we require the lifetime of the operator name to be at least as long
-   * as the operator registry.
+   * itself, we require the lifetime of the kernel name to be at least as long
+   * as the kernel registry.
    */
-  explicit Operator(const char* name, OpFunction func)
-      : name_(name), op_(func) {}
+  explicit Kernel(const char* name, KernelFunction func)
+      : name_(name), kernel_(func) {}
 };
 
 /**
- * See OperatorRegistry::hasOpsFn()
+ * See KernelRegistry::hasKernelFn()
  */
-bool hasOpsFn(const char* name);
+bool hasKernelFn(const char* name);
 
 /**
- * See OperatorRegistry::getOpsFn()
+ * See KernelRegistry::getKernelFn()
  */
-OpFunction& getOpsFn(const char* name);
+KernelFunction& getKernelFn(const char* name);
 
 
-[[nodiscard]] bool register_operators(const ArrayRef<Operator>&);
+[[nodiscard]] bool register_kernels(const ArrayRef<Kernel>&);
 
-struct OperatorRegistry {
+struct KernelRegistry {
  public:
-  OperatorRegistry() : operatorRegSize_(0) {}
+  KernelRegistry() : kernelRegSize_(0) {}
 
-  bool register_operators(const ArrayRef<Operator>&);
+  bool register_kernels(const ArrayRef<Kernel>&);
 
   /**
-   * Checks whether an operator with a given name is registered
+   * Checks whether an kernel with a given name is registered
    */
-  bool hasOpsFn(const char* name);
+  bool hasKernelFn(const char* name);
 
   /**
-   * Checks whether an operator with a given name is registered
+   * Checks whether an kernel with a given name is registered
    */
-  OpFunction& getOpsFn(const char* name);
+  KernelFunction& getKernelFn(const char* name);
 
  private:
-  std::map<const char*, OpFunction> operators_map_;
-  uint32_t operatorRegSize_;
+  std::map<const char*, KernelFunction> kernels_map_;
+  uint32_t kernelRegSize_;
 };
 
 } // namespace executor
diff --git a/test/edge/templates/RegisterCodegenUnboxedKernels.cpp b/test/edge/templates/RegisterCodegenUnboxedKernels.cpp
index c96a9979a6272..b2ae23684b42a 100644
--- a/test/edge/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/test/edge/templates/RegisterCodegenUnboxedKernels.cpp
@@ -1,25 +1,25 @@
 #include <operator_registry.h>
-#include "Functions.h"
+#include "${fn_header}" // Generated Function import headers
 
 namespace torch {
 namespace executor {
 
 namespace {
-using OpArrayRef = ::at::ArrayRef<::torch::executor::Operator>;
+using KernelArrayRef = ::at::ArrayRef<::torch::executor::Kernel>;
 
-static Operator operators_to_register[] = {
-    ${unboxed_ops} // Generated operators
+static Kernel kernels_to_register[] = {
+    ${unboxed_kernels} // Generated operators
 };
 
 // Explicitly convert to ArrayRef, so that the API can take an empty C array of
-// Operators.
-static OpArrayRef op_array_ref(
-    operators_to_register,
-    operators_to_register + sizeof(operators_to_register) / sizeof(Operator));
+// Kernels.
+static KernelArrayRef kernel_array_ref(
+    kernels_to_register,
+    kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel));
 
 // Return value not used. Keep the static variable assignment to register
 // operators in static initialization time.
-static auto success_with_op_reg = register_operators(op_array_ref);
+static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
 } // namespace
 } // namespace executor
 } // namespace torch
diff --git a/test/edge/test_operator_registration.cpp b/test/edge/test_operator_registration.cpp
index 905c5de4c8fc5..8155f9297b3a4 100644
--- a/test/edge/test_operator_registration.cpp
+++ b/test/edge/test_operator_registration.cpp
@@ -11,8 +11,8 @@ TEST(OperatorRegistrationTest, Add) {
     values[1] = EValue(at::ones({2, 3}));
     values[2] = EValue(int64_t(1));
     values[3] = EValue(at::zeros({2, 3}));
-    ASSERT_TRUE(hasOpsFn("aten::add.out"));
-    auto op = getOpsFn("aten::add.out");
+    ASSERT_TRUE(hasKernelFn("aten::add.out"));
+    auto op = getKernelFn("aten::add.out");
 
     EValue* kernel_values[4];
     for (size_t i = 0; i < 4; i++) {
@@ -33,8 +33,8 @@ TEST(OperatorRegistrationTest, CustomAdd3) {
     values[1] = EValue(at::ones({2, 3}));
     values[2] = EValue(at::ones({2, 3}));
     values[3] = EValue(at::zeros({2, 3}));
-    ASSERT_TRUE(hasOpsFn("custom::add_3.out"));
-    auto op = getOpsFn("custom::add_3.out");
+    ASSERT_TRUE(hasKernelFn("custom::add_3.out"));
+    auto op = getKernelFn("custom::add_3.out");
 
     EValue* kernel_values[4];
     for (size_t i = 0; i < 4; i++) {
diff --git a/tools/test/test_executorch_gen.py b/tools/test/test_executorch_gen.py
index e9554f5f31fd0..7711299eb7937 100644
--- a/tools/test/test_executorch_gen.py
+++ b/tools/test/test_executorch_gen.py
@@ -5,10 +5,11 @@
 
 import yaml
 
-from torchgen.executorch.model import ETKernelIndex
+from torchgen.executorch.model import ETKernelIndex, ETKernelKey
 from torchgen.gen import LineLoader
 
 from torchgen.gen_executorch import (
+    ComputeCodegenUnboxedKernels,
     gen_functions_declarations,
     parse_yaml_files,
     translate_native_yaml,
@@ -397,7 +398,6 @@ def test_aten_lib_has_context_arg(self) -> None:
             selector=SelectiveBuilder.get_nop_selector(),
             use_aten_lib=True,
         )
-        print(declarations)
         self.assertTrue(
             """
 namespace custom_1 {
@@ -411,3 +411,88 @@ def test_aten_lib_has_context_arg(self) -> None:
         """
             in declarations
         )
+
+
+class TestComputeCodegenUnboxedKernels(unittest.TestCase):
+    def setUp(self) -> None:
+        (
+            self.native_function_no_kern,
+            _,
+        ) = NativeFunction.from_yaml(
+            {
+                "func": "custom_1::op_1() -> bool",
+                "dispatch": {"CPU": "unused_kernel_1"},
+            },
+            loc=Location(__file__, 1),
+            valid_tags=set(),
+        )
+
+        self.default_kernel_key = ETKernelKey(default=True)
+        self.default_backend_metadata = BackendMetadata(
+            "default_kernel", False, "at::native"
+        )
+        self.default_kernel_entry = (
+            [self.default_kernel_key],
+            self.default_backend_metadata,
+        )
+
+    def test_codegen_unboxed_specialized(self) -> None:
+        specialized_kernel_key = ETKernelKey.gen_from_yaml(
+            {"self": ("T0", "D0"), "other": ("T0", "D0"), "out": ("T0", "D0")},
+            {"T0": ["Double"]},
+            {"D0": [0, 1, 2, 3]},
+        )
+        selector = SelectiveBuilder.get_nop_selector()
+        use_aten_lib = False
+        entry = (
+            self.native_function_no_kern,
+            (specialized_kernel_key, self.default_backend_metadata),
+        )
+
+        result = ComputeCodegenUnboxedKernels(selector, use_aten_lib)(entry)
+        # Concat used to prevent whitespace stripping
+        expected_str = (
+            """
+Kernel(
+    "custom_1::op_1",
+    "v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3",
+    [](torch::executor::RuntimeContext & context, EValue** stack) {
+        """
+            + """
+
+        EXECUTORCH_SCOPE_PROF("native_call_op_1");
+        bool result_ = at::native::default_kernel(context, );
+
+        *stack[0] = EValue(result_);
+    }
+),
+"""
+        )
+
+        self.assertEqual(expected_str, result)
+
+    def test_codegen_unboxed_default(self) -> None:
+        selector = SelectiveBuilder.get_nop_selector()
+        use_aten_lib = False
+        entry = (self.native_function_no_kern, self.default_kernel_entry)
+
+        result = ComputeCodegenUnboxedKernels(selector, use_aten_lib)(entry)
+        # Concat used to prevent whitespace stripping
+        expected_str = (
+            """
+Kernel(
+    "custom_1::op_1",
+    [](torch::executor::RuntimeContext & context, EValue** stack) {
+        """
+            + """
+
+        EXECUTORCH_SCOPE_PROF("native_call_op_1");
+        bool result_ = at::native::default_kernel(context, );
+
+        *stack[0] = EValue(result_);
+    }
+),
+"""
+        )
+
+        self.assertEqual(expected_str, result)
diff --git a/torchgen/context.py b/torchgen/context.py
index b643890d97992..f79bde17367e8 100644
--- a/torchgen/context.py
+++ b/torchgen/context.py
@@ -1,7 +1,7 @@
 import contextlib
 
 import functools
-from typing import Callable, Dict, Iterator, Optional, TypeVar, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union
 
 import torchgen.local as local
 from torchgen.model import (
@@ -33,6 +33,8 @@
     str,
 )
 
+F3 = TypeVar("F3", Tuple[NativeFunction, Any], List[NativeFunction])
+
 
 @contextlib.contextmanager
 def native_function_manager(
@@ -90,6 +92,17 @@ def wrapper(slf: S, f: F) -> T:
     return wrapper
 
 
+def method_with_nested_native_function(
+    func: Callable[[S, F3], T]
+) -> Callable[[S, F3], T]:
+    @functools.wraps(func)
+    def wrapper(slf: S, f: F3) -> T:
+        with native_function_manager(f[0]):
+            return func(slf, f)
+
+    return wrapper
+
+
 # Convenience decorator for functions that explicitly take in a BackendIndex,
 # instead of indirectly taking one in as a closure
 def with_native_function_and_index(
diff --git a/torchgen/executorch/model.py b/torchgen/executorch/model.py
index 5a3409fb53bce..cec9251a3187c 100644
--- a/torchgen/executorch/model.py
+++ b/torchgen/executorch/model.py
@@ -62,7 +62,7 @@ class ETKernelKey:
     def gen_from_yaml(
         args: Dict[str, Tuple[str, str]],
         type_alias_map: Dict[str, List[str]],  # TODO: Support unwrapped str val
-        dim_order_alias_map: Dict[str, List[str]],
+        dim_order_alias_map: Dict[str, List[int]],
     ) -> List["ETKernelKey"]:
         """Generate ETKernelKeys from arg kernel specs
         Multiple ETKernelKeys are returned due to dtype permutations from utilizing
@@ -194,7 +194,10 @@ def _to_backend_index(self) -> BackendIndex:
             assert (
                 len(kernel_dict.values()) == 1
             ), f"Can't convert ETKernelIndex to BackendIndex because {op} has more than one kernels. Got {kernel_dict}"
-            index[op] = kernel_dict[ETKernelKey(default=True)]
+            index[op] = kernel_dict.get(
+                ETKernelKey(default=True),
+                BackendMetadata(kernel="", structured=False, cpp_namespace=""),
+            )
         return BackendIndex(
             dispatch_key=DispatchKey.CPU,
             use_out_as_primary=False,
diff --git a/torchgen/executorch/parse.py b/torchgen/executorch/parse.py
index 308d689c29129..663f6f930c54c 100644
--- a/torchgen/executorch/parse.py
+++ b/torchgen/executorch/parse.py
@@ -62,7 +62,7 @@ def parse_from_yaml(ei: Dict[str, object]) -> Dict[ETKernelKey, BackendMetadata]
         kernel_keys = (
             [ETKernelKey((), default=True)]
             if arg_meta is None
-            else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias)
+            else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias)  # type: ignore[arg-type]
         )
 
         for kernel_key in kernel_keys:
diff --git a/torchgen/gen_executorch.py b/torchgen/gen_executorch.py
index e089ae7f00397..c5fc6b3c89d24 100644
--- a/torchgen/gen_executorch.py
+++ b/torchgen/gen_executorch.py
@@ -11,7 +11,11 @@
 from torchgen import dest
 from torchgen.api import cpp as aten_cpp
 from torchgen.api.types import CppSignature, CppSignatureGroup, CType, NamedCType
-from torchgen.context import method_with_native_function, with_native_function_and_index
+from torchgen.context import (
+    method_with_native_function,
+    method_with_nested_native_function,
+    with_native_function_and_index,
+)
 from torchgen.executorch.api import et_cpp
 from torchgen.executorch.api.custom_ops import (
     ComputeNativeFunctionStub,
@@ -19,7 +23,7 @@
 )
 from torchgen.executorch.api.types import contextArg, ExecutorchCppSignature
 from torchgen.executorch.api.unboxing import Unboxing
-from torchgen.executorch.model import ETKernelIndex, ETParsedYaml
+from torchgen.executorch.model import ETKernelIndex, ETKernelKey, ETParsedYaml
 from torchgen.executorch.parse import ET_FIELDS, parse_et_yaml, parse_et_yaml_struct
 from torchgen.gen import (
     get_custom_build_selector,
@@ -155,8 +159,16 @@ class ComputeCodegenUnboxedKernels:
 
     use_aten_lib: bool
 
-    @method_with_native_function
-    def __call__(self, f: NativeFunction) -> str:
+    @method_with_nested_native_function
+    def __call__(
+        self,
+        unbox_kernel_entry: Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]],
+    ) -> str:
+        f: NativeFunction = unbox_kernel_entry[0]
+        kernel_key: Union[ETKernelKey, List[ETKernelKey]] = unbox_kernel_entry[1][0]
+        kernel_meta: BackendMetadata = unbox_kernel_entry[1][1]
+
+        # TODO: Update to use Kernel Selector
         if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"):
             return ""
         sig: Union[CppSignature, ExecutorchCppSignature]
@@ -169,11 +181,13 @@ def __call__(self, f: NativeFunction) -> str:
             argument_type_gen = aten_cpp.argumenttype_type
             return_type_gen = aten_cpp.returns_type
             arguments = sig.arguments()
+            kernel_call = f"torch::executor::{f.namespace}::{sig.name()}"
         else:
             sig = ExecutorchCppSignature.from_native_function(f)
             argument_type_gen = et_cpp.argumenttype_type
             return_type_gen = et_cpp.returns_type
             arguments = sig.arguments(include_context=False)
+            kernel_call = f"{kernel_meta.cpp_namespace}::{kernel_meta.kernel}"
         # parse arguments into C++ code
         binding_list, code_list = Unboxing(
             argument_type_gen=argument_type_gen
@@ -203,19 +217,28 @@ def __call__(self, f: NativeFunction) -> str:
                 return_assignment = ""
                 ret_prefix = ""
 
-        return f"""
-Operator(
-    "{f.namespace}::{f.func.name}",
+        if not isinstance(kernel_key, list):
+            kernel_key = [kernel_key]
+
+        newline = "\n    "
+        return "\n".join(
+            [
+                f"""
+Kernel(
+    "{f.namespace}::{f.func.name}",{newline + '"' + (k.to_native_string() + '",') if k.to_native_string() != 'default' else ''}
     []({contextArg.defn()}, EValue** stack) {{
         {code_connector.join(code_list)}
 
         EXECUTORCH_SCOPE_PROF("native_call_{f.func.name}");
-        {ret_prefix}torch::executor::{f.namespace}::{sig.name()}({"context, "}{args_str});
+        {ret_prefix}{kernel_call}(context, {args_str});
 
         {return_assignment}
     }}
 ),
 """
+                for k in kernel_key
+            ]
+        )
 
 
 def gen_unboxing(
@@ -224,19 +247,36 @@ def gen_unboxing(
     cpu_fm: FileManager,
     selector: SelectiveBuilder,
     use_aten_lib: bool,
+    kernel_index: ETKernelIndex,
 ) -> None:
-    def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str:
-        return fn.root_name
+    # Iterable type for write_sharded is a Tuple of (native_function, (kernel_key, metadata))
+    def key_func(
+        item: Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]]
+    ) -> str:
+        return item[0].root_name + ":" + item[1][0].to_native_string()
+
+    items: List[Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]]] = [
+        (native_function, (kernel_key, metadata))
+        for native_function in native_functions
+        for kernel_key, metadata in kernel_index.get_kernels(native_function).items()
+    ]
+
+    header = ["Functions.h" if use_aten_lib else "NativeFunctions.h"]
 
     cpu_fm.write_sharded(
         "RegisterCodegenUnboxedKernels.cpp",
-        native_functions,
+        items,
         key_fn=key_func,
-        env_callable=lambda fn: {
-            "unboxed_ops": [ComputeCodegenUnboxedKernels(selector, use_aten_lib)(fn)],
+        env_callable=lambda unbox_kernel_entry: {
+            "unboxed_kernels": [
+                ComputeCodegenUnboxedKernels(selector, use_aten_lib)(unbox_kernel_entry)
+            ],
+            "fn_header": header
+            if unbox_kernel_entry == items[0]
+            else [],  # Only write header once
         },
         num_shards=1,
-        sharded_keys={"unboxed_ops"},
+        sharded_keys={"unboxed_kernels", "fn_header"},
     )
 
 
@@ -853,6 +893,7 @@ def main() -> None:
             cpu_fm=cpu_fm,
             selector=selector,
             use_aten_lib=options.use_aten_lib,
+            kernel_index=kernel_index,
         )
         if custom_ops_native_functions:
             gen_custom_ops(