From e9674d146ce424d3ea44f8b2ffd9e9f92dfa15f7 Mon Sep 17 00:00:00 2001 From: Jack Khuu Date: Tue, 13 Jun 2023 14:10:54 -0700 Subject: [PATCH] [Specialized Kernel] Propagate Specialized Kernel Support through ComputeCodegenUnboxedKernels (#103113) Updating ComputeCodegenUnboxedKernels to accept and write out kernel information to RegisterCodegenUnboxedKernels.cpp Differential Revision: [D46486195](https://our.internmc.facebook.com/intern/diff/D46486195/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/103113 Approved by: https://github.com/larryliu0820, https://github.com/kirklandsign --- test/edge/operator_registry.cpp | 40 ++++----- test/edge/operator_registry.h | 44 ++++----- .../RegisterCodegenUnboxedKernels.cpp | 18 ++-- test/edge/test_operator_registration.cpp | 8 +- tools/test/test_executorch_gen.py | 89 ++++++++++++++++++- torchgen/context.py | 15 +++- torchgen/executorch/model.py | 7 +- torchgen/executorch/parse.py | 2 +- torchgen/gen_executorch.py | 69 +++++++++++--- 9 files changed, 217 insertions(+), 75 deletions(-) diff --git a/test/edge/operator_registry.cpp b/test/edge/operator_registry.cpp index 924a9078fed7c..765afa66e7a19 100644 --- a/test/edge/operator_registry.cpp +++ b/test/edge/operator_registry.cpp @@ -4,40 +4,40 @@ namespace torch { namespace executor { -OperatorRegistry& getOperatorRegistry() { - static OperatorRegistry operator_registry; - return operator_registry; +KernelRegistry& getKernelRegistry() { + static KernelRegistry kernel_registry; + return kernel_registry; } -bool register_operators(const ArrayRef& operators) { - return getOperatorRegistry().register_operators(operators); +bool register_kernels(const ArrayRef& kernels) { + return getKernelRegistry().register_kernels(kernels); } -bool OperatorRegistry::register_operators( - const ArrayRef& operators) { - for (const auto& op : operators) { - this->operators_map_[op.name_] = op.op_; +bool KernelRegistry::register_kernels( + const ArrayRef& kernels) { + for (const auto& kernel : kernels) { + this->kernels_map_[kernel.name_] = kernel.kernel_; } return true; } -bool hasOpsFn(const char* name) { - return getOperatorRegistry().hasOpsFn(name); +bool hasKernelFn(const char* name) { + return getKernelRegistry().hasKernelFn(name); } -bool OperatorRegistry::hasOpsFn(const char* name) { - auto op = this->operators_map_.find(name); - return op != this->operators_map_.end(); +bool KernelRegistry::hasKernelFn(const char* name) { + auto kernel = this->kernels_map_.find(name); + return kernel != this->kernels_map_.end(); } -OpFunction& getOpsFn(const char* name) { - return getOperatorRegistry().getOpsFn(name); +KernelFunction& getKernelFn(const char* name) { + return getKernelRegistry().getKernelFn(name); } -OpFunction& OperatorRegistry::getOpsFn(const char* name) { - auto op = this->operators_map_.find(name); - TORCH_CHECK_MSG(op != this->operators_map_.end(), "Operator not found!"); - return op->second; +KernelFunction& KernelRegistry::getKernelFn(const char* name) { + auto kernel = this->kernels_map_.find(name); + TORCH_CHECK_MSG(kernel != this->kernels_map_.end(), "Kernel not found!"); + return kernel->second; } diff --git a/test/edge/operator_registry.h b/test/edge/operator_registry.h index 01b8d2374bccf..7ab6065cc0c17 100644 --- a/test/edge/operator_registry.h +++ b/test/edge/operator_registry.h @@ -11,60 +11,60 @@ namespace torch { namespace executor { -using OpFunction = std::function; +using KernelFunction = std::function; template using ArrayRef = at::ArrayRef; #define EXECUTORCH_SCOPE_PROF(x) -struct Operator { +struct Kernel { const char* name_; - OpFunction op_; + KernelFunction kernel_; - Operator() = default; + Kernel() = default; /** * We are doing a copy of the string pointer instead of duplicating the string - * itself, we require the lifetime of the operator name to be at least as long - * as the operator registry. + * itself, we require the lifetime of the kernel name to be at least as long + * as the kernel registry. */ - explicit Operator(const char* name, OpFunction func) - : name_(name), op_(func) {} + explicit Kernel(const char* name, KernelFunction func) + : name_(name), kernel_(func) {} }; /** - * See OperatorRegistry::hasOpsFn() + * See KernelRegistry::hasKernelFn() */ -bool hasOpsFn(const char* name); +bool hasKernelFn(const char* name); /** - * See OperatorRegistry::getOpsFn() + * See KernelRegistry::getKernelFn() */ -OpFunction& getOpsFn(const char* name); +KernelFunction& getKernelFn(const char* name); -[[nodiscard]] bool register_operators(const ArrayRef&); +[[nodiscard]] bool register_kernels(const ArrayRef&); -struct OperatorRegistry { +struct KernelRegistry { public: - OperatorRegistry() : operatorRegSize_(0) {} + KernelRegistry() : kernelRegSize_(0) {} - bool register_operators(const ArrayRef&); + bool register_kernels(const ArrayRef&); /** - * Checks whether an operator with a given name is registered + * Checks whether an kernel with a given name is registered */ - bool hasOpsFn(const char* name); + bool hasKernelFn(const char* name); /** - * Checks whether an operator with a given name is registered + * Checks whether an kernel with a given name is registered */ - OpFunction& getOpsFn(const char* name); + KernelFunction& getKernelFn(const char* name); private: - std::map operators_map_; - uint32_t operatorRegSize_; + std::map kernels_map_; + uint32_t kernelRegSize_; }; } // namespace executor diff --git a/test/edge/templates/RegisterCodegenUnboxedKernels.cpp b/test/edge/templates/RegisterCodegenUnboxedKernels.cpp index c96a9979a6272..b2ae23684b42a 100644 --- a/test/edge/templates/RegisterCodegenUnboxedKernels.cpp +++ b/test/edge/templates/RegisterCodegenUnboxedKernels.cpp @@ -1,25 +1,25 @@ #include -#include "Functions.h" +#include "${fn_header}" // Generated Function import headers namespace torch { namespace executor { namespace { -using OpArrayRef = ::at::ArrayRef<::torch::executor::Operator>; +using KernelArrayRef = ::at::ArrayRef<::torch::executor::Kernel>; -static Operator operators_to_register[] = { - ${unboxed_ops} // Generated operators +static Kernel kernels_to_register[] = { + ${unboxed_kernels} // Generated operators }; // Explicitly convert to ArrayRef, so that the API can take an empty C array of -// Operators. -static OpArrayRef op_array_ref( - operators_to_register, - operators_to_register + sizeof(operators_to_register) / sizeof(Operator)); +// Kernels. +static KernelArrayRef kernel_array_ref( + kernels_to_register, + kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel)); // Return value not used. Keep the static variable assignment to register // operators in static initialization time. -static auto success_with_op_reg = register_operators(op_array_ref); +static auto success_with_kernel_reg = register_kernels(kernel_array_ref); } // namespace } // namespace executor } // namespace torch diff --git a/test/edge/test_operator_registration.cpp b/test/edge/test_operator_registration.cpp index 905c5de4c8fc5..8155f9297b3a4 100644 --- a/test/edge/test_operator_registration.cpp +++ b/test/edge/test_operator_registration.cpp @@ -11,8 +11,8 @@ TEST(OperatorRegistrationTest, Add) { values[1] = EValue(at::ones({2, 3})); values[2] = EValue(int64_t(1)); values[3] = EValue(at::zeros({2, 3})); - ASSERT_TRUE(hasOpsFn("aten::add.out")); - auto op = getOpsFn("aten::add.out"); + ASSERT_TRUE(hasKernelFn("aten::add.out")); + auto op = getKernelFn("aten::add.out"); EValue* kernel_values[4]; for (size_t i = 0; i < 4; i++) { @@ -33,8 +33,8 @@ TEST(OperatorRegistrationTest, CustomAdd3) { values[1] = EValue(at::ones({2, 3})); values[2] = EValue(at::ones({2, 3})); values[3] = EValue(at::zeros({2, 3})); - ASSERT_TRUE(hasOpsFn("custom::add_3.out")); - auto op = getOpsFn("custom::add_3.out"); + ASSERT_TRUE(hasKernelFn("custom::add_3.out")); + auto op = getKernelFn("custom::add_3.out"); EValue* kernel_values[4]; for (size_t i = 0; i < 4; i++) { diff --git a/tools/test/test_executorch_gen.py b/tools/test/test_executorch_gen.py index e9554f5f31fd0..7711299eb7937 100644 --- a/tools/test/test_executorch_gen.py +++ b/tools/test/test_executorch_gen.py @@ -5,10 +5,11 @@ import yaml -from torchgen.executorch.model import ETKernelIndex +from torchgen.executorch.model import ETKernelIndex, ETKernelKey from torchgen.gen import LineLoader from torchgen.gen_executorch import ( + ComputeCodegenUnboxedKernels, gen_functions_declarations, parse_yaml_files, translate_native_yaml, @@ -397,7 +398,6 @@ def test_aten_lib_has_context_arg(self) -> None: selector=SelectiveBuilder.get_nop_selector(), use_aten_lib=True, ) - print(declarations) self.assertTrue( """ namespace custom_1 { @@ -411,3 +411,88 @@ def test_aten_lib_has_context_arg(self) -> None: """ in declarations ) + + +class TestComputeCodegenUnboxedKernels(unittest.TestCase): + def setUp(self) -> None: + ( + self.native_function_no_kern, + _, + ) = NativeFunction.from_yaml( + { + "func": "custom_1::op_1() -> bool", + "dispatch": {"CPU": "unused_kernel_1"}, + }, + loc=Location(__file__, 1), + valid_tags=set(), + ) + + self.default_kernel_key = ETKernelKey(default=True) + self.default_backend_metadata = BackendMetadata( + "default_kernel", False, "at::native" + ) + self.default_kernel_entry = ( + [self.default_kernel_key], + self.default_backend_metadata, + ) + + def test_codegen_unboxed_specialized(self) -> None: + specialized_kernel_key = ETKernelKey.gen_from_yaml( + {"self": ("T0", "D0"), "other": ("T0", "D0"), "out": ("T0", "D0")}, + {"T0": ["Double"]}, + {"D0": [0, 1, 2, 3]}, + ) + selector = SelectiveBuilder.get_nop_selector() + use_aten_lib = False + entry = ( + self.native_function_no_kern, + (specialized_kernel_key, self.default_backend_metadata), + ) + + result = ComputeCodegenUnboxedKernels(selector, use_aten_lib)(entry) + # Concat used to prevent whitespace stripping + expected_str = ( + """ +Kernel( + "custom_1::op_1", + "v1/7;0,1,2,3|7;0,1,2,3|7;0,1,2,3", + [](torch::executor::RuntimeContext & context, EValue** stack) { + """ + + """ + + EXECUTORCH_SCOPE_PROF("native_call_op_1"); + bool result_ = at::native::default_kernel(context, ); + + *stack[0] = EValue(result_); + } +), +""" + ) + + self.assertEqual(expected_str, result) + + def test_codegen_unboxed_default(self) -> None: + selector = SelectiveBuilder.get_nop_selector() + use_aten_lib = False + entry = (self.native_function_no_kern, self.default_kernel_entry) + + result = ComputeCodegenUnboxedKernels(selector, use_aten_lib)(entry) + # Concat used to prevent whitespace stripping + expected_str = ( + """ +Kernel( + "custom_1::op_1", + [](torch::executor::RuntimeContext & context, EValue** stack) { + """ + + """ + + EXECUTORCH_SCOPE_PROF("native_call_op_1"); + bool result_ = at::native::default_kernel(context, ); + + *stack[0] = EValue(result_); + } +), +""" + ) + + self.assertEqual(expected_str, result) diff --git a/torchgen/context.py b/torchgen/context.py index b643890d97992..f79bde17367e8 100644 --- a/torchgen/context.py +++ b/torchgen/context.py @@ -1,7 +1,7 @@ import contextlib import functools -from typing import Callable, Dict, Iterator, Optional, TypeVar, Union +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union import torchgen.local as local from torchgen.model import ( @@ -33,6 +33,8 @@ str, ) +F3 = TypeVar("F3", Tuple[NativeFunction, Any], List[NativeFunction]) + @contextlib.contextmanager def native_function_manager( @@ -90,6 +92,17 @@ def wrapper(slf: S, f: F) -> T: return wrapper +def method_with_nested_native_function( + func: Callable[[S, F3], T] +) -> Callable[[S, F3], T]: + @functools.wraps(func) + def wrapper(slf: S, f: F3) -> T: + with native_function_manager(f[0]): + return func(slf, f) + + return wrapper + + # Convenience decorator for functions that explicitly take in a BackendIndex, # instead of indirectly taking one in as a closure def with_native_function_and_index( diff --git a/torchgen/executorch/model.py b/torchgen/executorch/model.py index 5a3409fb53bce..cec9251a3187c 100644 --- a/torchgen/executorch/model.py +++ b/torchgen/executorch/model.py @@ -62,7 +62,7 @@ class ETKernelKey: def gen_from_yaml( args: Dict[str, Tuple[str, str]], type_alias_map: Dict[str, List[str]], # TODO: Support unwrapped str val - dim_order_alias_map: Dict[str, List[str]], + dim_order_alias_map: Dict[str, List[int]], ) -> List["ETKernelKey"]: """Generate ETKernelKeys from arg kernel specs Multiple ETKernelKeys are returned due to dtype permutations from utilizing @@ -194,7 +194,10 @@ def _to_backend_index(self) -> BackendIndex: assert ( len(kernel_dict.values()) == 1 ), f"Can't convert ETKernelIndex to BackendIndex because {op} has more than one kernels. Got {kernel_dict}" - index[op] = kernel_dict[ETKernelKey(default=True)] + index[op] = kernel_dict.get( + ETKernelKey(default=True), + BackendMetadata(kernel="", structured=False, cpp_namespace=""), + ) return BackendIndex( dispatch_key=DispatchKey.CPU, use_out_as_primary=False, diff --git a/torchgen/executorch/parse.py b/torchgen/executorch/parse.py index 308d689c29129..663f6f930c54c 100644 --- a/torchgen/executorch/parse.py +++ b/torchgen/executorch/parse.py @@ -62,7 +62,7 @@ def parse_from_yaml(ei: Dict[str, object]) -> Dict[ETKernelKey, BackendMetadata] kernel_keys = ( [ETKernelKey((), default=True)] if arg_meta is None - else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias) + else ETKernelKey.gen_from_yaml(arg_meta, type_alias, dim_order_alias) # type: ignore[arg-type] ) for kernel_key in kernel_keys: diff --git a/torchgen/gen_executorch.py b/torchgen/gen_executorch.py index e089ae7f00397..c5fc6b3c89d24 100644 --- a/torchgen/gen_executorch.py +++ b/torchgen/gen_executorch.py @@ -11,7 +11,11 @@ from torchgen import dest from torchgen.api import cpp as aten_cpp from torchgen.api.types import CppSignature, CppSignatureGroup, CType, NamedCType -from torchgen.context import method_with_native_function, with_native_function_and_index +from torchgen.context import ( + method_with_native_function, + method_with_nested_native_function, + with_native_function_and_index, +) from torchgen.executorch.api import et_cpp from torchgen.executorch.api.custom_ops import ( ComputeNativeFunctionStub, @@ -19,7 +23,7 @@ ) from torchgen.executorch.api.types import contextArg, ExecutorchCppSignature from torchgen.executorch.api.unboxing import Unboxing -from torchgen.executorch.model import ETKernelIndex, ETParsedYaml +from torchgen.executorch.model import ETKernelIndex, ETKernelKey, ETParsedYaml from torchgen.executorch.parse import ET_FIELDS, parse_et_yaml, parse_et_yaml_struct from torchgen.gen import ( get_custom_build_selector, @@ -155,8 +159,16 @@ class ComputeCodegenUnboxedKernels: use_aten_lib: bool - @method_with_native_function - def __call__(self, f: NativeFunction) -> str: + @method_with_nested_native_function + def __call__( + self, + unbox_kernel_entry: Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]], + ) -> str: + f: NativeFunction = unbox_kernel_entry[0] + kernel_key: Union[ETKernelKey, List[ETKernelKey]] = unbox_kernel_entry[1][0] + kernel_meta: BackendMetadata = unbox_kernel_entry[1][1] + + # TODO: Update to use Kernel Selector if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"): return "" sig: Union[CppSignature, ExecutorchCppSignature] @@ -169,11 +181,13 @@ def __call__(self, f: NativeFunction) -> str: argument_type_gen = aten_cpp.argumenttype_type return_type_gen = aten_cpp.returns_type arguments = sig.arguments() + kernel_call = f"torch::executor::{f.namespace}::{sig.name()}" else: sig = ExecutorchCppSignature.from_native_function(f) argument_type_gen = et_cpp.argumenttype_type return_type_gen = et_cpp.returns_type arguments = sig.arguments(include_context=False) + kernel_call = f"{kernel_meta.cpp_namespace}::{kernel_meta.kernel}" # parse arguments into C++ code binding_list, code_list = Unboxing( argument_type_gen=argument_type_gen @@ -203,19 +217,28 @@ def __call__(self, f: NativeFunction) -> str: return_assignment = "" ret_prefix = "" - return f""" -Operator( - "{f.namespace}::{f.func.name}", + if not isinstance(kernel_key, list): + kernel_key = [kernel_key] + + newline = "\n " + return "\n".join( + [ + f""" +Kernel( + "{f.namespace}::{f.func.name}",{newline + '"' + (k.to_native_string() + '",') if k.to_native_string() != 'default' else ''} []({contextArg.defn()}, EValue** stack) {{ {code_connector.join(code_list)} EXECUTORCH_SCOPE_PROF("native_call_{f.func.name}"); - {ret_prefix}torch::executor::{f.namespace}::{sig.name()}({"context, "}{args_str}); + {ret_prefix}{kernel_call}(context, {args_str}); {return_assignment} }} ), """ + for k in kernel_key + ] + ) def gen_unboxing( @@ -224,19 +247,36 @@ def gen_unboxing( cpu_fm: FileManager, selector: SelectiveBuilder, use_aten_lib: bool, + kernel_index: ETKernelIndex, ) -> None: - def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str: - return fn.root_name + # Iterable type for write_sharded is a Tuple of (native_function, (kernel_key, metadata)) + def key_func( + item: Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]] + ) -> str: + return item[0].root_name + ":" + item[1][0].to_native_string() + + items: List[Tuple[NativeFunction, Tuple[ETKernelKey, BackendMetadata]]] = [ + (native_function, (kernel_key, metadata)) + for native_function in native_functions + for kernel_key, metadata in kernel_index.get_kernels(native_function).items() + ] + + header = ["Functions.h" if use_aten_lib else "NativeFunctions.h"] cpu_fm.write_sharded( "RegisterCodegenUnboxedKernels.cpp", - native_functions, + items, key_fn=key_func, - env_callable=lambda fn: { - "unboxed_ops": [ComputeCodegenUnboxedKernels(selector, use_aten_lib)(fn)], + env_callable=lambda unbox_kernel_entry: { + "unboxed_kernels": [ + ComputeCodegenUnboxedKernels(selector, use_aten_lib)(unbox_kernel_entry) + ], + "fn_header": header + if unbox_kernel_entry == items[0] + else [], # Only write header once }, num_shards=1, - sharded_keys={"unboxed_ops"}, + sharded_keys={"unboxed_kernels", "fn_header"}, ) @@ -853,6 +893,7 @@ def main() -> None: cpu_fm=cpu_fm, selector=selector, use_aten_lib=options.use_aten_lib, + kernel_index=kernel_index, ) if custom_ops_native_functions: gen_custom_ops(