From 9cfb0c8240ef0790c101a8acccdeeee82bd9a323 Mon Sep 17 00:00:00 2001 From: Hyeongseok Oh Date: Mon, 9 Dec 2024 19:03:09 +0900 Subject: [PATCH 1/3] [onert/doc] Add onert on-device compiler document (#14425) This commit adds onert on-device compiler document. ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh --- docs/runtime/on-device-compilation.md | 202 ++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 docs/runtime/on-device-compilation.md diff --git a/docs/runtime/on-device-compilation.md b/docs/runtime/on-device-compilation.md new file mode 100644 index 00000000000..abd908dfc7b --- /dev/null +++ b/docs/runtime/on-device-compilation.md @@ -0,0 +1,202 @@ +# On-Device Compilation + +ONERT supports on-device compilation - on-device quantization and on-device code generation. + +## On-device quantization + +ONERT supports on-device quantization for the **float32** model. On-device quantization has two mode - full quantization and weight-only quantization. + +### Weight-only quantization + +Weight-only quantization quantizes only weights of the model. The activation is still in float32 precision. This mode is useful when the model size reduction is more important than the inference speedup. + +For weight-only quantization, follow below steps: +- Load float32 model +- Set quantization type for weight-only quantization +- Set path to save quantized model +- Call quantize API to perform quantization + +```cpp +// Load float32 model +nnfw_load_model_from_file(session, pkg_path); + +// Set quantization type: weight-only, symmetric, int8 +nnfw_set_quantization_type(session, NNFW_QUANTIZE_TYPE_WO_I8_SYM); + +// Set path to save quantized model +nnfw_set_quantized_model_path(session, quantized_model_path); + +// Quantize model +nnfw_quantize(session); + +// Run model for inference with quantized model +nnfw_run(session); +``` + +When the model is quantized, you can use the quantized model because the quantized model is loaded automatically after quantization. You don't need to load the quantized model explicitly. + + +### Full quantization + +Full quantization quantizes both weights and activations of the model. This mode is useful when specific runtime backend requires quantized model. To quantize activation, runtime should gather information about activation range during the execution of the model. Therefore, it needs to run the model enough times to get accurate activation range. + +For full quantization, follow below steps: + +- Load float32 model +- Gather activation range by running the model multiple times + - Prepare model to run + - Set input and output buffer(s) + - Set execution configuration to gather activation range + - Run model multiple times for inference with gathering activation range +- Quantize model if activation range is gathered enough + - Set quantization type for full quantization + - Set path to save quantized model + - Call quantize API to perform quantization + +```cpp +// Load float32 model +nnfw_load_model_from_file(session, pkg_path); + +// Prepare model to run +nnfw_prepare(session); + +// Set input and output buffer(s) +nnfw_set_input(session, input_index, input_type, input_buffer, input_element_size); +nnfw_set_output(session, output_index, output_type, output_buffer, output_element_size); + +// Set execution configuration to gather activation range +nnfw_set_execute_config(session, NNFW_RUN_CONFIG_DUMP_MINMAX, nullptr); + +// Run model multiple times for inference with gathering activation range +for (int i = 0; i < num_of_inference; ++i) +{ + nnfw_run(session); +} + +// Set quantization type: full, asymmetric, uint8 +nnfw_set_quantization_type(session, NNFW_QUANTIZE_TYPE_U8_ASYM); + +// Set path to save quantized model +nnfw_set_quantized_model_path(session, quantized_model_path); + +// Quantize model +nnfw_quantize(session); + +// Reset execution configuration to normal execution +nnfw_reset_execute_config(session); + +// Run model for inference with quantized model +nnfw_run(session); +``` + +When the model is quantized, you can use the quantized model because the quantized model is loaded automatically after quantization. You don't need to load the quantized model explicitly. Also, you don't need to set input and output buffers for the quantized data type because runtime automatically casts input and output buffers data between float32 and quantized data type. But you can set input and output buffers for the quantized data type after model full quantization if you want to use them directly without data casting. + +## On-device code generation + +ONE supports on-device code generation. On-device code generation generates backend-specific code from the model and saves it as a supported file format. This feature is useful when the backend requires a specific precompiled model file format. + +### Prerequisites + +To use on-device code generation, you need to install plugin that supports on-device code generation. On-device code generation plugin must fulfill interface defined in `ICodegen.h`. + +Plugin should be installed in `{libdir}/nnfw/codegen` with `lib-gen.so` name pattern. For example, if your plugin generates file with `.abc` extension, then plugin library should be named `libabc-gen.so`. + +### Usage + +To generate code, follow below steps: + +- Load model +- (Optional) Set path to save generated code + - If path is not set, generated code will be saved in same directory with model with same name but target name extension +- Call generate_code API to perform code generation + +```cpp +// Load model +nnfw_load_model_from_file(session, pkg_path); + +// (Optional) Set path to save generated code +// nnfw_set_codegen_model_path(session, codegen_model_path); + +// Generate code for target backend: target codegen plugin name is "abc" (installed as `libabc-gen.lib`) +nnfw_codegen(session, "abc-gen", NNFW_CODEGEN_PREF_DEFAULT); + +// Prepare model to run +nnfw_prepare(session); + +// Set backend to use generated code on specific target backend if need +nnfw_set_available_backend(session, "abc"); + +// Set input and output buffer(s) +nnfw_set_input(session, input_index, input_type, input_buffer, input_element_size); +nnfw_set_output(session, output_index, output_type, output_buffer, output_element_size); + +// Run model +nnfw_run(session); +``` + +## Collaboration on-device quantization and code generation + +On-device quantization and code generation can be used together when target backend requires quantized model and specific precompiled model file format. + +## Test tool support + +On-device compilation is supported in test tools `onert_run` + +### Quantization + +Example: weight-only quantization +- Input file: `test.circle` +- Quantization type: weight-only, symmetric, int8 +- Output file: `test.q.circle` + +```sh +$ onert_run --quantize int8_wo \ + --qpath test.q.circle \ + test.circle +``` + +Example: full quantization +- Input file: `test.circle` +- Quantization type: full, asymmetric, uint8 +- Output file: `test.q.circle` +- Number of inference to gather activation range: 10 + +```sh +$ onert_run -- quantize uint8 \ + --qpath test.q.circle \ + --minmax_run 10 \ + test.circle +``` + +### Code generation + +Example +- Input file: `test.circle` +- Target backend: `abc_back` +- Target plugin name: `abc` +- Output file: `test.abc` + +```sh +$ BACKENDS='abc_back' onert_run --codegen abc-gen \ + --cpath test.abc \ + test.circle +``` + +### Quantization and code generation + +Example +- Input file: `test.circle` +- Quantization type: full, asymmetric, uint8 +- Number of inference to gather activation range: 10 +- Quantized model file: `test.q.circle` +- Target backend: `abc_back` +- Target plugin name: `abc` +- Codegen output file: `test.abc` + +```sh +$ BACKENDS='abc_back' onert_run --quantize uint8 \ + --qpath test.q.circle \ + --minmax_run 10 \ + --codegen abc-gen \ + --cpath test.abc \ + test.circle From 329c5e233726388f2efc17849c352e6602c5af4b Mon Sep 17 00:00:00 2001 From: Tomasz Dolbniak Date: Thu, 9 Jan 2025 13:47:26 +0100 Subject: [PATCH 2/3] [infra] Update protobuf to 3.20.1 This commit updates the Protobuf library to a newer version due to the fixes this version contains. In particular this version fixes some compilation errors being reported by GCC on Ubuntu 24.04 when the build is performed with the following flags enabled: -Werror -Wall The removed .patch file is not required any more since the js_embed binary has been removed from protobuf in the following pull request protocolbuffers/protobuf#4709 ONE-DCO-1.0-Signed-off-by: Tomasz Dolbniak --- infra/cmake/packages/ProtobufConfig.cmake | 4 ++-- infra/cmake/packages/ProtobufSource.patch | 18 ------------------ .../cmake/packages/ProtobufSourceConfig.cmake | 5 ++--- 3 files changed, 4 insertions(+), 23 deletions(-) delete mode 100644 infra/cmake/packages/ProtobufSource.patch diff --git a/infra/cmake/packages/ProtobufConfig.cmake b/infra/cmake/packages/ProtobufConfig.cmake index f8e9ff1f951..8020e731b3c 100644 --- a/infra/cmake/packages/ProtobufConfig.cmake +++ b/infra/cmake/packages/ProtobufConfig.cmake @@ -24,7 +24,7 @@ endfunction(_Protobuf_module_import) function(_Protobuf_import) # Let's use find_package here not to export unnecessary definitions # NOTE Here we use "exact" match to avoid possible infinite loop - find_package(protobuf EXACT 3.5.2 QUIET) + find_package(protobuf EXACT 3.20.1.0 QUIET) if(NOT protobuf_FOUND) set(Protobuf_FOUND FALSE PARENT_SCOPE) @@ -65,7 +65,7 @@ function(_Protobuf_build) INSTALL_DIR ${EXT_OVERLAY_DIR} BUILD_FLAGS -fPIC EXTRA_OPTS -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_WITH_ZLIB=OFF - IDENTIFIER "3.5.2-fix2" + IDENTIFIER "3.20.1.0" PKG_NAME "PROTOBUF") endfunction(_Protobuf_build) diff --git a/infra/cmake/packages/ProtobufSource.patch b/infra/cmake/packages/ProtobufSource.patch deleted file mode 100644 index 9a83a80e480..00000000000 --- a/infra/cmake/packages/ProtobufSource.patch +++ /dev/null @@ -1,18 +0,0 @@ ---- a/cmake/libprotoc.cmake -+++ b/cmake/libprotoc.cmake -@@ -209,10 +209,14 @@ - ${protobuf_source_dir}/src/google/protobuf/compiler/js/well_known_types/timestamp.js - ) - add_executable(js_embed ${protobuf_source_dir}/src/google/protobuf/compiler/js/embed.cc) -+set(JS_EMBED_EXEC "js_embed") -+if(DEFINED ENV{EXTERNAL_JS_EMBED}) -+ set(JS_EMBED_EXEC "$ENV{EXTERNAL_JS_EMBED}") -+endif() - add_custom_command( - OUTPUT ${protobuf_source_dir}/src/google/protobuf/compiler/js/well_known_types_embed.cc - DEPENDS js_embed ${js_well_known_types_sources} -- COMMAND js_embed ${js_well_known_types_sources} > ${protobuf_source_dir}/src/google/protobuf/compiler/js/well_known_types_embed.cc -+ COMMAND ${JS_EMBED_EXEC} ${js_well_known_types_sources} > ${protobuf_source_dir}/src/google/protobuf/compiler/js/well_known_types_embed.cc - ) - - add_library(libprotoc ${protobuf_SHARED_OR_STATIC} diff --git a/infra/cmake/packages/ProtobufSourceConfig.cmake b/infra/cmake/packages/ProtobufSourceConfig.cmake index a1704e53d72..5675b9d35a0 100644 --- a/infra/cmake/packages/ProtobufSourceConfig.cmake +++ b/infra/cmake/packages/ProtobufSourceConfig.cmake @@ -8,10 +8,9 @@ function(_ProtobufSource_import) nnas_include(OptionTools) envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") - envoption(PROTOBUF_URL ${EXTERNAL_DOWNLOAD_SERVER}/protocolbuffers/protobuf/archive/v3.5.2.tar.gz) + envoption(PROTOBUF_URL ${EXTERNAL_DOWNLOAD_SERVER}/protocolbuffers/protobuf/archive/v3.20.1.tar.gz) - ExternalSource_Download(PROTOBUF ${PROTOBUF_URL} - PATCH ${CMAKE_CURRENT_LIST_DIR}/ProtobufSource.patch) + ExternalSource_Download(PROTOBUF ${PROTOBUF_URL}) set(ProtobufSource_DIR ${PROTOBUF_SOURCE_DIR} PARENT_SCOPE) set(ProtobufSource_FOUND TRUE PARENT_SCOPE) From 02ee378acbb1310b4781cba9a5ac7837fb2727f1 Mon Sep 17 00:00:00 2001 From: Tomasz Dolbniak Date: Tue, 21 Jan 2025 11:51:50 +0100 Subject: [PATCH 3/3] Use PB 3.20.2 --- infra/cmake/packages/ProtobufConfig.cmake | 4 ++-- infra/cmake/packages/ProtobufSourceConfig.cmake | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/infra/cmake/packages/ProtobufConfig.cmake b/infra/cmake/packages/ProtobufConfig.cmake index 8020e731b3c..974fe203a3a 100644 --- a/infra/cmake/packages/ProtobufConfig.cmake +++ b/infra/cmake/packages/ProtobufConfig.cmake @@ -24,7 +24,7 @@ endfunction(_Protobuf_module_import) function(_Protobuf_import) # Let's use find_package here not to export unnecessary definitions # NOTE Here we use "exact" match to avoid possible infinite loop - find_package(protobuf EXACT 3.20.1.0 QUIET) + find_package(protobuf EXACT 3.20.2.0 QUIET) if(NOT protobuf_FOUND) set(Protobuf_FOUND FALSE PARENT_SCOPE) @@ -65,7 +65,7 @@ function(_Protobuf_build) INSTALL_DIR ${EXT_OVERLAY_DIR} BUILD_FLAGS -fPIC EXTRA_OPTS -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_WITH_ZLIB=OFF - IDENTIFIER "3.20.1.0" + IDENTIFIER "3.20.2.0" PKG_NAME "PROTOBUF") endfunction(_Protobuf_build) diff --git a/infra/cmake/packages/ProtobufSourceConfig.cmake b/infra/cmake/packages/ProtobufSourceConfig.cmake index 5675b9d35a0..88c8c86cbb4 100644 --- a/infra/cmake/packages/ProtobufSourceConfig.cmake +++ b/infra/cmake/packages/ProtobufSourceConfig.cmake @@ -8,7 +8,7 @@ function(_ProtobufSource_import) nnas_include(OptionTools) envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") - envoption(PROTOBUF_URL ${EXTERNAL_DOWNLOAD_SERVER}/protocolbuffers/protobuf/archive/v3.20.1.tar.gz) + envoption(PROTOBUF_URL ${EXTERNAL_DOWNLOAD_SERVER}/protocolbuffers/protobuf/archive/v3.20.2.tar.gz) ExternalSource_Download(PROTOBUF ${PROTOBUF_URL})