[onert-micro] Support S8 and S16 FullyConnected (#13163)

This pr adds supporting of s8 and s16 + cmsis_nn. ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]>
Samsung · Jun 13, 2024 · b734be3 · b734be3
1 parent 1f33982
commit b734be3
Show file tree

Hide file tree

Showing 8 changed files with 578 additions and 11 deletions.
diff --git a/onert-micro/onert-micro/include/OMStatus.h b/onert-micro/onert-micro/include/OMStatus.h
@@ -34,6 +34,7 @@ enum OMStatus
   UnsupportedDynamicShapeCase,
   FailReadWOFFile,
   FailReadCheckpointFile,
+  CmsisNNError,
 };
 
 } // namespace onert_micro

diff --git a/onert-micro/onert-micro/include/pal/cmsisnn/KernelsToBuild.lst b/onert-micro/onert-micro/include/pal/cmsisnn/KernelsToBuild.lst
@@ -0,0 +1,87 @@
+#/*REGISTER_KERNEL(ABS, Abs)*/
+#/*REGISTER_KERNEL(ADD, Add)*/
+#/*REGISTER_KERNEL(ADD_N, AddN)*/
+#/*REGISTER_KERNEL(AVERAGE_POOL_2D, AveragePool2D)*/
+#/*REGISTER_KERNEL(ARG_MAX, ArgMax)*/
+#/*REGISTER_KERNEL(ARG_MIN, ArgMin)*/
+#/*REGISTER_KERNEL(CONCATENATION, Concatenation)*/
+#/*REGISTER_KERNEL(CUSTOM, BroadcastTo)*/
+#/*REGISTER_KERNEL(BATCH_TO_SPACE_ND, BatchToSpaceND)*/
+#/*REGISTER_KERNEL(CEIL, Ceil)*/
+#/*REGISTER_KERNEL(COS, Cos)*/
+#/*REGISTER_KERNEL(CAST, Cast)*/
+#/*REGISTER_KERNEL(DIV, Div)*/
+#/*REGISTER_KERNEL(DEPTHWISE_CONV_2D, DepthwiseConv2D)*/
+#/*REGISTER_KERNEL(DEPTH_TO_SPACE, DepthToSpace)*/
+#/*REGISTER_KERNEL(DEQUANTIZE, Dequantize)*/
+REGISTER_KERNEL(FULLY_CONNECTED, FullyConnected)
+#/*REGISTER_KERNEL(CONV_2D, Conv2D)*/
+#/*REGISTER_KERNEL(LOGISTIC, Logistic)*/
+#/*REGISTER_KERNEL(LOG, Log)*/
+#/*REGISTER_KERNEL(GATHER, Gather)*/
+#/*REGISTER_KERNEL(GATHER_ND, GatherND)*/
+#/*REGISTER_KERNEL(EXP, Exp)*/
+#/*REGISTER_KERNEL(GREATER, Greater)*/
+#/*REGISTER_KERNEL(GREATER_EQUAL, GreaterEqual)*/
+#/*REGISTER_KERNEL(EXPAND_DIMS, ExpandDims)*/
+#/*REGISTER_KERNEL(ELU, Elu)*/
+#/*REGISTER_KERNEL(EQUAL, Equal)*/
+#/*REGISTER_KERNEL(FILL, Fill)*/
+#/*REGISTER_KERNEL(FLOOR, Floor)*/
+#/*REGISTER_KERNEL(FLOOR_DIV, FloorDiv)*/
+#/*REGISTER_KERNEL(FLOOR_MOD, FloorMod)*/
+#/*REGISTER_KERNEL(PACK, Pack)*/
+#/*REGISTER_KERNEL(PAD, Pad)*/
+#/*REGISTER_KERNEL(PADV2, PadV2)*/
+#/*REGISTER_KERNEL(PRELU, PRelu)*/
+#/*REGISTER_KERNEL(RESHAPE, Reshape)*/
+#/*REGISTER_KERNEL(RELU, Relu)*/
+#/*REGISTER_KERNEL(RELU6, Relu6)*/
+#/*REGISTER_KERNEL(REDUCE_PROD, ReduceCommon)*/
+#/*REGISTER_KERNEL(REDUCE_MAX, ReduceMax)*/
+#/*REGISTER_KERNEL(ROUND, Round)*/
+#/*REGISTER_KERNEL(LESS, Less)*/
+#/*REGISTER_KERNEL(L2_NORMALIZATION, L2Normalize)*/
+#/*REGISTER_KERNEL(L2_POOL_2D, L2Pool2D)*/
+#/*REGISTER_KERNEL(LESS_EQUAL, LessEqual)*/
+#/*REGISTER_KERNEL(LOGICAL_AND, LogicalAnd)*/
+#/*REGISTER_KERNEL(LOGICAL_NOT, LogicalNot)*/
+#/*REGISTER_KERNEL(LOGICAL_OR, LogicalOr)*/
+#/*REGISTER_KERNEL(LEAKY_RELU, LeakyRelu)*/
+#/*REGISTER_KERNEL(LOG_SOFTMAX, LogSoftmax)*/
+#/*REGISTER_KERNEL(MUL, Mul)*/
+#/*REGISTER_KERNEL(MIRROR_PAD, MirrorPad)*/
+#/*REGISTER_KERNEL(MAXIMUM, Maximum)*/
+#/*REGISTER_KERNEL(MEAN, Mean)*/
+#/*REGISTER_KERNEL(MAX_POOL_2D, MaxPool2D)*/
+#/*REGISTER_KERNEL(MINIMUM, Minimum)*/
+#/*REGISTER_KERNEL(SHAPE, Shape)*/
+#/*REGISTER_KERNEL(NOT_EQUAL, NotEqual)*/
+#/*REGISTER_KERNEL(SIN, Sin)*/
+#/*REGISTER_KERNEL(SQUARED_DIFFERENCE, SquaredDifference)*/
+#/*REGISTER_KERNEL(SLICE, Slice)*/
+#/*REGISTER_KERNEL(SUB, Sub)*/
+#/*REGISTER_KERNEL(SPLIT, Split)*/
+#/*REGISTER_KERNEL(SPACE_TO_BATCH_ND, SpaceToBatchND)*/
+#/*REGISTER_KERNEL(STRIDED_SLICE, StridedSlice)*/
+#/*REGISTER_KERNEL(SPLIT_V, SplitV)*/
+#/*REGISTER_KERNEL(SQUARE, Square)*/
+#/*REGISTER_KERNEL(SQRT, Sqrt)*/
+#/*REGISTER_KERNEL(SPACE_TO_DEPTH, SpaceToDepth)*/
+#/*REGISTER_KERNEL(QUANTIZE, Quantize)*/
+#/*REGISTER_KERNEL(TANH, Tanh)*/
+#/*REGISTER_KERNEL(TRANSPOSE, Transpose)*/
+#/*REGISTER_KERNEL(TRANSPOSE_CONV, TransposeConv)*/
+#/*REGISTER_KERNEL(SOFTMAX, Softmax)*/
+#/*REGISTER_KERNEL(SUM, Sum)*/
+#/*REGISTER_KERNEL(SELECT_V2, SelectV2)*/
+#/*REGISTER_KERNEL(SVDF, SVDF)*/
+#/*REGISTER_KERNEL(WHILE, While)*/
+#/*REGISTER_KERNEL(UNIDIRECTIONAL_SEQUENCE_LSTM, UnidirectionalSequenceLSTM)*/
+#/*REGISTER_KERNEL(RESIZE_BILINEAR, ResizeBilinear)*/
+#/*REGISTER_KERNEL(RESIZE_NEAREST_NEIGHBOR, ResizeNearestNeighbor)*/
+#/*REGISTER_KERNEL(RSQRT, Rsqrt)*/
+#/*REGISTER_KERNEL(NEG, Neg)*/
+#/*REGISTER_KERNEL(ZEROS_LIKE, ZerosLike)*/
+#/*REGISTER_KERNEL(SQUEEZE, Squeeze)*/
+#/*REGISTER_KERNEL(UNPACK, Unpack)*/
diff --git a/onert-micro/onert-micro/include/pal/cmsisnn/PALFullyConnected.h b/onert-micro/onert-micro/include/pal/cmsisnn/PALFullyConnected.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_EXECUTE_PAL_FULLY_CONNECTED_H
+#define ONERT_MICRO_EXECUTE_PAL_FULLY_CONNECTED_H
+
+#include "PALFullyConnectedCommon.h"
+
+#include <arm_nnfunctions.h>
+
+namespace onert_micro
+{
+namespace execute
+{
+namespace pal
+{
+template <>
+OMStatus FullyConnected<int8_t>(const core::FullyConnectedParams &params, const int8_t *input_data,
+                                const core::OMRuntimeShape &filter_shape, const int8_t *filter_data,
+                                const int32_t *bias_data, const core::OMRuntimeShape &output_shape,
+                                int8_t *output_data)
+{
+  const int filter_dim_count = filter_shape.dimensionsCount();
+  const int output_dim_count = output_shape.dimensionsCount();
+  const int batches =
+    flatSizeSkipDim(output_shape.dimsData(), output_dim_count - 1, output_dim_count);
+  const int output_depth = output_shape.dims(output_dim_count - 1);
+  const int accum_depth = filter_shape.dims(filter_dim_count - 1);
+
+  cmsis_nn_fc_params fc_params;
+  fc_params.input_offset = params.input_offset;
+  fc_params.output_offset = params.output_offset;
+  fc_params.filter_offset = params.weights_offset;
+  fc_params.activation.min = params.quantized_activation_min;
+  fc_params.activation.max = params.quantized_activation_max;
+
+  cmsis_nn_per_tensor_quant_params quant_params;
+  quant_params.multiplier = params.output_multiplier;
+  quant_params.shift = params.output_shift;
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = batches;
+  input_dims.h = 1;
+  input_dims.w = 1;
+  input_dims.c = accum_depth;
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = accum_depth;
+  filter_dims.h = 1;
+  filter_dims.w = 1;
+  filter_dims.c = output_depth;
+
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = 1;
+  bias_dims.h = 1;
+  bias_dims.w = 1;
+  bias_dims.c = output_depth;
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = batches;
+  output_dims.h = 1;
+  output_dims.w = 1;
+  output_dims.c = output_depth;
+
+  int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+  auto buffer = std::make_unique<int8_t[]>(buf_size);
+  assert(buffer != nullptr);
+
+  cmsis_nn_context ctx;
+  ctx.buf = buffer.get();
+  ctx.size = buf_size;
+
+  auto res =
+    arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
+                           filter_data, &bias_dims, bias_data, &output_dims, output_data);
+  assert(res == ARM_CMSIS_NN_SUCCESS);
+  if (res != ARM_CMSIS_NN_SUCCESS)
+    return CmsisNNError;
+
+  return Ok;
+}
+
+template <>
+OMStatus FullyConnected(const core::FullyConnectedParams &params, const int16_t *input_data,
+                        const core::OMRuntimeShape &filter_shape, const int8_t *filter_data,
+                        const int64_t *bias_data, const core::OMRuntimeShape &output_shape,
+                        int16_t *output_data)
+{
+  const int filter_dim_count = filter_shape.dimensionsCount();
+  const int output_dim_count = output_shape.dimensionsCount();
+  const int batches =
+    flatSizeSkipDim(output_shape.dimsData(), output_dim_count - 1, output_dim_count);
+  const int output_depth = output_shape.dims(output_dim_count - 1);
+  const int accum_depth = filter_shape.dims(filter_dim_count - 1);
+
+  cmsis_nn_fc_params fc_params;
+  fc_params.input_offset = params.input_offset;
+  fc_params.output_offset = params.output_offset;
+  fc_params.filter_offset = params.weights_offset;
+  fc_params.activation.min = params.quantized_activation_min;
+  fc_params.activation.max = params.quantized_activation_max;
+
+  cmsis_nn_per_tensor_quant_params quant_params;
+  quant_params.multiplier = params.output_multiplier;
+  quant_params.shift = params.output_shift;
+
+  cmsis_nn_dims input_dims;
+  input_dims.n = batches;
+  input_dims.h = 1;
+  input_dims.w = 1;
+  input_dims.c = accum_depth;
+
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = accum_depth;
+  filter_dims.h = 1;
+  filter_dims.w = 1;
+  filter_dims.c = output_depth;
+
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = 1;
+  bias_dims.h = 1;
+  bias_dims.w = 1;
+  bias_dims.c = output_depth;
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = batches;
+  output_dims.h = 1;
+  output_dims.w = 1;
+  output_dims.c = output_depth;
+
+  int32_t buf_size = arm_fully_connected_s16_get_buffer_size(&filter_dims);
+  auto buffer = std::make_unique<int8_t[]>(buf_size);
+  assert(buffer != nullptr);
+
+  cmsis_nn_context ctx;
+  ctx.buf = buffer.get();
+  ctx.size = buf_size;
+
+  auto res =
+    arm_fully_connected_s16(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
+                            filter_data, &bias_dims, bias_data, &output_dims, output_data);
+  assert(res == ARM_CMSIS_NN_SUCCESS);
+
+  if (res != ARM_CMSIS_NN_SUCCESS)
+    return CmsisNNError;
+
+  return Ok;
+}
+
+} // namespace pal
+} // namespace execute
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_EXECUTE_PAL_FULLY_CONNECTED_COMMON_H