Skip to content

Commit

Permalink
[onert-micro] Support S8 Add
Browse files Browse the repository at this point in the history
This pr adds supporting of s8 + cmsis_nn.

ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]>
  • Loading branch information
Artem Balyshev committed Jun 14, 2024
1 parent 3eceee5 commit 869e077
Show file tree
Hide file tree
Showing 12 changed files with 760 additions and 16 deletions.
16 changes: 16 additions & 0 deletions onert-micro/onert-micro/include/core/OMKernelData.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,22 @@ struct TransposeParams
int32_t perm[5];
};

struct ArithmeticQuantParams
{
int32_t input1_offset;
int32_t input2_offset;
int left_shift;
int32_t input1_multiplier;
int32_t input2_multiplier;
int input1_shift;
int input2_shift;
int32_t output_multiplier;
int output_shift;
int32_t output_offset;
int32_t quantized_activation_max;
int32_t quantized_activation_min;
};

struct BinaryArithmeticBroadcastParams
{
// float activation params.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#/*REGISTER_KERNEL(ABS, Abs)*/
#/*REGISTER_KERNEL(ADD, Add)*/
REGISTER_KERNEL(ADD, Add)
#/*REGISTER_KERNEL(ADD_N, AddN)*/
#/*REGISTER_KERNEL(AVERAGE_POOL_2D, AveragePool2D)*/
#/*REGISTER_KERNEL(ARG_MAX, ArgMax)*/
Expand Down
53 changes: 53 additions & 0 deletions onert-micro/onert-micro/include/pal/cmsisnn/PALAdd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef ONERT_MICRO_EXECUTE_PAL_ADD_H
#define ONERT_MICRO_EXECUTE_PAL_ADD_H

#include "PALAddCommon.h"
#include "PALUtils.h"

#include "arm_nnfunctions.h"

namespace onert_micro
{
namespace execute
{
namespace pal
{

OMStatus Add(const core::ArithmeticQuantParams &params, const uint32_t flat_size,
const int8_t *input1_data, const int8_t *input2_data, int8_t *output_data)
{
auto status = arm_elementwise_add_s8(
input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
output_data, params.output_offset, params.output_multiplier, params.output_shift,
params.quantized_activation_min, params.quantized_activation_max, flat_size);

assert(status == ARM_CMSIS_NN_SUCCESS);
if (status != ARM_CMSIS_NN_SUCCESS)
return UnknownError;

return Ok;
}

} // namespace pal
} // namespace execute
} // namespace onert_micro

#endif // ONERT_MICRO_EXECUTE_PAL_ADD_H
29 changes: 29 additions & 0 deletions onert-micro/onert-micro/include/pal/common/PALAddCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@ namespace execute
namespace pal
{

int8_t AddFunc(int8_t x, int8_t y, const core::ArithmeticQuantParams &params)
{
const int32_t input1_val = params.input1_offset + x;
const int32_t input2_val = params.input2_offset + y;
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output = multiplyByQuantizedMultiplierSmallerThanOneExp(
raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output = std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
return static_cast<int8_t>(clamped_output);
}

template <typename T>
OMStatus Add(const core::BinaryArithmeticBroadcastParams &params, const int flat_size,
const T *input1_data, const T *input2_data, T *output_data)
Expand All @@ -46,6 +65,16 @@ OMStatus BroadcastAdd4DSlow(const core::BinaryArithmeticBroadcastParams &params,
return Ok;
}

OMStatus BroadcastAdd4DSlow(const core::ArithmeticQuantParams &params,
const core::OMRuntimeShape &input1_shape, const int8_t *input1_data,
const core::OMRuntimeShape &input2_shape, const int8_t *input2_data,
const core::OMRuntimeShape &output_shape, int8_t *output_data)
{
BroadcastBinaryFunction6DSlow(params, input1_shape, input1_data, input2_shape, input2_data,
output_shape, output_data, AddFunc);
return Ok;
}

} // namespace pal
} // namespace execute
} // namespace onert_micro
Expand Down
131 changes: 131 additions & 0 deletions onert-micro/onert-micro/include/pal/common/PALArithmeticOpCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@ OMStatus ArithmeticOp(const core::BinaryArithmeticBroadcastParams &params, const
return Ok;
}

template <typename T>
void ElementWise(const uint32_t size, const core::ArithmeticQuantParams &params,
const T *input1_data, const T *input2_data, T *output_data,
T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
{
for (int i = 0; i < size; ++i)
{
output_data[i] = binary_func(input1_data[i], input2_data[i], params);
}
}

template <typename T, typename Fn>
inline void ArithmeticOpScalar(const core::BinaryArithmeticBroadcastParams &params,
const int flat_size, const T *input_data, const T scalar_value,
Expand Down Expand Up @@ -130,6 +141,126 @@ OMStatus BroadcastArithmeticOp4DSlow(const core::BinaryArithmeticBroadcastParams
return Ok;
}

template <typename T>
void BroadcastInput1(int size, const core::ArithmeticQuantParams &params, const T *input1_data,
const T *input2_data, T *output_data,
T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
{
for (int i = 0; i < size; ++i)
{
output_data[i] = binary_func(input1_data[0], input2_data[i], params);
}
}

template <typename T>
void BroadcastInput2(int size, const core::ArithmeticQuantParams &params, const T *input1_data,
const T *input2_data, T *output_data,
T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
{
for (int i = 0; i < size; ++i)
{
output_data[i] = binary_func(input1_data[i], input2_data[0], params);
}
}

template <typename T>
void BroadcastRecursiveDimensions(const core::ArithmeticQuantParams &params, int dimension,
size_t *input1_offset_p, size_t *input2_offset_p,
size_t *output_offset, size_t *compressed_input1_stride,
size_t *compressed_input2_stride, size_t *compressed_output_shape,
const T *input1_data, const T *input2_data, T *output_data,
T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
{
if (dimension > 0)
{
for (size_t c = 0; c < compressed_output_shape[dimension]; ++c)
{
size_t input1_offset_c = *input1_offset_p;
size_t input2_offset_c = *input2_offset_p;
BroadcastRecursiveDimensions(params, dimension - 1, &input1_offset_c, &input2_offset_c,
output_offset, compressed_input1_stride,
compressed_input2_stride, compressed_output_shape, input1_data,
input2_data, output_data, binary_func);
*input1_offset_p += compressed_input1_stride[dimension];
*input2_offset_p += compressed_input2_stride[dimension];
}
}
else
{
assert(dimension == 0);
bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
assert(!(input1_is_broadcast && input2_is_broadcast));
const T *input1_data_ptr = input1_data + *input1_offset_p;
const T *input2_data_ptr = input2_data + *input2_offset_p;
T *output_data_ptr = output_data + *output_offset;
if (input1_is_broadcast)
{
// input1 is broadcast.
BroadcastInput1<T>(compressed_output_shape[dimension], params, input1_data_ptr,
input2_data_ptr, output_data_ptr, binary_func);
*input2_offset_p += compressed_output_shape[dimension];
}
else if (input2_is_broadcast)
{
// input2 is broadcast.
BroadcastInput2<T>(compressed_output_shape[dimension], params, input1_data_ptr,
input2_data_ptr, output_data_ptr, binary_func);
*input1_offset_p += compressed_output_shape[dimension];
}
else
{
// Add element-wise.
ElementWise<T>(compressed_output_shape[dimension], params, input1_data_ptr, input2_data_ptr,
output_data_ptr, binary_func);
*input1_offset_p += compressed_output_shape[dimension];
*input2_offset_p += compressed_output_shape[dimension];
}
*output_offset += compressed_output_shape[dimension];
}
}

template <typename T>
void BroadcastBinaryFunction6DSlow(const core::ArithmeticQuantParams &params,
const core::OMRuntimeShape &input1_shape, const T *input1_data,
const core::OMRuntimeShape &input2_shape, const T *input2_data,
const core::OMRuntimeShape &output_shape, T *output_data,
T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
{
constexpr int kMaxBroadcastDim = 6;

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
size_t compressed_input1_stride[kMaxBroadcastDim];
size_t compressed_input2_stride[kMaxBroadcastDim];
size_t compressed_output_shape[kMaxBroadcastDim];
bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
input1_shape, input2_shape, compressed_input1_stride, compressed_input2_stride,
compressed_output_shape);
// Skip broadcasting for degenerate shapes.
if (!broadcastable_shape)
{
return;
}

size_t input1_offset = 0;
size_t input2_offset = 0;
size_t output_offset = 0;
BroadcastRecursiveDimensions(params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
&output_offset, compressed_input1_stride, compressed_input2_stride,
compressed_output_shape, input1_data, input2_data, output_data,
binary_func);
}

} // namespace pal
} // namespace execute
} // namespace onert_micro
Expand Down
Loading

0 comments on commit 869e077

Please sign in to comment.