Skip to content

Commit

Permalink
[onert-micro] Support S8 Conv2D (#13185)
Browse files Browse the repository at this point in the history
This pr adds supporting of s8 + cmsis_nn.

ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]>
  • Loading branch information
BalyshevArtem authored Jun 14, 2024
1 parent be96c99 commit ff12cae
Show file tree
Hide file tree
Showing 15 changed files with 721 additions and 20 deletions.
22 changes: 22 additions & 0 deletions onert-micro/onert-micro/include/core/OMKernelData.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,28 @@ struct BinaryArithmeticBroadcastParams
BroadcastableOpCategory broadcast_category;
};

struct ConvQuant
{
int32_t pad_h;
int32_t pad_w;
int32_t stride_w;
int32_t stride_h;
int32_t stride_width;
int32_t stride_height;
int32_t dilation_width_factor;
int32_t dilation_height_factor;
int32_t input_offset;
int32_t weights_offset;
int32_t output_offset;
int32_t output_multiplier;
int32_t output_shift;
int32_t quantized_activation_min;
int32_t quantized_activation_max;
int32_t depth_multiplier;
std::vector<int32_t> per_channel_output_multiplier;
std::vector<int> per_channel_output_shift;
};

struct FloatConv2D
{
int32_t stride_w;
Expand Down
24 changes: 20 additions & 4 deletions onert-micro/onert-micro/include/execute/OMUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#ifndef ONERT_MICRO_EXECUTE_UTILS_H
#define ONERT_MICRO_EXECUTE_UTILS_H

#include <cmath>
#include "OMStatus.h"
#include "core/reader/OMCircleReader.h"
#include "core/OMRuntimeShape.h"
Expand Down Expand Up @@ -85,10 +86,25 @@ void quantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier,
void quantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quantized_multiplier,
int *left_shift);

void calculateActivationRangeQuantized(circle::ActivationFunctionType activation,
int32_t output_zero_point, float output_scale,
circle::TensorType data_type, int32_t *activation_min,
int32_t *activation_max);
inline std::vector<double>
getQuantizedConvolutionMultiplers(float input_scale, const flatbuffers::Vector<float> *filter_scale,
float output_scale)
{
std::vector<double> effective_output_scales;
size_t n = filter_scale->size();
effective_output_scales.reserve(n);
for (size_t i = 0; i < n; ++i)
{
effective_output_scales.push_back(
getQuantizedConvolutionMultipler(input_scale, filter_scale->operator[](i), output_scale));
}
return effective_output_scales;
}

OMStatus calculateActivationRangeQuantized(circle::ActivationFunctionType activation,
int32_t output_zero_point, float output_scale,
circle::TensorType data_type, int32_t *activation_min,
int32_t *activation_max);

inline int computeOutSize(circle::Padding padding, int image_size, int filter_size, int stride,
int dilation_rate = 1)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
/*
* Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,8 +14,24 @@
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_PAL_CONV2D_H
#define LUCI_INTERPRETER_PAL_CONV2D_H
#include "PALConv2DCommon.h"
#ifndef ONERT_MICRO_EXECUTE_KERNELS_CONVOLUTION_COMMON_H
#define ONERT_MICRO_EXECUTE_KERNELS_CONVOLUTION_COMMON_H

#endif // LUCI_INTERPRETER_PAL_CONV2D_H
#include "OMStatus.h"

#include "core/OMKernelData.h"
#include "core/OMRuntimeShape.h"

namespace onert_micro
{
namespace execute
{

OMStatus createConvParams(core::ConvQuant &params, const circle::Tensor *input,
const circle::Tensor *filter, const circle::Tensor *output,
const circle::Conv2DOptions *options);

} // namespace execute
} // namespace onert_micro

#endif // ONERT_MICRO_EXECUTE_KERNELS_CONVOLUTION_COMMON_H
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#/*REGISTER_KERNEL(DEPTH_TO_SPACE, DepthToSpace)*/
#/*REGISTER_KERNEL(DEQUANTIZE, Dequantize)*/
REGISTER_KERNEL(FULLY_CONNECTED, FullyConnected)
#/*REGISTER_KERNEL(CONV_2D, Conv2D)*/
REGISTER_KERNEL(CONV_2D, Conv2D)
#/*REGISTER_KERNEL(LOGISTIC, Logistic)*/
#/*REGISTER_KERNEL(LOG, Log)*/
#/*REGISTER_KERNEL(GATHER, Gather)*/
Expand Down
115 changes: 115 additions & 0 deletions onert-micro/onert-micro/include/pal/cmsisnn/PALConv2D.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef ONERT_MICRO_EXECUTE_PAL_CONV_2D_H
#define ONERT_MICRO_EXECUTE_PAL_CONV_2D_H

#include "PALConv2DCommon.h"
#include "core/OMKernelData.h"
#include "core/OMRuntimeShape.h"
#include "PALUtils.h"

#include <arm_nnfunctions.h>

namespace onert_micro
{
namespace execute
{
namespace pal
{

// Fixed-point per-channel-quantization convolution reference kernel.
OMStatus ConvPerChannel(const core::ConvQuant &params, const core::OMRuntimeShape &input_shape,
const int8_t *input_data, const core::OMRuntimeShape &filter_shape,
const int8_t *filter_data, const int32_t *bias_data,
const core::OMRuntimeShape &output_shape, int8_t *output_data)
{
cmsis_nn_conv_params conv_params;
conv_params.dilation.h = params.dilation_height_factor;
conv_params.dilation.w = params.dilation_width_factor;

assert(conv_params.dilation.h == 1);
assert(conv_params.dilation.w == 1);

conv_params.input_offset = params.input_offset;
conv_params.output_offset = params.output_offset;
conv_params.stride.h = params.stride_height;
conv_params.stride.w = params.stride_width;
conv_params.padding.h = params.pad_h;
conv_params.padding.w = params.pad_w;
conv_params.activation.min = params.quantized_activation_min;
conv_params.activation.max = params.quantized_activation_max;

cmsis_nn_per_channel_quant_params quant_params;
quant_params.multiplier = const_cast<int32_t *>(params.per_channel_output_multiplier.data());
quant_params.shift = const_cast<int32_t *>(
reinterpret_cast<const int32_t *>(params.per_channel_output_shift.data()));

assert(conv_params.activation.min <= conv_params.activation.max);
const int batch_size = input_shape.dims(0);
const int input_depth = input_shape.dims(3);
const int output_depth = filter_shape.dims(0);

cmsis_nn_dims input_dims;
input_dims.n = batch_size;
input_dims.h = input_shape.dims(1);
input_dims.w = input_shape.dims(2);
input_dims.c = input_depth;

cmsis_nn_dims filter_dims;
filter_dims.n = output_depth;
filter_dims.h = filter_shape.dims(1);
filter_dims.w = filter_shape.dims(2);
filter_dims.c = input_depth;

cmsis_nn_dims bias_dims;
bias_dims.n = 1;
bias_dims.h = 1;
bias_dims.w = 1;
bias_dims.c = output_depth;

cmsis_nn_dims output_dims;
output_dims.n = batch_size;
output_dims.h = output_shape.dims(1);
output_dims.w = output_shape.dims(2);
output_dims.c = output_depth;

auto buf_size =
arm_convolve_wrapper_s8_get_buffer_size(&conv_params, &input_dims, &filter_dims, &output_dims);

auto buffer = std::make_unique<int8_t[]>(buf_size);
assert(buffer != nullptr);

cmsis_nn_context ctx;
ctx.buf = buffer.get();
ctx.size = buf_size;

auto res = arm_convolve_wrapper_s8(&ctx, &conv_params, &quant_params, &input_dims, input_data,
&filter_dims, filter_data, &bias_dims, bias_data, &output_dims,
output_data);

assert(res == ARM_CMSIS_NN_SUCCESS);
if (res != ARM_CMSIS_NN_SUCCESS)
return CmsisNNError;
return Ok;
}

} // namespace pal
} // namespace execute
} // namespace onert_micro

#endif // ONERT_MICRO_EXECUTE_PAL_CONV_2D_H
155 changes: 155 additions & 0 deletions onert-micro/onert-micro/include/pal/mcu/PALConv2D.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef ONERT_MICRO_EXECUTE_PAL_CONV_2D_H
#define ONERT_MICRO_EXECUTE_PAL_CONV_2D_H

#include "PALConv2DCommon.h"
#include "core/OMKernelData.h"
#include "core/OMRuntimeShape.h"
#include "PALUtils.h"

namespace onert_micro
{
namespace execute
{
namespace pal
{

// Fixed-point per-channel-quantization convolution reference kernel.
OMStatus ConvPerChannel(const core::ConvQuant &params, const core::OMRuntimeShape &input_shape,
const int8_t *input_data, const core::OMRuntimeShape &filter_shape,
const int8_t *filter_data, const int32_t *bias_data,
const core::OMRuntimeShape &output_shape, int8_t *output_data)
{
// Get parameters.
const int32_t input_offset = params.input_offset; // r = s(q - Z)
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.pad_w;
const int pad_height = params.pad_h;
const int32_t output_offset = params.output_offset;

const std::vector<int32_t> &output_multiplier = params.per_channel_output_multiplier;
const std::vector<int32_t> &output_shift = params.per_channel_output_shift;

// Set min and max value of the output.
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;

// Consistency check.
assert(output_activation_max >= output_activation_min);
assert(input_shape.dimensionsCount() == 4);
assert(filter_shape.dimensionsCount() == 4);
assert(output_shape.dimensionsCount() == 4);

const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = input_shape.dims(3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);

// Check dimensions of the tensors.
const int input_height = input_shape.dims(1);
const int input_width = input_shape.dims(2);
const int filter_height = filter_shape.dims(1);
const int filter_width = filter_shape.dims(2);
const int filter_input_depth = filter_shape.dims(3);
const int groups = input_depth / filter_input_depth;
assert(groups != 0);
assert(input_depth % filter_input_depth == 0);
const int filters_per_group = output_depth / groups;
assert(filters_per_group != 0);
const int output_height = output_shape.dims(1);
const int output_width = output_shape.dims(2);
for (int batch = 0; batch < batches; ++batch)
{
for (int out_y = 0; out_y < output_height; ++out_y)
{
const int in_y_origin = (out_y * stride_height) - pad_height;
for (int out_x = 0; out_x < output_width; ++out_x)
{
const int in_x_origin = (out_x * stride_width) - pad_width;
for (int out_channel = 0; out_channel < output_depth; ++out_channel)
{
auto group = out_channel / filters_per_group;
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y)
{
const int in_y = in_y_origin + dilation_height_factor * filter_y;
for (int filter_x = 0; filter_x < filter_width; ++filter_x)
{
const int in_x = in_x_origin + dilation_width_factor * filter_x;

// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);

if (!is_point_inside_image)
{
continue;
}

for (int in_channel = 0; in_channel < filter_input_depth; ++in_channel)
{
int32_t input_val = input_data[offset(input_shape.dimsData(), batch, in_y, in_x,
in_channel + group * filter_input_depth)];
int32_t filter_val = filter_data[offset(filter_shape.dimsData(), out_channel,
filter_y, filter_x, in_channel)];
// Accumulate with 32 bits accumulator.
// In the nudging process during model quantization, we force
// real value of 0.0 be represented by a quantized value. This
// guarantees that the input_offset is a int8_t, even though
// it is represented using int32_t. int32_t += int8_t *
// (int8_t - int8_t) so the highest value we can get from each
// accumulation is [-127, 127] * ([-128, 127] -
// [-128, 127]), which is [-32512, 32512]. log2(32512)
// = 14.98, which means we can accumulate at least 2^16
// multiplications without overflow. The accumulator is
// applied to a filter so the accumulation logic will hold as
// long as the filter size (filter_y * filter_x * in_channel)
// does not exceed 2^16, which is the case in all the models
// we have seen so far.
// accumulator depth is smaller than 2^16.
acc += filter_val * (input_val + input_offset);
}
}
}

if (bias_data)
{
acc += bias_data[out_channel];
}
acc = multiplyByQuantizedMultiplier(acc, output_multiplier[out_channel],
output_shift[out_channel]);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[offset(output_shape.dimsData(), batch, out_y, out_x, out_channel)] =
static_cast<int8_t>(acc);
}
}
}
}
return Ok;
}

} // namespace pal
} // namespace execute
} // namespace onert_micro

#endif // ONERT_MICRO_EXECUTE_PAL_CONV_2D_H
Loading

0 comments on commit ff12cae

Please sign in to comment.