From 9ada0108e199f35909d63e7331c02539fa6e45f9 Mon Sep 17 00:00:00 2001 From: ragmani Date: Wed, 10 Jan 2024 18:11:44 +0900 Subject: [PATCH] [onert-micro] Reduce duplicate code in binary kernels This commit reduces duplicate code in binary kernels. - Introduce PALBinaryOpCommon.h that has common functions for binary kernels. - Introduce binary function objects. - Introduce `BinaryOp()` that unifies binary kernels without broadcast. - Introduce `BroadcastBinaryOp4DSlow()` that unifies binary kernels with broadcast. - Apply common functions for binary kernels. ONE-DCO-1.0-Signed-off-by: ragmani --- .../pal/common/PALBinaryOpCommon.h | 117 ++++++++++++++++++ .../pal/common/PALFloorDivCommon.h | 54 +------- .../pal/common/PALFloorModCommon.h | 59 +-------- .../pal/common/PALMaximumCommon.h | 50 +------- .../pal/common/PALMinimumCommon.h | 15 +-- .../src/kernels/BinaryOpCommon.h | 9 ++ .../luci-interpreter/src/kernels/FloorDiv.cpp | 21 +--- .../src/kernels/FloorDiv.test.cpp | 66 ++++++++++ .../luci-interpreter/src/kernels/FloorMod.cpp | 21 +--- .../src/kernels/FloorMod.test.cpp | 66 ++++++++++ .../luci-interpreter/src/kernels/Maximum.cpp | 21 +--- .../src/kernels/Maximum.test.cpp | 66 ++++++++++ .../luci-interpreter/src/kernels/Minimum.cpp | 21 +--- .../src/kernels/Minimum.test.cpp | 66 ++++++++++ .../luci-interpreter/src/kernels/Utils.cpp | 16 +++ .../luci-interpreter/src/kernels/Utils.h | 4 + 16 files changed, 434 insertions(+), 238 deletions(-) create mode 100644 onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h diff --git a/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h b/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h new file mode 100644 index 00000000000..615c0984242 --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H +#define LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H + +#include "Params.h" +#include "PALUtils.h" +#include "ProcessBroadcastShapes.h" + +namespace luci_interpreter_pal +{ + +template ::value, bool> = true> +struct FloorDivFn +{ + T operator()(T lhs, T rhs) + { + return std::floor(static_cast(lhs) / static_cast(rhs)); + } +}; +template ::value, bool> = true> +struct FloorModFn +{ + T operator()(T lhs, T rhs) + { + T trunc_mod = std::fmod(lhs, rhs); + return (trunc_mod != 0) && ((rhs < 0) != (trunc_mod < 0)) ? (trunc_mod + rhs) : trunc_mod; + } +}; +template struct MaximumFn +{ + T operator()(T lhs, T rhs) { return std::max(lhs, rhs); } +}; +template struct MinimumFn +{ + T operator()(T lhs, T rhs) { return std::min(lhs, rhs); } +}; + +// TODO: check if there real activation value +template +inline void BinaryOp(const int flat_size, const T *input1_data, const T *input2_data, + T *output_data) +{ + Fn func; + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = func(input1_data[i], input2_data[i]); + } +} + +template +inline void BroadcastBinaryOp4DSlow(const luci_interpreter::RuntimeShape &input1_shape, + const float *input1_data, + const luci_interpreter::RuntimeShape &input2_shape, + const float *input2_data, + const luci_interpreter::RuntimeShape &output_shape, + float *output_data) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + + const luci_interpreter::RuntimeShape extended_output_shape = + luci_interpreter::RuntimeShape::extendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + + Fn func; + for (int b = 0; b < extended_output_shape.dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.dims(3); ++c) + { + const int output_data_offset = + ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * + extended_output_shape.dims(3) + + c; + + output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)], + input2_data[subscriptToIndex(desc2, b, y, x, c)]); + } + } + } + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H diff --git a/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h b/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h index 30ac76050ff..a7ba1a3e9fd 100644 --- a/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h +++ b/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h @@ -18,18 +18,14 @@ #ifndef LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H #define LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H -#include "Params.h" -#include "PALUtils.h" -#include "ProcessBroadcastShapes.h" +#include "PALBinaryOpCommon.h" namespace luci_interpreter_pal { inline void FloorDiv(const int flat_size, const float *input1_data, const float *input2_data, float *output_data) { - for (int i = 0; i < flat_size; ++i) - output_data[i] = - std::floor(static_cast(input1_data[i]) / static_cast(input2_data[i])); + BinaryOp>(flat_size, input1_data, input2_data, output_data); } inline void @@ -40,50 +36,8 @@ BroadcastFloorDiv4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data) { - const int flat_size = input1_shape.flatSize(); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); - - const luci_interpreter::RuntimeShape extended_output_shape = - luci_interpreter::RuntimeShape::extendedShape(4, output_shape); - - auto FloorDivFunc = [](float x, float y) -> float { - return std::floor(static_cast(x) / static_cast(y)); - }; - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - - for (int b = 0; b < extended_output_shape.dims(0); ++b) - { - for (int y = 0; y < extended_output_shape.dims(1); ++y) - { - for (int x = 0; x < extended_output_shape.dims(2); ++x) - { - for (int c = 0; c < extended_output_shape.dims(3); ++c) - { - const int output_data_offset = - ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * - extended_output_shape.dims(3) + - c; - - output_data[output_data_offset] = - FloorDivFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)], - input2_data[subscriptToIndex(desc2, b, y, x, c)]); - } - } - } - } + BroadcastBinaryOp4DSlow>(input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); } } // namespace luci_interpreter_pal diff --git a/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h b/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h index e6ea63db233..dfb6725f04e 100644 --- a/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h +++ b/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h @@ -18,22 +18,14 @@ #ifndef LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H #define LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H -#include "Params.h" -#include "PALUtils.h" -#include "ProcessBroadcastShapes.h" +#include "PALBinaryOpCommon.h" namespace luci_interpreter_pal { inline void FloorMod(const int flat_size, const float *input1_data, const float *input2_data, float *output_data) { - for (int i = 0; i < flat_size; ++i) - { - float trunc_mod = std::fmod(input1_data[i], input2_data[i]); - output_data[i] = (trunc_mod != 0) && ((input2_data[i] < 0) != (trunc_mod < 0)) - ? (trunc_mod + input2_data[i]) - : trunc_mod; - } + BinaryOp>(flat_size, input1_data, input2_data, output_data); } inline void @@ -44,51 +36,8 @@ BroadcastFloorMod4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data) { - const int flat_size = input1_shape.flatSize(); - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); - - const luci_interpreter::RuntimeShape extended_output_shape = - luci_interpreter::RuntimeShape::extendedShape(4, output_shape); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - auto FloorModFunc = [](float x, float y) -> float { - float trunc_mod = std::fmod(x, y); - return (trunc_mod != 0) && ((y < 0) != (trunc_mod < 0)) ? (trunc_mod + y) : trunc_mod; - }; - - for (int b = 0; b < extended_output_shape.dims(0); ++b) - { - for (int y = 0; y < extended_output_shape.dims(1); ++y) - { - for (int x = 0; x < extended_output_shape.dims(2); ++x) - { - for (int c = 0; c < extended_output_shape.dims(3); ++c) - { - const int output_data_offset = - ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * - extended_output_shape.dims(3) + - c; - - output_data[output_data_offset] = - FloorModFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)], - input2_data[subscriptToIndex(desc2, b, y, x, c)]); - } - } - } - } + BroadcastBinaryOp4DSlow>(input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); } } // namespace luci_interpreter_pal diff --git a/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h index 106240eef92..9bf6a689ead 100644 --- a/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h +++ b/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h @@ -18,19 +18,14 @@ #ifndef LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H #define LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H -#include "Params.h" -#include "PALUtils.h" -#include "ProcessBroadcastShapes.h" +#include "PALBinaryOpCommon.h" namespace luci_interpreter_pal { inline void Maximum(const int flat_size, const float *input1_data, const float *input2_data, float *output_data) { - for (int i = 0; i < flat_size; ++i) - { - output_data[i] = std::max(input1_data[i], input2_data[i]); - } + BinaryOp>(flat_size, input1_data, input2_data, output_data); } inline void @@ -38,45 +33,8 @@ BroadcastMaximum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); - - const luci_interpreter::RuntimeShape extended_output_shape = - luci_interpreter::RuntimeShape::extendedShape(4, output_shape); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - - for (int b = 0; b < extended_output_shape.dims(0); ++b) - { - for (int y = 0; y < extended_output_shape.dims(1); ++y) - { - for (int x = 0; x < extended_output_shape.dims(2); ++x) - { - for (int c = 0; c < extended_output_shape.dims(3); ++c) - { - const int output_data_offset = - ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * - extended_output_shape.dims(3) + - c; - - output_data[output_data_offset] = - std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)], - input2_data[subscriptToIndex(desc2, b, y, x, c)]); - } - } - } - } + BroadcastBinaryOp4DSlow>(input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); } } // namespace luci_interpreter_pal diff --git a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h index 283314f94da..f2fadcba882 100644 --- a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h +++ b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h @@ -18,20 +18,14 @@ #ifndef LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H #define LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H -#include "Params.h" -#include "PALUtils.h" -#include "ProcessBroadcastShapes.h" -#include "Broadcast.h" +#include "PALBinaryOpCommon.h" namespace luci_interpreter_pal { inline void Minimum(const int flat_size, const float *input1_data, const float *input2_data, float *output_data) { - for (int i = 0; i < flat_size; ++i) - { - output_data[i] = std::min(input1_data[i], input2_data[i]); - } + BinaryOp>(flat_size, input1_data, input2_data, output_data); } template @@ -40,9 +34,8 @@ BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data) { - auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); }; - BroadcastTISO4DSlow(input1_shape, input1_data, input2_shape, input2_data, output_shape, - output_data, func); + BroadcastBinaryOp4DSlow>(input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); } } // namespace luci_interpreter_pal diff --git a/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h b/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h index 990a13c3d8e..e79ef89804a 100644 --- a/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h +++ b/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h @@ -19,6 +19,7 @@ #define LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H #include "TISOKernel.h" +#include "PALComparisons.h" #include "ProcessBroadcastShapes.h" #include "Utils.h" @@ -112,6 +113,14 @@ void evalTISOInplaceKernel(TISOFunc tiso_func, TISOBroadcastFunc tiso_broadcast_ } } +inline void CheckBinaryOpDataTypesEqual(const kernels::TISOKernel &kernel) +{ + LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == + Tensor::element_type(kernel.input2())); + LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == + Tensor::element_type(kernel.output())); +} + #ifndef DIS_QUANT template diff --git a/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp b/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp index a412c97e7ef..36dc29e4177 100644 --- a/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp +++ b/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp @@ -29,10 +29,7 @@ void configure_kernel_CircleFloorDiv(const circle::Operator *cur_op, { kernels::TISOKernel kernel(cur_op, runtime_graph); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.input2())); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.output())); + CheckBinaryOpDataTypesEqual(kernel); } void execute_kernel_CircleFloorDiv(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph) @@ -67,21 +64,7 @@ void execute_kernel_CircleFloorDiv(const circle::Operator *cur_op, BaseRuntimeGr LUCI_INTERPRETER_CHECK(kernels::getTensorData(input_data2)[i] != 0); } // check that input and output dimensions are equal - auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1, - const luci_interpreter::RuntimeShape &input_shape2) -> bool { - if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount()) - { - int N = input_shape1.dimensionsCount(); - for (int i = 0; i < N; ++i) - { - if (input_shape1.dims(i) != input_shape2.dims(i)) - return false; - } - return true; - } - return false; - }; - if (AreShapesEqual(input_shape1, input_shape2)) + if (kernels::areShapesEqual(input_shape1, input_shape2)) { const int flat_size = input_shape1.flatSize(); luci_interpreter_pal::FloorDiv(flat_size, kernels::getTensorData(input_data1), diff --git a/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp b/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp index d87a6cef96b..cb1d0430d86 100644 --- a/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp +++ b/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp @@ -121,3 +121,69 @@ TEST_F(FloorDivTest, Wrong_Input2_Type_NEG) } // namespace } // namespace luci_interpreter + +#include "PALFloorDiv.h" + +#include +#include + +namespace luci_interpreter +{ +namespace +{ + +class PALFloorDivTest : public ::testing::Test +{ + // Do nothing +}; + +TEST_F(PALFloorDivTest, Float_P) +{ + // No broadcast + { + const bool is_with_broadcast = false; + test_kernel::TestDataFloatFloorDiv test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const auto num_elements = input1.size(); + EXPECT_EQ(num_elements, input2.size()); + + std::vector output = std::vector(num_elements); + luci_interpreter_pal::FloorDiv(num_elements, input1.data(), input2.data(), + const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } + + // With broadcast + { + const bool is_with_broadcast = true; + test_kernel::TestDataFloatFloorDiv test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const int32_t shape[2] = {2, 5}; + const int32_t shape_broadcast[2] = {2, 1}; + + assert(input1.size() == + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast), + 1, std::multiplies())); + + std::vector output = std::vector( + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + luci_interpreter_pal::BroadcastFloorDiv4DSlow( + RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(), + RuntimeShape{2, shape}, const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } +} + +} // namespace +} // namespace luci_interpreter diff --git a/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp b/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp index 1b81a9173ba..537744acfbe 100644 --- a/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp +++ b/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp @@ -29,10 +29,7 @@ void configure_kernel_CircleFloorMod(const circle::Operator *cur_op, { kernels::TISOKernel kernel(cur_op, runtime_graph); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.input2())); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.output())); + CheckBinaryOpDataTypesEqual(kernel); } void execute_kernel_CircleFloorMod(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph) @@ -67,21 +64,7 @@ void execute_kernel_CircleFloorMod(const circle::Operator *cur_op, BaseRuntimeGr LUCI_INTERPRETER_CHECK(kernels::getTensorData(input_data2)[i] != 0); } // check that input and output dimensions are equal - auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1, - const luci_interpreter::RuntimeShape &input_shape2) -> bool { - if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount()) - { - int N = input_shape1.dimensionsCount(); - for (int i = 0; i < N; ++i) - { - if (input_shape1.dims(i) != input_shape2.dims(i)) - return false; - } - return true; - } - return false; - }; - if (AreShapesEqual(input_shape1, input_shape2)) + if (kernels::areShapesEqual(input_shape1, input_shape2)) { const int flat_size = input_shape1.flatSize(); luci_interpreter_pal::FloorMod(flat_size, kernels::getTensorData(input_data1), diff --git a/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp b/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp index b962db1829f..eaad79d9620 100644 --- a/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp +++ b/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp @@ -119,3 +119,69 @@ TEST_F(FloorModTest, Wrong_Input2_Type_NEG) } // namespace } // namespace luci_interpreter + +#include "PALFloorMod.h" + +#include +#include + +namespace luci_interpreter +{ +namespace +{ + +class PALFloorModTest : public ::testing::Test +{ + // Do nothing +}; + +TEST_F(PALFloorModTest, Float_P) +{ + // No broadcast + { + const bool is_with_broadcast = false; + test_kernel::TestDataFloatFloorMod test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const auto num_elements = input1.size(); + EXPECT_EQ(num_elements, input2.size()); + + std::vector output = std::vector(num_elements); + luci_interpreter_pal::FloorMod(num_elements, input1.data(), input2.data(), + const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } + + // With broadcast + { + const bool is_with_broadcast = true; + test_kernel::TestDataFloatFloorMod test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const int32_t shape[2] = {2, 5}; + const int32_t shape_broadcast[2] = {2, 1}; + + assert(input1.size() == + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast), + 1, std::multiplies())); + + std::vector output = std::vector( + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + luci_interpreter_pal::BroadcastFloorMod4DSlow( + RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(), + RuntimeShape{2, shape}, const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } +} + +} // namespace +} // namespace luci_interpreter diff --git a/onert-micro/luci-interpreter/src/kernels/Maximum.cpp b/onert-micro/luci-interpreter/src/kernels/Maximum.cpp index 87c8d04428f..11edb3d9206 100644 --- a/onert-micro/luci-interpreter/src/kernels/Maximum.cpp +++ b/onert-micro/luci-interpreter/src/kernels/Maximum.cpp @@ -28,10 +28,7 @@ void configure_kernel_CircleMaximum(const circle::Operator *cur_op, BaseRuntimeG { kernels::TISOKernel kernel(cur_op, runtime_graph); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.input2())); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.output())); + kernels::CheckBinaryOpDataTypesEqual(kernel); } void execute_kernel_CircleMaximum(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph) @@ -58,21 +55,7 @@ void execute_kernel_CircleMaximum(const circle::Operator *cur_op, BaseRuntimeGra case DataType::FLOAT32: { // check that input and output dimensions are equal - auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1, - const luci_interpreter::RuntimeShape &input_shape2) -> bool { - if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount()) - { - int N = input_shape1.dimensionsCount(); - for (int i = 0; i < N; ++i) - { - if (input_shape1.dims(i) != input_shape2.dims(i)) - return false; - } - return true; - } - return false; - }; - if (AreShapesEqual(input_shape1, input_shape2)) + if (kernels::areShapesEqual(input_shape1, input_shape2)) { const int flat_size = input_shape1.flatSize(); luci_interpreter_pal::Maximum(flat_size, kernels::getTensorData(input_data1), diff --git a/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp b/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp index b9f1e068a9e..2316f143c1a 100644 --- a/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp +++ b/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp @@ -119,3 +119,69 @@ TEST_F(MaximumTest, Wrong_Input2_Type_NEG) } // namespace } // namespace luci_interpreter + +#include "PALMaximum.h" + +#include +#include + +namespace luci_interpreter +{ +namespace +{ + +class PALMaximumTest : public ::testing::Test +{ + // Do nothing +}; + +TEST_F(PALMaximumTest, Float_P) +{ + // No broadcast + { + const bool is_with_broadcast = false; + test_kernel::TestDataFloatMaximum test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const auto num_elements = input1.size(); + EXPECT_EQ(num_elements, input2.size()); + + std::vector output = std::vector(num_elements); + luci_interpreter_pal::Maximum(num_elements, input1.data(), input2.data(), + const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } + + // With broadcast + { + const bool is_with_broadcast = true; + test_kernel::TestDataFloatMaximum test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const int32_t shape[2] = {2, 5}; + const int32_t shape_broadcast[2] = {2, 1}; + + assert(input1.size() == + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast), + 1, std::multiplies())); + + std::vector output = std::vector( + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + luci_interpreter_pal::BroadcastMaximum4DSlow( + RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(), + RuntimeShape{2, shape}, const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } +} + +} // namespace +} // namespace luci_interpreter diff --git a/onert-micro/luci-interpreter/src/kernels/Minimum.cpp b/onert-micro/luci-interpreter/src/kernels/Minimum.cpp index 0949b8990dc..9ed4841c684 100644 --- a/onert-micro/luci-interpreter/src/kernels/Minimum.cpp +++ b/onert-micro/luci-interpreter/src/kernels/Minimum.cpp @@ -28,10 +28,7 @@ void configure_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeG { kernels::TISOKernel kernel(cur_op, runtime_graph); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.input2())); - LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) == - Tensor::element_type(kernel.output())); + kernels::CheckBinaryOpDataTypesEqual(kernel); } void execute_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph) @@ -58,21 +55,7 @@ void execute_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeGra case DataType::FLOAT32: { // check that input and output dimensions are equal - auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1, - const luci_interpreter::RuntimeShape &input_shape2) -> bool { - if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount()) - { - int N = input_shape1.dimensionsCount(); - for (int i = 0; i < N; ++i) - { - if (input_shape1.dims(i) != input_shape2.dims(i)) - return false; - } - return true; - } - return false; - }; - if (AreShapesEqual(input_shape1, input_shape2)) + if (kernels::areShapesEqual(input_shape1, input_shape2)) { const int flat_size = input_shape1.flatSize(); luci_interpreter_pal::Minimum(flat_size, kernels::getTensorData(input_data1), diff --git a/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp b/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp index 5775ce80a48..5f6d304b34a 100644 --- a/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp +++ b/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp @@ -119,3 +119,69 @@ TEST_F(MinimumTest, Wrong_Input2_Type_NEG) } // namespace } // namespace luci_interpreter + +#include "PALMinimum.h" + +#include +#include + +namespace luci_interpreter +{ +namespace +{ + +class PALMinimumTest : public ::testing::Test +{ + // Do nothing +}; + +TEST_F(PALMinimumTest, Float_P) +{ + // No broadcast + { + const bool is_with_broadcast = false; + test_kernel::TestDataFloatMinimum test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const auto num_elements = input1.size(); + EXPECT_EQ(num_elements, input2.size()); + + std::vector output = std::vector(num_elements); + luci_interpreter_pal::Minimum(num_elements, input1.data(), input2.data(), + const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } + + // With broadcast + { + const bool is_with_broadcast = true; + test_kernel::TestDataFloatMinimum test_data_kernel(is_with_broadcast); + + const auto &input1 = test_data_kernel.get_input_data_by_index(0); + const auto &input2 = test_data_kernel.get_input_data_by_index(1); + + const int32_t shape[2] = {2, 5}; + const int32_t shape_broadcast[2] = {2, 1}; + + assert(input1.size() == + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast), + 1, std::multiplies())); + + std::vector output = std::vector( + std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies())); + luci_interpreter_pal::BroadcastMinimum4DSlow( + RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(), + RuntimeShape{2, shape}, const_cast(output.data())); + + EXPECT_THAT(output, kernels::testing::FloatArrayNear( + test_data_kernel.get_output_data_by_index(0), 0.0001f)); + } +} + +} // namespace +} // namespace luci_interpreter diff --git a/onert-micro/luci-interpreter/src/kernels/Utils.cpp b/onert-micro/luci-interpreter/src/kernels/Utils.cpp index 35ab821809c..6085071e720 100644 --- a/onert-micro/luci-interpreter/src/kernels/Utils.cpp +++ b/onert-micro/luci-interpreter/src/kernels/Utils.cpp @@ -86,6 +86,22 @@ void matrixScalarMultiplyAccumulate(const int8_t *matrix, int32_t scalar, int32_ } } +bool areShapesEqual(const luci_interpreter::RuntimeShape &input_shape1, + const luci_interpreter::RuntimeShape &input_shape2) +{ + if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount()) + { + int N = input_shape1.dimensionsCount(); + for (int i = 0; i < N; ++i) + { + if (input_shape1.dims(i) != input_shape2.dims(i)) + return false; + } + return true; + } + return false; +} + template void calculateActivationRange(Activation activation, float *activation_min, float *activation_max); template void calculateActivationRange(Activation activation, int32_t *activation_min, diff --git a/onert-micro/luci-interpreter/src/kernels/Utils.h b/onert-micro/luci-interpreter/src/kernels/Utils.h index a01d72dfafd..ce5f763d2ee 100644 --- a/onert-micro/luci-interpreter/src/kernels/Utils.h +++ b/onert-micro/luci-interpreter/src/kernels/Utils.h @@ -228,6 +228,10 @@ template constexpr bool one_of_types void matrixScalarMultiplyAccumulate(const int8_t *matrix, int32_t scalar, int32_t n_row, int32_t n_col, int32_t *output); +// Checks if input and output dimensions are equal +bool areShapesEqual(const luci_interpreter::RuntimeShape &input_shape1, + const luci_interpreter::RuntimeShape &input_shape2); + #ifndef DIS_QUANT bool checkedLog2(const float x, int *log2_result);