Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[onert-micro] Reduce duplicate code in binary kernels #12459

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
#define LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"

namespace luci_interpreter_pal
{

template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
struct FloorDivFn
{
T operator()(T lhs, T rhs)
{
return std::floor(static_cast<double>(lhs) / static_cast<double>(rhs));
}
};
template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
struct FloorModFn
{
T operator()(T lhs, T rhs)
{
T trunc_mod = std::fmod(lhs, rhs);
return (trunc_mod != 0) && ((rhs < 0) != (trunc_mod < 0)) ? (trunc_mod + rhs) : trunc_mod;
}
};
template <typename T> struct MaximumFn
{
T operator()(T lhs, T rhs) { return std::max(lhs, rhs); }
};
template <typename T> struct MinimumFn
{
T operator()(T lhs, T rhs) { return std::min(lhs, rhs); }
};

// TODO: check if there real activation value
template <typename T, typename Fn>
inline void BinaryOp(const int flat_size, const T *input1_data, const T *input2_data,
T *output_data)
{
Fn func;
for (int i = 0; i < flat_size; ++i)
{
output_data[i] = func(input1_data[i], input2_data[i]);
}
}

template <typename T, typename Fn>
inline void BroadcastBinaryOp4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
const float *input1_data,
const luci_interpreter::RuntimeShape &input2_shape,
const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape,
float *output_data)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

Fn func;
for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
}

} // namespace luci_interpreter_pal

#endif // LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
54 changes: 4 additions & 50 deletions onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,14 @@
#ifndef LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H
#define LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void FloorDiv(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
output_data[i] =
std::floor(static_cast<double>(input1_data[i]) / static_cast<double>(input2_data[i]));
BinaryOp<float, FloorDivFn<float>>(flat_size, input1_data, input2_data, output_data);
}

inline void
Expand All @@ -40,50 +36,8 @@ BroadcastFloorDiv4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape, float *output_data)
{
const int flat_size = input1_shape.flatSize();

NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

auto FloorDivFunc = [](float x, float y) -> float {
return std::floor(static_cast<double>(x) / static_cast<double>(y));
};
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
FloorDivFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
BroadcastBinaryOp4DSlow<float, FloorDivFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}

} // namespace luci_interpreter_pal
Expand Down
59 changes: 4 additions & 55 deletions onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,14 @@
#ifndef LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H
#define LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void FloorMod(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
{
float trunc_mod = std::fmod(input1_data[i], input2_data[i]);
output_data[i] = (trunc_mod != 0) && ((input2_data[i] < 0) != (trunc_mod < 0))
? (trunc_mod + input2_data[i])
: trunc_mod;
}
BinaryOp<float, FloorModFn<float>>(flat_size, input1_data, input2_data, output_data);
}

inline void
Expand All @@ -44,51 +36,8 @@ BroadcastFloorMod4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape, float *output_data)
{
const int flat_size = input1_shape.flatSize();

NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto FloorModFunc = [](float x, float y) -> float {
float trunc_mod = std::fmod(x, y);
return (trunc_mod != 0) && ((y < 0) != (trunc_mod < 0)) ? (trunc_mod + y) : trunc_mod;
};

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
FloorModFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
BroadcastBinaryOp4DSlow<float, FloorModFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}

} // namespace luci_interpreter_pal
Expand Down
50 changes: 4 additions & 46 deletions onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,65 +18,23 @@
#ifndef LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H
#define LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void Maximum(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
{
output_data[i] = std::max(input1_data[i], input2_data[i]);
}
BinaryOp<float, MaximumFn<float>>(flat_size, input1_data, input2_data, output_data);
}

inline void
BroadcastMaximum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data,
const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape, float *output_data)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
BroadcastBinaryOp4DSlow<float, MaximumFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}

} // namespace luci_interpreter_pal
Expand Down
15 changes: 4 additions & 11 deletions onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,14 @@
#ifndef LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
#define LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "Broadcast.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void Minimum(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
{
output_data[i] = std::min(input1_data[i], input2_data[i]);
}
BinaryOp<float, MinimumFn<float>>(flat_size, input1_data, input2_data, output_data);
}

template <typename T>
Expand All @@ -40,9 +34,8 @@ BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const
const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
const luci_interpreter::RuntimeShape &output_shape, T *output_data)
{
auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); };
BroadcastTISO4DSlow<float>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
output_data, func);
BroadcastBinaryOp4DSlow<float, MinimumFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}
} // namespace luci_interpreter_pal

Expand Down
9 changes: 9 additions & 0 deletions onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H

#include "TISOKernel.h"
#include "PALComparisons.h"
#include "Params.h"
#include "ProcessBroadcastShapes.h"
#include "Utils.h"
Expand Down Expand Up @@ -112,6 +113,14 @@ void evalTISOInplaceKernel(TISOFunc tiso_func, TISOBroadcastFunc tiso_broadcast_
}
}

inline void CheckBinaryOpDataTypesEqual(const kernels::TISOKernel &kernel)
{
LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
Tensor::element_type(kernel.input2()));
LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
Tensor::element_type(kernel.output()));
}

#ifndef DIS_QUANT
template <typename T, typename TISOFunc = nullptr_t, typename TISOBroadcastFunc = nullptr_t,
typename Options = nullptr_t>
Expand Down
Loading