Skip to content

Commit

Permalink
[onert-micro] Reduce duplicate code in binary kernels
Browse files Browse the repository at this point in the history
This commit reduces duplicate code in binary kernels.
  - Introduce PALBinaryOpCommon.h that has common functions for binary kernels.
    - Introduce binary function objects.
    - Introduce `BinaryOp()` that unifies binary kernels without broadcast.
    - Introduce `BroadcastBinaryOp4DSlow()` that unifies binary kernels with broadcast.
  - Apply common functions for binary kernels.

ONE-DCO-1.0-Signed-off-by: ragmani <[email protected]>
  • Loading branch information
ragmani committed Jan 12, 2024
1 parent d0cff18 commit 9ada010
Show file tree
Hide file tree
Showing 16 changed files with 434 additions and 238 deletions.
117 changes: 117 additions & 0 deletions onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
#define LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"

namespace luci_interpreter_pal
{

template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
struct FloorDivFn
{
T operator()(T lhs, T rhs)
{
return std::floor(static_cast<double>(lhs) / static_cast<double>(rhs));
}
};
template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
struct FloorModFn
{
T operator()(T lhs, T rhs)
{
T trunc_mod = std::fmod(lhs, rhs);
return (trunc_mod != 0) && ((rhs < 0) != (trunc_mod < 0)) ? (trunc_mod + rhs) : trunc_mod;
}
};
template <typename T> struct MaximumFn
{
T operator()(T lhs, T rhs) { return std::max(lhs, rhs); }
};
template <typename T> struct MinimumFn
{
T operator()(T lhs, T rhs) { return std::min(lhs, rhs); }
};

// TODO: check if there real activation value
template <typename T, typename Fn>
inline void BinaryOp(const int flat_size, const T *input1_data, const T *input2_data,
T *output_data)
{
Fn func;
for (int i = 0; i < flat_size; ++i)
{
output_data[i] = func(input1_data[i], input2_data[i]);
}
}

template <typename T, typename Fn>
inline void BroadcastBinaryOp4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
const float *input1_data,
const luci_interpreter::RuntimeShape &input2_shape,
const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape,
float *output_data)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

Fn func;
for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
}

} // namespace luci_interpreter_pal

#endif // LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
54 changes: 4 additions & 50 deletions onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,14 @@
#ifndef LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H
#define LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void FloorDiv(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
output_data[i] =
std::floor(static_cast<double>(input1_data[i]) / static_cast<double>(input2_data[i]));
BinaryOp<float, FloorDivFn<float>>(flat_size, input1_data, input2_data, output_data);
}

inline void
Expand All @@ -40,50 +36,8 @@ BroadcastFloorDiv4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape, float *output_data)
{
const int flat_size = input1_shape.flatSize();

NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

auto FloorDivFunc = [](float x, float y) -> float {
return std::floor(static_cast<double>(x) / static_cast<double>(y));
};
// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
FloorDivFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
BroadcastBinaryOp4DSlow<float, FloorDivFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}

} // namespace luci_interpreter_pal
Expand Down
59 changes: 4 additions & 55 deletions onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,14 @@
#ifndef LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H
#define LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void FloorMod(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
{
float trunc_mod = std::fmod(input1_data[i], input2_data[i]);
output_data[i] = (trunc_mod != 0) && ((input2_data[i] < 0) != (trunc_mod < 0))
? (trunc_mod + input2_data[i])
: trunc_mod;
}
BinaryOp<float, FloorModFn<float>>(flat_size, input1_data, input2_data, output_data);
}

inline void
Expand All @@ -44,51 +36,8 @@ BroadcastFloorMod4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape, float *output_data)
{
const int flat_size = input1_shape.flatSize();

NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
auto FloorModFunc = [](float x, float y) -> float {
float trunc_mod = std::fmod(x, y);
return (trunc_mod != 0) && ((y < 0) != (trunc_mod < 0)) ? (trunc_mod + y) : trunc_mod;
};

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
FloorModFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
BroadcastBinaryOp4DSlow<float, FloorModFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}

} // namespace luci_interpreter_pal
Expand Down
50 changes: 4 additions & 46 deletions onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,65 +18,23 @@
#ifndef LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H
#define LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void Maximum(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
{
output_data[i] = std::max(input1_data[i], input2_data[i]);
}
BinaryOp<float, MaximumFn<float>>(flat_size, input1_data, input2_data, output_data);
}

inline void
BroadcastMaximum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data,
const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape, float *output_data)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
BroadcastBinaryOp4DSlow<float, MaximumFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}

} // namespace luci_interpreter_pal
Expand Down
15 changes: 4 additions & 11 deletions onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,14 @@
#ifndef LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
#define LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "Broadcast.h"
#include "PALBinaryOpCommon.h"

namespace luci_interpreter_pal
{
inline void Minimum(const int flat_size, const float *input1_data, const float *input2_data,
float *output_data)
{
for (int i = 0; i < flat_size; ++i)
{
output_data[i] = std::min(input1_data[i], input2_data[i]);
}
BinaryOp<float, MinimumFn<float>>(flat_size, input1_data, input2_data, output_data);
}

template <typename T>
Expand All @@ -40,9 +34,8 @@ BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const
const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
const luci_interpreter::RuntimeShape &output_shape, T *output_data)
{
auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); };
BroadcastTISO4DSlow<float>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
output_data, func);
BroadcastBinaryOp4DSlow<float, MinimumFn<float>>(input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data);
}
} // namespace luci_interpreter_pal

Expand Down
9 changes: 9 additions & 0 deletions onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H

#include "TISOKernel.h"
#include "PALComparisons.h"
#include "ProcessBroadcastShapes.h"

#include "Utils.h"
Expand Down Expand Up @@ -112,6 +113,14 @@ void evalTISOInplaceKernel(TISOFunc tiso_func, TISOBroadcastFunc tiso_broadcast_
}
}

inline void CheckBinaryOpDataTypesEqual(const kernels::TISOKernel &kernel)
{
LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
Tensor::element_type(kernel.input2()));
LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
Tensor::element_type(kernel.output()));
}

#ifndef DIS_QUANT
template <typename T, typename TISOFunc = nullptr_t, typename TISOBroadcastFunc = nullptr_t,
typename Options = nullptr_t>
Expand Down
Loading

0 comments on commit 9ada010

Please sign in to comment.