From 9ada0108e199f35909d63e7331c02539fa6e45f9 Mon Sep 17 00:00:00 2001
From: ragmani <ragmani0216@gmail.com>
Date: Wed, 10 Jan 2024 18:11:44 +0900
Subject: [PATCH] [onert-micro] Reduce duplicate code in binary kernels

This commit reduces duplicate code in binary kernels.
  - Introduce PALBinaryOpCommon.h that has common functions for binary kernels.
    - Introduce binary function objects.
    - Introduce `BinaryOp()` that unifies binary kernels without broadcast.
    - Introduce `BroadcastBinaryOp4DSlow()` that unifies binary kernels with broadcast.
  - Apply common functions for binary kernels.

ONE-DCO-1.0-Signed-off-by: ragmani <ragmani0216@gmail.com>
---
 .../pal/common/PALBinaryOpCommon.h            | 117 ++++++++++++++++++
 .../pal/common/PALFloorDivCommon.h            |  54 +-------
 .../pal/common/PALFloorModCommon.h            |  59 +--------
 .../pal/common/PALMaximumCommon.h             |  50 +-------
 .../pal/common/PALMinimumCommon.h             |  15 +--
 .../src/kernels/BinaryOpCommon.h              |   9 ++
 .../luci-interpreter/src/kernels/FloorDiv.cpp |  21 +---
 .../src/kernels/FloorDiv.test.cpp             |  66 ++++++++++
 .../luci-interpreter/src/kernels/FloorMod.cpp |  21 +---
 .../src/kernels/FloorMod.test.cpp             |  66 ++++++++++
 .../luci-interpreter/src/kernels/Maximum.cpp  |  21 +---
 .../src/kernels/Maximum.test.cpp              |  66 ++++++++++
 .../luci-interpreter/src/kernels/Minimum.cpp  |  21 +---
 .../src/kernels/Minimum.test.cpp              |  66 ++++++++++
 .../luci-interpreter/src/kernels/Utils.cpp    |  16 +++
 .../luci-interpreter/src/kernels/Utils.h      |   4 +
 16 files changed, 434 insertions(+), 238 deletions(-)
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h
diff --git a/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h b/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h
new file mode 100644
index 00000000000..615c0984242
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
+#define LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+#include "ProcessBroadcastShapes.h"
+
+namespace luci_interpreter_pal
+{
+
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+struct FloorDivFn
+{
+  T operator()(T lhs, T rhs)
+  {
+    return std::floor(static_cast<double>(lhs) / static_cast<double>(rhs));
+  }
+};
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+struct FloorModFn
+{
+  T operator()(T lhs, T rhs)
+  {
+    T trunc_mod = std::fmod(lhs, rhs);
+    return (trunc_mod != 0) && ((rhs < 0) != (trunc_mod < 0)) ? (trunc_mod + rhs) : trunc_mod;
+  }
+};
+template <typename T> struct MaximumFn
+{
+  T operator()(T lhs, T rhs) { return std::max(lhs, rhs); }
+};
+template <typename T> struct MinimumFn
+{
+  T operator()(T lhs, T rhs) { return std::min(lhs, rhs); }
+};
+
+// TODO: check if there real activation value
+template <typename T, typename Fn>
+inline void BinaryOp(const int flat_size, const T *input1_data, const T *input2_data,
+                     T *output_data)
+{
+  Fn func;
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T, typename Fn>
+inline void BroadcastBinaryOp4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
+                                    const float *input1_data,
+                                    const luci_interpreter::RuntimeShape &input2_shape,
+                                    const float *input2_data,
+                                    const luci_interpreter::RuntimeShape &output_shape,
+                                    float *output_data)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+
+  const luci_interpreter::RuntimeShape extended_output_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+
+  Fn func;
+  for (int b = 0; b < extended_output_shape.dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.dims(3); ++c)
+        {
+          const int output_data_offset =
+            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
+              extended_output_shape.dims(3) +
+            c;
+
+          output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
+                                                 input2_data[subscriptToIndex(desc2, b, y, x, c)]);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h b/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h
index 30ac76050ff..a7ba1a3e9fd 100644
--- a/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h
+++ b/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h
@@ -18,18 +18,14 @@
 #ifndef LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H
 #define LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void FloorDiv(const int flat_size, const float *input1_data, const float *input2_data,
                      float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-    output_data[i] =
-      std::floor(static_cast<double>(input1_data[i]) / static_cast<double>(input2_data[i]));
+  BinaryOp<float, FloorDivFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 inline void
@@ -40,50 +36,8 @@ BroadcastFloorDiv4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
                         const float *input2_data,
                         const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 {
-  const int flat_size = input1_shape.flatSize();
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  auto FloorDivFunc = [](float x, float y) -> float {
-    return std::floor(static_cast<double>(x) / static_cast<double>(y));
-  };
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            FloorDivFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
-                         input2_data[subscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  BroadcastBinaryOp4DSlow<float, FloorDivFn<float>>(input1_shape, input1_data, input2_shape,
+                                                    input2_data, output_shape, output_data);
 }
 
 } // namespace luci_interpreter_pal
diff --git a/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h b/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h
index e6ea63db233..dfb6725f04e 100644
--- a/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h
+++ b/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h
@@ -18,22 +18,14 @@
 #ifndef LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H
 #define LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void FloorMod(const int flat_size, const float *input1_data, const float *input2_data,
                      float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-  {
-    float trunc_mod = std::fmod(input1_data[i], input2_data[i]);
-    output_data[i] = (trunc_mod != 0) && ((input2_data[i] < 0) != (trunc_mod < 0))
-                       ? (trunc_mod + input2_data[i])
-                       : trunc_mod;
-  }
+  BinaryOp<float, FloorModFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 inline void
@@ -44,51 +36,8 @@ BroadcastFloorMod4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
                         const float *input2_data,
                         const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 {
-  const int flat_size = input1_shape.flatSize();
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto FloorModFunc = [](float x, float y) -> float {
-    float trunc_mod = std::fmod(x, y);
-    return (trunc_mod != 0) && ((y < 0) != (trunc_mod < 0)) ? (trunc_mod + y) : trunc_mod;
-  };
-
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            FloorModFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
-                         input2_data[subscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  BroadcastBinaryOp4DSlow<float, FloorModFn<float>>(input1_shape, input1_data, input2_shape,
+                                                    input2_data, output_shape, output_data);
 }
 
 } // namespace luci_interpreter_pal
diff --git a/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h
index 106240eef92..9bf6a689ead 100644
--- a/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h
+++ b/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h
@@ -18,19 +18,14 @@
 #ifndef LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H
 #define LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void Maximum(const int flat_size, const float *input1_data, const float *input2_data,
                     float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-  {
-    output_data[i] = std::max(input1_data[i], input2_data[i]);
-  }
+  BinaryOp<float, MaximumFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 inline void
@@ -38,45 +33,8 @@ BroadcastMaximum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const
                        const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data,
                        const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)],
-                     input2_data[subscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  BroadcastBinaryOp4DSlow<float, MaximumFn<float>>(input1_shape, input1_data, input2_shape,
+                                                   input2_data, output_shape, output_data);
 }
 
 } // namespace luci_interpreter_pal
diff --git a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
index 283314f94da..f2fadcba882 100644
--- a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
+++ b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
@@ -18,20 +18,14 @@
 #ifndef LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
 #define LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
-#include "Broadcast.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void Minimum(const int flat_size, const float *input1_data, const float *input2_data,
                     float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-  {
-    output_data[i] = std::min(input1_data[i], input2_data[i]);
-  }
+  BinaryOp<float, MinimumFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 template <typename T>
@@ -40,9 +34,8 @@ BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const
                        const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
                        const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 {
-  auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); };
-  BroadcastTISO4DSlow<float>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
-                             output_data, func);
+  BroadcastBinaryOp4DSlow<float, MinimumFn<float>>(input1_shape, input1_data, input2_shape,
+                                                   input2_data, output_shape, output_data);
 }
 } // namespace luci_interpreter_pal
 
diff --git a/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h b/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
index 990a13c3d8e..e79ef89804a 100644
--- a/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
+++ b/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
@@ -19,6 +19,7 @@
 #define LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
 
 #include "TISOKernel.h"
+#include "PALComparisons.h"
 #include "ProcessBroadcastShapes.h"
 
 #include "Utils.h"
@@ -112,6 +113,14 @@ void evalTISOInplaceKernel(TISOFunc tiso_func, TISOBroadcastFunc tiso_broadcast_
   }
 }
 
+inline void CheckBinaryOpDataTypesEqual(const kernels::TISOKernel &kernel)
+{
+  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
+                         Tensor::element_type(kernel.input2()));
+  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
+                         Tensor::element_type(kernel.output()));
+}
+
 #ifndef DIS_QUANT
 template <typename T, typename TISOFunc = nullptr_t, typename TISOBroadcastFunc = nullptr_t,
           typename Options = nullptr_t>
diff --git a/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp b/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp
index a412c97e7ef..36dc29e4177 100644
--- a/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/FloorDiv.cpp
@@ -29,10 +29,7 @@ void configure_kernel_CircleFloorDiv(const circle::Operator *cur_op,
 {
   kernels::TISOKernel kernel(cur_op, runtime_graph);
 
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.input2()));
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.output()));
+  CheckBinaryOpDataTypesEqual(kernel);
 }
 
 void execute_kernel_CircleFloorDiv(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
@@ -67,21 +64,7 @@ void execute_kernel_CircleFloorDiv(const circle::Operator *cur_op, BaseRuntimeGr
         LUCI_INTERPRETER_CHECK(kernels::getTensorData<float>(input_data2)[i] != 0);
       }
       // check that input and output dimensions are equal
-      auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1,
-                               const luci_interpreter::RuntimeShape &input_shape2) -> bool {
-        if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount())
-        {
-          int N = input_shape1.dimensionsCount();
-          for (int i = 0; i < N; ++i)
-          {
-            if (input_shape1.dims(i) != input_shape2.dims(i))
-              return false;
-          }
-          return true;
-        }
-        return false;
-      };
-      if (AreShapesEqual(input_shape1, input_shape2))
+      if (kernels::areShapesEqual(input_shape1, input_shape2))
       {
         const int flat_size = input_shape1.flatSize();
         luci_interpreter_pal::FloorDiv(flat_size, kernels::getTensorData<float>(input_data1),
diff --git a/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp b/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp
index d87a6cef96b..cb1d0430d86 100644
--- a/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/FloorDiv.test.cpp
@@ -121,3 +121,69 @@ TEST_F(FloorDivTest, Wrong_Input2_Type_NEG)
 
 } // namespace
 } // namespace luci_interpreter
+
+#include "PALFloorDiv.h"
+
+#include <array>
+#include <numeric>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+class PALFloorDivTest : public ::testing::Test
+{
+  // Do nothing
+};
+
+TEST_F(PALFloorDivTest, Float_P)
+{
+  // No broadcast
+  {
+    const bool is_with_broadcast = false;
+    test_kernel::TestDataFloatFloorDiv test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const auto num_elements = input1.size();
+    EXPECT_EQ(num_elements, input2.size());
+
+    std::vector<float> output = std::vector<float>(num_elements);
+    luci_interpreter_pal::FloorDiv(num_elements, input1.data(), input2.data(),
+                                   const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+
+  // With broadcast
+  {
+    const bool is_with_broadcast = true;
+    test_kernel::TestDataFloatFloorDiv test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const int32_t shape[2] = {2, 5};
+    const int32_t shape_broadcast[2] = {2, 1};
+
+    assert(input1.size() ==
+           std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast),
+                                            1, std::multiplies<float>()));
+
+    std::vector<float> output = std::vector<float>(
+      std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    luci_interpreter_pal::BroadcastFloorDiv4DSlow(
+      RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(),
+      RuntimeShape{2, shape}, const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp b/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp
index 1b81a9173ba..537744acfbe 100644
--- a/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/FloorMod.cpp
@@ -29,10 +29,7 @@ void configure_kernel_CircleFloorMod(const circle::Operator *cur_op,
 {
   kernels::TISOKernel kernel(cur_op, runtime_graph);
 
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.input2()));
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.output()));
+  CheckBinaryOpDataTypesEqual(kernel);
 }
 
 void execute_kernel_CircleFloorMod(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
@@ -67,21 +64,7 @@ void execute_kernel_CircleFloorMod(const circle::Operator *cur_op, BaseRuntimeGr
         LUCI_INTERPRETER_CHECK(kernels::getTensorData<float>(input_data2)[i] != 0);
       }
       // check that input and output dimensions are equal
-      auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1,
-                               const luci_interpreter::RuntimeShape &input_shape2) -> bool {
-        if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount())
-        {
-          int N = input_shape1.dimensionsCount();
-          for (int i = 0; i < N; ++i)
-          {
-            if (input_shape1.dims(i) != input_shape2.dims(i))
-              return false;
-          }
-          return true;
-        }
-        return false;
-      };
-      if (AreShapesEqual(input_shape1, input_shape2))
+      if (kernels::areShapesEqual(input_shape1, input_shape2))
       {
         const int flat_size = input_shape1.flatSize();
         luci_interpreter_pal::FloorMod(flat_size, kernels::getTensorData<float>(input_data1),
diff --git a/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp b/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp
index b962db1829f..eaad79d9620 100644
--- a/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/FloorMod.test.cpp
@@ -119,3 +119,69 @@ TEST_F(FloorModTest, Wrong_Input2_Type_NEG)
 
 } // namespace
 } // namespace luci_interpreter
+
+#include "PALFloorMod.h"
+
+#include <array>
+#include <numeric>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+class PALFloorModTest : public ::testing::Test
+{
+  // Do nothing
+};
+
+TEST_F(PALFloorModTest, Float_P)
+{
+  // No broadcast
+  {
+    const bool is_with_broadcast = false;
+    test_kernel::TestDataFloatFloorMod test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const auto num_elements = input1.size();
+    EXPECT_EQ(num_elements, input2.size());
+
+    std::vector<float> output = std::vector<float>(num_elements);
+    luci_interpreter_pal::FloorMod(num_elements, input1.data(), input2.data(),
+                                   const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+
+  // With broadcast
+  {
+    const bool is_with_broadcast = true;
+    test_kernel::TestDataFloatFloorMod test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const int32_t shape[2] = {2, 5};
+    const int32_t shape_broadcast[2] = {2, 1};
+
+    assert(input1.size() ==
+           std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast),
+                                            1, std::multiplies<float>()));
+
+    std::vector<float> output = std::vector<float>(
+      std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    luci_interpreter_pal::BroadcastFloorMod4DSlow(
+      RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(),
+      RuntimeShape{2, shape}, const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/onert-micro/luci-interpreter/src/kernels/Maximum.cpp b/onert-micro/luci-interpreter/src/kernels/Maximum.cpp
index 87c8d04428f..11edb3d9206 100644
--- a/onert-micro/luci-interpreter/src/kernels/Maximum.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/Maximum.cpp
@@ -28,10 +28,7 @@ void configure_kernel_CircleMaximum(const circle::Operator *cur_op, BaseRuntimeG
 {
   kernels::TISOKernel kernel(cur_op, runtime_graph);
 
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.input2()));
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.output()));
+  kernels::CheckBinaryOpDataTypesEqual(kernel);
 }
 
 void execute_kernel_CircleMaximum(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
@@ -58,21 +55,7 @@ void execute_kernel_CircleMaximum(const circle::Operator *cur_op, BaseRuntimeGra
     case DataType::FLOAT32:
     {
       // check that input and output dimensions are equal
-      auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1,
-                               const luci_interpreter::RuntimeShape &input_shape2) -> bool {
-        if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount())
-        {
-          int N = input_shape1.dimensionsCount();
-          for (int i = 0; i < N; ++i)
-          {
-            if (input_shape1.dims(i) != input_shape2.dims(i))
-              return false;
-          }
-          return true;
-        }
-        return false;
-      };
-      if (AreShapesEqual(input_shape1, input_shape2))
+      if (kernels::areShapesEqual(input_shape1, input_shape2))
       {
         const int flat_size = input_shape1.flatSize();
         luci_interpreter_pal::Maximum(flat_size, kernels::getTensorData<float>(input_data1),
diff --git a/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp b/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp
index b9f1e068a9e..2316f143c1a 100644
--- a/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/Maximum.test.cpp
@@ -119,3 +119,69 @@ TEST_F(MaximumTest, Wrong_Input2_Type_NEG)
 
 } // namespace
 } // namespace luci_interpreter
+
+#include "PALMaximum.h"
+
+#include <array>
+#include <numeric>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+class PALMaximumTest : public ::testing::Test
+{
+  // Do nothing
+};
+
+TEST_F(PALMaximumTest, Float_P)
+{
+  // No broadcast
+  {
+    const bool is_with_broadcast = false;
+    test_kernel::TestDataFloatMaximum test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const auto num_elements = input1.size();
+    EXPECT_EQ(num_elements, input2.size());
+
+    std::vector<float> output = std::vector<float>(num_elements);
+    luci_interpreter_pal::Maximum(num_elements, input1.data(), input2.data(),
+                                  const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+
+  // With broadcast
+  {
+    const bool is_with_broadcast = true;
+    test_kernel::TestDataFloatMaximum test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const int32_t shape[2] = {2, 5};
+    const int32_t shape_broadcast[2] = {2, 1};
+
+    assert(input1.size() ==
+           std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast),
+                                            1, std::multiplies<float>()));
+
+    std::vector<float> output = std::vector<float>(
+      std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    luci_interpreter_pal::BroadcastMaximum4DSlow(
+      RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(),
+      RuntimeShape{2, shape}, const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/onert-micro/luci-interpreter/src/kernels/Minimum.cpp b/onert-micro/luci-interpreter/src/kernels/Minimum.cpp
index 0949b8990dc..9ed4841c684 100644
--- a/onert-micro/luci-interpreter/src/kernels/Minimum.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/Minimum.cpp
@@ -28,10 +28,7 @@ void configure_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeG
 {
   kernels::TISOKernel kernel(cur_op, runtime_graph);
 
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.input2()));
-  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
-                         Tensor::element_type(kernel.output()));
+  kernels::CheckBinaryOpDataTypesEqual(kernel);
 }
 
 void execute_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
@@ -58,21 +55,7 @@ void execute_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeGra
     case DataType::FLOAT32:
     {
       // check that input and output dimensions are equal
-      auto AreShapesEqual = [](const luci_interpreter::RuntimeShape &input_shape1,
-                               const luci_interpreter::RuntimeShape &input_shape2) -> bool {
-        if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount())
-        {
-          int N = input_shape1.dimensionsCount();
-          for (int i = 0; i < N; ++i)
-          {
-            if (input_shape1.dims(i) != input_shape2.dims(i))
-              return false;
-          }
-          return true;
-        }
-        return false;
-      };
-      if (AreShapesEqual(input_shape1, input_shape2))
+      if (kernels::areShapesEqual(input_shape1, input_shape2))
       {
         const int flat_size = input_shape1.flatSize();
         luci_interpreter_pal::Minimum(flat_size, kernels::getTensorData<float>(input_data1),
diff --git a/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp b/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp
index 5775ce80a48..5f6d304b34a 100644
--- a/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/Minimum.test.cpp
@@ -119,3 +119,69 @@ TEST_F(MinimumTest, Wrong_Input2_Type_NEG)
 
 } // namespace
 } // namespace luci_interpreter
+
+#include "PALMinimum.h"
+
+#include <array>
+#include <numeric>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+class PALMinimumTest : public ::testing::Test
+{
+  // Do nothing
+};
+
+TEST_F(PALMinimumTest, Float_P)
+{
+  // No broadcast
+  {
+    const bool is_with_broadcast = false;
+    test_kernel::TestDataFloatMinimum test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const auto num_elements = input1.size();
+    EXPECT_EQ(num_elements, input2.size());
+
+    std::vector<float> output = std::vector<float>(num_elements);
+    luci_interpreter_pal::Minimum(num_elements, input1.data(), input2.data(),
+                                  const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+
+  // With broadcast
+  {
+    const bool is_with_broadcast = true;
+    test_kernel::TestDataFloatMinimum test_data_kernel(is_with_broadcast);
+
+    const auto &input1 = test_data_kernel.get_input_data_by_index(0);
+    const auto &input2 = test_data_kernel.get_input_data_by_index(1);
+
+    const int32_t shape[2] = {2, 5};
+    const int32_t shape_broadcast[2] = {2, 1};
+
+    assert(input1.size() ==
+           std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    assert(input2.size() == std::accumulate(std::begin(shape_broadcast), std::end(shape_broadcast),
+                                            1, std::multiplies<float>()));
+
+    std::vector<float> output = std::vector<float>(
+      std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<float>()));
+    luci_interpreter_pal::BroadcastMinimum4DSlow(
+      RuntimeShape{2, shape}, input1.data(), RuntimeShape{2, shape_broadcast}, input2.data(),
+      RuntimeShape{2, shape}, const_cast<float *>(output.data()));
+
+    EXPECT_THAT(output, kernels::testing::FloatArrayNear(
+                          test_data_kernel.get_output_data_by_index(0), 0.0001f));
+  }
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/onert-micro/luci-interpreter/src/kernels/Utils.cpp b/onert-micro/luci-interpreter/src/kernels/Utils.cpp
index 35ab821809c..6085071e720 100644
--- a/onert-micro/luci-interpreter/src/kernels/Utils.cpp
+++ b/onert-micro/luci-interpreter/src/kernels/Utils.cpp
@@ -86,6 +86,22 @@ void matrixScalarMultiplyAccumulate(const int8_t *matrix, int32_t scalar, int32_
   }
 }
 
+bool areShapesEqual(const luci_interpreter::RuntimeShape &input_shape1,
+                    const luci_interpreter::RuntimeShape &input_shape2)
+{
+  if (input_shape1.dimensionsCount() == input_shape2.dimensionsCount())
+  {
+    int N = input_shape1.dimensionsCount();
+    for (int i = 0; i < N; ++i)
+    {
+      if (input_shape1.dims(i) != input_shape2.dims(i))
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
 template void calculateActivationRange(Activation activation, float *activation_min,
                                        float *activation_max);
 template void calculateActivationRange(Activation activation, int32_t *activation_min,
diff --git a/onert-micro/luci-interpreter/src/kernels/Utils.h b/onert-micro/luci-interpreter/src/kernels/Utils.h
index a01d72dfafd..ce5f763d2ee 100644
--- a/onert-micro/luci-interpreter/src/kernels/Utils.h
+++ b/onert-micro/luci-interpreter/src/kernels/Utils.h
@@ -228,6 +228,10 @@ template <typename T, typename U, typename... Other> constexpr bool one_of_types
 void matrixScalarMultiplyAccumulate(const int8_t *matrix, int32_t scalar, int32_t n_row,
                                     int32_t n_col, int32_t *output);
 
+// Checks if input and output dimensions are equal
+bool areShapesEqual(const luci_interpreter::RuntimeShape &input_shape1,
+                    const luci_interpreter::RuntimeShape &input_shape2);
+
 #ifndef DIS_QUANT
 bool checkedLog2(const float x, int *log2_result);