[onert-micro] Reduce duplicate code in binary kernels

This commit reduces duplicate code in binary kernels. - Introduce PALBinaryOpCommon.h that has common functions for binary kernels. - Introduce binary function objects. - Introduce `BinaryOp()` that unifies binary kernels without broadcast. - Introduce `BroadcastBinaryOp4DSlow()` that unifies binary kernels with broadcast. - Apply common functions for binary kernels. ONE-DCO-1.0-Signed-off-by: ragmani <[email protected]>
Samsung · Jan 12, 2024 · 9ada010 · 9ada010
1 parent d0cff18
commit 9ada010
Show file tree

Hide file tree

Showing 16 changed files with 434 additions and 238 deletions.
diff --git a/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h b/onert-micro/luci-interpreter/pal/common/PALBinaryOpCommon.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
+#define LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+#include "ProcessBroadcastShapes.h"
+
+namespace luci_interpreter_pal
+{
+
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+struct FloorDivFn
+{
+  T operator()(T lhs, T rhs)
+  {
+    return std::floor(static_cast<double>(lhs) / static_cast<double>(rhs));
+  }
+};
+template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
+struct FloorModFn
+{
+  T operator()(T lhs, T rhs)
+  {
+    T trunc_mod = std::fmod(lhs, rhs);
+    return (trunc_mod != 0) && ((rhs < 0) != (trunc_mod < 0)) ? (trunc_mod + rhs) : trunc_mod;
+  }
+};
+template <typename T> struct MaximumFn
+{
+  T operator()(T lhs, T rhs) { return std::max(lhs, rhs); }
+};
+template <typename T> struct MinimumFn
+{
+  T operator()(T lhs, T rhs) { return std::min(lhs, rhs); }
+};
+
+// TODO: check if there real activation value
+template <typename T, typename Fn>
+inline void BinaryOp(const int flat_size, const T *input1_data, const T *input2_data,
+                     T *output_data)
+{
+  Fn func;
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T, typename Fn>
+inline void BroadcastBinaryOp4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
+                                    const float *input1_data,
+                                    const luci_interpreter::RuntimeShape &input2_shape,
+                                    const float *input2_data,
+                                    const luci_interpreter::RuntimeShape &output_shape,
+                                    float *output_data)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+
+  const luci_interpreter::RuntimeShape extended_output_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+
+  Fn func;
+  for (int b = 0; b < extended_output_shape.dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.dims(3); ++c)
+        {
+          const int output_data_offset =
+            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
+              extended_output_shape.dims(3) +
+            c;
+
+          output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
+                                                 input2_data[subscriptToIndex(desc2, b, y, x, c)]);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BINARYOPCOMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h b/onert-micro/luci-interpreter/pal/common/PALFloorDivCommon.h
@@ -18,18 +18,14 @@
 #ifndef LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H
 #define LUCI_INTERPRETER_PAL_FLOORDIV_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void FloorDiv(const int flat_size, const float *input1_data, const float *input2_data,
                      float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-    output_data[i] =
-      std::floor(static_cast<double>(input1_data[i]) / static_cast<double>(input2_data[i]));
+  BinaryOp<float, FloorDivFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 inline void
@@ -40,50 +36,8 @@ BroadcastFloorDiv4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
                         const float *input2_data,
                         const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 {
-  const int flat_size = input1_shape.flatSize();
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  auto FloorDivFunc = [](float x, float y) -> float {
-    return std::floor(static_cast<double>(x) / static_cast<double>(y));
-  };
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            FloorDivFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
-                         input2_data[subscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  BroadcastBinaryOp4DSlow<float, FloorDivFn<float>>(input1_shape, input1_data, input2_shape,
+                                                    input2_data, output_shape, output_data);
 }
 
 } // namespace luci_interpreter_pal

diff --git a/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h b/onert-micro/luci-interpreter/pal/common/PALFloorModCommon.h
@@ -18,22 +18,14 @@
 #ifndef LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H
 #define LUCI_INTERPRETER_PAL_FLOORMOD_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void FloorMod(const int flat_size, const float *input1_data, const float *input2_data,
                      float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-  {
-    float trunc_mod = std::fmod(input1_data[i], input2_data[i]);
-    output_data[i] = (trunc_mod != 0) && ((input2_data[i] < 0) != (trunc_mod < 0))
-                       ? (trunc_mod + input2_data[i])
-                       : trunc_mod;
-  }
+  BinaryOp<float, FloorModFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 inline void
@@ -44,51 +36,8 @@ BroadcastFloorMod4DSlow(const luci_interpreter::RuntimeShape &input1_shape,
                         const float *input2_data,
                         const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 {
-  const int flat_size = input1_shape.flatSize();
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  auto FloorModFunc = [](float x, float y) -> float {
-    float trunc_mod = std::fmod(x, y);
-    return (trunc_mod != 0) && ((y < 0) != (trunc_mod < 0)) ? (trunc_mod + y) : trunc_mod;
-  };
-
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            FloorModFunc(input1_data[subscriptToIndex(desc1, b, y, x, c)],
-                         input2_data[subscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  BroadcastBinaryOp4DSlow<float, FloorModFn<float>>(input1_shape, input1_data, input2_shape,
+                                                    input2_data, output_shape, output_data);
 }
 
 } // namespace luci_interpreter_pal

diff --git a/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMaximumCommon.h
@@ -18,65 +18,23 @@
 #ifndef LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H
 #define LUCI_INTERPRETER_PAL_MAXIMUM_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void Maximum(const int flat_size, const float *input1_data, const float *input2_data,
                     float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-  {
-    output_data[i] = std::max(input1_data[i], input2_data[i]);
-  }
+  BinaryOp<float, MaximumFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 inline void
 BroadcastMaximum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data,
                        const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data,
                        const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)],
-                     input2_data[subscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  BroadcastBinaryOp4DSlow<float, MaximumFn<float>>(input1_shape, input1_data, input2_shape,
+                                                   input2_data, output_shape, output_data);
 }
 
 } // namespace luci_interpreter_pal

diff --git a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
@@ -18,20 +18,14 @@
 #ifndef LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
 #define LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
-#include "Broadcast.h"
+#include "PALBinaryOpCommon.h"
 
 namespace luci_interpreter_pal
 {
 inline void Minimum(const int flat_size, const float *input1_data, const float *input2_data,
                     float *output_data)
 {
-  for (int i = 0; i < flat_size; ++i)
-  {
-    output_data[i] = std::min(input1_data[i], input2_data[i]);
-  }
+  BinaryOp<float, MinimumFn<float>>(flat_size, input1_data, input2_data, output_data);
 }
 
 template <typename T>
@@ -40,9 +34,8 @@ BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const
                        const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
                        const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 {
-  auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); };
-  BroadcastTISO4DSlow<float>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
-                             output_data, func);
+  BroadcastBinaryOp4DSlow<float, MinimumFn<float>>(input1_shape, input1_data, input2_shape,
+                                                   input2_data, output_shape, output_data);
 }
 } // namespace luci_interpreter_pal
 

diff --git a/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h b/onert-micro/luci-interpreter/src/kernels/BinaryOpCommon.h
@@ -19,6 +19,7 @@
 #define LUCI_INTERPRETER_KERNELS_BINARYOPUTILS_H
 
 #include "TISOKernel.h"
+#include "PALComparisons.h"
 #include "ProcessBroadcastShapes.h"
 
 #include "Utils.h"
@@ -112,6 +113,14 @@ void evalTISOInplaceKernel(TISOFunc tiso_func, TISOBroadcastFunc tiso_broadcast_
   }
 }
 
+inline void CheckBinaryOpDataTypesEqual(const kernels::TISOKernel &kernel)
+{
+  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
+                         Tensor::element_type(kernel.input2()));
+  LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
+                         Tensor::element_type(kernel.output()));
+}
+
 #ifndef DIS_QUANT
 template <typename T, typename TISOFunc = nullptr_t, typename TISOBroadcastFunc = nullptr_t,
           typename Options = nullptr_t>