[infra/onert] Bump up ARM Compute Library v24.07 (Samsung#14009)

This commit bumps up ARM Compute Library v24.07. It disables some neon NNAPI unittest to skip failures as workaround. ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh <[email protected]>
dayo09 · Sep 23, 2024 · 355813f · 355813f
1 parent e1fc0d4
commit 355813f
Show file tree

Hide file tree

Showing 22 changed files with 223 additions and 423 deletions.
diff --git a/Makefile.template b/Makefile.template
@@ -202,7 +202,7 @@ runtime_tar_internal:
 	tar -zcf $(WORKSPACE)/onert-test-package.tar.gz -C $(INSTALL_PATH) $(shell ls $(INSTALL_PATH) -I lib -I include)
 
 acl_tar_internal:
-	tar -zcf $(WORKSPACE)/onert-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_core.so lib/libarm_compute_graph.so
+	tar -zcf $(WORKSPACE)/onert-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_graph.so
 
 install_acl_internal:
 # Workaround to install acl for test (ignore error when there is no file to copy)

diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -50,46 +50,16 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
- * the following kernels:
- *
- *  -# @ref CLTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedHybridLayerReshapeWeights : public ICLSimpleFunction
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * S8.
-   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedHybridLayerReshapeWeights
-   *
-   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * S8.
-   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
 
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following
  * OpenCL kernels:
  *
  *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ *  -# @ref CLTranspose (if @p are_weights_reshaped is set to false
  * and transpose_weights is set to true ) (called once)
  *  -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized symmetric)
  *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
@@ -165,7 +135,7 @@ class CLFullyConnectedHybridLayer : public IFunction
                     bool retain_internal_weights);
 
   MemoryGroup _memory_group;
-  CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
+  CLTranspose _reshape_weights_kernel;
   CLScaleFactorSymm8Kernel _scale_factor_kernel;
   CLQuantizationSymmetricKernel _quant_input_kernel;
   CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;

diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -50,45 +50,15 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
- * the following kernels:
- *
- *  -# @ref CLTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedLayerReshapeWeightsEx : public ICLSimpleFunction
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedLayerReshapeWeightsEx
-   *
-   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
 
 namespace weights_transformations
 {
 /** Basic function to manage the reshape weights generated from @ref
- * CLFullyConnectedLayerReshapeWeightsEx */
+ * CLTranspose */
 class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights
 {
 public:
@@ -118,7 +88,7 @@ class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights
 private:
   static constexpr uint32_t _uid = 0x0;
   CLTensor _output{};
-  CLFullyConnectedLayerReshapeWeightsEx _func{};
+  CLTranspose _func{};
 };
 } // namespace weights_transformations
 
@@ -209,7 +179,7 @@ class CLFullyConnectedLayerEx : public IFunction
   weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged
     _reshape_weights_managed_function;
   CLFlattenLayer _flatten_layer;
-  CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function;
+  CLTranspose _reshape_weights_function;
   CLGEMM _mm_gemm;
   CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   CLTensor _flatten_output;

diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -43,8 +43,7 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
-#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
-// #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include <memory>
 
 namespace arm_compute
@@ -123,7 +122,7 @@ class CLPadLayerEx : public IFunction
   void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
 
   std::unique_ptr<CLPadLayerKernelEx> _pad_kernel;
-  std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel;
+  std::unique_ptr<CLCopy> _copy_kernel;
   bool _perform_pad;
 };
 } // namespace arm_compute

diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -48,43 +48,15 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls
- * the following kernels:
- *
- *  -# @ref NETransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Destination tensor. Data type supported: Same as @p input.
-   */
-  void configure(const ITensor *input, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEFullyConnectedHybridLayerReshapeWeights
-   *
-   * @param[in] input  Weights tensor info. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in] output Destination tensor info. Data type supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
 
 /** Basic function to compute a Fully Connected layer on NEON. This function calls the following
  * NEON kernels:
  *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ *  -# @ref NETranspose (if @p are_weights_reshaped is set to false
  * and transpose_weights is set to true ) (called once)
  *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
  * asymmetric)
@@ -162,7 +134,7 @@ class NEFullyConnectedHybridLayer : public IFunction
   void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
 
   MemoryGroup _memory_group;
-  NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+  NETranspose _reshape_weights_function;
   NEQuantizationSymmetricKernel _quant_input_kernel;
   NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   NEMultiplyScaleFactorKernel _multiply_scale_kernel;

diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -51,21 +51,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "src/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 namespace arm_compute
 {
 /** Basic function to compute a Fully Connected layer on NEON. This function calls the following
  * NEON kernels:
  *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and
+ *  -# @ref NETranspose (if @p are_weights_reshaped is set to false and
  * transpose_weights is set to true ) (called once)
  *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
  * asymmetric)
- *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
- * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  * @note  The difference from NEFullyConnectedLayer is that this class supports weights as input
@@ -136,29 +132,28 @@ class NEFullyConnectedLayerEx : public IFunction
   void prepare() override;
 
 private:
-  void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output);
-  void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output);
-  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *bias,
+                       ITensor *output, const FullyConnectedLayerInfo &fc_info);
+  void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *bias,
+                         ITensor *output, const FullyConnectedLayerInfo &fc_info);
+  void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *bias,
+                    ITensor *output, const FullyConnectedLayerInfo &fc_info);
 
   MemoryGroup _memory_group;
-  NEFlattenLayer _flatten_kernel;
   NEConvertFullyConnectedWeights _convert_weights;
-  NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
+  NEFlattenLayer _flatten_kernel;
+  NETranspose _reshape_weights_function;
   NEGEMM _mm_gemm;
   NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
-  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
-  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
   Tensor _flatten_output;
-  Tensor _gemmlowp_output;
   Tensor _converted_weights_output;
   Tensor _reshape_weights_output;
-  const ITensor *_original_weights;
   bool _are_weights_converted;
   bool _are_weights_reshaped;
   bool _is_fc_after_conv;
-  bool _accumulate_biases;
   bool _is_quantized;
   bool _is_prepared;
+  const ITensor *_original_weights;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -44,6 +44,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -164,7 +165,7 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
   _original_weights = weights;
   _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
   _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, false);
 
   auto out_dims = transposeconv_output_dimensions(
     input->info()->dimension(idx_w), input->info()->dimension(idx_h),

diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -65,19 +65,6 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 }
 } // namespace
 
-void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
-{
-  auto k = std::make_unique<CLTransposeKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
-                                                           const ITensorInfo *output)
-{
-  return CLTransposeKernel::validate(input, output);
-}
-
 CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
   std::shared_ptr<IMemoryManager> memory_manager)
   : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
@@ -245,8 +232,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   if (!weights_reshaped)
   {
     // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-      CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights_to_use, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }