diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_activations.h b/onnxruntime/core/providers/cpu/fp16/fp16_activations.h index 5404a1b180b64..1a7727921877b 100644 --- a/onnxruntime/core/providers/cpu/fp16/fp16_activations.h +++ b/onnxruntime/core/providers/cpu/fp16/fp16_activations.h @@ -75,6 +75,9 @@ struct LeakyRelu : public ElementWiseRangedTransform { // MlasTanhActivation, // MlasLogisticActivation, // MlasClipActivation, +// Once it's added, please update TestNhwcConvReluClipFusion_FP16 +// in xnnpack_basic_test.cc +// to enable outputs verification for Clip activation. // MlasHardSigmoidActivation, } // namespace functors diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc index 2adf8339b4b66..7b21916948b29 100644 --- a/onnxruntime/core/providers/xnnpack/detail/utils.cc +++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc @@ -9,6 +9,7 @@ #include "core/common/common.h" #include "core/common/safeint.h" +#include "core/framework/float16.h" #include "core/framework/node_unit.h" #include "core/framework/tensorprotoutils.h" #include "core/graph/graph.h" @@ -245,7 +246,7 @@ std::unique_ptr FuseActivation(const NodeUnit& node_un const auto& activation_type = activation.OpType(); if (activation_type == "Clip") { - min = std::numeric_limits::min(); + min = std::numeric_limits::lowest(); max = std::numeric_limits::max(); bool min_max_are_attributes = activation.SinceVersion() == 1 || activation.SinceVersion() == 6; @@ -267,9 +268,19 @@ std::unique_ptr FuseActivation(const NodeUnit& node_un ORT_ENFORCE(utils::HasExternalData(value) == false, "External data is not supported for the scalar min/max Clip values"); - value_to_set = utils::HasRawData(value) - ? *reinterpret_cast(value.raw_data().data()) - : value.float_data()[0]; + int32_t arg_type; + if (GetType(arg, arg_type) && arg_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) { + // arg is of type FP16 + value_to_set = utils::HasRawData(value) + ? (*reinterpret_cast(value.raw_data().data())).ToFloat() + : value.float_data()[0]; + } else if (GetType(arg, arg_type) && arg_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { + value_to_set = utils::HasRawData(value) + ? *reinterpret_cast(value.raw_data().data()) + : value.float_data()[0]; + } else { + ORT_THROW("Now, only FP32 and FP16 are supported to fuse activation in Xnnpack EP", arg_type); + } } } }; diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index 4253e36e02548..3b0d8bd9777f4 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -1323,4 +1323,4 @@ TEST(ConvFp16Test, SharedPrepackedWeights) { } // namespace test } // namespace onnxruntime -#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED \ No newline at end of file diff --git a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc index 65db81e7f4013..56726038a163c 100644 --- a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc +++ b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc @@ -6,9 +6,11 @@ #include "core/common/logging/logging.h" #include "core/common/span_utils.h" +#include "core/framework/float16.h" #include "core/framework/utils.h" #include "core/graph/graph.h" #include "core/providers/xnnpack/xnnpack_execution_provider.h" +#include "core/providers/xnnpack/xnnpack_init.h" #include "core/session/inference_session.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/session/onnxruntime_session_options_config_keys.h" @@ -89,6 +91,91 @@ TEST(XnnpackEP, TestNhwcConvReluClipFusion) { RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion", std::move(ep), feeds, params); } +#ifdef XNNPACK_FP16_SUPPORTED +// This test can be removed if Mlas implemented FP16 Clip fusion. +// Now TestNhwcConvReluClipFusion_FP16 skips output verification +TEST(XnnpackEP, TestNhwcConvReluFusion_FP16) { + const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_relu_model_fp16.onnx"; + + RandomValueGenerator generator; + TensorShape input_shape_x{1, 16, 16, 192}; + std::vector input_x = generator.Uniform(input_shape_x.GetDims(), -128, 128); + + OrtValue ml_value_x; + CreateMLValue(input_shape_x.GetDims(), input_x.data(), OrtMemoryInfo(), &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("model_input", ml_value_x)); + + std::function verify = [](const Graph& graph) -> void { + ASSERT_EQ(graph.NumberOfNodes(), 2) << "Transpose nodes should have been removed, and " + "Conv+Relu should have been fused, leaving 2 nodes."; + auto node_iter = graph.Nodes().begin(); + auto check_node = [](const Node& node, const std::string& fusion_type) { + const auto& attr = node.GetAttributes(); + auto activation = attr.find("activation"); + ASSERT_NE(activation, attr.cend()) << "Fused node should have activation attribute"; + ASSERT_EQ(activation->second.s(), fusion_type); + }; + + ++node_iter; + check_node(*node_iter, "Relu"); + }; + + EPVerificationParams params; + params.ep_node_assignment = ExpectedEPNodeAssignment::Some; + params.fp32_abs_err = 0.5f; + params.graph_verifier = &verify; + + auto ep = DefaultXnnpackExecutionProvider(); + RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluFusion_FP16", std::move(ep), feeds, params); +}; + +// Now, this Test is mainly check whether Xnnpack Clip fusion works. +TEST(XnnpackEP, TestNhwcConvReluClipFusion_FP16) { + const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_clip_relu_fp16.onnx"; + + RandomValueGenerator generator; + TensorShape input_shape_x{1, 16, 16, 192}; + std::vector input_x = generator.Uniform(input_shape_x.GetDims(), -128, 128); + + OrtValue ml_value_x; + CreateMLValue(input_shape_x.GetDims(), input_x.data(), OrtMemoryInfo(), &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("model_input", ml_value_x)); + + std::function verify = [](const Graph& graph) -> void { + ASSERT_EQ(graph.NumberOfNodes(), 3) << "Transpose nodes should have been removed, and " + "Conv+Relu and Conv+Clip should have been fused, leaving 3 nodes."; + auto node_iter = graph.Nodes().begin(); + auto check_node = [](const Node& node, const std::string& fusion_type) { + const auto& attr = node.GetAttributes(); + auto activation = attr.find("activation"); + ASSERT_NE(activation, attr.cend()) << "Fused node should have activation attribute"; + ASSERT_EQ(activation->second.s(), fusion_type); + }; + + // check 2nd and 3rd nodes. + // the first node is the Conv that does not get fused (created after first call to GetCapability) + // the 2nd and 3rd nodes are the fused nodes (created after second call to GetCapability) + ++node_iter; + check_node(*node_iter, "Clip"); + ++node_iter; + check_node(*node_iter, "Relu"); + }; + + EPVerificationParams params; + params.ep_node_assignment = ExpectedEPNodeAssignment::Some; + params.fp32_abs_err = 0.5f; + params.graph_verifier = &verify; + + auto ep = DefaultXnnpackExecutionProvider(); + // So far, CPU EP doesn't support Fp16 Conv fusion, so verify_outputs is skipped. + RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion_FP16", std::move(ep), feeds, params, {}, false); +} +#endif + // test we can share the cpu ep allocator with the xnnpack EP TEST(XnnpackEP, TestAllocatorSharing) { auto init_session = [](std::vector>& eps, diff --git a/onnxruntime/test/testdata/nhwc_conv_clip_relu_fp16.onnx b/onnxruntime/test/testdata/nhwc_conv_clip_relu_fp16.onnx new file mode 100644 index 0000000000000..1621d8b17afd5 Binary files /dev/null and b/onnxruntime/test/testdata/nhwc_conv_clip_relu_fp16.onnx differ diff --git a/onnxruntime/test/testdata/nhwc_conv_relu_model_fp16.onnx b/onnxruntime/test/testdata/nhwc_conv_relu_model_fp16.onnx new file mode 100644 index 0000000000000..872adfde8d008 Binary files /dev/null and b/onnxruntime/test/testdata/nhwc_conv_relu_model_fp16.onnx differ