From 6de5ead7da2e997071fafaeb9d870be1f5fb429a Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Tue, 7 Jan 2025 19:19:07 +0800 Subject: [PATCH 01/57] [Paddle TensorRT No.6-7] Add pd_op.affine_channel,pd_op.numel converter (#70507) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix * fix * fix * 增加converter.py支持opt_shape * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix --- .../new_executor/collect_shape_manager.cc | 69 ++++++++++------- .../transforms/tensorrt/trt_op_marker_pass.cc | 30 ++++++++ python/paddle/tensorrt/converter.py | 1 - python/paddle/tensorrt/export.py | 5 +- python/paddle/tensorrt/impls/manipulation.py | 10 +++ python/paddle/tensorrt/impls/others.py | 51 +++++++++++++ python/paddle/tensorrt/util.py | 9 ++- test/tensorrt/tensorrt_test_base.py | 27 ++++++- test/tensorrt/test_converter_activation.py | 22 ++++++ test/tensorrt/test_converter_attribute.py | 2 + test/tensorrt/test_converter_common.py | 11 +++ test/tensorrt/test_converter_conv.py | 12 +++ test/tensorrt/test_converter_creation.py | 11 +++ test/tensorrt/test_converter_input.py | 2 + test/tensorrt/test_converter_linalg.py | 9 ++- test/tensorrt/test_converter_logic.py | 29 ++++++++ test/tensorrt/test_converter_manipulation.py | 74 ++++++++++++++++++- test/tensorrt/test_converter_math.py | 33 +++++++++ test/tensorrt/test_converter_norm.py | 4 + test/tensorrt/test_converter_ops.py | 2 + test/tensorrt/test_converter_others.py | 66 ++++++++++++++++- test/tensorrt/test_converter_pooling.py | 9 ++- test/tensorrt/test_converter_search.py | 12 +++ test/tensorrt/test_converter_stat.py | 2 + test/tensorrt/test_converter_vision.py | 1 + 25 files changed, 467 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/framework/new_executor/collect_shape_manager.cc b/paddle/fluid/framework/new_executor/collect_shape_manager.cc index 02c4aaae5dfe5c..053a4055779b95 100644 --- a/paddle/fluid/framework/new_executor/collect_shape_manager.cc +++ b/paddle/fluid/framework/new_executor/collect_shape_manager.cc @@ -34,7 +34,9 @@ void CollectShapeManager::CollectShapeInfo( auto *var = scope->FindVar(var_name); if (!var || !var->IsType()) continue; auto tensor = var->Get(); - if (!tensor.initialized()) continue; + if (!tensor.initialized() && !instr->NoNeedBuffer().count(input.first)) { + continue; + } paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -124,36 +126,53 @@ void CollectShapeManager::StatisticShapeRangeInfo() { for (auto const &it : shape_data) { auto val = it.first; auto shapes = it.second; + std::vector min_shape(shapes[0].begin(), shapes[0].end()); std::vector max_shape(shapes[0].begin(), shapes[0].end()); std::vector opt_shape(shapes[0].begin(), shapes[0].end()); + // Applicable to scenarios where min/opt/max are specified; + if (shapes.size() == 3) { + for (size_t d = 0; d < shapes[0].size(); ++d) { + std::vector dim_values; + for (const auto &shape : shapes) { + dim_values.push_back(shape[d]); + } + std::sort(dim_values.begin(), dim_values.end()); + min_shape[d] = dim_values[0]; + opt_shape[d] = dim_values[1]; + max_shape[d] = dim_values[2]; + } + min_data[val] = min_shape; + max_data[val] = max_shape; + opt_data[val] = opt_shape; + } else { + // suitable for scenarios where shape is automatically collected. + auto ShapeMaxFreq = + [](const std::map &m) -> int32_t { + std::vector> counter; + for (auto &it : m) counter.emplace_back(it); + std::sort(counter.begin(), + counter.end(), + [](std::pair &a, + std::pair &b) { + return a.second > b.second; + }); + return counter[0].first; + }; - auto ShapeMaxFreq = - [](const std::map &m) -> int32_t { - std::vector> counter; - for (auto &it : m) counter.emplace_back(it); - std::sort(counter.begin(), - counter.end(), - [](std::pair &a, - std::pair &b) { - return a.second > b.second; - }); - return counter[0].first; - }; - - for (size_t d = 0; d < shapes[0].size(); ++d) { - std::map counter; - for (auto &shape : shapes) { - counter[shape[d]] += 1; - if (shape[d] < min_shape[d]) min_shape[d] = shape[d]; - if (shape[d] > max_shape[d]) max_shape[d] = shape[d]; + for (size_t d = 0; d < shapes[0].size(); ++d) { + std::map counter; + for (auto &shape : shapes) { + counter[shape[d]] += 1; + if (shape[d] < min_shape[d]) min_shape[d] = shape[d]; + if (shape[d] > max_shape[d]) max_shape[d] = shape[d]; + } + opt_shape[d] = ShapeMaxFreq(counter); } - opt_shape[d] = ShapeMaxFreq(counter); + min_data[val] = min_shape; + max_data[val] = max_shape; + opt_data[val] = opt_shape; } - - min_data[val] = min_shape; - max_data[val] = max_shape; - opt_data[val] = opt_shape; } }; extract_min_max_opt(min_shapes_, max_shapes_, opt_shapes_, shape_info_); diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index 7c2aad2caefdda..c67bd5d012973b 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -94,6 +94,7 @@ DEFINE_GENERAL_PATTERN(Flip, paddle::dialect::FlipOp) DEFINE_GENERAL_PATTERN(Mish, paddle::dialect::MishOp) DEFINE_GENERAL_PATTERN(AssignValue, paddle::dialect::AssignValueOp) DEFINE_GENERAL_PATTERN(AssignValue_, paddle::dialect::AssignValue_Op) +DEFINE_GENERAL_PATTERN(Numel, paddle::dialect::NumelOp) #undef DEFINE_GENERAL_PATTERN // Add ReduceCommonOpPattern base class to simplify code @@ -2191,6 +2192,33 @@ class InstanceNormOpPattern } }; +class AffineChannelOpPattern + : public pir::OpRewritePattern { + public: + using pir::OpRewritePattern< + paddle::dialect::AffineChannelOp>::OpRewritePattern; + bool MatchAndRewrite(paddle::dialect::AffineChannelOp op, + pir::PatternRewriter &rewriter) const override { + if (op->HasAttribute(kCanRunTrtAttr) && + op->attribute(kCanRunTrtAttr).data()) { + return false; + } + if (!op->HasAttribute("data_layout")) { + VLOG(3) << "pd_op.affine_channel must has data_layout"; + return false; + } + pir::Value x = op.operand_source(0); + auto x_shape = pir::GetShapeFromValue(x); + if (x_shape.size() == 2) { + VLOG(3) << "pd_op.affine_channel x.shape can not be 2"; + return false; + } + + op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); + return true; + } +}; + class TrtOpMarkerPass : public pir::PatternRewritePass { public: TrtOpMarkerPass() : pir::PatternRewritePass("trt_op_marker_pass", 2) {} @@ -2245,6 +2273,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ADD_PATTERN(Mish) ADD_PATTERN(AssignValue) ADD_PATTERN(AssignValue_) + ADD_PATTERN(Numel) #if IS_TRT_VERSION_GE(8600) ADD_PATTERN(Layer_norm) #endif @@ -2321,6 +2350,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); + ps.Add(std::make_unique(context)); return ps; } }; diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index 2fe7416a646623..cab46618c4c0ee 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -294,7 +294,6 @@ def convert_subgraph_to_trt(self, program, group_op): max_shape = get_value_shape_range_info( value, False, paddle.base.core.ShapeMode.kMAX ) - if trt_input.is_shape_tensor: min_value = get_value_shape_range_info( value, True, paddle.base.core.ShapeMode.kMIN diff --git a/python/paddle/tensorrt/export.py b/python/paddle/tensorrt/export.py index 1a36ce3aff74d9..044f58f0041908 100644 --- a/python/paddle/tensorrt/export.py +++ b/python/paddle/tensorrt/export.py @@ -248,17 +248,20 @@ def convert_to_trt(program, trt_config, scope): with paddle.pir_utils.IrGuard(): min_shape_feed = {} max_shape_feed = {} + opt_shape_feed = {} for i, input_instance in enumerate(trt_config.inputs): # get fake inputs - min_data, _, max_data = input_instance.generate_input_data() + min_data, opt_data, max_data = input_instance.generate_input_data() program_with_output = program.list_vars()[-1] min_shape_feed[feed_name[i]] = min_data + opt_shape_feed[feed_name[i]] = opt_data max_shape_feed[feed_name[i]] = max_data # run warmup for collecting shape program = warmup_shape_infer( program, min_shape_feed=min_shape_feed, + opt_shape_feed=opt_shape_feed, max_shape_feed=max_shape_feed, scope=scope, ) diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py index 3ec25417953fb1..8f005518d618c7 100644 --- a/python/paddle/tensorrt/impls/manipulation.py +++ b/python/paddle/tensorrt/impls/manipulation.py @@ -947,3 +947,13 @@ def roll_converter(network, paddle_op, inputs): ) return layer.get_output(0) + + +@converter_registry.register("pd_op.numel", trt_version="8.x") +def numel_converter(network, paddle_op, inputs): + input_tensor = inputs[0] + shape_tensor = network.add_shape(input_tensor).get_output(0) + layer = network.add_reduce( + shape_tensor, trt.ReduceOperation.PROD, axes=1, keep_dims=False + ) + return layer.get_output(0) diff --git a/python/paddle/tensorrt/impls/others.py b/python/paddle/tensorrt/impls/others.py index da386091ebcf92..f2f571f6953129 100644 --- a/python/paddle/tensorrt/impls/others.py +++ b/python/paddle/tensorrt/impls/others.py @@ -301,3 +301,54 @@ def share_data_converter(network, paddle_op, inputs): identity_layer = network.add_identity(x) return identity_layer.get_output(0) + + +@converter_registry.register("pd_op.affine_channel", trt_version="8.x") +def affine_channel_converter(network, paddle_op, inputs): + x, scale_weights, bias_weights = inputs + data_layout = paddle_op.attrs().get("data_layout") + + if data_layout == "NCHW": + channel_axis = 1 + x_input = x + elif data_layout == "NHWC": + # Permute NHWC to NCHW + shuffle_layer1 = network.add_shuffle(x) + shuffle_layer1.first_transpose = (0, 3, 1, 2) + x_input = shuffle_layer1.get_output(0) + channel_axis = 1 + else: + raise ValueError(f"affine_channel: Unsupported layout: {data_layout}") + + if not isinstance(scale_weights, trt.Weights): + raise TypeError("affine_channel requires scale as trt.Weights") + if not isinstance(bias_weights, trt.Weights): + raise TypeError("affine_channel requires bias as trt.Weights") + + if scale_weights.size != bias_weights.size: + raise ValueError( + f"affine_channel: scale.size({scale_weights.size}) != bias.size({bias_weights.size})" + ) + + power_array = np.ones((scale_weights.size,), dtype=np.float32) + power_weights = trt.Weights(power_array) + + layer = network.add_scale_nd( + input=x_input, + mode=trt.ScaleMode.CHANNEL, + shift=bias_weights, + scale=scale_weights, + power=power_weights, + channel_axis=channel_axis, + ) + if not layer: + raise RuntimeError("affine_channel: add_scale_nd failed.") + + out_tensor = layer.get_output(0) + + if data_layout == "NHWC": + shuffle_layer2 = network.add_shuffle(out_tensor) + shuffle_layer2.first_transpose = (0, 2, 3, 1) + out_tensor = shuffle_layer2.get_output(0) + + return out_tensor diff --git a/python/paddle/tensorrt/util.py b/python/paddle/tensorrt/util.py index cba02fb3997622..fbabef8c6178d5 100644 --- a/python/paddle/tensorrt/util.py +++ b/python/paddle/tensorrt/util.py @@ -94,7 +94,9 @@ def predict_program(program, feed_data, fetch_var_list, scope=None): return output -def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None): +def warmup_shape_infer( + program, min_shape_feed, opt_shape_feed, max_shape_feed, scope=None +): paddle.framework.set_flags({"FLAGS_enable_collect_shape": True}) with paddle.pir_utils.IrGuard(): with paddle.static.program_guard(program): @@ -103,6 +105,9 @@ def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None): for _ in range(1): executor.run(program, feed=min_shape_feed, scope=scope) + for _ in range(1): + executor.run(program, feed=opt_shape_feed, scope=scope) + # Run the program with input_data_max_shape (fake max_shape input) for _ in range(1): executor.run(program, feed=max_shape_feed, scope=scope) @@ -120,6 +125,7 @@ def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None): ) ) paddle.framework.set_flags({"FLAGS_enable_collect_shape": False}) + return exe_program @@ -192,6 +198,7 @@ def weight_to_tensor(network, paddle_value, trt_tensor, use_op_name): "pd_op.batch_norm_", "pd_op.layer_norm", "pd_op.depthwise_conv2d_transpose", + "pd_op.affine_channel", ] if use_op_name in forbid_cast_op: return trt_tensor diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py index 03e51b100fcbaa..a8fc090d00bb00 100755 --- a/test/tensorrt/tensorrt_test_base.py +++ b/test/tensorrt/tensorrt_test_base.py @@ -39,6 +39,7 @@ def __init__(self, methodName='runTest'): self.api_args = None self.program_config = None self.min_shape = None + self.opt_shape = None self.max_shape = None self.target_marker_op = "" self.dynamic_shape_data = {} @@ -62,6 +63,7 @@ def create_fake_program(self): ].items(): if ( feed_name in self.min_shape.keys() + and feed_name in self.opt_shape.keys() and feed_name in self.max_shape.keys() ): input_shape_without_dynamic_dim = ( @@ -89,11 +91,15 @@ def create_fake_program(self): api_args[feed_name] = new_list_args else: empty_min_max_shape = ( - self.min_shape is None or self.max_shape is None + self.min_shape is None + or self.max_shape is None + or self.opt_shape is None ) + if ( not empty_min_max_shape and feed_name in self.min_shape.keys() + and feed_name in self.opt_shape.keys() and feed_name in self.max_shape.keys() ): # dynamic shape condition @@ -181,6 +187,7 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): output_expected = self.run_program(main_program, fetch_list) min_shape_data = dict() # noqa: C408 + opt_shape_data = dict() # noqa: C408 max_shape_data = dict() # noqa: C408 for feed_name in self.program_config["feed_list"]: if self.api_args[feed_name] is None: @@ -190,11 +197,13 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): if ( feed_name not in self.min_shape.keys() and feed_name not in self.max_shape.keys() + and feed_name not in self.opt_shape.keys() ): for sub_feed_name, sub_feed_value in self.api_args[ feed_name ].items(): min_shape_data[sub_feed_name] = sub_feed_value + opt_shape_data[sub_feed_name] = sub_feed_value max_shape_data[sub_feed_name] = sub_feed_value continue else: @@ -206,6 +215,11 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): ).astype( self.api_args[feed_name][sub_feed_name].dtype ) + opt_shape_data[sub_feed_name] = np.random.randn( + *self.opt_shape[feed_name][i] + ).astype( + self.api_args[feed_name][sub_feed_name].dtype + ) max_shape_data[sub_feed_name] = np.random.randn( *self.max_shape[feed_name][i] ).astype( @@ -216,8 +230,10 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): if ( feed_name not in self.min_shape.keys() and feed_name not in self.max_shape.keys() + and feed_name not in self.opt_shape.keys() ): min_shape_data[feed_name] = self.api_args[feed_name] + opt_shape_data[feed_name] = self.api_args[feed_name] max_shape_data[feed_name] = self.api_args[feed_name] continue else: @@ -225,6 +241,9 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): min_shape_data[feed_name] = self.dynamic_shape_data[ feed_name ](self.min_shape[feed_name]) + opt_shape_data[feed_name] = self.dynamic_shape_data[ + feed_name + ](self.opt_shape[feed_name]) max_shape_data[feed_name] = self.dynamic_shape_data[ feed_name ](self.max_shape[feed_name]) @@ -232,6 +251,9 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): min_shape_data[feed_name] = np.random.randn( *self.min_shape[feed_name] ).astype(self.api_args[feed_name].dtype) + opt_shape_data[feed_name] = np.random.randn( + *self.opt_shape[feed_name] + ).astype(self.api_args[feed_name].dtype) max_shape_data[feed_name] = np.random.randn( *self.max_shape[feed_name] ).astype(self.api_args[feed_name].dtype) @@ -239,6 +261,7 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): main_program = warmup_shape_infer( main_program, min_shape_feed=min_shape_data, + opt_shape_feed=opt_shape_data, max_shape_feed=max_shape_data, scope=scope, ) @@ -262,7 +285,7 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): input = Input( min_input_shape=self.min_shape, - optim_input_shape=self.min_shape, + optim_input_shape=self.opt_shape, max_input_shape=self.max_shape, ) trt_config = TensorRTConfig(inputs=[input]) diff --git a/test/tensorrt/test_converter_activation.py b/test/tensorrt/test_converter_activation.py index 268dc1e592e073..2e95b50e20e95e 100644 --- a/test/tensorrt/test_converter_activation.py +++ b/test/tensorrt/test_converter_activation.py @@ -29,6 +29,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [1]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -44,6 +45,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [1]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -58,6 +60,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [1, 3], "y": [1, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -72,6 +75,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [1, 3], "y": [1, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -84,6 +88,7 @@ def setUp(self): self.api_args = {"x": np.random.randn(3).astype("float32")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [1]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -96,6 +101,7 @@ def setUp(self): self.api_args = {"x": np.random.randn(3).astype("float32")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [2]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -111,6 +117,7 @@ def setUp(self): self.api_args = {"x": np.random.randn(3).astype("float32")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [1]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -125,6 +132,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -139,6 +147,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -153,6 +162,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -167,6 +177,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -184,6 +195,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -198,6 +210,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -212,6 +225,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result_fp16(self): @@ -231,6 +245,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -246,6 +261,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [1, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -261,6 +277,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -275,6 +292,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [2]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -289,6 +307,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -303,6 +322,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 4]} + self.opt_shape = {"x": [2, 3, 4]} self.max_shape = {"x": [5, 3, 4]} def test_trt_result(self): @@ -317,6 +337,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 4, 2]} + self.opt_shape = {"x": [2, 3, 4, 2]} self.max_shape = {"x": [5, 3, 4, 2]} def test_trt_result(self): @@ -331,6 +352,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_attribute.py b/test/tensorrt/test_converter_attribute.py index ff4defcf70187a..cdb647857804a2 100644 --- a/test/tensorrt/test_converter_attribute.py +++ b/test/tensorrt/test_converter_attribute.py @@ -28,6 +28,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -42,6 +43,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_common.py b/test/tensorrt/test_converter_common.py index dce25797b26e09..58f65e49b31802 100644 --- a/test/tensorrt/test_converter_common.py +++ b/test/tensorrt/test_converter_common.py @@ -44,6 +44,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [1, 2, 3]} self.max_shape = {"x": [10, 2, 3]} def test_trt_result(self): @@ -60,6 +61,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [1, 2, 3]} self.max_shape = {"x": [10, 2, 3]} def test_trt_result(self): @@ -126,6 +128,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 3, 6, 10]} + self.opt_shape = {"x": [2, 3, 6, 10]} self.max_shape = {"x": [12, 3, 6, 10]} def test_trt_result(self): @@ -155,6 +158,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 6, 10, 3]} + self.opt_shape = {"x": [2, 6, 10, 3]} self.max_shape = {"x": [12, 6, 10, 3]} def test_trt_result(self): @@ -182,6 +186,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "OutSize"]} self.min_shape = {"x": [2, 3, 6, 10]} + self.opt_shape = {"x": [2, 3, 6, 10]} self.max_shape = {"x": [12, 3, 6, 10]} def test_trt_result(self): @@ -214,6 +219,7 @@ def setUp(self): "feed_list": ["x", "OutSize", "SizeTensor", "Scale"] } self.min_shape = {"x": [2, 3, 6, 10]} + self.opt_shape = {"x": [2, 3, 6, 10]} self.max_shape = {"x": [12, 3, 6, 10]} def test_trt_result(self): @@ -243,6 +249,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 6, 10, 3]} + self.opt_shape = {"x": [2, 6, 10, 3]} self.max_shape = {"x": [12, 6, 10, 3]} def test_trt_result(self): @@ -274,6 +281,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "SizeTensor"]} self.min_shape = {"x": [2, 3, 6, 10]} + self.opt_shape = {"x": [2, 3, 6, 10]} self.max_shape = {"x": [12, 3, 6, 10]} def test_trt_result(self): @@ -302,6 +310,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 3, 6, 10]} + self.opt_shape = {"x": [2, 3, 6, 10]} self.max_shape = {"x": [12, 3, 6, 10]} def test_trt_result(self): @@ -314,6 +323,7 @@ def setUp(self): self.api_args = {"x": np.random.random([2, 3, 6, 10]).astype("float32")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 3, 6, 10]} + self.opt_shape = {"x": [2, 3, 6, 10]} self.max_shape = {"x": [12, 3, 6, 10]} def test_trt_result(self): @@ -331,6 +341,7 @@ def setUp(self): self.api_args = {"x": np.random.random([2, 3, 6, 10]).astype("float32")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 3, 6, 10]} + self.opt_shape = {"x": [2, 3, 6, 10]} self.max_shape = {"x": [12, 3, 6, 10]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_conv.py b/test/tensorrt/test_converter_conv.py index 4dd17c977caf88..4c6d5c0d212341 100644 --- a/test/tensorrt/test_converter_conv.py +++ b/test/tensorrt/test_converter_conv.py @@ -39,6 +39,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 8, 8]} + self.opt_shape = {"x": [2, 3, 8, 8]} self.max_shape = {"x": [10, 3, 8, 8]} def test_trt_result_fp16(self): @@ -58,6 +59,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 8, 8]} + self.opt_shape = {"x": [2, 3, 8, 8]} self.max_shape = {"x": [10, 3, 8, 8]} def test_trt_result(self): @@ -75,6 +77,7 @@ def setUp(self): self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 8, 8]} + self.opt_shape = {"x": [2, 3, 8, 8]} self.max_shape = {"x": [10, 3, 8, 8]} def test_trt_result(self): @@ -132,6 +135,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 5, 5]} + self.opt_shape = {"x": [2, 3, 5, 5]} self.max_shape = {"x": [4, 3, 5, 5]} def test_trt_result(self): @@ -154,6 +158,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 5, 5]} + self.opt_shape = {"x": [2, 3, 5, 5]} self.max_shape = {"x": [4, 3, 5, 5]} def test_trt_result(self): @@ -176,6 +181,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 5, 5]} + self.opt_shape = {"x": [2, 3, 5, 5]} self.max_shape = {"x": [4, 3, 5, 5]} @@ -205,6 +211,7 @@ def setUp(self): self.api_args = {"x": np.random.random([3, 2, 8, 8]).astype("float32")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 8, 8]} + self.opt_shape = {"x": [3, 2, 8, 8]} self.max_shape = {"x": [10, 2, 8, 8]} def test_trt_result(self): @@ -221,6 +228,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 8, 8]} + self.opt_shape = {"x": [3, 2, 8, 8]} self.max_shape = {"x": [10, 2, 8, 8]} def test_trt_result(self): @@ -238,6 +246,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 8, 8]} + self.opt_shape = {"x": [3, 2, 8, 8]} self.max_shape = {"x": [10, 2, 8, 8]} def test_trt_result(self): @@ -262,6 +271,7 @@ def setUp(self): self.api_args = {"x": np.random.random([3, 2, 8, 8]).astype("float32")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 8, 8]} + self.opt_shape = {"x": [3, 2, 8, 8]} self.max_shape = {"x": [10, 2, 8, 8]} def test_trt_result(self): @@ -279,6 +289,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 8, 8]} + self.opt_shape = {"x": [3, 2, 8, 8]} self.max_shape = {"x": [10, 2, 8, 8]} def test_trt_result(self): @@ -295,6 +306,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 8, 8]} + self.opt_shape = {"x": [3, 2, 8, 8]} self.max_shape = {"x": [10, 2, 8, 8]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_creation.py b/test/tensorrt/test_converter_creation.py index 1478a67cc8bbf6..8c1623d1a2ebad 100644 --- a/test/tensorrt/test_converter_creation.py +++ b/test/tensorrt/test_converter_creation.py @@ -27,6 +27,7 @@ def setUp(self): self.api_args = {"shape": [3, 2], "fill_value": 1.0} self.program_config = {"feed_list": []} self.min_shape = {} + self.opt_shape = {} self.max_shape = {} def test_trt_result(self): @@ -41,6 +42,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2]} + self.opt_shape = {"x": [2, 2]} self.max_shape = {"x": [3, 2]} def test_trt_result(self): @@ -96,6 +98,7 @@ def test_trt_result(self): self.api_args = api_args self.program_config = {"feed_list": ["x"]} self.min_shape = {} + self.opt_shape = {} self.max_shape = {} self.check_trt_result() @@ -110,6 +113,7 @@ def setUp(self): } self.program_config = {"feed_list": []} self.min_shape = {} + self.opt_shape = {} self.max_shape = {} def test_trt_result(self): @@ -125,6 +129,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "output"]} self.min_shape = {"x": [1, 2], "output": [1, 2]} + self.opt_shape = {"x": [2, 2], "output": [2, 2]} self.max_shape = {"x": [3, 2], "output": [3, 2]} def test_trt_result(self): @@ -140,6 +145,7 @@ def setUp(self): } self.program_config = {"feed_list": ["input"]} self.min_shape = {"input": [1, 2]} + self.opt_shape = {"input": [3, 2]} self.max_shape = {"input": [5, 2]} def test_trt_result(self): @@ -155,6 +161,7 @@ def setUp(self): } self.program_config = {"feed_list": ["input"]} self.min_shape = {"input": [1, 2]} + self.opt_shape = {"input": [3, 2]} self.max_shape = {"input": [5, 2]} def test_trt_result(self): @@ -170,6 +177,7 @@ def setUp(self): } self.program_config = {"feed_list": ["input"]} self.min_shape = {"input": [1, 2]} + self.opt_shape = {"input": [3, 2]} self.max_shape = {"input": [5, 2]} def test_trt_result(self): @@ -185,6 +193,7 @@ def setUp(self): } self.program_config = {"feed_list": ["input", "fill_value"]} self.min_shape = {"input": [1, 2]} + self.opt_shape = {"input": [3, 2]} self.max_shape = {"input": [5, 2]} def test_trt_result(self): @@ -201,6 +210,7 @@ def setUp(self): } self.program_config = {"feed_list": ["value", "shape"]} self.min_shape = {} + self.opt_shape = {} self.max_shape = {} def test_trt_result(self): @@ -217,6 +227,7 @@ def setUp(self): } self.program_config = {"feed_list": ["value"]} self.min_shape = {} + self.opt_shape = {} self.max_shape = {} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_input.py b/test/tensorrt/test_converter_input.py index 945ff2133efd1b..c4f0254c8b4dcb 100644 --- a/test/tensorrt/test_converter_input.py +++ b/test/tensorrt/test_converter_input.py @@ -35,6 +35,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "num_classes"]} self.min_shape = {"x": [1, 1]} + self.opt_shape = {"x": [3, 1]} self.max_shape = {"x": [6, 1]} def test_trt_result(self): @@ -58,6 +59,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1]} + self.opt_shape = {"x": [3, 1]} self.max_shape = {"x": [6, 1]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_linalg.py b/test/tensorrt/test_converter_linalg.py index 28162d1da0359b..910ffffcdd5448 100644 --- a/test/tensorrt/test_converter_linalg.py +++ b/test/tensorrt/test_converter_linalg.py @@ -31,10 +31,11 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3, 2]} + self.opt_shape = {"x": [1, 3], "y": [3, 2]} self.max_shape = {"x": [5, 3], "y": [3, 2]} def test_trt_result(self): - self.check_trt_result() + self.check_trt_result(rtol=1e-3, atol=1e-3) class TestTransposeTRTPattern(TensorRTBaseTest): @@ -46,6 +47,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 4]} + self.opt_shape = {"x": [1, 3, 4]} self.max_shape = {"x": [5, 3, 4]} def test_trt_result(self): @@ -61,6 +63,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 2, 3], "y": [1, 3, 2]} + self.opt_shape = {"x": [1, 2, 3], "y": [1, 3, 2]} self.max_shape = {"x": [5, 2, 3], "y": [5, 3, 2]} def test_trt_result(self): @@ -76,6 +79,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 4]} + self.opt_shape = {"x": [1, 3, 4]} self.max_shape = {"x": [5, 3, 4]} def test_trt_result(self): @@ -91,6 +95,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 4]} + self.opt_shape = {"x": [1, 3, 4]} self.max_shape = {"x": [5, 3, 4]} def test_trt_result(self): @@ -106,6 +111,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 4]} + self.opt_shape = {"x": [1, 3, 4]} self.max_shape = {"x": [5, 3, 4]} def test_trt_result(self): @@ -121,6 +127,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 4]} + self.opt_shape = {"x": [1, 3, 4]} self.max_shape = {"x": [5, 3, 4]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_logic.py b/test/tensorrt/test_converter_logic.py index cfc3375c3896c0..30c920c4137439 100644 --- a/test/tensorrt/test_converter_logic.py +++ b/test/tensorrt/test_converter_logic.py @@ -29,6 +29,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [5, 3], "y": [3]} def test_trt_result(self): @@ -44,6 +45,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} def test_trt_result(self): @@ -59,6 +61,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [5, 3], "y": [3]} def test_trt_result(self): @@ -74,6 +77,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} def test_trt_result(self): @@ -89,6 +93,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} def test_trt_result(self): @@ -104,6 +109,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} def test_trt_result(self): @@ -119,6 +125,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} def test_trt_result(self): @@ -134,6 +141,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} def test_trt_result(self): @@ -149,6 +157,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [5, 3], "y": [3]} def test_trt_result_fp16(self): @@ -167,6 +176,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 5], "y": [1, 5]} + self.opt_shape = {"x": [2, 5], "y": [1, 5]} self.max_shape = {"x": [10, 5], "y": [1, 5]} def test_trt_result_fp16(self): @@ -185,6 +195,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result_fp16(self): @@ -203,6 +214,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [2, 1], "y": [2, 3]} + self.opt_shape = {"x": [2, 1], "y": [2, 3]} self.max_shape = {"x": [2, 1], "y": [2, 3]} def test_trt_result_fp16(self): @@ -221,6 +233,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [5, 3], "y": [3]} def test_trt_result_fp16(self): @@ -238,6 +251,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result_fp16(self): @@ -255,6 +269,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result_fp16(self): @@ -275,6 +290,7 @@ def test_trt_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} self.check_trt_result() @@ -285,6 +301,7 @@ def test_trt_diff_shape_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [4, 3], "y": [3]} self.check_trt_result() @@ -298,6 +315,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [5, 3], "y": [3]} def test_trt_result(self): @@ -313,6 +331,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [5, 3], "y": [3]} def test_trt_result(self): @@ -327,6 +346,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -341,6 +361,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result_fp16(self): @@ -375,6 +396,7 @@ def test_trt_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} self.check_trt_result() @@ -385,6 +407,7 @@ def test_trt_diff_shape_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [4, 3], "y": [3]} self.check_trt_result() @@ -414,6 +437,7 @@ def test_trt_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} self.check_trt_result() @@ -424,6 +448,7 @@ def test_trt_diff_shape_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [4, 3], "y": [3]} self.check_trt_result() @@ -450,6 +475,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [2, 3]} def test_trt_result(self): @@ -462,6 +488,7 @@ def setUp(self): self.api_args = {"x": np.random.random([2]).astype("bool")} self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2]} + self.opt_shape = {"x": [2]} self.max_shape = {"x": [2]} def test_trt_result(self): @@ -479,6 +506,7 @@ def test_trt_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1], "y": [1]} + self.opt_shape = {"x": [2], "y": [2]} self.max_shape = {"x": [5], "y": [5]} self.check_trt_result() @@ -489,6 +517,7 @@ def test_trt_diff_shape_result(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [3]} + self.opt_shape = {"x": [2, 3], "y": [3]} self.max_shape = {"x": [4, 3], "y": [3]} self.check_trt_result() diff --git a/test/tensorrt/test_converter_manipulation.py b/test/tensorrt/test_converter_manipulation.py index 6b38f0b91a5b09..595091a928bb82 100644 --- a/test/tensorrt/test_converter_manipulation.py +++ b/test/tensorrt/test_converter_manipulation.py @@ -30,6 +30,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [5, 3]} self.max_shape = {"x": [10, 3]} def test_trt_result(self): @@ -45,6 +46,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [5, 3]} self.max_shape = {"x": [10, 3]} def test_trt_result(self): @@ -60,6 +62,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [5, 3]} self.max_shape = {"x": [10, 3]} def test_trt_result(self): @@ -79,6 +82,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [[1, 3], [1, 3], [1, 2]]} + self.opt_shape = {"x": [[5, 3], [5, 3], [5, 2]]} self.max_shape = {"x": [[5, 3], [5, 3], [5, 2]]} def test_trt_result(self): @@ -95,6 +99,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 1, 19]} + self.opt_shape = {"x": [10, 1, 1, 19]} self.max_shape = {"x": [10, 1, 1, 19]} def test_trt_result(self): @@ -110,6 +115,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [6, 3]} self.max_shape = {"x": [6, 3]} def test_trt_result(self): @@ -125,6 +131,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "shape"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [6, 3]} self.max_shape = {"x": [6, 3]} def test_trt_result(self): @@ -148,6 +155,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 6, 64, 64]} + self.opt_shape = {"x": [4, 6, 64, 64]} self.max_shape = {"x": [8, 6, 64, 64]} def test_trt_result(self): @@ -163,6 +171,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {} + self.opt_shape = {} self.max_shape = {} def test_trt_result(self): @@ -180,6 +189,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [2, 6, 64, 64]} + self.opt_shape = {"x": [4, 6, 64, 64]} self.max_shape = {"x": [8, 6, 64, 64]} def test_trt_result(self): @@ -197,6 +207,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [4, 3]} self.max_shape = {"x": [4, 3]} def test_trt_result(self): @@ -214,6 +225,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "starts", "ends"]} self.min_shape = {"x": [3, 4, 5, 6]} + self.opt_shape = {"x": [6, 4, 5, 6]} self.max_shape = {"x": [6, 4, 5, 6]} def test_trt_result(self): @@ -230,6 +242,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 9, 5]} + self.opt_shape = {"x": [3, 9, 5]} self.max_shape = {"x": [3, 9, 5]} def test_trt_result(self): @@ -246,6 +259,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "axis"]} self.min_shape = {"x": [1, 9, 5]} + self.opt_shape = {"x": [3, 9, 5]} self.max_shape = {"x": [3, 9, 5]} def test_trt_result(self): @@ -262,7 +276,8 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "axis"]} self.min_shape = {"x": [1, 2]} - self.max_shape = {"x": [1, 2]} + self.opt_shape = {"x": [1, 2]} + self.max_shape = {"x": [3, 2]} def test_trt_result(self): self.check_trt_result() @@ -278,6 +293,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 9, 5]} + self.opt_shape = {"x": [2, 9, 5]} self.max_shape = {"x": [3, 9, 5]} def test_trt_result(self): @@ -294,6 +310,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 9, 5]} + self.opt_shape = {"x": [2, 9, 5]} self.max_shape = {"x": [3, 9, 5]} def test_trt_result(self): @@ -310,6 +327,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "axis"]} self.min_shape = {"x": [1, 9, 5]} + self.opt_shape = {"x": [2, 9, 5]} self.max_shape = {"x": [3, 9, 5]} def test_trt_result(self): @@ -330,6 +348,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "num_or_sections"]} self.min_shape = {"x": [1, 9, 5]} + self.opt_shape = {"x": [2, 9, 5]} self.max_shape = {"x": [3, 9, 5]} def test_trt_result(self): @@ -346,6 +365,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "num_or_sections", "axis"]} self.min_shape = {"x": [1, 9, 5]} + self.opt_shape = {"x": [2, 9, 5]} self.max_shape = {"x": [3, 9, 5]} def test_trt_result(self): @@ -365,6 +385,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [[1, 2], [1, 2], [1, 2]]} + self.opt_shape = {"x": [[2, 2], [2, 2], [2, 2]]} self.max_shape = {"x": [[3, 2], [3, 2], [3, 2]]} def test_trt_result(self): @@ -384,6 +405,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [[1, 2], [1, 2], [1, 2]]} + self.opt_shape = {"x": [[2, 2], [2, 2], [2, 2]]} self.max_shape = {"x": [[3, 2], [3, 2], [3, 2]]} def test_trt_result(self): @@ -399,6 +421,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [2, 2, 3]} self.max_shape = {"x": [2, 2, 3]} def test_trt_result(self): @@ -414,6 +437,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "repeat_times"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [2, 2, 3]} self.max_shape = {"x": [2, 2, 3]} def test_trt_result(self): @@ -429,6 +453,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [2, 2, 3]} self.max_shape = {"x": [2, 2, 3]} def test_trt_result(self): @@ -447,6 +472,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4, 10]} + self.opt_shape = {"x": [2, 4, 10]} self.max_shape = {"x": [5, 4, 10]} def test_trt_result(self): @@ -465,6 +491,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4, 10]} + self.opt_shape = {"x": [2, 4, 10]} self.max_shape = {"x": [5, 4, 10]} def test_trt_result(self): @@ -483,6 +510,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4, 10]} + self.opt_shape = {"x": [2, 4, 10]} self.max_shape = {"x": [5, 4, 10]} def test_trt_result(self): @@ -501,7 +529,8 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 56, 56, 128]} - self.max_shape = {"x": [1, 56, 56, 128]} + self.opt_shape = {"x": [3, 56, 56, 128]} + self.max_shape = {"x": [2, 56, 56, 128]} def test_trt_result(self): self.check_trt_result() @@ -522,7 +551,8 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 56, 56, 128]} - self.max_shape = {"x": [1, 56, 56, 128]} + self.opt_shape = {"x": [3, 56, 56, 128]} + self.max_shape = {"x": [3, 56, 56, 128]} def test_trt_result(self): self.check_trt_result() @@ -538,6 +568,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4, 10]} + self.opt_shape = {"x": [2, 4, 10]} self.max_shape = {"x": [5, 4, 10]} def test_trt_result(self): @@ -554,6 +585,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4, 10]} + self.opt_shape = {"x": [2, 4, 10]} self.max_shape = {"x": [5, 4, 10]} def test_trt_result(self): @@ -570,6 +602,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "shift"]} self.min_shape = {"x": [1, 4, 10]} + self.opt_shape = {"x": [2, 4, 10]} self.max_shape = {"x": [5, 4, 10]} def test_trt_result(self): @@ -585,6 +618,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 28]} + self.opt_shape = {"x": [2, 1, 28]} self.max_shape = {"x": [5, 1, 28]} def test_trt_result(self): @@ -600,11 +634,45 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 28]} + self.opt_shape = {"x": [2, 1, 28]} self.max_shape = {"x": [5, 1, 28]} def test_trt_result(self): self.check_trt_result() +class TestNumelTRTCase1Pattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.numel + self.api_args = { + "x": np.random.randn(2, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} + self.max_shape = {"x": [5, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_fp16_result(self): + self.check_trt_result(precision_mode="fp16") + + +class TestNumelTRTCase2Pattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.numel + self.api_args = { + "x": np.random.randn(1, 2, 33, 33).astype("int64"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 2, 33, 33]} + self.opt_shape = {"x": [2, 2, 33, 33]} + self.max_shape = {"x": [5, 2, 33, 33]} + + def test_trt_result(self): + self.check_trt_result() + + if __name__ == '__main__': unittest.main() diff --git a/test/tensorrt/test_converter_math.py b/test/tensorrt/test_converter_math.py index cdb9858f738d07..3783615ddbde1c 100644 --- a/test/tensorrt/test_converter_math.py +++ b/test/tensorrt/test_converter_math.py @@ -29,6 +29,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4]} + self.opt_shape = {"x": [2, 4]} self.max_shape = {"x": [5, 4]} def test_trt_result(self): @@ -44,6 +45,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -59,6 +61,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -74,6 +77,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -89,6 +93,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -104,6 +109,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result_fp16(self): @@ -130,6 +136,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -155,6 +162,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -170,6 +178,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4]} + self.opt_shape = {"x": [2, 4]} self.max_shape = {"x": [5, 4]} def test_trt_result(self): @@ -185,6 +194,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4, 6]} + self.opt_shape = {"x": [2, 4, 6]} self.max_shape = {"x": [5, 4, 6]} def test_trt_result(self): @@ -200,6 +210,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4, 6]} + self.opt_shape = {"x": [2, 4, 6]} self.max_shape = {"x": [5, 4, 6]} def test_trt_result(self): @@ -216,6 +227,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 2]} + self.opt_shape = {"x": [2, 3, 2]} self.max_shape = {"x": [5, 3, 2]} def test_trt_result(self): @@ -232,6 +244,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 2]} + self.opt_shape = {"x": [2, 3, 2]} self.max_shape = {"x": [5, 3, 2]} def test_trt_result(self): @@ -248,6 +261,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 2]} + self.opt_shape = {"x": [2, 3, 2]} self.max_shape = {"x": [5, 3, 2]} def test_trt_result(self): @@ -264,6 +278,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 2]} + self.opt_shape = {"x": [2, 3, 2]} self.max_shape = {"x": [5, 3, 2]} def test_trt_result(self): @@ -279,6 +294,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [2, 2, 3]} self.max_shape = {"x": [5, 2, 3]} def test_trt_result_fp16(self): @@ -297,6 +313,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [2, 2, 3]} self.max_shape = {"x": [5, 2, 3]} def test_trt_result_fp16(self): @@ -315,6 +332,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [2, 2, 3]} self.max_shape = {"x": [5, 2, 3]} def test_trt_result_fp16(self): @@ -333,6 +351,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2, 3]} + self.opt_shape = {"x": [2, 2, 3]} self.max_shape = {"x": [5, 2, 3]} def test_trt_result(self): @@ -348,6 +367,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -371,6 +391,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3], "y": [1, 3]} + self.opt_shape = {"x": [2, 3], "y": [2, 3]} self.max_shape = {"x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -385,6 +406,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -399,6 +421,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -419,6 +442,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4]} + self.opt_shape = {"x": [2, 4]} self.max_shape = {"x": [5, 4]} def test_trt_result(self): @@ -436,6 +460,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 4]} + self.opt_shape = {"x": [2, 4]} self.max_shape = {"x": [5, 4]} def test_trt_result(self): @@ -456,6 +481,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "min", "max"]} self.min_shape = {"x": [1, 4]} + self.opt_shape = {"x": [2, 4]} self.max_shape = {"x": [5, 4]} def test_trt_result(self): @@ -474,6 +500,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "min", "max"]} self.min_shape = {"x": [1, 4]} + self.opt_shape = {"x": [2, 4]} self.max_shape = {"x": [5, 4]} def test_trt_result(self): @@ -489,6 +516,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]} + self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]} self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]} def test_trt_result_fp16(self): @@ -507,6 +535,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3, 4], "y": [4]} + self.opt_shape = {"x": [2, 3, 4], "y": [4]} self.max_shape = {"x": [5, 3, 4], "y": [4]} def test_trt_result_fp16(self): @@ -537,6 +566,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]} + self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]} self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]} def test_trt_result_fp16(self): @@ -555,6 +585,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]} + self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]} self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]} def test_trt_result_fp16(self): @@ -573,6 +604,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3, 4], "y": [4]} + self.opt_shape = {"x": [2, 3, 4], "y": [4]} self.max_shape = {"x": [5, 3, 4], "y": [4]} def test_trt_result_fp16(self): @@ -603,6 +635,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "y"]} self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]} + self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]} self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]} def test_trt_result_fp16(self): diff --git a/test/tensorrt/test_converter_norm.py b/test/tensorrt/test_converter_norm.py index 9144a64386395a..d33880c73d9c21 100644 --- a/test/tensorrt/test_converter_norm.py +++ b/test/tensorrt/test_converter_norm.py @@ -33,6 +33,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 2, 3]} + self.opt_shape = {"x": [2, 1, 2, 3]} self.max_shape = {"x": [5, 1, 2, 3]} def test_trt_result(self): @@ -53,6 +54,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "weight", "bias"]} self.min_shape = {"x": [1, 2, 1, 3]} + self.opt_shape = {"x": [2, 2, 1, 3]} self.max_shape = {"x": [5, 2, 1, 3]} def test_trt_result(self): @@ -69,6 +71,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "weight", "bias"]} self.min_shape = {"x": [1, 2, 1]} + self.opt_shape = {"x": [2, 2, 1]} self.max_shape = {"x": [5, 2, 1]} def test_trt_result(self): @@ -85,6 +88,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "weight", "bias"]} self.min_shape = {"x": [1, 2, 1, 3]} + self.opt_shape = {"x": [2, 2, 1, 3]} self.max_shape = {"x": [5, 2, 1, 3]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_ops.py b/test/tensorrt/test_converter_ops.py index 8bc188e3e5514b..544fca80fbecc0 100644 --- a/test/tensorrt/test_converter_ops.py +++ b/test/tensorrt/test_converter_ops.py @@ -28,6 +28,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} self.max_shape = {"x": [10, 3]} def test_trt_result(self): @@ -42,6 +43,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} self.max_shape = {"x": [10, 3]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_others.py b/test/tensorrt/test_converter_others.py index a26b5546c9a719..0c88733296f262 100644 --- a/test/tensorrt/test_converter_others.py +++ b/test/tensorrt/test_converter_others.py @@ -66,6 +66,7 @@ def setUp(self): } self.program_config = {"feed_list": ["bboxes", "scores"]} self.min_shape = {"bboxes": [1, 5, 4], "scores": [1, 4, 5]} + self.opt_shape = {"bboxes": [2, 5, 4], "scores": [2, 4, 5]} self.max_shape = {"bboxes": [3, 5, 4], "scores": [3, 4, 5]} def test_trt_result(self): @@ -170,6 +171,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2]} + self.opt_shape = {"x": [2, 2]} self.max_shape = {"x": [20, 2]} def test_trt_result(self): @@ -193,7 +195,8 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2]} - self.max_shape = {"x": [20, 2]} + self.opt_shape = {"x": [2, 2]} + self.max_shape = {"x": [5, 2]} def test_trt_result(self): self.check_marker(expected_result=False) @@ -216,6 +219,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2]} + self.opt_shape = {"x": [2, 2]} self.max_shape = {"x": [20, 2]} def test_trt_result(self): @@ -239,6 +243,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2]} + self.opt_shape = {"x": [2, 2]} self.max_shape = {"x": [20, 2]} def test_trt_result(self): @@ -262,6 +267,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2]} + self.opt_shape = {"x": [2, 2]} self.max_shape = {"x": [20, 2]} def test_trt_result(self): @@ -285,6 +291,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "starts"]} self.min_shape = {"x": [1, 2]} + self.opt_shape = {"x": [2, 2]} self.max_shape = {"x": [20, 2]} def test_trt_result(self): @@ -307,6 +314,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 2]} + self.opt_shape = {"x": [2, 2]} self.max_shape = {"x": [20, 2]} def test_trt_result(self): @@ -329,6 +337,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "values"]} self.min_shape = {"x": [1, 3, 3], "values": [1, 2, 3]} + self.opt_shape = {"x": [2, 3, 3], "values": [2, 2, 3]} self.max_shape = {"x": [4, 3, 3], "values": [4, 2, 3]} def test_trt_result(self): @@ -352,6 +361,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "values"]} self.min_shape = {"x": [1, 3, 3], "values": [1, 2, 3]} + self.opt_shape = {"x": [2, 3, 3], "values": [2, 2, 3]} self.max_shape = {"x": [4, 3, 3], "values": [4, 2, 3]} def test_trt_result(self): @@ -374,6 +384,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "values"]} self.min_shape = {"x": [1, 3, 3], "values": [1, 2, 3]} + self.opt_shape = {"x": [2, 3, 3], "values": [2, 2, 3]} self.max_shape = {"x": [4, 3, 3], "values": [4, 2, 3]} def test_trt_result(self): @@ -388,11 +399,64 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [4, 3, 5]} + self.opt_shape = {"x": [5, 3, 5]} self.max_shape = {"x": [6, 3, 5]} def test_trt_result(self): self.check_trt_result() +def affine_channel(x, scale_shape, bias_shape, layout): + scale = paddle.static.create_parameter( + shape=scale_shape, dtype='float32', name="scale" + ) + bias = paddle.static.create_parameter( + shape=bias_shape, dtype='float32', name="bias" + ) + return _C_ops.affine_channel(x, scale, bias, layout) + + +class TestAffineChannelTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = affine_channel + self.api_args = { + "x": np.random.random((2, 100, 3, 3)).astype("float32"), + "scale_shape": [100], + "bias_shape": [100], + "layout": "NCHW", + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 100, 3, 3]} + self.opt_shape = {"x": [2, 100, 3, 3]} + self.max_shape = {"x": [3, 100, 3, 3]} + + def test_fp32_trt_result(self): + self.check_trt_result() + + def test_fp16_trt_result(self): + self.check_trt_result(precision_mode="fp16") + + +class TestAffineChannelCas1TRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = affine_channel + self.api_args = { + "x": np.random.random((2, 3, 3, 100)).astype("float32"), + "scale_shape": [100], + "bias_shape": [100], + "layout": "NHWC", + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 3, 3, 100]} + self.opt_shape = {"x": [2, 3, 3, 100]} + self.max_shape = {"x": [3, 3, 3, 100]} + + def test_fp32_trt_result(self): + self.check_trt_result() + + def test_fp16_trt_result(self): + self.check_trt_result(precision_mode="fp16") + + if __name__ == '__main__': unittest.main() diff --git a/test/tensorrt/test_converter_pooling.py b/test/tensorrt/test_converter_pooling.py index 32523ba4c27e96..5219b71df28d47 100644 --- a/test/tensorrt/test_converter_pooling.py +++ b/test/tensorrt/test_converter_pooling.py @@ -56,6 +56,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 2, 3]} + self.opt_shape = {"x": [1, 1, 2, 3]} self.max_shape = {"x": [5, 1, 2, 3]} def test_trt_result(self): @@ -80,6 +81,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 2, 3]} + self.opt_shape = {"x": [1, 1, 2, 3]} self.max_shape = {"x": [5, 1, 2, 3]} def test_trt_result(self): @@ -104,6 +106,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 2, 3]} + self.opt_shape = {"x": [1, 1, 2, 3]} self.max_shape = {"x": [5, 1, 2, 3]} def test_trt_result(self): @@ -128,6 +131,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 2, 3]} + self.opt_shape = {"x": [1, 1, 2, 3]} self.max_shape = {"x": [5, 1, 2, 3]} def test_trt_result(self): @@ -152,6 +156,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 1, 5, 5]} + self.opt_shape = {"x": [1, 1, 5, 5]} self.max_shape = {"x": [5, 1, 5, 5]} def test_trt_result(self): @@ -176,6 +181,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 16, 56, 56]} + self.opt_shape = {"x": [1, 16, 56, 56]} self.max_shape = {"x": [5, 16, 56, 56]} def test_trt_result(self): @@ -200,7 +206,8 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 5, 5]} - self.max_shape = {"x": [2, 3, 5, 5]} # 动态批次大小,宽度保持为 1 + self.opt_shape = {"x": [1, 3, 5, 5]} + self.max_shape = {"x": [2, 3, 5, 5]} def test_trt_result(self): self.check_trt_result() diff --git a/test/tensorrt/test_converter_search.py b/test/tensorrt/test_converter_search.py index 2665caee450dd5..dffb0348bd64b6 100644 --- a/test/tensorrt/test_converter_search.py +++ b/test/tensorrt/test_converter_search.py @@ -29,6 +29,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -89,6 +90,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -150,6 +152,7 @@ def setUp(self): } self.program_config = {"feed_list": ["condition", "x", "y"]} self.min_shape = {"condition": [1, 3], "x": [1, 3], "y": [1, 3]} + self.opt_shape = {"condition": [2, 3], "x": [2, 3], "y": [2, 3]} self.max_shape = {"condition": [5, 3], "x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -165,6 +168,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -180,6 +184,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [2]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -195,6 +200,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -226,6 +232,7 @@ def setUp(self): } self.program_config = {"feed_list": ["condition", "x", "y"]} self.min_shape = {"condition": [1, 3], "x": [1, 3], "y": [1, 3]} + self.opt_shape = {"condition": [2, 3], "x": [2, 3], "y": [2, 3]} self.max_shape = {"condition": [5, 3], "x": [5, 3], "y": [5, 3]} def test_trt_result(self): @@ -241,6 +248,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -256,6 +264,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [2]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -272,6 +281,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1]} + self.opt_shape = {"x": [2]} self.max_shape = {"x": [5]} def test_trt_result(self): @@ -288,6 +298,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "index"]} self.min_shape = {"x": [1, 3, 3], "index": [1]} + self.opt_shape = {"x": [2, 3, 3], "index": [2]} self.max_shape = {"x": [5, 3, 3], "index": [5]} def test_trt_result_fp16(self): @@ -307,6 +318,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "index"]} self.min_shape = {"x": [1, 3, 3], "index": [1]} + self.opt_shape = {"x": [2, 3, 3], "index": [2]} self.max_shape = {"x": [5, 3, 3], "index": [5]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_stat.py b/test/tensorrt/test_converter_stat.py index 4ea43f9bbb2f6c..9a7e8d19c4cc99 100644 --- a/test/tensorrt/test_converter_stat.py +++ b/test/tensorrt/test_converter_stat.py @@ -30,6 +30,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3]} + self.opt_shape = {"x": [2, 3]} self.max_shape = {"x": [5, 3]} def test_trt_result(self): @@ -46,6 +47,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x"]} self.min_shape = {"x": [1, 3, 2]} + self.opt_shape = {"x": [2, 3, 2]} self.max_shape = {"x": [5, 3, 2]} def test_trt_result(self): diff --git a/test/tensorrt/test_converter_vision.py b/test/tensorrt/test_converter_vision.py index 59d735311eaf6a..62b0b14f49ae7a 100644 --- a/test/tensorrt/test_converter_vision.py +++ b/test/tensorrt/test_converter_vision.py @@ -40,6 +40,7 @@ def setUp(self): } self.program_config = {"feed_list": ["x", "grid"]} self.min_shape = {"x": [1, 1, 3, 3], "grid": [1, 3, 4, 2]} + self.opt_shape = {"x": [1, 1, 3, 3], "grid": [1, 3, 4, 2]} self.max_shape = {"x": [5, 1, 3, 3], "grid": [5, 3, 4, 2]} From 5718f746d945449b5bb9a8615e0b1f06aff09475 Mon Sep 17 00:00:00 2001 From: Zhou Xin Date: Tue, 7 Jan 2025 19:47:14 +0800 Subject: [PATCH 02/57] [CINN][Backend Pass Update No.10] Update ReplaceCrossThreadReduction pass (#70592) * Update replaceCrossThreadReduction * Add visit logics for IfThenElse stmt * Refine test and exception message * Leverage help function Mutate to refactor CrossThreadReductionReplacer --- paddle/cinn/ir/schedule/ir_schedule_util.cc | 12 ++ paddle/cinn/ir/schedule/ir_schedule_util.h | 7 + .../optim/replace_cross_thread_reduction.cc | 199 ++++++++++-------- .../optim/replace_cross_thread_reduction.h | 11 +- .../replace_cross_thread_reduction_test.cc | 26 ++- 5 files changed, 152 insertions(+), 103 deletions(-) diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc index 19bed9130494dd..316854db08ebed 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.cc +++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc @@ -110,6 +110,18 @@ int GetLoopExtent(const Expr& loop) { return static_cast(loop.As()->extent.get_constant()); } +int GetLoopExtent(const ir::stmt::For loop) { + PADDLE_ENFORCE_EQ( + cinn::common::is_zero(loop->min()), + true, + ::common::errors::InvalidArgument("For node's min should be zero.")); + PADDLE_ENFORCE_EQ(loop->extent().is_constant(), + true, + ::common::errors::InvalidArgument( + "For node's extent should be constant.")); + return static_cast(loop->extent().get_constant()); +} + void SetCudaAxisInfo(ir::LoweredFunc lowered_func) { auto CannotProveLT = [](const ir::Expr& lhs, const ir::Expr& rhs) -> bool { std::vector exprs{rhs, lhs}; diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h index 7ec7e4f96f4a2c..576a7448147e6e 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.h +++ b/paddle/cinn/ir/schedule/ir_schedule_util.h @@ -65,6 +65,13 @@ Tensor GetReadTensor(const Expr& block, int index); */ int GetLoopExtent(const Expr& loop); +/** + * \brief Given a For node, return its extent as int. + * @param loop The given For node + * @return The extent of For node + */ +int GetLoopExtent(const ir::stmt::For loop); + /** * \brief Given a vector of Exprs, return whether they contain a var with * specific name. diff --git a/paddle/cinn/optim/replace_cross_thread_reduction.cc b/paddle/cinn/optim/replace_cross_thread_reduction.cc index 487214c08c6b90..947911eeef30c4 100644 --- a/paddle/cinn/optim/replace_cross_thread_reduction.cc +++ b/paddle/cinn/optim/replace_cross_thread_reduction.cc @@ -26,7 +26,9 @@ #include "paddle/cinn/ir/ir_mutator.h" #include "paddle/cinn/ir/ir_printer.h" #include "paddle/cinn/ir/schedule/ir_schedule_util.h" +#include "paddle/cinn/ir/utils/stmt_converter.h" #include "paddle/cinn/lang/compute.h" +#include "paddle/cinn/pass/pass_manager.h" namespace cinn { namespace optim { @@ -40,27 +42,17 @@ struct BufferCmp { }; thread_local std::set shm_buffer_; -struct CrossThreadReductionReplacer : public ir::IRMutator<> { +struct CrossThreadReductionReplacer { void operator()(ir::LoweredFunc fn) { Visit(fn.As()); } private: - bool CanReplace(const ir::ScheduleBlockRealize* block_realize) { - const ir::ScheduleBlock* schedule_block = - block_realize->schedule_block.As(); - - PADDLE_ENFORCE_NOT_NULL( - schedule_block, - ::common::errors::PreconditionNotMet( - "The schedule block pointer in CanReplace must not be null.")); - - if (block_realize->schedule_block.As()->name.substr( - 0, 4) == "root") { + bool CanReplace(const ir::stmt::Schedule block) { + if (block->name().substr(0, 4) == "root") { return false; } - const std::vector& iter_values = block_realize->iter_values; - const std::vector& iter_vars = schedule_block->iter_vars; - ir::Expr body = schedule_block->body; + const std::vector& iter_values = block->iter_values(); + const std::vector& iter_vars = block->iter_vars(); std::unordered_set reduce_var_names; for (int i = 0; i < iter_values.size(); ++i) { @@ -72,23 +64,22 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { if (x->as_var()) { reduce_var_names.insert(x->as_var()->name); } - return false; }); } - auto IsThreadBindOnReduceAxis = [&](const ir::For* for_node) { - return reduce_var_names.count(for_node->loop_var->name) > 0 && + auto IsThreadBindOnReduceAxis = [&](const ir::stmt::For& for_node) { + return reduce_var_names.count(for_node->loop_var()->name) > 0 && for_node->is_gpu_thread_binded(); }; std::vector thread_binded_reduce_loop_indices; bool is_thread_binded_inner_loop = false; for (int i = 0; i < cur_loops_.size(); ++i) { - bool is_thread_bind_on_reduce = - IsThreadBindOnReduceAxis(cur_loops_[i].As()); - if (is_thread_bind_on_reduce && ir::GetLoopExtent(cur_loops_[i]) == 1) + bool is_thread_bind_on_reduce = IsThreadBindOnReduceAxis(cur_loops_[i]); + if (is_thread_bind_on_reduce && ir::GetLoopExtent(cur_loops_[i]) == 1) { return false; + } if (is_thread_binded_inner_loop || is_thread_bind_on_reduce) { if (ir::GetLoopExtent(cur_loops_[i]) > 1024) { return false; @@ -115,7 +106,7 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { int GetBlockSize() const { int block_size = 1; for (auto& loop : cur_loops_) { - if (loop->as()->is_gpu_thread_binded()) { + if (loop->is_gpu_thread_binded()) { block_size *= ir::GetLoopExtent(loop); } } @@ -123,13 +114,14 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { } template - void ReplaceByContinuousReduceExternCall(ir::Expr* store, bool return_warp) { - auto* node = store->As()->value.As(); + void ReplaceByContinuousReduceExternCall(ir::stmt::Store store, + bool return_warp) { + auto* node = store->value().As(); PADDLE_ENFORCE_NOT_NULL( node, ::common::errors::InvalidArgument("The node must not be null.")); auto& operand = node->b(); std::string reduce_func_name = hlir::pe::CrossThreadReduceExternalFuncName( - store->As()->value, operand.template As()->tensor); + store->value(), operand.template As()->tensor); auto tmp_dtype = operand.template As()->tensor.as_tensor()->type(); auto tmp_buffer = ir::_Buffer_::Make( @@ -138,18 +130,18 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { tmp_buffer->dtype = tmp_dtype; tmp_buffer->memory_type = ir::MemoryType::GPUShared; shm_buffer_.insert(tmp_buffer); - store->As()->value = lang::CallExtern( - reduce_func_name, {node->b(), tmp_buffer, ir::Expr(return_warp)}); + store->set_value(lang::CallExtern( + reduce_func_name, {node->b(), tmp_buffer, ir::Expr(return_warp)})); } template - void ReplaceByDiscreteReduceExternCall(ir::Expr* store) { - auto* node = store->As()->value.As(); + void ReplaceByDiscreteReduceExternCall(ir::stmt::Store store) { + auto* node = store->value().As(); PADDLE_ENFORCE_NOT_NULL( node, ::common::errors::InvalidArgument("The node must not be null.")); auto& operand = node->b(); std::string reduce_func_name = hlir::pe::DiscreteReduceExternalFuncName( - store->As()->value, operand.template As()->tensor); + store->value(), operand.template As()->tensor); auto tmp_dtype = operand.template As()->tensor.as_tensor()->type(); auto tmp_buffer = ir::_Buffer_::Make( @@ -158,12 +150,12 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { tmp_buffer->dtype = tmp_dtype; tmp_buffer->memory_type = ir::MemoryType::GPUShared; shm_buffer_.insert(tmp_buffer); - store->As()->value = - lang::CallExtern(reduce_func_name, {node->b(), tmp_buffer}); + store->set_value( + lang::CallExtern(reduce_func_name, {node->b(), tmp_buffer})); } template - void ReplaceByReduceExternCall(ir::Expr* store, + void ReplaceByReduceExternCall(ir::stmt::Store store, const ir::ReduceMethod& method) { std::visit(cinn::adt::match{ [&](const ir::NoneReduceMethod&) { @@ -181,10 +173,11 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { method); } - void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } - - void Visit(ir::_LoweredFunc_* fn) override { - ir::IRMutator<>::Visit(fn); + void Visit(ir::_LoweredFunc_* fn) { + ir::stmt::Mutate( + fn->body_block, + [&](ir::stmt::StmtRef stmt) { PreCall(stmt); }, + [&](ir::stmt::StmtRef stmt) { PostCall(stmt); }); if (std::find_if(fn->temp_bufs.begin(), fn->temp_bufs.end(), [&](const ir::Buffer& buf) -> bool { @@ -198,74 +191,98 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> { shm_buffer_.clear(); } - void Visit(const ir::ScheduleBlockRealize* expr, ir::Expr* op) override { - if (!CanReplace(expr)) { - VLOG(6) << "Can't replace cross thread reduction: " << *op; - IRMutator::Visit(expr, op); - return; + void PreCall(ir::stmt::StmtRef stmt) { + switch (stmt->stmt_type()) { + case ir::StmtNodeTy::Schedule: + VisitStmt(stmt.as()); + break; + case ir::StmtNodeTy::For: + cur_loops_.push_back(stmt.as()); + break; + default: + break; } - VLOG(6) << "Can replace cross thread reduction: " << *op; + } - const ir::ScheduleBlock* schedule_block = - expr->schedule_block.As(); - PADDLE_ENFORCE_NOT_NULL( - schedule_block, - ::common::errors::PreconditionNotMet( - "The schedule block pointer in Visit must not be null.")); - ir::Expr original_update_body = schedule_block->body; - ir::Expr original_update_stmt; - PADDLE_ENFORCE_EQ(original_update_body.As() || - original_update_body.As(), - true, - ::common::errors::InvalidArgument( - "The type of original_update_body is incorrect." - "Expected type is Block or Store.")); - if (original_update_body.As()) { - PADDLE_ENFORCE_EQ( - original_update_body.As()->stmts.size(), - 1, - ::common::errors::InvalidArgument( - "The size of stmts is incorrect." - "Expected size is 1, but receive %d.", - original_update_body.As()->stmts.size())); - original_update_stmt = original_update_body.As()->stmts[0]; - } else if (original_update_body.As()) { - original_update_stmt = original_update_body; + void PostCall(ir::stmt::StmtRef stmt) { + switch (stmt->stmt_type()) { + case ir::StmtNodeTy::For: + cur_loops_.pop_back(); + break; + default: + break; } - -#define REPLACE_TO_EXTERNAL_CALL(Op) \ - if (original_update_stmt.As()->value.As()) { \ - ReplaceByReduceExternCall(&original_update_stmt, \ - schedule_block->reduce_method); \ } - REPLACE_TO_EXTERNAL_CALL(ir::Add) - REPLACE_TO_EXTERNAL_CALL(ir::Mul) - REPLACE_TO_EXTERNAL_CALL(ir::Max) - REPLACE_TO_EXTERNAL_CALL(ir::Min) - REPLACE_TO_EXTERNAL_CALL(ir::And) - REPLACE_TO_EXTERNAL_CALL(ir::Or) -#undef REPLACE_TO_EXTERNAL_CALL - - VLOG(6) << "Replace cross thread reduction: " << *op; - - IRMutator::Visit(expr, op); - } + void VisitStmt(ir::stmt::Schedule stmt) { + if (!CanReplace(stmt)) { + return; + } + ir::stmt::BlockRef original_update_body = stmt->body(); - void Visit(const ir::For* expr, ir::Expr* op) override { - cur_loops_.push_back(*op); - IRMutator::Visit(expr, op); - cur_loops_.pop_back(); + ir::stmt::Store original_update_stmt; + PADDLE_ENFORCE_EQ(original_update_body->stmts().size(), + 1, + ::common::errors::InvalidArgument( + "The size of statements is incorrect." + "Expected size is 1, but receive %d.", + original_update_body->stmts().size())); + PADDLE_ENFORCE_EQ(original_update_body->stmts()[0].isa(), + true, + ::common::errors::InvalidArgument( + "The stmt in schedule's body should be store " + "statement, but get %s.", + original_update_body->stmts()[0]->stmt_type())); + original_update_stmt = + original_update_body->stmts()[0].as(); + + switch (original_update_stmt->value()->node_type()) { + case cinn::ir::IrNodeTy::Add: + ReplaceByReduceExternCall(original_update_stmt, + stmt->reduce_method()); + break; + case cinn::ir::IrNodeTy::Mul: + ReplaceByReduceExternCall(original_update_stmt, + stmt->reduce_method()); + break; + case cinn::ir::IrNodeTy::Max: + ReplaceByReduceExternCall(original_update_stmt, + stmt->reduce_method()); + break; + case cinn::ir::IrNodeTy::Min: + ReplaceByReduceExternCall(original_update_stmt, + stmt->reduce_method()); + break; + case cinn::ir::IrNodeTy::And: + ReplaceByReduceExternCall(original_update_stmt, + stmt->reduce_method()); + break; + case cinn::ir::IrNodeTy::Or: + ReplaceByReduceExternCall(original_update_stmt, + stmt->reduce_method()); + break; + default: + PADDLE_THROW(::common::errors::InvalidArgument( + "The node type is not supported in cross thread reduction.")); + } } private: - std::vector cur_loops_; + std::vector cur_loops_; }; } // namespace void ReplaceCrossThreadReduction(ir::LoweredFunc fn) { - CrossThreadReductionReplacer()(fn); + FuncPassManager manager; + manager.AddPass(std::make_unique()); + manager.Run(fn); +} + +LogicalResult ReplaceCrossThreadReductionPass::Run(ir::LoweredFunc func) { + CrossThreadReductionReplacer replacer; + replacer(func); + return LogicalResult::success(); } } // namespace optim diff --git a/paddle/cinn/optim/replace_cross_thread_reduction.h b/paddle/cinn/optim/replace_cross_thread_reduction.h index 9de7bfba8e1aed..16d83d384ff8d3 100644 --- a/paddle/cinn/optim/replace_cross_thread_reduction.h +++ b/paddle/cinn/optim/replace_cross_thread_reduction.h @@ -16,14 +16,21 @@ * This file implements the strategy to remove the unnecessary nested block. */ #pragma once -#include -#include "paddle/cinn/common/common.h" #include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/pass/pass.h" namespace cinn { namespace optim { +class ReplaceCrossThreadReductionPass : public FuncPass { + public: + ReplaceCrossThreadReductionPass() + : FuncPass("replace_cross_thread_reduction") {} + + LogicalResult Run(ir::LoweredFunc func) override; +}; + /** * Replace cross thread reduction to external call. */ diff --git a/paddle/cinn/optim/replace_cross_thread_reduction_test.cc b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc index dd304a43213f5f..a7d3fc6c9f973b 100644 --- a/paddle/cinn/optim/replace_cross_thread_reduction_test.cc +++ b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc @@ -63,19 +63,25 @@ TEST(CrossThreadReductionReplacer, basic) { EXPECT_EQ(utils::GetStreamCnt(new_func->body), utils::Trim(R"ROC({ ScheduleBlock(root) { - thread_bind[blockIdx.x] for (i, 0, 64) { - ScheduleBlock(B__reduce_init) + thread_bind[blockIdx.x] for (i, 0, 64) { - i0 = axis.bind(i) - B__reduce_init[i0] = 0.00000000f - } - thread_bind[threadIdx.x] for (reduce_j, 0, 128) - { - ScheduleBlock(B) + ScheduleBlock(B__reduce_init) + { + i0 = axis.bind(i) + { + B__reduce_init[i0] = 0.00000000f + } + } + thread_bind[threadIdx.x] for (reduce_j, 0, 128) { - i0_0, i1 = axis.bind(i, reduce_j) - B[i0_0] = cinn_partial_block_reduce_sum_fp32_internal_shm(A[i0_0, i1], _Buffer_(shm32__fp32_reduce), false) + ScheduleBlock(B) + { + i0_0, i1 = axis.bind(i, reduce_j) + { + B[i0_0] = cinn_partial_block_reduce_sum_fp32_internal_shm(A[i0_0, i1], _Buffer_(shm32__fp32_reduce), false) + } + } } } } From 63527fb67da56d975895f0468ece7bf2881c4919 Mon Sep 17 00:00:00 2001 From: Zhou Xin Date: Tue, 7 Jan 2025 19:53:10 +0800 Subject: [PATCH 03/57] [CINN][Backend Pass Update No.8] Update rearrange_load_instruction pass (#70437) * update rearrange_load_ins_pass * Implement rearrange_laod_ins as a FuncPass * Leverage Visit to implement ContainsStmtInStmt * Remove commentted code --- paddle/cinn/optim/CMakeLists.txt | 2 +- paddle/cinn/optim/optimize.cc | 8 +- .../cinn/optim/rearrange_load_instruction.cc | 305 ------------ .../optim/rearrange_load_instruction_pass.cc | 449 ++++++++++++++++++ ...on.h => rearrange_load_instruction_pass.h} | 11 +- 5 files changed, 464 insertions(+), 311 deletions(-) delete mode 100644 paddle/cinn/optim/rearrange_load_instruction.cc create mode 100644 paddle/cinn/optim/rearrange_load_instruction_pass.cc rename paddle/cinn/optim/{rearrange_load_instruction.h => rearrange_load_instruction_pass.h} (94%) diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt index 25b9d5032b6555..d95c6e1d238401 100755 --- a/paddle/cinn/optim/CMakeLists.txt +++ b/paddle/cinn/optim/CMakeLists.txt @@ -36,7 +36,7 @@ gather_srcs( if_fusion_pass.cc merge_block_utils.cc eliminate_common_global_memory_read.cc - rearrange_load_instruction.cc + rearrange_load_instruction_pass.cc check_tensor_buffer_map.cc longlong2int_pass.cc vectorize_for_trans.cc diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc index 7f7815ba2b9670..fec6877220b8c7 100644 --- a/paddle/cinn/optim/optimize.cc +++ b/paddle/cinn/optim/optimize.cc @@ -31,7 +31,7 @@ #include "paddle/cinn/optim/lower_function_call_bind_vars.h" #include "paddle/cinn/optim/lower_intrin.h" #include "paddle/cinn/optim/map_extern_call.h" -#include "paddle/cinn/optim/rearrange_load_instruction.h" +#include "paddle/cinn/optim/rearrange_load_instruction_pass.h" #include "paddle/cinn/optim/remove_schedule_block_pass.h" #include "paddle/cinn/optim/replace_const_param_to_integer.h" #include "paddle/cinn/optim/replace_cross_block_reduction.h" @@ -114,7 +114,7 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn, // Simplify already contains CastSimplify Simplify(&copied->body); - VLOG(10) << "After Optimize Simplify:" << copied; + VLOG(4) << "After Optimize Simplify:" << copied; BlockPassManager pass_manager; pass_manager.AddPass(CreateIfFusionPass()); @@ -122,7 +122,9 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn, target.arch.Match( [&](common::NVGPUArch) { - RearrangeLoadInstruction(&copied->body); + FuncPassManager func_pass_manager; + func_pass_manager.AddPass(CreateRearrangeLoadInstructionPass()); + func_pass_manager.Run(copied); VLOG(4) << "After Optimize RearrangeLoadInstruction:" << copied; }, [](auto) {}); diff --git a/paddle/cinn/optim/rearrange_load_instruction.cc b/paddle/cinn/optim/rearrange_load_instruction.cc deleted file mode 100644 index 9c3b8b067e1f58..00000000000000 --- a/paddle/cinn/optim/rearrange_load_instruction.cc +++ /dev/null @@ -1,305 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/cinn/optim/rearrange_load_instruction.h" - -#include -#include "paddle/cinn/ir/ir_analyzer/ir_analyzer.h" -#include "paddle/cinn/ir/ir_mutator.h" -#include "paddle/cinn/ir/ir_printer.h" - -PD_DECLARE_bool(cinn_enable_rearrange_load); - -namespace cinn { -namespace optim { -namespace { - -constexpr int MaxRearrangeLoadNum = 8; - -template -bool ContainsExprNode(const ir::Expr& expr) { - auto res = ir::ir_utils::CollectIRNodes( - expr, - [](const ir::Expr* x) { return x->As(); }, - /* uniq_target = */ true); - return !res.empty(); -} - -/** - * Calculate the buffer size as a constant. For dynamic dims, since they are - * difficult to compare, we just estimate them to be 32. - * Note: this is a heuristic optimization, so the exact number is not very - * important. - */ -int64_t EstimateBufferSize(const ir::Buffer& buffer) { - int64_t size = 1; - for (auto& dim_size : buffer->shape) { - if (dim_size.is_constant()) { - size *= dim_size.as_int64(); - } else { - size *= 32; - } - } - return size; -} - -std::vector SortLoadsByBufferSizes( - const std::unordered_map& load_map, - std::vector load_list) { - // Calculate the buffer sizes of loads (with estimation). - std::map buffer_size_map; - for (auto& [_, load_expr] : load_map) { - auto& buffer = load_expr->As()->tensor.as_tensor()->buffer; - if (buffer_size_map.count(buffer)) { - continue; - } - buffer_size_map[buffer] = EstimateBufferSize(buffer); - } - - const auto GetBufferSize = [&](const std::string& key) { - auto& buffer = load_map.at(key)->As()->tensor.as_tensor()->buffer; - return buffer_size_map[buffer]; - }; - - // Sort loads by their buffer sizes from large to small. - // Note: we use stable sort here, because for equal-size loads, we want to - // keep their original order. - std::stable_sort(load_list.begin(), - load_list.end(), - [&](const std::string& key1, const std::string& key2) { - return GetBufferSize(key1) > GetBufferSize(key2); - }); - return load_list; -} - -struct LoadCollector : public ir::IRMutator<> { - explicit LoadCollector(const std::set& locally_defined_buffers) - : locally_defined_buffers_(locally_defined_buffers) {} - - void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } - - private: - // Collect loads that meet the following criteria: - // 1) It is loading from global memory. Local loads are simply register reads - // and do not require rearrangement. - // 2) The value being loaded is not defined locally by a previous store. In - // such cases, the value resides in a register rather than in memory, thus - // doesn't need rearrangement. This criteria also prevents data-dependency - // harzards. - // 3) It doesn't contains indirect indices (i.e. loads within indices). - // Indirect indices are hard to manage and are seldom seem, so we choose - // not to handle them. - void Visit(const ir::Load* op, ir::Expr* expr) override { - auto& buffer = op->tensor.as_tensor()->buffer; - if (buffer->memory_type != ir::MemoryType::Heap) { - return; - } - if (locally_defined_buffers_.count(buffer) > 0) { - return; - } - for (auto& index_expr : op->indices) { - if (ContainsExprNode(index_expr)) { - return; - } - } - std::string key = utils::GetStreamCnt(*expr); - CollectLoad(key, expr); - } - - // Handle Select as a special op. - // Since Select evaluates only one of its two branches, we can rearrange a - // load in Select only if the load appears in both branches, otherwise we may - // violate the control dependency. - void Visit(const ir::Select* op, ir::Expr* expr) override { - auto* node = expr->As(); - ir::IRMutator<>::Visit(&node->condition, &node->condition); - - LoadCollector true_collector(locally_defined_buffers_); - true_collector(&node->true_value); - LoadCollector false_collector(locally_defined_buffers_); - false_collector(&node->false_value); - - for (auto& key : true_collector.load_list_) { - if (false_collector.load_map_.count(key) > 0) { - CollectLoad(key, true_collector.load_map_[key]); - } - } - } - - void CollectLoad(const std::string& key, const ir::Expr* expr) { - auto [_, is_first] = load_map_.emplace(key, expr); - if (is_first) { - load_list_.push_back(key); - } - } - - public: - // map from the signatures of loads to the load nodes - std::unordered_map load_map_; - // list of the signatures of loads in the order they are visited - std::vector load_list_; - - private: - const std::set& locally_defined_buffers_; -}; - -struct LoadReplacer : public ir::IRMutator<> { - explicit LoadReplacer(const std::unordered_map& var_map) - : var_map_(var_map) {} - - void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } - - private: - void Visit(const ir::Load* op, ir::Expr* expr) override { - std::string key = utils::GetStreamCnt(*expr); - if (var_map_.count(key) > 0) { - *expr = Expr(var_map_.at(key)); - } - } - - const std::unordered_map& var_map_; -}; - -struct RearrangeLoadInstructionMutator : public ir::IRMutator<> { - void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } - - private: - // A block is a leaf block if it is inside at least one loop, and all of its - // stmts are schedule blocks. - bool IsLeafBlock(const ir::Block& block) { - if (parent_loops_.empty()) { - return false; - } - for (auto& stmt : block.stmts) { - if (!stmt.As()) { - return false; - } - auto* node = stmt.As() - ->schedule_block.As(); - if (node->name.substr(0, 4) == "root") { - return false; - } - } - return true; - } - - // Local buffer initialization is like: - // var_1_local[0] = var_1[blockIdx.x], - // where the lhs is a local buffer and the rhs is a single load. - bool IsLocalBufferInit(const ir::Store& store) { - auto& store_buffer = store.tensor.as_tensor()->buffer; - return store_buffer->memory_type == ir::MemoryType::GPULocal && - store.value.As(); - } - - void DoRearrangeLoadInstruction(ir::Block* block) { - // Step 1. Collect loads in each schedule block under this block. - // Requirements: - // 1) The schedule block cannot contain IfThenElse, or we will violate the - // control dependency. Schedule blocks that have IfThenElse usually don't - // benefit from rearranging loads, so it's ok to skip them. - // 2) The schedule block is not local buffer initialization, because when - // initializing the local buffer with a load, we have already rearranged - // that load. - // 3) There are more constrains on the loads to collect, see LoadCollector - // for details. - LoadCollector collector(locally_defined_buffers_); - for (auto& stmt : block->stmts) { - ir::Expr store = ir::analyzer::GetStoreOfSBlock(stmt); - auto* store_node = store.As(); - if (ContainsExprNode(stmt)) continue; - if (IsLocalBufferInit(*store_node)) continue; - collector(&store_node->value); - } - - // Step 2. Sort the loads by their buffer sizes from large to small, and - // only keep the first `MaxRearrangeLoadNum` loads. - // Performance concerns: - // 1) Larger buffers need more time to access, so we should issue their - // corresponding loads earlier. - // 2) Rearranged loads will consume registers, so we should set a limit - // to prevent register overflow. - std::vector load_list = - SortLoadsByBufferSizes(collector.load_map_, collector.load_list_); - if (load_list.size() > MaxRearrangeLoadNum) { - load_list.resize(MaxRearrangeLoadNum); - } - - // Step 3. Create loads with Let at the beginning of the block. - std::vector new_stmts; - std::unordered_map var_map; - for (auto& key : load_list) { - auto* load_expr = collector.load_map_[key]; - auto* tensor = load_expr->As()->tensor.as_tensor(); - ir::Var local_var = ir::Var(common::UniqName(tensor->name + "_local"), - tensor->buffer->dtype); - ir::Expr let_expr = ir::Let::Make(local_var, *load_expr); - new_stmts.push_back(let_expr); - var_map[key] = local_var; - } - - // Step 4. Replace loads in schedule blocks with the above Let vars. - LoadReplacer replacer(var_map); - for (auto& stmt : block->stmts) { - replacer(&stmt); - new_stmts.push_back(stmt); - } - block->stmts = std::move(new_stmts); - } - - void Visit(const ir::Block* op, ir::Expr* expr) override { - auto* node = expr->As(); - ir::IRMutator<>::Visit(op, expr); - if (IsLeafBlock(*op)) { - DoRearrangeLoadInstruction(node); - } - } - - void Visit(const ir::ScheduleBlockRealize* op, ir::Expr* expr) override { - auto* block_node = op->schedule_block.As(); - if (block_node->name.substr(0, 4) == "root") { - ir::IRMutator<>::Visit(op, expr); - return; - } - for (auto& buffer_range : block_node->write_buffers) { - auto& write_buffer = buffer_range.As()->buffer; - locally_defined_buffers_.insert(write_buffer.as_buffer_ref()); - } - } - - void Visit(const ir::For* op, ir::Expr* expr) override { - parent_loops_.push_back(op); - ir::IRMutator<>::Visit(op, expr); - parent_loops_.pop_back(); - } - - private: - // Buffers whose values are defined locally inside this function. - // Note: even if a buffer is allocated on global memory, its value may be - // assigned locally. If so, it also belongs to this set. - std::set locally_defined_buffers_; - - std::vector parent_loops_; -}; - -} // namespace - -void RearrangeLoadInstruction(Expr* expr) { - if (!FLAGS_cinn_enable_rearrange_load) return; - RearrangeLoadInstructionMutator mutator; - mutator(expr); -} - -} // namespace optim -} // namespace cinn diff --git a/paddle/cinn/optim/rearrange_load_instruction_pass.cc b/paddle/cinn/optim/rearrange_load_instruction_pass.cc new file mode 100644 index 00000000000000..366c077306d7a9 --- /dev/null +++ b/paddle/cinn/optim/rearrange_load_instruction_pass.cc @@ -0,0 +1,449 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/optim/rearrange_load_instruction_pass.h" +#include "paddle/cinn/common/cinn_value.h" +#include "paddle/cinn/ir/buffer.h" +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_base.h" +#include "paddle/cinn/ir/ir_mutator.h" +#include "paddle/cinn/ir/ir_printer.h" +#include "paddle/cinn/ir/ir_visitor.h" +#include "paddle/cinn/ir/stmt_visitors.h" +#include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/ir/utils/stmt_converter.h" +#include "paddle/phi/core/enforce.h" + +PD_DECLARE_bool(cinn_enable_rearrange_load); + +namespace cinn { +namespace optim { + +using ir::stmt::Alloc; +using ir::stmt::BlockRef; +using ir::stmt::Evaluate; +using ir::stmt::For; +using ir::stmt::Free; +using ir::stmt::IfThenElse; +using ir::stmt::Let; +using ir::stmt::Schedule; +using ir::stmt::StmtRef; +using ir::stmt::Store; + +namespace { +constexpr int MaxRearrangeLoadNum = 8; + +template +bool ContainsExprNodeInExpr(const ir::Expr& expr) { + auto res = ir::ir_utils::CollectIRNodes( + expr, + [](const ir::Expr* x) { return x->As(); }, + /* uniq_target = */ true); + return !res.empty(); +} + +template +bool ContainsStmtInStmt(const StmtRef& stmt) { + bool found = false; + auto CheckStmt = [&found](const StmtRef& stmt) { + if (!found && stmt.isa()) { + found = true; + } + }; + ir::stmt::Visit(stmt, CheckStmt, [](const StmtRef&) {}); + return found; +} + +/** + * Calculate the buffer size as a constant. For dynamic dims, since they are + * difficult to compare, we just estimate them to be 32. + * Note: this is a heuristic optimization, so the exact number is not very + * important. + */ +int64_t EstimateBufferSize(const ir::Buffer& buffer) { + int64_t size = 1; + for (auto& dim_size : buffer->shape) { + if (dim_size.is_constant()) { + size *= dim_size.as_int64(); + } else { + size *= 32; + } + } + return size; +} + +std::vector SortLoadsByBufferSizes( + const std::unordered_map& load_map, + std::vector load_list) { + // Calculate the buffer sizes of loads (with estimation). + std::map buffer_size_map; + for (auto& [_, load_expr] : load_map) { + auto& buffer = load_expr->As()->tensor.as_tensor()->buffer; + if (buffer_size_map.count(buffer)) { + continue; + } + buffer_size_map[buffer] = EstimateBufferSize(buffer); + } + + const auto GetBufferSize = [&](const std::string& key) { + auto& buffer = load_map.at(key)->As()->tensor.as_tensor()->buffer; + return buffer_size_map[buffer]; + }; + + // Sort loads by their buffer sizes from large to small. + // Note: we use stable sort here, because for equal-size loads, we want to + // keep their original order. + std::stable_sort(load_list.begin(), + load_list.end(), + [&](const std::string& key1, const std::string& key2) { + return GetBufferSize(key1) > GetBufferSize(key2); + }); + return load_list; +} + +struct LoadCollector : public ir::IRMutator<> { + explicit LoadCollector(const std::set& locally_defined_buffers) + : locally_defined_buffers_(locally_defined_buffers) {} + + void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } + + private: + // Collect loads that meet the following criteria: + // 1) It is loading from global memory. Local loads are simply register + // reads and do not require rearrangement. + // 2) The value being loaded is not defined locally by a previous store. In + // such cases, the value resides in a register rather than in memory, + // thus doesn't need rearrangement. This criteria also prevents + // data-dependency harzards. + // 3) It doesn't contains indirect indices (i.e. loads within indices). + // Indirect indices are hard to manage and are seldom seem, so we choose + // not to handle them. + void Visit(const ir::Load* op, ir::Expr* expr) override { + auto& buffer = op->tensor.as_tensor()->buffer; + if (buffer->memory_type != ir::MemoryType::Heap) { + return; + } + if (locally_defined_buffers_.count(buffer) > 0) { + return; + } + for (auto& index_expr : op->indices) { + if (ContainsExprNodeInExpr(index_expr)) { + return; + } + } + std::string key = utils::GetStreamCnt(*expr); + CollectLoad(key, expr); + } + + // Handle Select as a special op. + // Since Select evaluates only one of its two branches, we can rearrange a + // load in Select only if the load appears in both branches, otherwise we + // may violate the control dependency. + void Visit(const ir::Select* op, ir::Expr* expr) override { + auto* node = expr->As(); + ir::IRMutator<>::Visit(&node->condition, &node->condition); + + LoadCollector true_collector(locally_defined_buffers_); + true_collector(&node->true_value); + LoadCollector false_collector(locally_defined_buffers_); + false_collector(&node->false_value); + + for (auto& key : true_collector.load_list_) { + if (false_collector.load_map_.count(key) > 0) { + CollectLoad(key, true_collector.load_map_[key]); + } + } + } + + void CollectLoad(const std::string& key, const ir::Expr* expr) { + auto [_, is_first] = load_map_.emplace(key, expr); + if (is_first) { + load_list_.push_back(key); + } + } + + public: + // map from the signatures of loads to the load nodes + std::unordered_map load_map_; + // list of the signatures of loads in the order they are visited + std::vector load_list_; + + private: + const std::set& locally_defined_buffers_; +}; + +struct LoadReplacer : public ir::IRMutator<>, public ir::stmt::StmtMutator<> { + explicit LoadReplacer(const std::unordered_map& var_map) + : var_map_(var_map) {} + + void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } + + void operator()(StmtRef stmt) { ir::stmt::StmtMutator<>::VisitStmt(stmt); } + + private: + void Visit(const ir::Load* op, ir::Expr* expr) override { + std::string key = utils::GetStreamCnt(*expr); + if (var_map_.count(key) > 0) { + *expr = Expr(var_map_.at(key)); + } + } + + void VisitStmt(ir::stmt::Let stmt) override { + if (stmt->body().defined()) { + Expr body = stmt->body(); + ir::IRMutator<>::Visit(&body, &body); + stmt->set_body(body); + } + } + + void VisitStmt(ir::stmt::Store stmt) override { + auto* tensor = stmt->tensor().as_tensor(); + + std::vector new_indices = stmt->indices(); + for (Expr& index : new_indices) { + ir::IRMutator<>::Visit(&index, &index); + } + stmt->set_indices(new_indices); + + Expr tensor_expr = stmt->tensor(); + ir::IRMutator<>::Visit(&tensor_expr, &tensor_expr); + stmt->set_tensor(tensor_expr); + + Expr value = stmt->value(); + ir::IRMutator<>::Visit(&value, &value); + stmt->set_value(value); + } + + void VisitStmt(ir::stmt::For stmt) override { + Expr min = stmt->min(); + ir::IRMutator<>::Visit(&min, &min); + Expr extent = stmt->extent(); + ir::IRMutator<>::Visit(&extent, &extent); + VisitBlock(stmt->body()); + ir::Expr loop_var = stmt->loop_var(); + ir::IRMutator<>::Visit(&loop_var, &loop_var); + stmt->set_loop_var(loop_var); + } + + void VisitStmt(ir::stmt::IfThenElse stmt) override { + Expr condition = stmt->condition(); + ir::IRMutator<>::Visit(&condition, &condition); + ir::stmt::BlockRef true_case = stmt->true_case(); + VisitBlock(true_case); + stmt->set_true_case(true_case); + if (stmt->false_case().defined()) { + ir::stmt::BlockRef false_case = stmt->false_case(); + VisitBlock(false_case); + stmt->set_false_case(false_case); + } + } + + void VisitStmt(ir::stmt::Schedule stmt) override { + std::vector vars = stmt->iter_vars(); + for (ir::Var& var : vars) { + if (var->lower_bound.defined()) { + ir::IRMutator<>::Visit(&var->lower_bound, &var->lower_bound); + } + if (var->upper_bound.defined()) { + ir::IRMutator<>::Visit(&var->upper_bound, &var->upper_bound); + } + } + std::vector new_read_buffers = stmt->read_buffers(); + for (Expr& read_buffer : new_read_buffers) { + ir::IRMutator<>::Visit(&read_buffer, &read_buffer); + } + stmt->set_read_buffers(new_read_buffers); + + std::vector new_write_buffers = stmt->write_buffers(); + for (Expr& write_buffer : new_write_buffers) { + ir::IRMutator<>::Visit(&write_buffer, &write_buffer); + } + stmt->set_write_buffers(new_write_buffers); + VisitBlock(stmt->body()); + } + + void VisitStmt(ir::stmt::Alloc stmt) override { return; } + + void VisitStmt(ir::stmt::Free stmt) override { return; } + + void VisitStmt(ir::stmt::Evaluate) override { return; } + + const std::unordered_map& var_map_; +}; + +struct RearrangeLoadInstructionMutator : public ir::stmt::StmtMutator<> { + void operator()(BlockRef block) { VisitBlock(block); } + + private: + // A block is a leaf block if it is inside at least one loop, and all of its + // stmts are schedule blocks. + bool IsLeafBlock(BlockRef block) { + if (parent_loops_.empty()) return false; + for (StmtRef stmt : block->stmts()) { + if (!stmt.isa()) return false; + Schedule schedule_stmt = stmt.as(); + if (schedule_stmt->name().substr(0, 4) == "root") return false; + } + return true; + } + + // Local buffer initialization is like: + // var_1_local[0] = var_1[blockIdx.x], + // where the lhs is a local buffer and the rhs is a single load. + bool IsLocalBufferInit(Store store_stmt) { + const ir::Buffer& store_buffer = store_stmt->tensor().as_tensor()->buffer; + return store_buffer->memory_type == ir::MemoryType::GPULocal && + store_stmt->value().As(); + } + + void DoRearrangeLoadInstruction(BlockRef block) { + auto GetStoreOfScheduleStmt = [](Schedule schedule_stmt) -> Store { + bool found = false; + Store ret; + for (StmtRef stmt : schedule_stmt->body()->stmts()) { + if (stmt.isa()) { + PADDLE_ENFORCE(found == false, + ::common::errors::InvalidArgument( + "One schedule statement should only have one " + "store statement.")); + found = true; + ret = stmt.as(); + } + } + PADDLE_ENFORCE(found == true, + ::common::errors::InvalidArgument( + "One schedule statement should have one store " + "statement, but not found.")); + return ret; + }; + + // Step 1. Collect loads in each schedule block under this block. + // Requirements: + // 1) The schedule block cannot contain IfThenElse, or we will violate the + // control dependency. Schedule blocks that have IfThenElse usually + // don't benefit from rearranging loads, so it's ok to skip them. + // 2) The schedule block is not local buffer initialization, because when + // initializing the local buffer with a load, we have already + // rearranged that load. + // 3) There are more constrains on the loads to collect, see LoadCollector + // for details. + LoadCollector collector(locally_defined_buffers_); + for (StmtRef stmt : block->stmts()) { + if (ContainsStmtInStmt(stmt)) continue; + if (!stmt.isa()) continue; + Schedule schedule_stmt = stmt.as(); + Store store_stmt = GetStoreOfScheduleStmt(schedule_stmt); + if (IsLocalBufferInit(store_stmt)) continue; + collector(const_cast(&store_stmt->value())); + } + + // Step 2. Sort the loads by their buffer sizes from large to small, and + // only keep the first `MaxRearrangeLoadNum` loads. + // Performance concerns: + // 1) Larger buffers need more time to access, so we should issue their + // corresponding loads earlier. + // 2) Rearranged loads will consume registers, so we should set a limit + // to prevent register overflow. + std::vector load_list = + SortLoadsByBufferSizes(collector.load_map_, collector.load_list_); + if (load_list.size() > MaxRearrangeLoadNum) { + load_list.resize(MaxRearrangeLoadNum); + } + + // Step 3. Create loads with Let at the beginning of the block. + std::vector new_stmts; + std::unordered_map var_map; + for (std::string& key : load_list) { + const ir::Expr* load_expr = collector.load_map_[key]; + const auto tensor = load_expr->As()->tensor.as_tensor(); + ir::Var local_var = ir::Var(common::UniqName(tensor->name + "_local"), + tensor->buffer->dtype); + Let let_stmt = Let(local_var, *load_expr); + new_stmts.push_back(let_stmt); + var_map[key] = local_var; + } + + // Step 4. Replace loads in schedule blocks with the above Let vars. + LoadReplacer replacer(var_map); + for (StmtRef stmt : block->stmts()) { + if (stmt.isa()) { + replacer(stmt); + } + new_stmts.push_back(stmt); + } + block->set_stmts(new_stmts); + } + + void VisitBlock(BlockRef block) override { + ir::stmt::StmtMutator<>::VisitBlock(block); + if (IsLeafBlock(block)) { + DoRearrangeLoadInstruction(block); + } + } + + void VisitStmt(Schedule stmt) override { + if (stmt->name().substr(0, 4) == "root") { + ir::stmt::StmtMutator<>::VisitBlock(stmt->body()); + return; + } + for (auto& buffer_range : stmt->write_buffers()) { + auto& write_buffer = buffer_range.As()->buffer; + locally_defined_buffers_.insert(write_buffer.as_buffer_ref()); + } + } + + void VisitStmt(For stmt) override { + parent_loops_.push_back(stmt); + VisitBlock(stmt->body()); + parent_loops_.pop_back(); + } + + void VisitStmt(IfThenElse stmt) override { + ir::stmt::BlockRef true_case = stmt->true_case(); + VisitBlock(true_case); + stmt->set_true_case(true_case); + if (stmt->false_case().defined()) { + ir::stmt::BlockRef false_case = stmt->false_case(); + VisitBlock(false_case); + stmt->set_false_case(false_case); + } + } + + void VisitStmt(Let stmt) override { return; } + void VisitStmt(Store stmt) override { return; } + void VisitStmt(Alloc stmt) override { return; } + void VisitStmt(Free stmt) override { return; } + void VisitStmt(Evaluate stmt) override { return; } + + private: + std::set locally_defined_buffers_; + std::vector parent_loops_; +}; +} // namespace + +LogicalResult cinn::optim::RearrangeLoadInstructionPass::Run( + ir::LoweredFunc func) { + if (FLAGS_cinn_enable_rearrange_load) { + BlockRef body = func->body_block; + RearrangeLoadInstructionMutator mutator; + mutator(body); + } + return LogicalResult::success(); +} + +std::unique_ptr CreateRearrangeLoadInstructionPass() { + return std::make_unique(); +} +} // namespace optim +} // namespace cinn diff --git a/paddle/cinn/optim/rearrange_load_instruction.h b/paddle/cinn/optim/rearrange_load_instruction_pass.h similarity index 94% rename from paddle/cinn/optim/rearrange_load_instruction.h rename to paddle/cinn/optim/rearrange_load_instruction_pass.h index 4c4d2b10728528..0128234a6ffa2e 100644 --- a/paddle/cinn/optim/rearrange_load_instruction.h +++ b/paddle/cinn/optim/rearrange_load_instruction_pass.h @@ -13,10 +13,17 @@ // limitations under the License. #pragma once -#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/pass/pass.h" + +PD_DECLARE_bool(cinn_enable_rearrange_load); namespace cinn { namespace optim { +class RearrangeLoadInstructionPass : public FuncPass { + public: + RearrangeLoadInstructionPass() : FuncPass("rearrange_load_instruction") {} + LogicalResult Run(ir::LoweredFunc func) override; +}; /* * Rearrange global memory loads in front of expressions to optimize the @@ -149,7 +156,7 @@ namespace optim { * branch of Select, `var_3[k]` in ScheduleBlock(var_4) has data dependency * with ScheduleBlock(var_3); none of them can be rearranged. */ -void RearrangeLoadInstruction(Expr *expr); +std::unique_ptr CreateRearrangeLoadInstructionPass(); } // namespace optim } // namespace cinn From 299ff2b35b01557492f8702bef1e157af8fd9c9c Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 7 Jan 2025 21:13:17 +0800 Subject: [PATCH 04/57] fix generator shape int63 to int32 bug (#70658) --- paddle/cinn/hlir/op/elementwise.cc | 1 + paddle/cinn/hlir/pe/elementwise.cc | 9 ++++++++- paddle/cinn/hlir/pe/elementwise.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc index 8f23a6b32b913a..db2a65b68c9c0d 100644 --- a/paddle/cinn/hlir/op/elementwise.cc +++ b/paddle/cinn/hlir/op/elementwise.cc @@ -1267,6 +1267,7 @@ std::shared_ptr StrategyForGenerateShapeSymbolic( symbol_bindings, output_dim_exprs, output_shapes[0], + out_type, tensor_name); std::vector res; res.push_back(CINNValue(out)); diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc index de0d8b63d872ec..8e16bd6a8c6d19 100644 --- a/paddle/cinn/hlir/pe/elementwise.cc +++ b/paddle/cinn/hlir/pe/elementwise.cc @@ -351,6 +351,7 @@ ir::Tensor GenerateShape(const std::vector& inputs, const cinn::dialect::SymbolBindings& symbol_bindings, const std::vector& output_dim_exprs, const std::vector& out_shape, + const std::vector& out_type, const std::string& name) { if (output_dim_exprs.size() != 1) { VLOG(4) << "pe::GenerateShape will return a meaningless tensor when " @@ -365,7 +366,13 @@ ir::Tensor GenerateShape(const std::vector& inputs, auto res = Compute( ToCinnExprs(out_shape), [=, &converter](const std::vector& indice) { - return converter.ConvertToIrExpr(output_dim_exprs[0]); + auto dim_expr = converter.ConvertToIrExpr(output_dim_exprs[0]); + + if (out_type[0] == type_of()) { + dim_expr = ir::Cast::Make(type_of(), dim_expr); + } + + return dim_expr; }, name); return res; diff --git a/paddle/cinn/hlir/pe/elementwise.h b/paddle/cinn/hlir/pe/elementwise.h index fd58bd39146f5a..5d7dd55416e3fb 100644 --- a/paddle/cinn/hlir/pe/elementwise.h +++ b/paddle/cinn/hlir/pe/elementwise.h @@ -165,6 +165,7 @@ ir::Tensor GenerateShape( const cinn::dialect::SymbolBindings& symbol_bindings, const std::vector& output_dim_exprs, const std::vector& out_shape, + const std::vector& out_type, const std::string& name = UniqName("T_Generate_Shape_out")); // This operator checks if all x and y satisfy the condition: |x - y| <= atol + From 1b1d815f7349fa3350a1e165f72755dc6fda4cde Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Tue, 7 Jan 2025 21:30:48 +0800 Subject: [PATCH 05/57] =?UTF-8?q?=E3=80=90CINN=E3=80=91Fix=20ir=20simplify?= =?UTF-8?q?=20bug=20(#70654)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update y layout * fix bug --- paddle/cinn/optim/ir_simplify.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc index 562b06686eaa4e..396e4b6e5c0697 100644 --- a/paddle/cinn/optim/ir_simplify.cc +++ b/paddle/cinn/optim/ir_simplify.cc @@ -266,6 +266,7 @@ struct SimplifyLogicalMutator : public ir::ExprMutator<> { } void Visit(const ir::Not* op, Expr* expr) override { + VLOG(7) << "Begin Visit Not op: " << *expr; auto* node = expr->As(); auto v = node->v(); ir::ExprMutator<>::Visit(&v, &v); @@ -273,19 +274,27 @@ struct SimplifyLogicalMutator : public ir::ExprMutator<> { case ir::IrNodeTy::IntImm: case ir::IrNodeTy::UIntImm: *expr = common::IsZero(v) ? Expr(true) : Expr(false); + return; case ir::IrNodeTy::Not: *expr = v.As()->v(); + return; case ir::IrNodeTy::LE: *expr = ir::GT::Make(v->operand(0), v->operand(1)); + return; case ir::IrNodeTy::LT: *expr = ir::GE::Make(v->operand(0), v->operand(1)); + return; case ir::IrNodeTy::GE: *expr = ir::LT::Make(v->operand(0), v->operand(1)); + return; case ir::IrNodeTy::GT: *expr = ir::LE::Make(v->operand(0), v->operand(1)); + return; default: + VLOG(7) << "End Visit Not op: " << *expr; return; } + VLOG(7) << "End Visit Not op: " << *expr; } }; From 80c376f3d0e0918a93819cafa304c27eb335fad1 Mon Sep 17 00:00:00 2001 From: Xinyi Li Date: Wed, 8 Jan 2025 09:14:34 +0800 Subject: [PATCH 06/57] [PIR][oneDNN] Optimize bfloat16 placement logic (#70630) * optimize placement logic * fix format * fix copyright * reduce repetitive match --- .../transforms/onednn/cpu_bfloat16_pass.cc | 14 +- .../onednn/cpu_bfloat16_placement_pass.cc | 139 +++++------------- .../onednn/cpu_special_ops_bf16_pass.cc | 23 +-- .../onednn/cpu_special_ops_bf16_pass.h | 2 +- .../onednn/onednn_placement_pass.cc | 3 +- .../onednn/test_cpu_bfloat16_pir_pass.py | 3 +- 6 files changed, 54 insertions(+), 130 deletions(-) diff --git a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc index 80ed42414cdbaf..c1a3d4eea3dfdf 100644 --- a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -96,9 +96,6 @@ class CpuBfloat16Pattern : public paddle::drr::DrrPatternBase { op_attrs.emplace("keepdim", pat.Attr("keepdim")); op_attrs.emplace("dtype", pat.Attr("dtype")); - } else if (bfloat16_ops_ == "onednn_op.concat") { - op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type")); - op_attrs.emplace("use_quantizer", pat.Attr("use_quantizer")); } else if (bfloat16_ops_ == "onednn_op.reshape_" || bfloat16_ops_ == "onednn_op.reshape") { op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type")); @@ -224,11 +221,7 @@ class CpuBfloat16DequantPattern : public paddle::drr::DrrPatternBase { paddle::drr::SourcePattern pat = ctx->SourcePattern(); std::unordered_map op_attrs; - if (bfloat16_ops_ == "onednn_op.concat") { - op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type")); - op_attrs.emplace("use_quantizer", pat.Attr("use_quantizer")); - - } else if (bfloat16_ops_ == "onednn_op.conv2d") { + if (bfloat16_ops_ == "onednn_op.conv2d") { op_attrs.emplace("strides", pat.Attr("strides")); op_attrs.emplace("paddings", pat.Attr("paddings")); op_attrs.emplace("padding_algorithm", pat.Attr("padding_algorithm")); @@ -272,9 +265,6 @@ class CpuBfloat16DequantPattern : public paddle::drr::DrrPatternBase { op_attrs.emplace("keepdim", pat.Attr("keepdim")); op_attrs.emplace("dtype", pat.Attr("dtype")); - } else if (bfloat16_ops_ == "onednn_op.concat") { - op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type")); - op_attrs.emplace("use_quantizer", pat.Attr("use_quantizer")); } else if (bfloat16_ops_ == "onednn_op.reshape_" || bfloat16_ops_ == "onednn_op.reshape") { op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type")); diff --git a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc index 649389585915d7..ce0f873be31c74 100644 --- a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc +++ b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -59,6 +59,7 @@ class OneDNNBf16PlacementPattern : public pir::RewritePattern { if (!op->isa() && !op->isa() && !op->isa() && + !op->isa() && !op->isa() && !op->isa() && !op->isa() && @@ -100,7 +101,8 @@ class OneDNNBf16PlacementPattern : public pir::RewritePattern { auto mkldnn_data_type = op_attr.at("mkldnn_data_type") .dyn_cast() .AsString(); - if (mkldnn_data_type == "int8") { + // Reduce repetitive match + if (mkldnn_data_type != "float32") { return false; } } @@ -143,14 +145,28 @@ class OneDNNBf16PlacementPattern : public pir::RewritePattern { } pir::Type type = op->operand_type(i); if (!type) continue; - if (!type.isa()) { - // We skip pir::VectorType - // TODO(Lirong, Xinyi): Support pir::VectorType in bf16 - return false; - } - pir::Type op_dtype = pir::GetDataTypeFromValue(value); - // Only float input can be converted to bfloat16 - if (!op_dtype.isa()) { + if (type.isa()) { + // Support pir::VectorType in bf16 + // Special op will do detailed check in its pattern + pir::VectorType vector_type = value.type().dyn_cast(); + for (size_t idx = 0; idx < static_cast(vector_type.size()); + idx++) { + auto input_type = + vector_type[idx].isa(); + // We don't precess nested VectorType + if (!input_type) return false; + pir::Type input_dtype = + vector_type[idx] + .dyn_cast() + .dtype(); + // Only float input can be converted to bfloat16 + if (!input_dtype.isa()) return false; + } + } else if (type.isa()) { + pir::Type op_dtype = pir::GetDataTypeFromValue(value); + // Only float input can be converted to bfloat16 + if (!op_dtype.isa()) return false; + } else { return false; } } @@ -211,6 +227,7 @@ class RemoveOrphanedPattern : public pir::RewritePattern { if (!op->isa() && !op->isa() && !op->isa() && + !op->isa() && !op->isa() && !op->isa() && !op->isa() && @@ -292,15 +309,17 @@ class RemoveOrphanedPattern : public pir::RewritePattern { } } } else { - // The first op in graph - return false; + // The first op in graph should be treated as prev_fp32 = true + prev_fp32 = true; } + size_t num_useops = 0; for (uint32_t i = 0; i < op->num_results(); i++) { if (!op->result(i) || !op->result(i).type()) { continue; } auto next_op_list = pir::GetUseOpsForOutput(op, i); + num_useops += next_op_list.size(); for (auto const& [next_op, op_index] : next_op_list) { // Some ops do not need to be processed std::string next_op_name = next_op->name(); @@ -325,6 +344,10 @@ class RemoveOrphanedPattern : public pir::RewritePattern { } } + // Check if it's the last op on graph. If it is, this op can be seen as a + // fp32 op down here + if (num_useops == 0) next_fp32 = true; + return prev_fp32 && next_fp32; } @@ -354,97 +377,6 @@ class RemoveOrphanedPattern : public pir::RewritePattern { } }; -class RemoveUnsupportedOpPattern : public pir::RewritePattern { - public: - explicit RemoveUnsupportedOpPattern(pir::IrContext* context) - : pir::RewritePattern(MatchAnyOpTypeTag(), - 1 /*benefit*/, - context, - {} /*generated_names*/) {} - - bool Match(pir::Operation* op) const override { // NOLINT - if (!op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa() && - !op->isa()) { - return false; - } - auto op_attr = op->attributes(); - if (op_attr.find("mkldnn_data_type") != op_attr.end()) { - auto mkldnn_data_type = op_attr.at("mkldnn_data_type") - .dyn_cast() - .AsString(); - if (mkldnn_data_type != "bfloat16") { - return false; - } - } - - uint32_t num_operands = op->num_operands(); - for (uint32_t i = 0; i < num_operands; i++) { - auto* pre_op = pir::GetDefiningOpForInput(op, i); - if (pre_op->HasAttribute("mkldnn_data_type")) { - return false; - } - } - - return true; - } - - void Rewrite(pir::Operation* op, - pir::PatternRewriter& rewriter) const override { // NOLINT - std::string target_op_name = op->name(); - auto op_info = - pir::IrContext::Instance()->GetRegisteredOpInfo(target_op_name); - if (op_info) { - std::vector op_item_inner_output_types; - for (size_t i = 0; i < op->num_results(); ++i) { - op_item_inner_output_types.push_back(op->result_type(i)); - } - auto attributes = op->attributes(); - if (attributes.find("mkldnn_data_type") != attributes.end()) { - attributes["mkldnn_data_type"] = - pir::StrAttribute::get(pir::IrContext::Instance(), "float32"); - } - pir::Operation* op_item_inner = rewriter.Build(op->operands_source(), - attributes, - op_item_inner_output_types, - op_info); - rewriter.ReplaceOp(op, op_item_inner->results()); - } - } -}; - class OneDNNPlacementBf16Pass : public pir::PatternRewritePass { public: OneDNNPlacementBf16Pass() @@ -454,7 +386,6 @@ class OneDNNPlacementBf16Pass : public pir::PatternRewritePass { pir::RewritePatternSet ps(context); ps.Add(context); ps.Add(context); - ps.Add(context); return ps; } diff --git a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc index eb586c40c16773..22179947f25afe 100644 --- a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc +++ b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -59,18 +59,15 @@ class ConcatBf16QuantizePattern auto onednn_data_type = op_attributes.at("mkldnn_data_type") .dyn_cast() .AsString(); - if (onednn_data_type == "bfloat16") return false; - op_attributes["mkldnn_data_type"] = rewriter.str_attr("bfloat16"); + if (onednn_data_type != "bfloat16") return false; auto combine_inputs = pre_op.inputs(); for (size_t idx = 0; idx < combine_inputs.size(); idx++) { - auto type = pre_op->operand_type(idx); - // Currently we only process case where elements are all DenseTensor(s) - if (!type.isa()) return false; - // All Tensors should be fp32 - auto dtype = pir::GetDataTypeFromValue(pre_op->operand_source(idx)); - if (!dtype.isa()) return false; + // Check if it's already quantized + auto pre_pre_op = pir::GetDefiningOpForInput(pre_op, idx); + if (pre_pre_op && pre_pre_op->name() == "onednn_op.quantize") + return false; } pir::IrContext *ctx = rewriter.ir_context(); @@ -95,6 +92,7 @@ class ConcatBf16QuantizePattern quant_op->result(0).set_type(new_type); new_combine_inputs[idx] = quant_op.output(); } + // Create new combine pir::CombineOp new_combine = rewriter.Build(new_combine_inputs); @@ -146,7 +144,12 @@ class CPUSpecialOpsBf16Pass : public pir::PatternRewritePass { auto concat_bf16_quant_pattern = std::make_unique( - context, benefit--, std::vector{}); + context, + benefit--, + std::vector{ + paddle::onednn::dialect::QuantizeOp::name(), + paddle::onednn::dialect::DequantizeOp::name(), + }); ps.Add(std::move(concat_bf16_quant_pattern)); return ps; diff --git a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h index 9dcf771121c24b..781858f00e0a5d 100644 --- a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h +++ b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc index c4f6c4824ecdd4..fd26907815c1e8 100644 --- a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc +++ b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc @@ -1,5 +1,4 @@ -// REGISTER_IR_PASS(onednn_placement_pass, OneDNNPlacementPass); -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py b/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py index 734611b5fe52ff..d8de881c364980 100644 --- a/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py +++ b/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -1143,6 +1143,7 @@ def build_ir_program(self): out = paddle.assign(out) self.pass_attr_list = [ {'onednn_placement_pass': {}}, + {'cpu_bfloat16_placement_pass': {}}, {'cpu_special_ops_bf16_pass': {}}, ] self.feeds = { From fbc9a6a3755b6a35aef4593fb7e9af5ffccb6788 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Wed, 8 Jan 2025 10:26:15 +0800 Subject: [PATCH 07/57] [CodeStyle][Typos][F-[12-17],F-[19-24],F-[26-28]] Fix typo(`Flattend`,`flattend`,`flattern`,`Flattern`,`filpped`,`flaot`,`follwed`,`folowing`,`formater`,`formating`,`foramt`,`formt`,`formate`,`forwad`,`forwrad`,`forword`,`founf`,`framwork`,`frequence`,`fron`,`fullfill`) (#70646) --------- Co-authored-by: Nyakku Shigure --- CONTRIBUTING.md | 4 +- _typos.toml | 21 ---- paddle/cinn/common/ir_util.h | 2 +- paddle/cinn/runtime/cuda/cuda_util.cc | 2 +- paddle/fluid/framework/data_transform.cc | 2 +- .../framework/new_executor/pir_interpreter.cc | 2 +- paddle/fluid/inference/tensorrt/op_teller.cc | 4 +- .../tensorrt/plugin/custom_generic_plugin.cu | 6 +- .../tensorrt/plugin/gelu_op_plugin.cu | 2 +- .../multiary_infer_sym.cc | 2 +- paddle/phi/kernels/cpu/unique_kernel.cc | 18 +-- paddle/phi/kernels/funcs/math_cuda_utils.h | 2 +- paddle/phi/kernels/funcs/unique_functor.h | 56 +++++----- paddle/phi/kernels/gpu/rms_norm_funcs.h | 2 +- .../phi/kernels/gpu/rms_norm_grad_kernel.cu | 2 +- .../kernels/gpu/unique_consecutive_functor.h | 4 +- paddle/phi/kernels/gpu/unique_kernel.cu | 104 +++++++++--------- paddle/phi/kernels/gpudnn/conv_gpudnn_base.h | 2 +- python/paddle/amp/grad_scaler.py | 16 +-- .../hybrid_parallel_gradscaler.py | 6 +- .../passes/auto_parallel_sharding.py | 2 +- .../pipeline_zero_bubble.py | 6 +- python/paddle/incubate/asp/utils.py | 14 +-- .../paddle/io/dataloader/dataloader_iter.py | 2 +- python/paddle/text/datasets/imikolov.py | 3 +- test/legacy_test/test_gather_op.py | 2 +- test/legacy_test/test_lstm_op.py | 4 +- tools/gen_ut_cmakelists.py | 4 +- 28 files changed, 138 insertions(+), 158 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d0c06e6ccf443f..8f03b35783a5ff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,7 +39,7 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful- pre-commit install ``` - Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python. + Our pre-commit configuration requires clang-format 3.8 for auto-formatting C/C++ code and yapf for Python. Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`: @@ -52,7 +52,7 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful- Check for broken symlinks................................................Passed Detect Private Key...................................(no files to check)Skipped Fix End of Files.....................................(no files to check)Skipped - clang-formater.......................................(no files to check)Skipped + clang-format.........................................(no files to check)Skipped [my-cool-stuff c703c041] add test file 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 233 diff --git a/_typos.toml b/_typos.toml index 169423520b98d5..2d6bcfacf8f928 100644 --- a/_typos.toml +++ b/_typos.toml @@ -46,27 +46,6 @@ dobule = 'dobule' Dowloading = 'Dowloading' downsteram = 'downsteram' fetchs = 'fetchs' -Flattend = 'Flattend' -flattend = 'flattend' -flattern = 'flattern' -Flattern = 'Flattern' -filpped = 'filpped' -flaot = 'flaot' -follwed = 'follwed' -folowing = 'folowing' -formater = 'formater' -formating = 'formating' -foramt = 'foramt' -formate = 'formate' -formt = 'formt' -forwrad = 'forwrad' -forwad = 'forwad' -forword = 'forword' -founf = 'founf' -framwork = 'framwork' -frequence = 'frequence' -fron = 'fron' -fullfill = 'fullfill' Indexs = 'Indexs' indexs = 'indexs' indiates = 'indiates' diff --git a/paddle/cinn/common/ir_util.h b/paddle/cinn/common/ir_util.h index 724be629e6e93e..cbfe072d307016 100644 --- a/paddle/cinn/common/ir_util.h +++ b/paddle/cinn/common/ir_util.h @@ -191,7 +191,7 @@ inline void UnpackReduction(const ir::IndexExpr &expr, FLeaf fleaf) { } /*! - * \brief Flattern the expression into a vector of expressions splited by `Add` + * \brief Flatten the expression into a vector of expressions splited by `Add` * or `Mul`. * * For example (Add): diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc index a0c12732a4ad5d..af0017222231bc 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.cc +++ b/paddle/cinn/runtime/cuda/cuda_util.cc @@ -1742,7 +1742,7 @@ void cinn_call_cholesky_nvgpu(void *v_args, cinn_buffer_t *x = args[0].operator cinn_buffer_t *(); cinn_buffer_t *out = args[1].operator cinn_buffer_t *(); // In cuSOLVER, dense matrix stores in COL_MAJOR, thus FILL_MODE needs to be - // filpped. See also: + // flipped. See also: // https://docs.nvidia.com/cuda/cusolver/index.html#matrix-dense-format cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index c8cf06fe27aec8..71d1ae8047105b 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -173,7 +173,7 @@ phi::GetKernelTypeForVarContext BuildGetKernelTypeForVarContext( if (has_infer_varkernel_fn) { for (auto &attr : fluid_attrs) { switch (attr.second.index()) { - case 3: // string type in framwork::Attribute + case 3: // string type in framework::Attribute (*phi_attrs)[attr.first] = PADDLE_GET_CONST(std::string, attr.second); break; default: diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 75def437deafda..287ca3fb178ea5 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -979,7 +979,7 @@ void PirInterpreter::BuildInstruction() { } std::string PirInterpreter::DebugInstructions() { - // log formate: var[101] = pd_op.relu(var[100]) or for inplace op var[100] = + // log format: var[101] = pd_op.relu(var[100]) or for inplace op var[100] = // pd_op.relu_(var[100]) std::stringstream ss; ss << "{outputs}" diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index d0becae8c45ed6..15f2fba66b1932 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -3467,9 +3467,9 @@ struct CustomGenericPluginTeller : public Teller { "SetTrtInferShapeFn."; return false; } - auto& trt_supports_formate_config = + auto& trt_supports_format_config = OpMetaInfoHelper::GetTrtSupportsFormatConfig(op_info); - if (trt_supports_formate_config.empty()) { + if (trt_supports_format_config.empty()) { VLOG(3) << op_type << " has no trt supportsFormatCombination config. Please set by " diff --git a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu index af5db479f10592..d6d76c6b9618ea 100644 --- a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu @@ -311,9 +311,9 @@ bool CustomGenericPlugin::supportsFormatCombination( auto& op_meta_info_map = OpMetaInfoMap::Instance(); const auto& meta_info_map = op_meta_info_map.GetMap(); auto& op_info = meta_info_map.at(op_desc_.Type()).front(); - auto& supports_formate_config = + auto& supports_format_config = OpMetaInfoHelper::GetTrtSupportsFormatConfig(op_info); - PADDLE_ENFORCE_NE(supports_formate_config.empty(), + PADDLE_ENFORCE_NE(supports_format_config.empty(), true, common::errors::InvalidArgument( "The %s op has no tensorrt plugin " @@ -325,7 +325,7 @@ bool CustomGenericPlugin::supportsFormatCombination( size_t output_num = OpMetaInfoHelper::GetOutputs(op_info).size(); std::vector>> format_combinations; - for (auto& config : supports_formate_config) { + for (auto& config : supports_format_config) { auto format_combination = parseConfig(op_desc_.Type(), config); PADDLE_ENFORCE_EQ(input_num + output_num, format_combination.size(), diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu index f5369eb691c69e..c1b4aad6d73c06 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu @@ -77,7 +77,7 @@ __device__ half do_tanh(half a) { return __float2half(tmp); } -// the kernel below is not aligned with fluid fp32 forwrad ones, use it for +// the kernel below is not aligned with fluid fp32 forward ones, use it for // fp16. template __global__ void no_exact_gelu_kernel( diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index 3f2c8397a61415..9809acfb576b71 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -2631,7 +2631,7 @@ bool GroupNormOpInferSymbolicShape( channel_idx = 1; } else { PADDLE_THROW(common::errors::Unimplemented( - "GroupNorm only suport NHWC and NCHW data formt")); + "GroupNorm only suport NHWC and NCHW data format")); } symbol::DimExpr channel_dim = x_shape.shape()[channel_idx]; diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc index e3be49af16ed3c..8a0b9046a15b84 100644 --- a/paddle/phi/kernels/cpu/unique_kernel.cc +++ b/paddle/phi/kernels/cpu/unique_kernel.cc @@ -83,15 +83,15 @@ void UniqueRawKernel(const Context& context, if (axis.empty()) { phi::VisitDataTypeTiny( dtype, - phi::funcs::UniqueFlattendTensorFunctor(context, - x, - out, - indices, - index, - counts, - return_index, - return_inverse, - return_counts)); + phi::funcs::UniqueFlattenedTensorFunctor(context, + x, + out, + indices, + index, + counts, + return_index, + return_inverse, + return_counts)); } else { int axis_value = axis[0]; axis_value = (axis_value == -1) ? (x.dims().size() - 1) : axis_value; diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h index a5aaa1310b16db..f14b2af8c72609 100644 --- a/paddle/phi/kernels/funcs/math_cuda_utils.h +++ b/paddle/phi/kernels/funcs/math_cuda_utils.h @@ -298,7 +298,7 @@ __inline__ __device__ T PartialWarpReduceMin(T val, warp_mask_t lane_mask) { T warp_val = __shfl_sync(lane_mask, val, 0, warpSize); #else T warp_val = __shfl( - val, 0, warpSize); // To fullfill the data in each thread of this warp. + val, 0, warpSize); // To fulfill the data in each thread of this warp. #endif warp_val = val; diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h index 8d62a0c5255e46..4365f1a5f4cfe6 100644 --- a/paddle/phi/kernels/funcs/unique_functor.h +++ b/paddle/phi/kernels/funcs/unique_functor.h @@ -130,15 +130,15 @@ static bool Equal(const DenseTensor& a, const DenseTensor& b) { } template -static void UniqueFlattendTensor(const Context& context, - const DenseTensor& in, - DenseTensor* out, - DenseTensor* indices, - DenseTensor* index, - DenseTensor* count, - bool return_index, - bool return_inverse, - bool return_counts) { +static void UniqueFlattenedTensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts) { const InT* in_data = in.data(); std::set unique(in_data, in_data + in.numel()); out->Resize(common::make_ddim({static_cast(unique.size())})); @@ -327,7 +327,7 @@ static void UniqueDim(const Context& context, } template -struct UniqueFlattendTensorFunctor { +struct UniqueFlattenedTensorFunctor { const Context& ctx_; /* */ const DenseTensor& in_; DenseTensor* out_; @@ -338,15 +338,15 @@ struct UniqueFlattendTensorFunctor { const bool return_inverse_; const bool return_counts_; - UniqueFlattendTensorFunctor(const Context& context, - const DenseTensor& in, - DenseTensor* out, - DenseTensor* indices, - DenseTensor* index, - DenseTensor* count, - bool return_index, - bool return_inverse, - bool return_counts) + UniqueFlattenedTensorFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* count, + bool return_index, + bool return_inverse, + bool return_counts) : ctx_(context), in_(in), out_(out), @@ -359,15 +359,15 @@ struct UniqueFlattendTensorFunctor { template void apply() const { - UniqueFlattendTensor(ctx_, - in_, - out_, - indices_, - index_, - count_, - return_index_, - return_inverse_, - return_counts_); + UniqueFlattenedTensor(ctx_, + in_, + out_, + indices_, + index_, + count_, + return_index_, + return_inverse_, + return_counts_); } }; diff --git a/paddle/phi/kernels/gpu/rms_norm_funcs.h b/paddle/phi/kernels/gpu/rms_norm_funcs.h index 2954d593014a6c..db6a137a02d386 100644 --- a/paddle/phi/kernels/gpu/rms_norm_funcs.h +++ b/paddle/phi/kernels/gpu/rms_norm_funcs.h @@ -14,7 +14,7 @@ limitations under the License. */ /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ -/*This code is copied fron NVIDIA apex: +/*This code is copied from NVIDIA apex: * https://github.com/NVIDIA/apex * with minor changes. */ diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu index 5be55226813646..342737e9b20bd5 100644 --- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ -/*This code is copied fron NVIDIA apex: +/*This code is copied from NVIDIA apex: * https://github.com/NVIDIA/apex * with minor changes. */ diff --git a/paddle/phi/kernels/gpu/unique_consecutive_functor.h b/paddle/phi/kernels/gpu/unique_consecutive_functor.h index dae83a45a8e917..f094da335f396d 100644 --- a/paddle/phi/kernels/gpu/unique_consecutive_functor.h +++ b/paddle/phi/kernels/gpu/unique_consecutive_functor.h @@ -32,7 +32,7 @@ namespace phi { -// The core logic of computing Unique Consecutive for a flattend Tensor +// The core logic of computing Unique Consecutive for a flattened Tensor template struct UniqueConsecutiveFlattenedCUDAFunctor { const Context& ctx_; diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu index 341483e57d56b4..e08aa5bece3bc4 100644 --- a/paddle/phi/kernels/gpu/unique_kernel.cu +++ b/paddle/phi/kernels/gpu/unique_kernel.cu @@ -106,21 +106,21 @@ struct BinaryNotEqual { } }; -// The core logic of computing Unique for a flattend DenseTensor +// The core logic of computing Unique for a flattened DenseTensor template static typename std::enable_if< !std::is_same::value && !std::is_same::value>::type -UniqueFlattendCUDATensor(const Context& context, - const DenseTensor& in, - DenseTensor* out, - DenseTensor* indices, - DenseTensor* index, - DenseTensor* counts, - bool return_index, - bool return_inverse, - bool return_counts, - int64_t num_input) { +UniqueFlattenedCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + int64_t num_input) { // 0. Prepration auto equal = thrust::equal_to(); auto not_equal = thrust::not_equal_to(); @@ -242,21 +242,21 @@ UniqueFlattendCUDATensor(const Context& context, } } -// The core logic of computing Unique for a flattend DenseTensor +// The core logic of computing Unique for a flattened DenseTensor template static typename std::enable_if< std::is_same::value || std::is_same::value>::type -UniqueFlattendCUDATensor(const Context& context, - const DenseTensor& in, - DenseTensor* out, - DenseTensor* indices, - DenseTensor* index, - DenseTensor* counts, - bool return_index, - bool return_inverse, - bool return_counts, - int64_t num_input) { +UniqueFlattenedCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts, + int64_t num_input) { // 1. Sort indices DenseTensor in_resize; in_resize.ShareDataWith(in); @@ -526,9 +526,9 @@ static void UniqueDimsCUDATensor(const Context& context, } } -// functor for processing a flattend DenseTensor +// functor for processing a flattened DenseTensor template -struct UniqueFlattendCUDAFunctor { +struct UniqueFlattenedCUDAFunctor { const Context& ctx_; const DenseTensor& in_; DenseTensor* out_; @@ -539,15 +539,15 @@ struct UniqueFlattendCUDAFunctor { const bool return_inverse_; const bool return_counts_; - UniqueFlattendCUDAFunctor(const Context& context, - const DenseTensor& in, - DenseTensor* out, - DenseTensor* indices, - DenseTensor* index, - DenseTensor* counts, - bool return_index, - bool return_inverse, - bool return_counts) + UniqueFlattenedCUDAFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts, + bool return_index, + bool return_inverse, + bool return_counts) : ctx_(context), in_(in), out_(out), @@ -560,16 +560,16 @@ struct UniqueFlattendCUDAFunctor { template void apply() const { - UniqueFlattendCUDATensor(ctx_, - in_, - out_, - indices_, - index_, - counts_, - return_index_, - return_inverse_, - return_counts_, - in_.numel()); + UniqueFlattenedCUDATensor(ctx_, + in_, + out_, + indices_, + index_, + counts_, + return_index_, + return_inverse_, + return_counts_, + in_.numel()); } }; @@ -650,15 +650,15 @@ void UniqueRawKernel(const Context& context, if (axis.empty()) { phi::VisitDataTypeTiny( dtype, - UniqueFlattendCUDAFunctor(context, - x, - out, - indices, - index, - counts, - return_index, - return_inverse, - return_counts)); + UniqueFlattenedCUDAFunctor(context, + x, + out, + indices, + index, + counts, + return_index, + return_inverse, + return_counts)); } else { // 'axis' is required. int axis_value = axis[0]; diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h index 5b55aa8f70394a..a21ed28d839a4a 100644 --- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h @@ -120,7 +120,7 @@ struct ConvArgsBase { // groups int group; - // data foramt + // data format GPUDNNDataLayout data_layout; ConvArgsBase(const HandleT& h, diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 4ba1524a307d9d..c371918e3f0e4f 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -181,7 +181,7 @@ def __init__( self._scale = paddle.to_tensor( np.array([self._init_loss_scaling]).astype(np.float32) ) - self._cache_founf_inf = None + self._cache_found_inf = None self._optimizer_states = defaultdict(_refresh_optimizer_state) def scale(self, var: Tensor) -> Tensor: @@ -335,13 +335,13 @@ def minimize( optimizer._set_auxiliary_var('found_inf', self._found_inf) optimize_ops, params_grads = optimizer.minimize(*args, **kwargs) # TODO: Fix to _cache_found_inf after PaddleNLP update - self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf') + self._cache_found_inf = optimizer._get_auxiliary_var('found_inf') else: if self._found_inf: - self._cache_founf_inf = True + self._cache_found_inf = True else: optimize_ops, params_grads = optimizer.minimize(*args, **kwargs) - self._cache_founf_inf = False + self._cache_found_inf = False if self._use_dynamic_loss_scaling: # update the scale @@ -462,7 +462,7 @@ def _update(self): if not self._enable: return - if self._cache_founf_inf: + if self._cache_found_inf: self._incr_count = 0 self._decr_count = self._decr_count + 1 if self._decr_count == self._decr_every_n_nan_or_inf: @@ -846,13 +846,13 @@ def step(self, optimizer: Optimizer) -> None: if hasattr(optimizer, "_set_auxiliary_var"): optimizer._set_auxiliary_var('found_inf', self._found_inf) optimizer.step() - self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf') + self._cache_found_inf = optimizer._get_auxiliary_var('found_inf') else: if self._found_inf: - self._cache_founf_inf = True + self._cache_found_inf = True else: optimizer.step() - self._cache_founf_inf = False + self._cache_found_inf = False optimizer_state["state"] = OptimizerState.STEPPED diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py index 358c6023e6c6f7..c9a684ae807be4 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py @@ -45,13 +45,13 @@ def minimize(self, optimizer, *args, **kwargs): optimizer._set_auxiliary_var('found_inf', self._found_inf) optimize_ops, params_grads = optimizer.minimize(*args, **kwargs) # TODO: Fix to _cache_found_inf after PaddleNLP update - self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf') + self._cache_found_inf = optimizer._get_auxiliary_var('found_inf') else: if self._found_inf: - self._cache_founf_inf = True + self._cache_found_inf = True else: optimize_ops, params_grads = optimizer.minimize(*args, **kwargs) - self._cache_founf_inf = False + self._cache_found_inf = False if self._use_dynamic_loss_scaling: self._update() diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py index 4b7814af7f53ea..c6315c78ad4617 100644 --- a/python/paddle/distributed/passes/auto_parallel_sharding.py +++ b/python/paddle/distributed/passes/auto_parallel_sharding.py @@ -196,7 +196,7 @@ def _apply_single_impl(self, main_program, startup_program, context): # NOTE Multi / Sub-Block Support # we assume that only parameter are present and partitioned in main_block, # there is NO new param in sub_block, and all params in sub_block follows the same - # partition as main_block. the above constraint fullfill the 3 most common use-cases in Paddle sub_block: + # partition as main_block. the above constraint fulfill the 3 most common use-cases in Paddle sub_block: # 1. subblock for lr scheduler # 2. sub-block uses the same or partial network of main-block, e.g. GPT3 generation model # 3. sub-block used for double backward diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py index d6025d80e5e7c8..112373cebcd404 100644 --- a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py +++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py @@ -604,11 +604,11 @@ def _insert_jobs_after_backward_start( break # Step3: Insert forward jobs after backward_b - forword_insert_order = range(self.num_stage) + forward_insert_order = range(self.num_stage) if self.num_model_chunks % 2: - forword_insert_order = range(self.num_stage - 1, -1, -1) + forward_insert_order = range(self.num_stage - 1, -1, -1) - for stage_id in forword_insert_order: + for stage_id in forward_insert_order: for chunk_id in range(self.num_model_chunks - 1, -1, -1): if self._can_schedule_f_task(stage_id, chunk_id): while ( diff --git a/python/paddle/incubate/asp/utils.py b/python/paddle/incubate/asp/utils.py index 408c3d3a6b0866..1fef294dc41826 100644 --- a/python/paddle/incubate/asp/utils.py +++ b/python/paddle/incubate/asp/utils.py @@ -220,14 +220,14 @@ def get_mask_1d(mat: npt.NDArray[Any], n: int, m: int) -> npt.NDArray[Any]: """ mat_flatten, shape = _reshape_1d(mat, m) - mask_flattern = np.ones_like(mat_flatten) + mask_flatten = np.ones_like(mat_flatten) mask = np.ones_like(mat) for i in range(mat_flatten.shape[0]): sub_mat = mat_flatten[i] min_order_indices = np.argsort(np.absolute(sub_mat)) - mask_flattern[i, min_order_indices[:n].tolist()] = 0 - mask_flattern = mask_flattern.reshape(shape) - mask[:, :] = mask_flattern[:, : mat.shape[1]] + mask_flatten[i, min_order_indices[:n].tolist()] = 0 + mask_flatten = mask_flatten.reshape(shape) + mask[:, :] = mask_flatten[:, : mat.shape[1]] return mask @@ -486,13 +486,13 @@ def get_mask_2d_best(mat: npt.NDArray[Any], n: int, m: int) -> npt.NDArray[Any]: patterns = _compute_valid_2d_patterns(n, m) mat_flatten, shape = _reshape_2d(mat, m) - mask_flattern = np.ones_like(mat_flatten).reshape(-1, m, m) + mask_flatten = np.ones_like(mat_flatten).reshape(-1, m, m) pmax = np.argmax( np.matmul(mat_flatten, patterns.reshape(patterns.shape[0], m * m).T), axis=1, ) - mask_flattern[:] = patterns[pmax[:]] + mask_flatten[:] = patterns[pmax[:]] mask = np.empty(shape) curr_idx = 0 @@ -500,7 +500,7 @@ def get_mask_2d_best(mat: npt.NDArray[Any], n: int, m: int) -> npt.NDArray[Any]: row_end = row_start + m for col_start in range(0, shape[1], m): col_end = col_start + m - mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx] + mask[row_start:row_end, col_start:col_end] = mask_flatten[curr_idx] curr_idx += 1 return mask[: mat.shape[0], : mat.shape[1]] diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py index 836c0b40224c6f..8b3ba314388eab 100644 --- a/python/paddle/io/dataloader/dataloader_iter.py +++ b/python/paddle/io/dataloader/dataloader_iter.py @@ -52,7 +52,7 @@ # layers processing) after iterate **the first few data** in # distributed launch mode, distributed launch will call # terminate() to kill main process on each devices, but thread -# is still iterating to fullfill blocking queue caches, which +# is still iterating to fulfill blocking queue caches, which # may cause thread error `terminate called without an active # exception` for terminate is a strong signal and `__del__` # of DataLoader may not be called, so we add a global link to diff --git a/python/paddle/text/datasets/imikolov.py b/python/paddle/text/datasets/imikolov.py index df7b4383f6318f..6691b1fd6ef5c1 100644 --- a/python/paddle/text/datasets/imikolov.py +++ b/python/paddle/text/datasets/imikolov.py @@ -43,7 +43,7 @@ class Imikolov(Dataset): data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'. window_size(int): sliding window size for 'NGRAM' data. Default -1. mode(str): 'train' 'test' mode. Default 'train'. - min_word_freq(int): minimal word frequence for building word dictionary. Default 50. + min_word_freq(int): minimal word frequencies for building word dictionary. Default 50. download(bool): whether to download dataset automatically if :attr:`data_file` is not set. Default True @@ -54,6 +54,7 @@ class Imikolov(Dataset): .. code-block:: python + >>> # doctest: +TIMEOUT(60) >>> import paddle >>> from paddle.text.datasets import Imikolov diff --git a/test/legacy_test/test_gather_op.py b/test/legacy_test/test_gather_op.py index c4ebe86af2ad97..d8227134d6b5d2 100644 --- a/test/legacy_test/test_gather_op.py +++ b/test/legacy_test/test_gather_op.py @@ -471,7 +471,7 @@ def config(self): class TestOutOfRangeError(unittest.TestCase): - def test_dygraph_forwad_and_backward(self): + def test_dygraph_forward_and_backward(self): with dygraph_guard(): x = paddle.randn([100, 3]).cpu() x.stop_gradient = False diff --git a/test/legacy_test/test_lstm_op.py b/test/legacy_test/test_lstm_op.py index 2f3f3fe4ed683e..fca6d226e90705 100644 --- a/test/legacy_test/test_lstm_op.py +++ b/test/legacy_test/test_lstm_op.py @@ -207,7 +207,7 @@ def test_check_output(self): self.check_output(atol=1e-8, check_dygraph=False) def test_check_grad(self): - # TODO(qingqing) remove folowing lines after the check_grad is refined. + # TODO(qingqing) remove following lines after the check_grad is refined. N = len(self.lod[0]) self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype( @@ -259,7 +259,7 @@ def test_check_grad(self): # self.use_peepholes = True # def test_check_grad(self): -# # TODO(qingqing) remove folowing lines after the check_grad is refined. +# # TODO(qingqing) remove following lines after the check_grad is refined. # N = len(self.lod[0]) # self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') # self.outputs['BatchCellPreAct'] = np.zeros( diff --git a/tools/gen_ut_cmakelists.py b/tools/gen_ut_cmakelists.py index f64f065c19da65..50819aa9a0de58 100644 --- a/tools/gen_ut_cmakelists.py +++ b/tools/gen_ut_cmakelists.py @@ -238,7 +238,7 @@ def process_dist_port_num(self, port_num): re.compile("^[0-9]+$").search(port_num) and int(port_num) > 0 or port_num.strip() == "" - ), f"""port_num must be foramt as a positive integer or empty, but this port_num is '{port_num}'""" + ), f"""port_num must be format as a positive integer or empty, but this port_num is '{port_num}'""" port_num = port_num.strip() if len(port_num) == 0: return 0 @@ -270,7 +270,7 @@ def _init_dist_ut_ports_from_cmakefile(self, cmake_file_name): break name = lines[k - 1].strip() - # matcg right tets name format, the name must start with 'test_' follwed bu at least one char of + # matcg right tets name format, the name must start with 'test_' followed bu at least one char of # '0-9'. 'a-z'. 'A-Z' or '_' assert re.compile("^test_[0-9a-zA-Z_]+").search( name From 6b69d206f5e1cc914610faee31700f1adde07aaf Mon Sep 17 00:00:00 2001 From: Chandler <44045446+BaolanChen@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:33:43 +0800 Subject: [PATCH 08/57] [CodeStyle][Typos][D-[37-44]] Fix typos (`dito`,`devide`,`documention`,`doens`,`doen`,`dobule`,`doubel`,`dowloading`,`downsteram`) (#70642) * Typos fix D37-44 * Typos fix D37-44 changes * merge changes --- _typos.toml | 9 --------- paddle/cinn/operator_fusion/graph_transformer/matcher.h | 6 +++--- paddle/fluid/inference/tensorrt/op_teller.cc | 2 +- .../fluid/operators/controlflow/control_flow_op_helper.h | 2 +- paddle/fluid/operators/elementwise/elementwise_mul_op.cc | 2 +- paddle/fluid/operators/elementwise/elementwise_sub_op.cc | 2 +- paddle/fluid/pir/dialect/op_generator/op_gen.py | 2 +- paddle/phi/kernels/funcs/values_vectors_functor.h | 1 - paddle/scripts/paddle_build.sh | 2 +- paddle/utils/string/printf.h | 2 +- .../static/tuner/to_distributed_api_patterns.py | 2 +- python/paddle/nn/clip.py | 2 +- .../hybrid_strategy/to_distributed_api_for_llama.py | 2 +- test/ir/pir/cinn/llama_test_model.py | 2 +- test/legacy_test/test_cond.py | 2 +- 15 files changed, 15 insertions(+), 25 deletions(-) diff --git a/_typos.toml b/_typos.toml index 2d6bcfacf8f928..5355bd0657d18b 100644 --- a/_typos.toml +++ b/_typos.toml @@ -36,15 +36,6 @@ unpacket = "unpacket" # These words need to be fixed Creater = 'Creater' creater = 'creater' -dito = 'dito' -devide = 'devide' -documention = 'documention' -doens = 'doens' -doen = 'doen' -doubel = 'doubel' -dobule = 'dobule' -Dowloading = 'Dowloading' -downsteram = 'downsteram' fetchs = 'fetchs' Indexs = 'Indexs' indexs = 'indexs' diff --git a/paddle/cinn/operator_fusion/graph_transformer/matcher.h b/paddle/cinn/operator_fusion/graph_transformer/matcher.h index 80c205529009b1..36352e81a2f24c 100644 --- a/paddle/cinn/operator_fusion/graph_transformer/matcher.h +++ b/paddle/cinn/operator_fusion/graph_transformer/matcher.h @@ -285,9 +285,9 @@ struct LeafReshapeConnectionMatcher { struct NotAllElementWiseDownstreamMatcher { bool operator()(const PatternGraph& graph, const PatternNodePtr& node) { size_t count = 0; - for (const auto& downsteram : node->downstream()) { - if (StmtPatternGraphMatcher()(graph, downsteram)) { - auto ops = std::get(downsteram->stmt_pattern()).ops(); + for (const auto& downstream : node->downstream()) { + if (StmtPatternGraphMatcher()(graph, downstream)) { + auto ops = std::get(downstream->stmt_pattern()).ops(); bool is_elementwise = std::all_of(ops.begin(), ops.end(), [](pir::Operation* op) { return GetOpPatternKind(op) == hlir::framework::kElementWise; diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 15f2fba66b1932..9a21edd52d838a 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -2383,7 +2383,7 @@ struct SimpleOpTypeSetTeller : public Teller { // conv3d_transpose if (op_type == "conv3d_transpose") { - // trt doen't support output_padding when < 8406 + // trt doesn't support output_padding when < 8406 // output_padding is usually set when stride > 1 #if !IS_TRT_VERSION_GE(8400) if (desc.HasAttr("output_padding")) { diff --git a/paddle/fluid/operators/controlflow/control_flow_op_helper.h b/paddle/fluid/operators/controlflow/control_flow_op_helper.h index 188aa87c2bf9fb..52039c1049b958 100644 --- a/paddle/fluid/operators/controlflow/control_flow_op_helper.h +++ b/paddle/fluid/operators/controlflow/control_flow_op_helper.h @@ -121,7 +121,7 @@ static void AssignZeroToParentScope( PADDLE_ENFORCE_EQ(input_tensors.size(), outside_tensors->size(), common::errors::InvalidArgument( - "DenseTensorArray outside_var %s doen't have same " + "DenseTensorArray outside_var %s doesn't have same " "size as input_var %s.", outside_grad_name, input_name)); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index fcbded2b78adbb..844a6e5c750a0d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -139,7 +139,7 @@ class ElementwiseMulCompositeDoubleGradOpMaker -1, common::errors::InvalidArgument( "We only support axis = -1 in composite " - "add_doubel_grad but we got: ", + "add_double_grad but we got: ", axis)); // get output diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index cce90902bd7c02..e2126db86e7e3d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -125,7 +125,7 @@ class ElementwiseSubCompositeDoubleGradOpMaker -1, common::errors::InvalidArgument( "We only support axis = -1 in composite " - "subtract_doubel_grad but we got: ", + "subtract_double_grad but we got: ", axis)); paddle::Tensor* grad_out_grad = this->GetOutputPtr(&grad_out_grad_t); diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py index a45964841116aa..3e33fe2205d618 100644 --- a/paddle/fluid/pir/dialect/op_generator/op_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py @@ -342,7 +342,7 @@ class {TEST_API} {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{ 'int': 'pir::Int32Attribute', 'int64_t': 'pir::Int64Attribute', 'float': 'pir::FloatAttribute', - 'dobule': 'pir::DoubleAttribute', + 'double': 'pir::DoubleAttribute', 'bool': 'pir::BoolAttribute', } diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h index d95b28fc59718f..b14f36aebb7cb0 100644 --- a/paddle/phi/kernels/funcs/values_vectors_functor.h +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -467,7 +467,6 @@ struct MatrixEighFunctor { "When has_vectors is true," "the eigenvectors needs to be calculated," "so the eigenvectors must be provided.")); - // input_trans = dito.Transpose(input_trans); input_trans = phi::TransposeLast2Dim(dev_ctx, input_trans); eigen_vectors->ShareDataWith(input_trans); } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d0c0ed8db6b8f7..7090df20d6a5e4 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -3674,7 +3674,7 @@ function distribute_test() { parallel_fa_unit echo "End FA tests" - echo "Dowloading ...." + echo "Downloading ...." cd ${work_dir} wget https://paddlenlp.bj.bcebos.com/wheels/PaddleNLP_stable_paddle.tar.gz --no-proxy tar -zvxf PaddleNLP_stable_paddle.tar.gz diff --git a/paddle/utils/string/printf.h b/paddle/utils/string/printf.h index f4576c6bc4aa54..f2c87fb5e8ed31 100644 --- a/paddle/utils/string/printf.h +++ b/paddle/utils/string/printf.h @@ -54,7 +54,7 @@ // weekday, month, day, hour, min); // // 2. High-performance -- most printed strings are not too long and -// doens't need dynamic memory allocation. Many StringPrintf +// doesn't need dynamic memory allocation. Many StringPrintf // implementations doesn't enforce type-safe, but are // high-performance, including // diff --git a/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py b/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py index 4887be8b757412..f0f564663d5e12 100644 --- a/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py +++ b/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py @@ -563,7 +563,7 @@ def apply( key_states = paddle.transpose(key_states, [0, 2, 1, 3]) value_states = paddle.transpose(value_states, [0, 2, 1, 3]) - # matmul and devide by sqrt(head_dim) + # matmul and divide by sqrt(head_dim) attn_weights = paddle.matmul( query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]), diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index f0ccab9ff068f1..9913063eb946f6 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -1407,7 +1407,7 @@ def set_gradient_clip(clip, param_list=None, program=None): "We recommend a new strategy: set 'grad_clip' " "when initializing the 'optimizer'. " "This method can reduce the mistakes, please " - "refer to documention of 'optimizer'." + "refer to documentation of 'optimizer'." ) if not isinstance(clip, ClipGradBase): diff --git a/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py b/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py index 2fe3a039b635be..da5402ed1031e5 100644 --- a/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py +++ b/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py @@ -117,7 +117,7 @@ def scaled_dot_product_attention( key_states = paddle.transpose(key_states, [0, 2, 1, 3]) value_states = paddle.transpose(value_states, [0, 2, 1, 3]) - # matmul and devide by sqrt(head_dim) + # matmul and divide by sqrt(head_dim) attn_weights = paddle.matmul( query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]) ) diff --git a/test/ir/pir/cinn/llama_test_model.py b/test/ir/pir/cinn/llama_test_model.py index 4761aa6f649246..250ce96d7c2f72 100644 --- a/test/ir/pir/cinn/llama_test_model.py +++ b/test/ir/pir/cinn/llama_test_model.py @@ -187,7 +187,7 @@ def scaled_dot_product_attention( key_states = paddle.transpose(key_states, [0, 2, 1, 3]) value_states = paddle.transpose(value_states, [0, 2, 1, 3]) - # matmul and devide by sqrt(head_dim) + # matmul and divide by sqrt(head_dim) attn_weights = paddle.matmul( query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]) ) diff --git a/test/legacy_test/test_cond.py b/test/legacy_test/test_cond.py index bc9c61545cb473..d966db3587f4ae 100644 --- a/test/legacy_test/test_cond.py +++ b/test/legacy_test/test_cond.py @@ -471,7 +471,7 @@ def test_extremely_simple_net_with_op_in_condition(self): main_program, fetch_list=[out, b, a.grad_name, b.grad_name] ) # Note: fill_constant has loss of precision, you have to assertEqual - # with values doens't lose precision in float-point number. + # with values doesn't lose precision in float-point number. self.assertEqual(ret[0][0], ret[1][0]) self.assertEqual(ret[2][0], 0.0) self.assertEqual(ret[3][0], 1.0) From e8c33cdff9f32e72f6b09b92317e7f2a3f2928ef Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Wed, 8 Jan 2025 10:34:31 +0800 Subject: [PATCH 09/57] [CodeStyle][Typos][I-[12-14],I-[16-20]] Fix typos (`indiates`,`indeces`,`inferrence`,`infering`,`imformation`,`infomation`,`informations`,`Infor`,`infor`,`inheritted`,`initilization`) (#70650) * fix * fix --- _typos.toml | 18 +++++++----------- .../operator/transforms/lowering_pass/utils.cc | 2 +- paddle/cinn/hlir/framework/pir/utils.cc | 2 +- paddle/cinn/ir/ir.h | 2 +- paddle/cinn/poly/stage.h | 4 ++-- .../fluid/distributed/ps/service/brpc_utils.cc | 2 +- .../eager_generated/backwards/scale_node.h | 2 +- .../interpreter/interpreter_util.cc | 2 +- .../tensorrt/convert/emb_eltwise_layernorm.cc | 2 +- .../convert/preln_emb_eltwise_layernorm.cc | 2 +- .../prompt_tuning_emb_eltwise_layernorm.cc | 2 +- .../fluid/operators/generator/generate_op.py | 2 +- paddle/fluid/pybind/tensor.cc | 2 +- paddle/phi/infermeta/unary.cc | 2 +- paddle/phi/kernels/funcs/seq2col.h | 4 ++-- paddle/phi/kernels/impl/einsum_impl.h | 2 +- .../kernels/sparse/cpu/elementwise_kernel.cc | 6 +++--- paddle/phi/kernels/sparse/gpu/convolution.cu.h | 2 +- paddle/pir/include/core/ir_context.h | 6 +++--- .../auto_parallel/static/auto_align_tool.py | 4 ++-- .../distributed/auto_parallel/static/engine.py | 4 ++-- .../paddle/distributed/fleet/utils/log_util.py | 2 +- python/paddle/distributed/rpc/rpc.py | 2 +- python/paddle/framework/io_utils.py | 18 +++++++++--------- .../test_align_tool_deprecated.py | 6 ++---- tools/parallel_UT_rule.py | 2 -- 26 files changed, 48 insertions(+), 56 deletions(-) diff --git a/_typos.toml b/_typos.toml index 5355bd0657d18b..a29bf57b1677b1 100644 --- a/_typos.toml +++ b/_typos.toml @@ -12,6 +12,13 @@ extend-exclude = [ [default] # Ignore 1-3 letter words, refer to https://github.com/crate-ci/typos/issues/1079 extend-ignore-words-re = ["^[a-zA-Z]{1,3}$"] +# refer to https://github.com/crate-ci/typos/blob/master/docs/reference.md#example-configurations +extend-ignore-re = [ + # Ignore lines by `# typos: disable-line` + "(?Rm)^.*(#|//)\\s*typos:\\s*disable-line$", + # Ignore block by `# typos: off` and `# typos: on` + "(?s)(#|//)\\s*typos:\\s*off.*?\\n\\s*(#|//)\\s*typos:\\s*on" +] [default.extend-words] # PaddlePaddle specific words @@ -39,19 +46,8 @@ creater = 'creater' fetchs = 'fetchs' Indexs = 'Indexs' indexs = 'indexs' -indiates = 'indiates' -indeces = 'indeces' -inferrence = 'inferrence' Infered = 'Infered' infered = 'infered' -infering = 'infering' -informations = 'informations' -imformation = 'imformation' -infomation = 'infomation' -Infor = 'Infor' -infor = 'infor' -inheritted = 'inheritted' -initilization = 'initilization' initilized = 'initilized' initalized = 'initalized' initalize = 'initalize' diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc index 59d00770c5e753..b1273e42868024 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc @@ -117,7 +117,7 @@ OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) { if (FLAGS_cinn_enable_map_expr) { cinn::adt::TryGenerateMapExprFromGroup(group); } - // Rebuild other informations + // Rebuild other information // TODO(zhangyuqin1998): Do we need group.master_ops? return group; } diff --git a/paddle/cinn/hlir/framework/pir/utils.cc b/paddle/cinn/hlir/framework/pir/utils.cc index 69482f296531a7..fb3d754e669e45 100644 --- a/paddle/cinn/hlir/framework/pir/utils.cc +++ b/paddle/cinn/hlir/framework/pir/utils.cc @@ -95,7 +95,7 @@ std::string GetDebugInfo(const std::unordered_set& names) { return debug_info; } -// OpTransInfo contains informations used to detect subgraphs +// OpTransInfo contains information used to detect subgraphs // supported by the CINN compiler. class OpTransInfo { using DeParamCondT = diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h index 94d6000da798ad..9cfad3ba57ca8c 100644 --- a/paddle/cinn/ir/ir.h +++ b/paddle/cinn/ir/ir.h @@ -848,7 +848,7 @@ struct ForBase { BindInfo bind_info_; }; -/// LLVM loop unroll metadata infomation +/// LLVM loop unroll metadata information struct LLVMForLoopMeta { enum UnrollMode { DefaultUnroll, FullyUnroll, NoUnroll }; diff --git a/paddle/cinn/poly/stage.h b/paddle/cinn/poly/stage.h index b15d0149ed426a..7653bd8e5bfee9 100644 --- a/paddle/cinn/poly/stage.h +++ b/paddle/cinn/poly/stage.h @@ -57,7 +57,7 @@ struct StageForloopInfo { ir::DeviceAPI device; }; -//! Store the informations about some other tensor `compute_at` this tensor. +//! Store the information about some other tensor `compute_at` this tensor. struct ComputeAtInfo { ComputeAtInfo(const std::string& consumer_tensor_name, const std::string& producer_tensor_name, @@ -84,7 +84,7 @@ struct ComputeAtInfo { }; /** - * Meta infomation for tensor. + * Meta information for tensor. */ struct TensorScheduleMeta { //! Store the information of all the other producer tensors `compute_at` this diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc index 21ce06030c71f3..a1645302f0bfba 100644 --- a/paddle/fluid/distributed/ps/service/brpc_utils.cc +++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc @@ -321,7 +321,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) { if (nullptr == hp) { LOG(ERROR) << "Brpc Start failed, ip_port= " << ip_port - << " , Error infomation: " << hstrerror(h_errno); + << " , Error information: " << hstrerror(h_errno); } int i = 0; diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index e2036bc8363d87..377a8354afde43 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -18,7 +18,7 @@ #include "paddle/fluid/eager/tensor_wrapper.h" /* - Each Operation has a specific GradNode inheritted from GradNodeBase + Each Operation has a specific GradNode inherited from GradNodeBase A specific GradNode defines 1. Input Tensors 2. overrides operator() to perform actual backward computations diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 43267f1babb4a6..5b446605af2cc7 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -859,7 +859,7 @@ void BuildOpFuncList(const phi::Place& place, op->Attr(kAllKernelsMustComputeRuntimeShape))) { RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context); // TODO(Aurelius84): In case of control flow ops, they are NOT - // inheritted from OperatorWithKernel. + // inherited from OperatorWithKernel. op_with_kernel->Info().infer_shape_(&infer_shape_ctx); } } diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index d1efdc4cddc2fc..8b67d0df3ff011 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -92,7 +92,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { output_fp16, 1, common::errors::InvalidArgument( - "Only Precision::KHalf(fp16) is supported when infering " + "Only Precision::KHalf(fp16) is supported when inferring " "ernie(bert) model with config.EnableVarseqlen(). " "But Precision::KFloat32 is setted.")); diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc index ee6eaa1730fa23..f1a3b64cbd0f75 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc @@ -145,7 +145,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { output_fp16, 1, common::errors::InvalidArgument( - "Only Precision::KHalf(fp16) is supported when infering " + "Only Precision::KHalf(fp16) is supported when inferring " "ernie(bert) model with config.EnableVarseqlen(). " "But Precision::KFloat32 is setted.")); diff --git a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc index fd935b27393c22..47b9386ee2b621 100644 --- a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc @@ -91,7 +91,7 @@ class PromptTuningEmbEltwiseLayerNormOpConverter : public OpConverter { output_fp16, 1, common::errors::InvalidArgument( - "Only Precision::KHalf(fp16) is supported when infering " + "Only Precision::KHalf(fp16) is supported when inferring " "ernie(bert) model with config.EnableVarseqlen(). " "But Precision::KFloat32 is setted.")); diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py index 47ac091598eafc..a680f716ac58a4 100644 --- a/paddle/fluid/operators/generator/generate_op.py +++ b/paddle/fluid/operators/generator/generate_op.py @@ -309,7 +309,7 @@ def add_grad_op_compat_name(grad_op_item, args_name_map): if new_op_name != op_name: forward_op_item['op_name'] = op_name - # add complex promote infomation + # add complex promote information if "complex_promote" in op_args: forward_op_item["complex_promote"] = op_args["complex_promote"] if has_backward: diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index c0dce7d167371d..1d290729a54401 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -878,7 +878,7 @@ void BindTensor(pybind11::module &m) { // NOLINT Returns: tuple: contains ipc name, data size, data type, - tensor dims and lod imformation. + tensor dims and lod information. Examples: .. code-block:: python diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 115bc417a4ff1c..11d9ab80a48ef3 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1242,7 +1242,7 @@ void EigvalshInferMeta(const MetaTensor& x, void EinsumInferMeta(const std::vector& inputs, const std::string& equation, MetaTensor* out) { - // collect the following informations to prepare einsum. + // collect the following information to prepare einsum. LabelMap labelshape(0); LabelMap labeltype(LabelType::Reduction); std::vector label2perms(inputs.size(), LabelMap(-1)); diff --git a/paddle/phi/kernels/funcs/seq2col.h b/paddle/phi/kernels/funcs/seq2col.h index 14665ada7b4a8a..656c96a8bfed69 100644 --- a/paddle/phi/kernels/funcs/seq2col.h +++ b/paddle/phi/kernels/funcs/seq2col.h @@ -35,7 +35,7 @@ struct Seq2ColFunctor { /* Convert sequences to frames. - 1. Dimension infomation: + 1. Dimension information: Sequences Frames (N, seq_length) -> (N, frame_length, n_frames) @@ -105,7 +105,7 @@ struct Col2SeqFunctor { /* Accumulate output gradient d_out to d_x. - 1. Dimension infomation: + 1. Dimension information: d_out d_x (N, frame_length, n_frames) -> (N, seq_length) diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h index 9c73be86c05689..d8b0826ba75746 100644 --- a/paddle/phi/kernels/impl/einsum_impl.h +++ b/paddle/phi/kernels/impl/einsum_impl.h @@ -603,7 +603,7 @@ void EinsumKernelImpl(const Context& dev_ctx, VLOG(5) << " inputs [ " << i << " ].shape=" << i->dims(); } ValidationCheck(equation); - // collect the following informations to prepare einsum. + // collect the following information to prepare einsum. LabelMap labelshape(0); LabelMap labeltype(LabelType::Reduction); std::vector label2perms(inputs.size(), LabelMap(-1)); diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc index 3b04652701835a..004f22c66804e5 100644 --- a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc @@ -240,11 +240,11 @@ void ElementWiseCooKernelImpl(const Context& dev_ctx, common::make_ddim( {static_cast(sparse_dim), static_cast(nnz)}), DataLayout::NCHW); - auto indeces_dim = common::vectorize( + auto indices_dim = common::vectorize( slice_ddim(x.values().dims(), 1, x.values().dims().size())); - indeces_dim.insert(indeces_dim.begin(), nnz); + indices_dim.insert(indices_dim.begin(), nnz); DenseTensorMeta values_meta( - x.dtype(), common::make_ddim(indeces_dim), DataLayout::NCHW); + x.dtype(), common::make_ddim(indices_dim), DataLayout::NCHW); phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 7fd99c9166ba21..a049ee03047284 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -109,7 +109,7 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, /** * @brief: update the out index and indices * unique_keys: save the index of the output feature list - * unique_values: indiates the index of key before deduplication + * unique_values: indicates the index of key before deduplication * out_indexs: indicates the position of the output index in the rulebook * rulebook_len: indicates the length of rulebook * out_dims: indicates the output dims diff --git a/paddle/pir/include/core/ir_context.h b/paddle/pir/include/core/ir_context.h index 50ce178531673a..1e8d70b3b08e63 100644 --- a/paddle/pir/include/core/ir_context.h +++ b/paddle/pir/include/core/ir_context.h @@ -105,7 +105,7 @@ class IR_API IrContext { AbstractAttribute *GetRegisteredAbstractAttribute(TypeId id); /// - /// \brief Register an op infomation to IrContext + /// \brief Register an op information to IrContext /// void RegisterOpInfo(Dialect *dialect, TypeId op_id, @@ -118,12 +118,12 @@ class IR_API IrContext { void (*verify_region)(Operation *)); /// - /// \brief Get registered operation infomation. + /// \brief Get registered operation information. /// OpInfo GetRegisteredOpInfo(const std::string &name); /// - /// \brief Get registered operation infomation map. + /// \brief Get registered operation information map. /// const OpInfoMap ®istered_op_info_map(); diff --git a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py index ec64c7e7b0e708..fc37b09b1599aa 100644 --- a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py +++ b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py @@ -402,7 +402,7 @@ def find_diff_vars(fixed_vars_map, query_vars_map): return diff_var_name_list @staticmethod - def diff_informations(right_dir, wrong_dir): + def diff_information(right_dir, wrong_dir): """ Find the corresponding operator according to the variable name. """ @@ -448,7 +448,7 @@ def diff_informations(right_dir, wrong_dir): return diff_ops_varname_dict @staticmethod - def diff_informations_from_dirs(right_dirs, wrong_dirs): + def diff_information_from_dirs(right_dirs, wrong_dirs): right_vars_list = [] right_program_list = [] right_dist_attr_map = {} diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index db31165134b15b..f06c935c2f2c29 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -1338,12 +1338,12 @@ def _initialize(self, mode, init_parameters=True): ) if self._in_pir_mode: - # FIXME(ljz) avoid shared same tensro more than once in different mode + # FIXME(ljz) avoid shared same tensor more than once in different mode if mode != "train": return # TODO(2024-Q2) # 1. unify random control - # 2. initilization of non-parameter buffer + # 2. initialization of non-parameter buffer # 3. run startup program for pir # 4. lazy init adaption # 5. amp init adaption diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py index 7c8dea70ba8708..13e8bceae97654 100644 --- a/python/paddle/distributed/fleet/utils/log_util.py +++ b/python/paddle/distributed/fleet/utils/log_util.py @@ -157,7 +157,7 @@ def check_memory_usage(msg=""): mem_msg = f"checking pinned memory usage {msg}:" for key in mem_dict: mem_msg += f"\n{key}: {mem_dict[key]}GB" - logger.infor(mem_msg) + logger.info(mem_msg) if hasattr(paddle.device, 'cpu') and hasattr( paddle.device.cpu, 'max_memory_allocated' diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py index 4b6e80c8320dff..04c10039eb4de5 100644 --- a/python/paddle/distributed/rpc/rpc.py +++ b/python/paddle/distributed/rpc/rpc.py @@ -365,7 +365,7 @@ class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`. def get_all_worker_infos() -> list[WorkerInfo]: """ - Get all worker informations. + Get all worker information. Returns: List[WorkerInfo]. diff --git a/python/paddle/framework/io_utils.py b/python/paddle/framework/io_utils.py index b6b6aac0188294..258b8d1ff65b84 100644 --- a/python/paddle/framework/io_utils.py +++ b/python/paddle/framework/io_utils.py @@ -215,7 +215,7 @@ def _pickle_loads_mac(path, f): def _pack_loaded_dict(load_obj): if isinstance(load_obj, dict): - unpack_info = 'UnpackBigParamInfor@@' + unpack_info = 'UnpackBigParamInfor@@' # typos: disable-line if unpack_info in load_obj: removes = [] for key, value in load_obj[unpack_info].items(): @@ -233,7 +233,7 @@ def _pack_loaded_dict(load_obj): def _unpack_saved_dict(saved_obj, protocol): temp_saved_obj = {} - unpack_infor = {} + unpack_info = {} # When pickle protocol=2 or protocol=3 the serialized object cannot be larger than 4G. if 1 < protocol < 4: if isinstance(saved_obj, dict): @@ -244,9 +244,9 @@ def _unpack_saved_dict(saved_obj, protocol): ) num_element = np.prod(value.shape) if num_element > MAX_NUMBER_OF_ELEMENT: - unpack_infor[key] = {} - unpack_infor[key]["OriginShape"] = value.shape - unpack_infor[key]["slices"] = [] + unpack_info[key] = {} + unpack_info[key]["OriginShape"] = value.shape + unpack_info[key]["slices"] = [] value = value.flatten() for i in range( int( @@ -256,20 +256,20 @@ def _unpack_saved_dict(saved_obj, protocol): ) ): part_name = key + "@@." + str(i) - unpack_infor[key]["slices"].append(part_name) + unpack_info[key]["slices"].append(part_name) temp_saved_obj[part_name] = value[ i * MAX_NUMBER_OF_ELEMENT : MAX_NUMBER_OF_ELEMENT * (i + 1) ] - if unpack_infor: - for key, value in unpack_infor.items(): + if unpack_info: + for key, value in unpack_info.items(): if key in saved_obj: saved_obj.pop(key) for part in value['slices']: saved_obj[part] = temp_saved_obj[part] - saved_obj['UnpackBigParamInfor@@'] = unpack_infor + saved_obj['UnpackBigParamInfor@@'] = unpack_info # typos: disable-line return saved_obj diff --git a/test/deprecated/auto_parallel/test_align_tool_deprecated.py b/test/deprecated/auto_parallel/test_align_tool_deprecated.py index 85e5482ae5e0cb..b83f45d4c61457 100644 --- a/test/deprecated/auto_parallel/test_align_tool_deprecated.py +++ b/test/deprecated/auto_parallel/test_align_tool_deprecated.py @@ -97,10 +97,8 @@ def test_align_tool(self): os.mkdir("./serial") align_tool.save("./serial", vars, fetch_list) break - AutoAlignTool.diff_informations("./serial", "./serial") - AutoAlignTool.diff_informations_from_dirs( - ["./serial"], ["./serial"] - ) + AutoAlignTool.diff_information("./serial", "./serial") + AutoAlignTool.diff_information_from_dirs(["./serial"], ["./serial"]) break print("test auto parallel align tool successfully!") diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index da905e8f04cf2f..6a28d63c017e95 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -89,7 +89,6 @@ 'test_sampling_id_op', 'test_nce', 'graph_helper_test', - 'test_static_shape_inferrence_for_shape_tensor', 'test_layer_norm_mkldnn_op', 'test_fleet_launch_async', 'test_multi_gru_fuse_pass', @@ -1570,7 +1569,6 @@ 'test_sysconfig', 'test_sync_batch_norm_pass', 'test_switch', - 'test_static_shape_inferrence_for_shape_tensor', 'test_static_analysis', 'test_squared_mat_sub_fuse_pass', 'test_spawn_and_init_parallel_env', From 29c3d915e0864022c31fc9830264ed7096e4616b Mon Sep 17 00:00:00 2001 From: fangfangssj <99968055+fangfangssj@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:05:00 +0800 Subject: [PATCH 10/57] [HEU][Paddle TensorRT No.69-72,74-85] Add UnaryOp converter (#70535) * add converter * fix * add marker * fix * fix --- .../transforms/tensorrt/trt_op_marker_pass.cc | 79 +++-- python/paddle/tensorrt/converter_utils.py | 30 +- python/paddle/tensorrt/impls/ops.py | 35 ++- test/tensorrt/CMakeLists.txt | 2 +- test/tensorrt/test_converter_ops.py | 288 ++++++++++++++++++ 5 files changed, 397 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index c67bd5d012973b..0ad509a9601882 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -94,7 +94,24 @@ DEFINE_GENERAL_PATTERN(Flip, paddle::dialect::FlipOp) DEFINE_GENERAL_PATTERN(Mish, paddle::dialect::MishOp) DEFINE_GENERAL_PATTERN(AssignValue, paddle::dialect::AssignValueOp) DEFINE_GENERAL_PATTERN(AssignValue_, paddle::dialect::AssignValue_Op) +DEFINE_GENERAL_PATTERN(Exp, paddle::dialect::ExpOp) +DEFINE_GENERAL_PATTERN(Abs, paddle::dialect::AbsOp) +DEFINE_GENERAL_PATTERN(Abs_, paddle::dialect::Abs_Op) +DEFINE_GENERAL_PATTERN(Sin, paddle::dialect::SinOp) +DEFINE_GENERAL_PATTERN(Cos, paddle::dialect::CosOp) +DEFINE_GENERAL_PATTERN(Sinh, paddle::dialect::SinhOp) +DEFINE_GENERAL_PATTERN(Cosh, paddle::dialect::CoshOp) +DEFINE_GENERAL_PATTERN(Asinh, paddle::dialect::AsinhOp) +DEFINE_GENERAL_PATTERN(Acosh, paddle::dialect::AcoshOp) +DEFINE_GENERAL_PATTERN(Atanh, paddle::dialect::AtanhOp) +DEFINE_GENERAL_PATTERN(Ceil, paddle::dialect::CeilOp) +DEFINE_GENERAL_PATTERN(Rsqrt, paddle::dialect::RsqrtOp) +DEFINE_GENERAL_PATTERN(Reciprocal, paddle::dialect::ReciprocalOp) +DEFINE_GENERAL_PATTERN(Erf, paddle::dialect::ErfOp) +DEFINE_GENERAL_PATTERN(Sign, paddle::dialect::SignOp) +DEFINE_GENERAL_PATTERN(Round, paddle::dialect::RoundOp) DEFINE_GENERAL_PATTERN(Numel, paddle::dialect::NumelOp) + #undef DEFINE_GENERAL_PATTERN // Add ReduceCommonOpPattern base class to simplify code @@ -267,8 +284,30 @@ class ActOpPattern : public pir::OpRewritePattern { using TanhOpPattern = ActOpPattern; using CeluOpPattern = ActOpPattern; using TanhShrinkOpPattern = ActOpPattern; -using LogicalNotOpPattern = ActOpPattern; -using LogicalNot_OpPattern = ActOpPattern; + +template +class Logical_NotOpPattern : public pir::OpRewritePattern { + public: + using pir::OpRewritePattern::OpRewritePattern; + bool MatchAndRewrite(OpType op, + pir::PatternRewriter &rewriter) const override { + if (op->HasAttribute(kCanRunTrtAttr) && + op->template attribute(kCanRunTrtAttr).data()) { + return false; + } + pir::Value x = op.operand_source(0); + auto x_dtype = pir::GetDataTypeFromValue(x); + if (!x_dtype.isa()) { + VLOG(3) << " logical_not op only support bool input in tensorrt."; + return false; + } + op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); + return true; + } +}; +using LogicalNotOpPattern = Logical_NotOpPattern; +using LogicalNot_OpPattern = + Logical_NotOpPattern; class Pool2dOpPattern : public pir::OpRewritePattern { @@ -538,24 +577,6 @@ class ArangeOpPattern } }; -class SignOpPattern : public pir::OpRewritePattern { - public: - using pir::OpRewritePattern::OpRewritePattern; - bool MatchAndRewrite(paddle::dialect::SignOp op, - pir::PatternRewriter &rewriter) const override { - if (op->HasAttribute(kCanRunTrtAttr) && - op->attribute(kCanRunTrtAttr).data()) { - return false; - } -#if IS_TRT_VERSION_LT(8200) - VLOG(3) << "sign op is only supported by tensorrt8.2 above "; - return false; -#endif - op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true)); - return true; - } -}; - class GroupNormOpPattern : public pir::OpRewritePattern { public: @@ -2273,6 +2294,23 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ADD_PATTERN(Mish) ADD_PATTERN(AssignValue) ADD_PATTERN(AssignValue_) + ADD_PATTERN(Exp) + ADD_PATTERN(Abs) + ADD_PATTERN(Abs_) + ADD_PATTERN(Cos) + ADD_PATTERN(Sin) + ADD_PATTERN(Cos) + ADD_PATTERN(Sinh) + ADD_PATTERN(Cosh) + ADD_PATTERN(Asinh) + ADD_PATTERN(Acosh) + ADD_PATTERN(Atanh) + ADD_PATTERN(Ceil) + ADD_PATTERN(Rsqrt) + ADD_PATTERN(Reciprocal) + ADD_PATTERN(Erf) + ADD_PATTERN(Sign) + ADD_PATTERN(Round) ADD_PATTERN(Numel) #if IS_TRT_VERSION_GE(8600) ADD_PATTERN(Layer_norm) @@ -2283,7 +2321,6 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); - ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); ps.Add(std::make_unique(context)); diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py index 5735b33ad42ace..76ccec354b0c5e 100644 --- a/python/paddle/tensorrt/converter_utils.py +++ b/python/paddle/tensorrt/converter_utils.py @@ -686,6 +686,29 @@ def squeeze_trt(network, input_tensor, axes): def unary_op_converter(network, paddle_op, inputs): from paddle.tensorrt import PrecisionMode + ops_type_map = { + "pd_op.sqrt": [trt.UnaryOperation.SQRT], + "pd_op.sqrt_": [trt.UnaryOperation.SQRT], + "pd_op.floor": [trt.UnaryOperation.FLOOR], + "pd_op.exp": [trt.UnaryOperation.EXP], + "pd_op.abs": [trt.UnaryOperation.ABS], + "pd_op.abs_": [trt.UnaryOperation.ABS], + "pd_op.sin": [trt.UnaryOperation.SIN], + "pd_op.cos": [trt.UnaryOperation.COS], + "pd_op.sinh": [trt.UnaryOperation.SINH], + "pd_op.cosh": [trt.UnaryOperation.COSH], + "pd_op.asinh": [trt.UnaryOperation.ASINH], + "pd_op.acosh": [trt.UnaryOperation.ACOSH], + "pd_op.atanh": [trt.UnaryOperation.ATANH], + "pd_op.ceil": [trt.UnaryOperation.CEIL], + "pd_op.reciprocal": [trt.UnaryOperation.RECIP], + "pd_op.erf": [trt.UnaryOperation.ERF], + "pd_op.sign": [trt.UnaryOperation.SIGN], + "pd_op.round": [trt.UnaryOperation.ROUND], + "pd_op.logical_not": [trt.UnaryOperation.NOT], + "pd_op.rsqrt": [trt.UnaryOperation.SQRT, trt.UnaryOperation.RECIP], + } + input_tensor = inputs[0] layer = None org_type = input_tensor.dtype @@ -707,9 +730,10 @@ def unary_op_converter(network, paddle_op, inputs): identity_layer.set_output_type(0, trt.float16) input_tensor = identity_layer.get_output(0) - if paddle_op.name() in ["pd_op.logical_not", "pd_op.logical_not_"]: - layer = network.add_unary(input_tensor, trt.UnaryOperation.NOT) - input_tensor = layer.get_output(0) + if paddle_op.name() in ops_type_map: + for trt_op in ops_type_map[paddle_op.name()]: + layer = network.add_unary(input_tensor, trt_op) + input_tensor = layer.get_output(0) else: raise NotImplementedError( f"Unsupported unary operation: {paddle_op.name()}" diff --git a/python/paddle/tensorrt/impls/ops.py b/python/paddle/tensorrt/impls/ops.py index 6416cb96e6af38..7370f10edc1eeb 100644 --- a/python/paddle/tensorrt/impls/ops.py +++ b/python/paddle/tensorrt/impls/ops.py @@ -11,21 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import tensorrt as trt +from paddle.tensorrt.converter_utils import unary_op_converter from paddle.tensorrt.register import converter_registry -ops_type_map = { - "pd_op.sqrt": trt.UnaryOperation.SQRT, - "pd_op.sqrt_": trt.UnaryOperation.SQRT, - "pd_op.floor": trt.UnaryOperation.FLOOR, -} - @converter_registry.register("pd_op.sqrt", trt_version="trt_version_ge=8.0") @converter_registry.register("pd_op.sqrt_", trt_version="trt_version_ge=8.0") -@converter_registry.register("pd_op.floor", trt_version="8.x") -def sqrt_converter(network, paddle_op, inputs): - input_tensor = inputs[0] - layer = network.add_unary(input_tensor, ops_type_map[paddle_op.name()]) - return layer.get_output(0) +@converter_registry.register("pd_op.floor", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.exp", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.abs", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.abs_", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.sin", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.cos", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.sinh", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.cosh", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.asinh", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.acosh", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.atanh", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.ceil", trt_version="trt_version_ge=8.0") +@converter_registry.register( + "pd_op.reciprocal", trt_version="trt_version_ge=8.0" +) +@converter_registry.register("pd_op.erf", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.rsqrt", trt_version="trt_version_ge=8.0") +@converter_registry.register("pd_op.sign", trt_version="trt_version_ge=8.2") +@converter_registry.register("pd_op.round", trt_version="trt_version_ge=8.2") +def UnaryOpConverter(network, paddle_op, inputs): + layer_output = unary_op_converter(network, paddle_op, inputs) + return layer_output diff --git a/test/tensorrt/CMakeLists.txt b/test/tensorrt/CMakeLists.txt index 4735dc6def3345..201a1e02f2f3f7 100644 --- a/test/tensorrt/CMakeLists.txt +++ b/test/tensorrt/CMakeLists.txt @@ -14,7 +14,7 @@ if(NOT WIN32 AND TENSORRT_FOUND) set_tests_properties(test_converter_conv PROPERTIES TIMEOUT "300") set_tests_properties(test_export PROPERTIES TIMEOUT "500") set_tests_properties(test_converter_norm PROPERTIES TIMEOUT "300") - set_tests_properties(test_converter_ops PROPERTIES TIMEOUT "300") + set_tests_properties(test_converter_ops PROPERTIES TIMEOUT "500") set_tests_properties(test_converter_stat PROPERTIES TIMEOUT "300") set_tests_properties(test_converter_math PROPERTIES TIMEOUT "300") set_tests_properties(test_converter_activation PROPERTIES TIMEOUT "300") diff --git a/test/tensorrt/test_converter_ops.py b/test/tensorrt/test_converter_ops.py index 544fca80fbecc0..155a93d2827a19 100644 --- a/test/tensorrt/test_converter_ops.py +++ b/test/tensorrt/test_converter_ops.py @@ -34,6 +34,9 @@ def setUp(self): def test_trt_result(self): self.check_trt_result() + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + class TestFloorFloatTRTPattern(TensorRTBaseTest): def setUp(self): @@ -49,6 +52,291 @@ def setUp(self): def test_trt_result(self): self.check_trt_result() + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestExpFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.exp + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestAbsFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.abs + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestAbsIntTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.abs + self.api_args = { + "x": np.random.randn(7, 3).astype("int64"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSinFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.sin + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestCosFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.cos + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestSinhFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.sinh + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestCoshFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.cosh + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestAsinhFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.asinh + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestAcoshFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.acosh + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestCeilFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.ceil + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestRsqrtFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.rsqrt + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestReciprocalFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.reciprocal + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestErfFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.erf + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestSignFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.sign + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + +class TestSignIntTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.sign + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + +class TestRoundFloatTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = paddle.round + self.api_args = { + "x": np.random.randn(7, 3).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [3, 3]} + self.opt_shape = {"x": [7, 3]} + self.max_shape = {"x": [10, 3]} + + def test_trt_result(self): + self.check_trt_result() + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + if __name__ == '__main__': unittest.main() From 1cb01143febde6bd5bcc914bb4f3d2c6a0f30019 Mon Sep 17 00:00:00 2001 From: Junjie Zhang <1356732652@qq.com> Date: Wed, 8 Jan 2025 11:06:37 +0800 Subject: [PATCH 11/57] =?UTF-8?q?=E3=80=90SCU=E3=80=91=E3=80=90Paddle=20Te?= =?UTF-8?q?nsor=20=E7=AC=AC=E4=BA=8C=E6=9C=9F=20API=20=E6=94=AF=E6=8C=81?= =?UTF-8?q?=200-size=20TensorNo.46=E3=80=91paddle.linalg.solve=20=E6=94=AF?= =?UTF-8?q?=E6=8C=81=200-size=20Tensor=20(#70575)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * support_0size * fix codestyle * Update solve_kernel_impl.h * update * fix codestyle * Update test_solve_op.py * Update test_solve_op.py * Update test_solve_op.py --- paddle/phi/kernels/impl/solve_kernel_impl.h | 34 +++++++++ test/legacy_test/test_solve_op.py | 82 +++++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h index 52d9cd131ba3ed..bbe63896fc4d91 100644 --- a/paddle/phi/kernels/impl/solve_kernel_impl.h +++ b/paddle/phi/kernels/impl/solve_kernel_impl.h @@ -195,6 +195,40 @@ void SolveKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, DenseTensor* out) { + if (x.numel() == 0 || y.numel() == 0) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + std::vector out_dims; + if (y_dims.size() == 1) { + out_dims = + std::vector(x_dims.Get(), x_dims.Get() + x_dims.size() - 2); + out_dims.push_back(y_dims[y_dims.size() - 1]); + } else { + // broadcast + std::vector x_shape(x_dims.Get(), x_dims.Get() + x_dims.size() - 2); + std::vector y_shape(y_dims.Get(), y_dims.Get() + y_dims.size() - 2); + auto x_it = x_shape.rbegin(); + auto y_it = y_shape.rbegin(); + while (x_it != x_shape.rend() || y_it != y_shape.rend()) { + int x_dim = (x_it != x_shape.rend()) ? *x_it : 1; + int y_dim = (y_it != y_shape.rend()) ? *y_it : 1; + if (x_dim == 0 || y_dim == 0) { + out_dims.push_back(0); + } else { + out_dims.push_back(std::max(x_dim, y_dim)); + } + if (x_it != x_shape.rend()) ++x_it; + if (y_it != y_shape.rend()) ++y_it; + } + std::reverse(out_dims.begin(), out_dims.end()); + out_dims.insert(out_dims.end(), + y_dims.Get() + y_dims.size() - 2, + y_dims.Get() + y_dims.size()); + } + out->Resize(phi::make_ddim(out_dims)); + dev_ctx.template Alloc(out); + return; + } linalg_solve(dev_ctx, x, y, out); } diff --git a/test/legacy_test/test_solve_op.py b/test/legacy_test/test_solve_op.py index 71ca1b5accae53..874f49ce5b3124 100644 --- a/test/legacy_test/test_solve_op.py +++ b/test/legacy_test/test_solve_op.py @@ -923,5 +923,87 @@ def test_dygraph(self): print("The mat is singular") +class TestSolveOpAPIZeroDimCase(unittest.TestCase): + def setUp(self): + np.random.seed(2021) + self.place = [] + self.dtype = "float32" + if ( + os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower() + in ['1', 'true', 'on'] + or not core.is_compiled_with_cuda() + ): + self.place.append(paddle.CPUPlace()) + if core.is_compiled_with_cuda(): + self.place.append(paddle.CUDAPlace(0)) + + def check_static_result(self, place, x_shape, y_shape, np_y_shape): + paddle.enable_static() + with base.program_guard(base.Program(), base.Program()): + paddle_input_x = paddle.static.data( + name="input_x", shape=x_shape, dtype=self.dtype + ) + paddle_input_y = paddle.static.data( + name="input_y", shape=y_shape, dtype=self.dtype + ) + paddle_result = paddle.linalg.solve( + paddle_input_x, paddle_input_y, left=False + ) + + np_input_x = np.random.random(x_shape).astype(self.dtype) + np_input_y = np.random.random(np_y_shape).astype(self.dtype) + + np_result = np.linalg.solve(np_input_x, np_input_y) + + exe = base.Executor(place) + fetches = exe.run( + base.default_main_program(), + feed={"input_x": np_input_x, "input_y": np_input_y}, + fetch_list=[paddle_result], + ) + np.testing.assert_allclose(fetches[0], np_result, rtol=0.0001) + + def test_static(self): + for place in self.place: + self.check_static_result( + place=place, + x_shape=[10, 0, 0], + y_shape=[6, 0, 0], + np_y_shape=[10, 0, 0], + ) + with self.assertRaises(ValueError) as context: + self.check_static_result( + place=place, + x_shape=[10, 0, 0], + y_shape=[10], + np_y_shape=[10], + ) + + def test_dygraph(self): + def run(place, x_shape, y_shape): + with base.dygraph.guard(place): + input_x_np = np.random.random(x_shape).astype(self.dtype) + input_y_np = np.random.random(y_shape).astype(self.dtype) + + tensor_input_x = paddle.to_tensor(input_x_np) + tensor_input_y = paddle.to_tensor(input_y_np) + + numpy_output = np.linalg.solve(input_x_np, input_y_np) + paddle_output = paddle.linalg.solve( + tensor_input_x, tensor_input_y, left=False + ) + np.testing.assert_allclose( + numpy_output, paddle_output.numpy(), rtol=0.0001 + ) + self.assertEqual( + numpy_output.shape, paddle_output.numpy().shape + ) + + for place in self.place: + run(place, x_shape=[10, 0, 0], y_shape=[10, 0, 0]) + with self.assertRaises(ValueError) as context: + run(place, x_shape=[10, 0, 0], y_shape=[10]) + + if __name__ == "__main__": unittest.main() From 418327b8b6f46651cf0248c77ef6d361a680f7a5 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:24:24 +0800 Subject: [PATCH 12/57] [XPU] add data_type_transform_test_xpu (#70638) * [XPU] add data_type_transform_test_xpu * [XPU] add data_type_transform_test_xpu --- paddle/fluid/framework/data_type.h | 13 ++ paddle/fluid/framework/data_type_transform.cc | 66 +++--- .../phi/core/framework/data_type_transform.cc | 66 +++--- paddle/phi/core/framework/var_type_helper.h | 13 ++ test/cpp/phi/core/CMakeLists.txt | 4 + .../phi/core/data_type_transform_test_xpu.cc | 219 ++++++++++++++++++ 6 files changed, 319 insertions(+), 62 deletions(-) create mode 100644 test/cpp/phi/core/data_type_transform_test_xpu.cc diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index a40f33e2f3fbfa..16df876079931c 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -112,6 +112,19 @@ struct DataTypeTrait { _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX64); \ _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX128); +// complex and float8 are not supported on XPU. +#define _ForEachDataTypeForXPU_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::float16, FP16); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::bfloat16, BF16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8); + #define DefineDataTypeTrait(cpp_type, proto_type) \ template <> \ struct DataTypeTrait { \ diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 9fba57e10fd0b1..83905084907687 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -64,10 +64,11 @@ static void XPUTransDataType( } \ } while (0) - if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 && - dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 && - dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) { - _ForEachDataType_(XPUCastCallback); + if (dst_type == proto::VarType::FP32 || dst_type == proto::VarType::FP16 || + dst_type == proto::VarType::BOOL || dst_type == proto::VarType::INT16 || + dst_type == proto::VarType::INT32 || dst_type == proto::VarType::INT64 || + dst_type == proto::VarType::FP64) { + _ForEachDataTypeForXPU_(XPUCastCallback); } else { PADDLE_THROW(common::errors::Unimplemented( "Data type (%s) is not supported in XPU when casting data type.", @@ -155,33 +156,37 @@ void TransDataType(const phi::DenseTensor& in, auto ctx = pool.Get(in.place()); #if defined(PADDLE_WITH_XPU) - switch (src_type) { - case proto::VarType::FP16: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::FP32: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::BOOL: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::INT16: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::INT32: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::INT64: - XPUTransDataType(in, out, dst_type, ctx); - break; - default: - PADDLE_THROW(common::errors::Unimplemented( - "Data type (%s) is not supported in XPU when casting data type.", - DataTypeToString(src_type))); + if (phi::is_xpu_place(in.place())) { + switch (src_type) { + case proto::VarType::FP16: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::FP32: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::FP64: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::BOOL: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT16: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT32: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT64: + XPUTransDataType(in, out, dst_type, ctx); + break; + default: + PADDLE_THROW(common::errors::Unimplemented( + "Data type (%s) is not supported in XPU when casting data type.", + DataTypeToString(src_type))); + } + return; } - -#else - +#endif switch (src_type) { case proto::VarType::FP16: framework::VisitDataType(dst_type, @@ -225,7 +230,6 @@ void TransDataType(const phi::DenseTensor& in, "Data type (%s) is not supported when casting data type.", DataTypeToString(src_type))); } -#endif } void TransComplexToReal(const proto::VarType::Type& dst_type, diff --git a/paddle/phi/core/framework/data_type_transform.cc b/paddle/phi/core/framework/data_type_transform.cc index c20da1023b3310..6ed397d85d378e 100644 --- a/paddle/phi/core/framework/data_type_transform.cc +++ b/paddle/phi/core/framework/data_type_transform.cc @@ -66,10 +66,11 @@ static void XPUTransDataType( } \ } while (0) - if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 && - dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 && - dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) { - _ForEachDataType_(XPUCastCallback); + if (dst_type == proto::VarType::FP32 || dst_type == proto::VarType::FP16 || + dst_type == proto::VarType::BOOL || dst_type == proto::VarType::INT16 || + dst_type == proto::VarType::INT32 || dst_type == proto::VarType::INT64 || + dst_type == proto::VarType::FP64) { + _ForEachDataTypeForXPU_(XPUCastCallback); } else { PADDLE_THROW(common::errors::Unimplemented( "Data type (%s) is not supported in XPU when casting data type.", @@ -158,33 +159,37 @@ void TransDataType(const phi::DenseTensor& in, auto ctx = pool.Get(in.place()); #if defined(PADDLE_WITH_XPU) - switch (src_type) { - case proto::VarType::FP16: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::FP32: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::BOOL: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::INT16: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::INT32: - XPUTransDataType(in, out, dst_type, ctx); - break; - case proto::VarType::INT64: - XPUTransDataType(in, out, dst_type, ctx); - break; - default: - PADDLE_THROW(common::errors::Unimplemented( - "Data type (%s) is not supported in XPU when casting data type.", - VarDataTypeToString(src_type))); + if (phi::is_xpu_place(in.place())) { + switch (src_type) { + case proto::VarType::FP16: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::FP32: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::FP64: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::BOOL: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT16: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT32: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT64: + XPUTransDataType(in, out, dst_type, ctx); + break; + default: + PADDLE_THROW(common::errors::Unimplemented( + "Data type (%s) is not supported in XPU when casting data type.", + VarDataTypeToString(src_type))); + } + return; } - -#else - +#endif switch (src_type) { case proto::VarType::FP16: phi::VisitDataType(dst_type, @@ -228,7 +233,6 @@ void TransDataType(const phi::DenseTensor& in, "Data type (%s) is not supported when casting data type.", VarDataTypeToString(src_type))); } -#endif } } // namespace phi diff --git a/paddle/phi/core/framework/var_type_helper.h b/paddle/phi/core/framework/var_type_helper.h index 81636930019331..9a6306da520a27 100644 --- a/paddle/phi/core/framework/var_type_helper.h +++ b/paddle/phi/core/framework/var_type_helper.h @@ -113,6 +113,19 @@ struct DataTypeTrait { _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX64); \ _ForEachDataTypeHelper_(callback, ::phi::dtype::complex, COMPLEX128); +// complex and float8 are not supported on XPU. +#define _ForEachDataTypeForXPU_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::float16, FP16); \ + _ForEachDataTypeHelper_(callback, ::phi::dtype::bfloat16, BF16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8); + #define DefineDataTypeTrait(cpp_type, proto_type) \ template <> \ struct DataTypeTrait { \ diff --git a/test/cpp/phi/core/CMakeLists.txt b/test/cpp/phi/core/CMakeLists.txt index 5eb78dacd7cd31..30cebae20e1f08 100644 --- a/test/cpp/phi/core/CMakeLists.txt +++ b/test/cpp/phi/core/CMakeLists.txt @@ -86,3 +86,7 @@ endif() if(NOT WIN32) paddle_test(test_c_tcp_store SRCS test_tcp_store.cc DEPS phi common) endif() + +if(WITH_XPU) + paddle_test(data_type_transform_test_xpu SRCS data_type_transform_test_xpu.cc) +endif() diff --git a/test/cpp/phi/core/data_type_transform_test_xpu.cc b/test/cpp/phi/core/data_type_transform_test_xpu.cc new file mode 100644 index 00000000000000..c897f59d8f9aca --- /dev/null +++ b/test/cpp/phi/core/data_type_transform_test_xpu.cc @@ -0,0 +1,219 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/framework/data_type_transform.h" +#include "paddle/phi/core/kernel_factory.h" + +template +void TransformTest(const phi::KernelKey& kernel_type_for_var, + const phi::KernelKey& expected_kernel_type, + const phi::CPUPlace& cpu_place, + const phi::XPUPlace& xpu_place, + const InT* cpu_data, + const int data_number) { + phi::XPUContext context(xpu_place); + phi::DenseTensor in; + phi::DenseTensor in_xpu; + phi::DenseTensor out; + phi::DenseTensor out_xpu; + + // copy from cpu_data to cpu tensor + InT* in_ptr = + in.mutable_data(common::make_ddim({data_number}), cpu_place); + memcpy(in_ptr, cpu_data, sizeof(InT) * data_number); + + // test case 1: on xpu + { + // copy from cpu tensor to xpu tensor + paddle::framework::TensorCopy(in, xpu_place, context, &in_xpu); + context.Wait(); + + // call trans data + phi::TransDataType( + kernel_type_for_var, expected_kernel_type, in_xpu, &out_xpu); + + // copy from xpu tensor to cpu tensor + paddle::framework::TensorCopy(out_xpu, cpu_place, context, &out); + context.Wait(); + + // check result + OutT* out_ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_ptr[i], static_cast(cpu_data[i])); + } + } + + // test case 2: on cpu + { + // call trans data + phi::TransDataType(kernel_type_for_var, expected_kernel_type, in, &out); + + // check result + OutT* out_ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_ptr[i], static_cast(cpu_data[i])); + } + } +} + +TEST(DataTypeTransform, XPUTransform) { + auto cpu_place = phi::CPUPlace(); + auto xpu_place = phi::XPUPlace(0); + phi::XPUContext context(xpu_place); + + auto kernel_fp16 = phi::KernelKey( + xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT16); + auto kernel_fp32 = phi::KernelKey( + xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT32); + auto kernel_fp64 = phi::KernelKey( + xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT64); + auto kernel_int16 = phi::KernelKey( + xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::INT16); + auto kernel_int32 = phi::KernelKey( + xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::INT32); + auto kernel_int64 = phi::KernelKey( + xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::INT64); + auto kernel_bool = phi::KernelKey( + xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::BOOL); + + { + // float16 -> any + phi::dtype::float16 cpu_data[6] = {phi::dtype::float16(0), + phi::dtype::float16(1), + phi::dtype::float16(2), + phi::dtype::float16(3), + phi::dtype::float16(4), + phi::dtype::float16(5)}; + TransformTest( + kernel_fp16, kernel_fp32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp16, kernel_fp64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp16, kernel_int32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp16, kernel_int64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp16, kernel_bool, cpu_place, xpu_place, cpu_data, 6); + } + { + // float -> any + float cpu_data[6] = {0, 1, 2, 3, 4, 5}; + TransformTest( + kernel_fp32, kernel_fp16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp32, kernel_fp32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp32, kernel_fp64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp32, kernel_int16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp32, kernel_int32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp32, kernel_int64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp32, kernel_bool, cpu_place, xpu_place, cpu_data, 6); + } + { + // double -> any + double cpu_data[6] = {0, 1, 2, 3, 4, 5}; + TransformTest( + kernel_fp64, kernel_fp16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp64, kernel_fp32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp64, kernel_fp64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp64, kernel_int16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp64, kernel_int32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp64, kernel_int64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_fp64, kernel_bool, cpu_place, xpu_place, cpu_data, 6); + } + { + // int16 -> any + int16_t cpu_data[6] = {0, 1, 2, 3, 4, 5}; + TransformTest( + kernel_int16, kernel_fp16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int16, kernel_fp32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int16, kernel_fp64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int16, kernel_int16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int16, kernel_int32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int16, kernel_int64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int16, kernel_bool, cpu_place, xpu_place, cpu_data, 6); + } + { + // int32 -> any + int32_t cpu_data[6] = {0, 1, 2, 3, 4, 5}; + TransformTest( + kernel_int32, kernel_fp16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int32, kernel_fp32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int32, kernel_fp64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int32, kernel_int16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int32, kernel_int32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int32, kernel_int64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int32, kernel_bool, cpu_place, xpu_place, cpu_data, 6); + } + { + // int64 -> any + int64_t cpu_data[6] = {0, 1, 2, 3, 4, 5}; + TransformTest( + kernel_int64, kernel_fp16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int64, kernel_fp32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int64, kernel_fp64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int64, kernel_int16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int64, kernel_int32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int64, kernel_int64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_int64, kernel_bool, cpu_place, xpu_place, cpu_data, 6); + } + { + // bool -> any + bool cpu_data[6] = {0, 1, 0, 1, 1, 0}; + TransformTest( + kernel_bool, kernel_fp16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_bool, kernel_fp32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_bool, kernel_fp64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_bool, kernel_int16, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_bool, kernel_int32, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_bool, kernel_int64, cpu_place, xpu_place, cpu_data, 6); + TransformTest( + kernel_bool, kernel_bool, cpu_place, xpu_place, cpu_data, 6); + } +} From 34e7b8811a4f5f067d32d45f31171ffa586d1758 Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:28:48 +0800 Subject: [PATCH 13/57] =?UTF-8?q?=E3=80=90CINN=E3=80=91Use=20ArithSimplify?= =?UTF-8?q?=20instead=20of=20Autosimplify--Part0=20(#70594)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * del autosimplify * fix bug * empty commit --- paddle/cinn/backends/codegen_gpu_dev.cc | 4 +- paddle/cinn/backends/llvm/codegen_llvm.cc | 6 +- paddle/cinn/common/ir_util.cc | 7 +- paddle/cinn/hlir/op/contrib/sort.cc | 8 +- paddle/cinn/hlir/pe/elementwise.cc | 2 +- paddle/cinn/hlir/pe/nn.cc | 117 +++++++++--------- paddle/cinn/hlir/pe/transform.cc | 24 ++-- paddle/cinn/ir/buffer.cc | 2 +- .../ir/schedule/impl/loop_transformation.cc | 18 ++- paddle/cinn/ir/schedule/ir_schedule_util.cc | 21 ++-- paddle/cinn/ir/schedule/ir_schedule_util.h | 6 +- paddle/cinn/ir/tensor.cc | 4 +- .../eliminate_common_factor_of_local_index.cc | 14 +-- .../eliminate_common_global_memory_read.cc | 9 +- paddle/cinn/optim/ir_simplify.cc | 5 +- 15 files changed, 117 insertions(+), 130 deletions(-) diff --git a/paddle/cinn/backends/codegen_gpu_dev.cc b/paddle/cinn/backends/codegen_gpu_dev.cc index a3dbddfdb132e8..9886d7c3a9fc45 100644 --- a/paddle/cinn/backends/codegen_gpu_dev.cc +++ b/paddle/cinn/backends/codegen_gpu_dev.cc @@ -115,7 +115,7 @@ std::vector FilterDeallocTempBuffers(const std::vector &frees) { bool has_symbolic_constant = false; const ir::_Buffer_ *buffer = op->destination.As(); for (Expr shape : buffer->shape) { - shape = common::AutoSimplify(shape); + shape = optim::ArithSimplify(shape); ir::ir_utils::CollectIRNodes(shape, [&](const Expr *x) { if (x->as_var()) { PADDLE_ENFORCE_EQ( @@ -540,7 +540,7 @@ ir::Expr CalculateSharedMemory(const ir::LoweredFunc &func) { shm_size = shm_size + CalculateSharedMemory(buffer); } } - return common::AutoSimplify(shm_size); + return optim::ArithSimplify(shm_size); } } // namespace backends diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc index 355ae881c6476e..3462325edd9b36 100644 --- a/paddle/cinn/backends/llvm/codegen_llvm.cc +++ b/paddle/cinn/backends/llvm/codegen_llvm.cc @@ -928,7 +928,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Store *op) { // fit the total_lanes in native_lanes(split into multiple native steps) for (int offset = 0; offset < total_lanes; offset += total_lanes) { int lanes = total_lanes; - Expr base = cinn::common::AutoSimplify(ramp->base + offset); + Expr base = optim::ArithSimplify(ramp->base + offset); optim::VarModSimplify(&base); auto *ptr = CreateBufferPtr(op->type().ElementOf(), buffer, Visit(&base)); @@ -1242,10 +1242,8 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) { for (int i = 0; i < load_lanes; i += load_lanes) { int slice_lanes = load_lanes; - auto slice_base = cinn::common::AutoSimplify(ramp->base + i); + auto slice_base = optim::ArithSimplify(ramp->base + i); optim::VarModSimplify(&slice_base); - auto slide_stride = Expr(1); - auto slide_index = slice_base; #if LLVM_VERSION_MAJOR >= 11 const llvm::ElementCount elem_count(slice_lanes, /*scalable*/ false); diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc index 0204e8dc1c5d0a..ace5e2e4cd2981 100644 --- a/paddle/cinn/common/ir_util.cc +++ b/paddle/cinn/common/ir_util.cc @@ -101,8 +101,8 @@ Expr RampRelatedAdd(ir::Ramp *ramp, ir::Ramp *other) { ::common::errors::InvalidArgument( "Other ramp pointer should not be null.")); if (ramp->lanes == other->lanes) { - Expr base_add = cinn::common::AutoSimplify(ramp->base + other->base); - Expr stride_add = cinn::common::AutoSimplify(ramp->stride + other->stride); + Expr base_add = optim::ArithSimplify(ramp->base + other->base); + Expr stride_add = optim::ArithSimplify(ramp->stride + other->stride); VLOG(2) << base_add; VLOG(2) << stride_add; return ir::Ramp::Make(base_add, stride_add, ramp->lanes); @@ -641,8 +641,7 @@ ir::IndexExpr SimplifySymbolicDivide(const ir::IndexExpr &lhs, bool ProveDivisible(const ir::IndexExpr &lhs, const ir::IndexExpr &rhs) { if (IsZero(lhs % rhs)) return true; - // remove AutoSimplify later. - if (IsZero(AutoSimplify(lhs % rhs))) return true; + if (IsZero(optim::ArithSimplify(lhs % rhs))) return true; return false; } diff --git a/paddle/cinn/hlir/op/contrib/sort.cc b/paddle/cinn/hlir/op/contrib/sort.cc index 897bf288c4f812..ec6403d5c7dd77 100644 --- a/paddle/cinn/hlir/op/contrib/sort.cc +++ b/paddle/cinn/hlir/op/contrib/sort.cc @@ -95,8 +95,8 @@ std::vector ArgSort(const ir::Tensor &A, stride = stride * A->shape[i]; } } - offset = cinn::common::AutoSimplify(offset); - stride = cinn::common::AutoSimplify(stride); + offset = optim::ArithSimplify(offset); + stride = optim::ArithSimplify(stride); auto A_shape_axis = A->shape[pos_axis]; return lang::CallExtern(index_func_name, {A, A_shape_axis, A(indices), offset, stride}); @@ -117,8 +117,8 @@ std::vector ArgSort(const ir::Tensor &A, stride = stride * A->shape[i]; } } - offset = cinn::common::AutoSimplify(offset); - stride = cinn::common::AutoSimplify(stride); + offset = optim::ArithSimplify(offset); + stride = optim::ArithSimplify(stride); auto A_shape_axis = A->shape[pos_axis]; auto idx = lang::CallExtern( diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc index 8e16bd6a8c6d19..ec4b687f88a7fb 100644 --- a/paddle/cinn/hlir/pe/elementwise.cc +++ b/paddle/cinn/hlir/pe/elementwise.cc @@ -222,7 +222,7 @@ Expr ReshapeHandler(const ir::Tensor& A, if (i > A_s) { temp = temp % A->shape[i]; } - A_indice[i] = common::AutoSimplify(temp); + A_indice[i] = optim::ArithSimplify(temp); } }; diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc index 15fc0575cae466..4954cda7976e0f 100644 --- a/paddle/cinn/hlir/pe/nn.cc +++ b/paddle/cinn/hlir/pe/nn.cc @@ -204,12 +204,12 @@ std::vector Conv2d_winograd_NCHW(const ir::Tensor &input, output_shape = { input->shape[0], // B weights->shape[0], // O - cinn::common::AutoSimplify( + optim::ArithSimplify( (input->shape[2] - ((weights_dilation->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1), // H - cinn::common::AutoSimplify( + optim::ArithSimplify( (input->shape[3] - ((weights_dilation->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + @@ -222,8 +222,8 @@ std::vector Conv2d_winograd_NCHW(const ir::Tensor &input, ir::Tensor B = winograd_transform[1]; ir::Tensor G = winograd_transform[2]; - int nH = (cinn::common::AutoSimplify(output_shape[2]).as_int32() + m - 1) / m; - int nW = (cinn::common::AutoSimplify(output_shape[3]).as_int32() + m - 1) / m; + int nH = (optim::ArithSimplify(output_shape[2]).as_int32() + m - 1) / m; + int nW = (optim::ArithSimplify(output_shape[3]).as_int32() + m - 1) / m; int P = input->shape[0].as_int32() * nH * nW; @@ -489,9 +489,9 @@ std::vector Conv2d_NCHW_5D(const ir::Tensor &input, shape_weights.size(), 4U, ::common::errors::InvalidArgument("weight's shape size should be 4")); - Expr c_in = cinn::common::AutoSimplify(shape_input[1]); - Expr c_filter = cinn::common::AutoSimplify(shape_weights[1]); - Expr c_out = cinn::common::AutoSimplify(shape_weights[0]); + Expr c_in = optim::ArithSimplify(shape_input[1]); + Expr c_filter = optim::ArithSimplify(shape_weights[1]); + Expr c_out = optim::ArithSimplify(shape_weights[0]); absl::flat_hash_map conv2d_factors; int oc = c_out.as_int32(); int ic = c_in.as_int32(); @@ -559,12 +559,12 @@ std::vector Conv2d_NCHW_5D(const ir::Tensor &input, std::vector output_shape = { batch, // B c_out, // O - cinn::common::AutoSimplify( - (h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + - 1), // H - cinn::common::AutoSimplify( - (w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + - 1) // W + optim::ArithSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / + stride_h + + 1), // H + optim::ArithSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / + stride_w + + 1) // W }; auto res = Compute( output_shape, @@ -601,33 +601,33 @@ std::vector Conv2d_NCHWc(const ir::Tensor &input, ::common::errors::InvalidArgument("weight's shape size should be 6")); Expr batch = shape_input[0]; - Expr c_in_outer = cinn::common::AutoSimplify(shape_input[1]); + Expr c_in_outer = optim::ArithSimplify(shape_input[1]); Expr h_in = shape_input[2]; Expr w_in = shape_input[3]; - Expr c_in_inner = cinn::common::AutoSimplify(shape_input[4]); + Expr c_in_inner = optim::ArithSimplify(shape_input[4]); Expr c_out_outer = shape_weights[0]; - Expr c_filter_outer = cinn::common::AutoSimplify(shape_weights[1]); + Expr c_filter_outer = optim::ArithSimplify(shape_weights[1]); Expr h_f = shape_weights[2]; Expr w_f = shape_weights[3]; - Expr c_filter_inner = cinn::common::AutoSimplify(shape_weights[4]); - Expr c_out_inner = cinn::common::AutoSimplify(shape_weights[5]); + Expr c_filter_inner = optim::ArithSimplify(shape_weights[4]); + Expr c_out_inner = optim::ArithSimplify(shape_weights[5]); - Expr c_filter = cinn::common::AutoSimplify(c_filter_outer * c_filter_inner); - Expr c_out = cinn::common::AutoSimplify(c_out_outer * c_out_inner); - Expr c_in = cinn::common::AutoSimplify(c_in_outer * c_in_inner); + Expr c_filter = optim::ArithSimplify(c_filter_outer * c_filter_inner); + Expr c_out = optim::ArithSimplify(c_out_outer * c_out_inner); + Expr c_in = optim::ArithSimplify(c_in_outer * c_in_inner); Var fc(c_filter, UniqName("fc")); Var fy(h_f, UniqName("fy")); Var fx(w_f, UniqName("fx")); std::vector output_shape = { batch, // B c_out_outer, // O - cinn::common::AutoSimplify( - (h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + - 1), // H - cinn::common::AutoSimplify( - (w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + - 1), // W + optim::ArithSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / + stride_h + + 1), // H + optim::ArithSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / + stride_w + + 1), // W c_out_inner}; ir::Tensor input_pad; @@ -639,18 +639,18 @@ std::vector Conv2d_NCHWc(const ir::Tensor &input, }, UniqName("input_pad")); } else { - auto pad_h_bound = cinn::common::AutoSimplify( - (output_shape[2] - 1) * stride_h + (h_f - 1) * dilation_h + 1); - auto pad_w_bound = cinn::common::AutoSimplify( - (output_shape[3] - 1) * stride_w + (w_f - 1) * dilation_w + 1); + auto pad_h_bound = optim::ArithSimplify((output_shape[2] - 1) * stride_h + + (h_f - 1) * dilation_h + 1); + auto pad_w_bound = optim::ArithSimplify((output_shape[3] - 1) * stride_w + + (w_f - 1) * dilation_w + 1); auto pad_out_h = std::min(pad_h_bound.as_int32(), - cinn::common::AutoSimplify(h_in + 2 * pad_h).as_int32()); + optim::ArithSimplify(h_in + 2 * pad_h).as_int32()); auto pad_out_w = std::min(pad_w_bound.as_int32(), - cinn::common::AutoSimplify(w_in + 2 * pad_w).as_int32()); - auto h_in_pad = cinn::common::AutoSimplify(h_in + pad_h); - auto w_in_pad = cinn::common::AutoSimplify(w_in + pad_w); + optim::ArithSimplify(w_in + 2 * pad_w).as_int32()); + auto h_in_pad = optim::ArithSimplify(h_in + pad_h); + auto w_in_pad = optim::ArithSimplify(w_in + pad_w); input_pad = Compute( {batch, c_in_outer, Expr(pad_out_h), Expr(pad_out_w), c_in_inner}, [=](Expr n, Expr icc, Expr yy, Expr xx, Expr icb) { @@ -670,23 +670,20 @@ std::vector Conv2d_NCHWc(const ir::Tensor &input, auto packed_out = Compute( output_shape, [=](Expr n, Expr oc_chunk, Expr oh, Expr ow, Expr oc_block) { - Expr c_out_per_group = - cinn::common::AutoSimplify(c_out * c_filter / c_in); + Expr c_out_per_group = optim::ArithSimplify(c_out * c_filter / c_in); Expr ic_outer, ic_inner; if (c_in == c_filter) { - ic_outer = cinn::common::AutoSimplify(fc / c_in_inner); - ic_inner = cinn::common::AutoSimplify(fc % c_in_inner); + ic_outer = optim::ArithSimplify(fc / c_in_inner); + ic_inner = optim::ArithSimplify(fc % c_in_inner); } else { - ic_outer = - cinn::common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) / - c_out_per_group * c_filter + - fc) / - c_in_inner); - ic_inner = - cinn::common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) / - c_out_per_group * c_filter + - fc) % - c_in_inner); + ic_outer = optim::ArithSimplify(((oc_chunk * c_out_inner + oc_block) / + c_out_per_group * c_filter + + fc) / + c_in_inner); + ic_inner = optim::ArithSimplify(((oc_chunk * c_out_inner + oc_block) / + c_out_per_group * c_filter + + fc) % + c_in_inner); } return lang::ReduceSum(input_pad(n, ic_outer, @@ -1264,8 +1261,8 @@ Tensor Pad(const Tensor &tensor, if (i >= pad_before.size()) { output_shape.push_back(tensor->shape[i]); } else { - auto shape = cinn::common::AutoSimplify(tensor->shape[i] + pad_before[i] + - pad_after[i]); + auto shape = + optim::ArithSimplify(tensor->shape[i] + pad_before[i] + pad_after[i]); output_shape.push_back(shape); } } @@ -1291,8 +1288,8 @@ Tensor Pad(const Tensor &tensor, } Expr sel_after; if (!MathEqual(pad_after[i], Expr(0))) { - sel_after = cinn::common::AutoSimplify(ovars[i] < pad_before[i] + - tensor->shape[i]); + sel_after = + optim::ArithSimplify(ovars[i] < pad_before[i] + tensor->shape[i]); sel.push_back(sel_after); } if (pad_mode == "edge") { @@ -1407,7 +1404,7 @@ std::vector PoolImpl(const Tensor &tensor, do_pad = (do_pad) ? do_pad : (padding_size[i] || padding_size[i + k_size]); if (ceil_mode) { - pad_tail[i] = cinn::common::AutoSimplify(pad_tail[i] + stride[i] - 1); + pad_tail[i] = optim::ArithSimplify(pad_tail[i] + stride[i] - 1); } daxis.emplace_back(Var(kernel[i], UniqName("kernel_idx"))); @@ -1415,7 +1412,7 @@ std::vector PoolImpl(const Tensor &tensor, pad_before[ii] = pad_head[i]; pad_after[ii] = pad_tail[i]; - auto out_dim = cinn::common::AutoSimplify( + auto out_dim = optim::ArithSimplify( (tensor->shape[ii] - kernel[i] + pad_head[i] + pad_tail[i]) / stride[i] + 1); @@ -1470,13 +1467,13 @@ std::vector PoolImpl(const Tensor &tensor, auto temp_factor = make_const(Int(32), 1); for (int i = 0; i < k_size; i++) { int ii = axis[i]; - start[i] = cinn::common::AutoSimplify(output[ii] * stride[i] - - pad_head[i]); + start[i] = + optim::ArithSimplify(output[ii] * stride[i] - pad_head[i]); end[i] = Min::Make(start[i] + kernel[i], tensor->shape[ii]); start[i] = Max::Make(start[i], make_const(Int(32), 0)); temp_factor = temp_factor * (end[i] - start[i]); } - cinn::common::AutoSimplify(temp_factor); + optim::ArithSimplify(temp_factor); Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1)); return lang::ReduceSum( ir::Div::Make(temp(indices), @@ -1487,7 +1484,7 @@ std::vector PoolImpl(const Tensor &tensor, for (int i = 0; i < k_size; i++) { temp_factor = temp_factor * kernel[i]; } - cinn::common::AutoSimplify(temp_factor); + optim::ArithSimplify(temp_factor); return lang::ReduceSum( ir::Div::Make(temp(indices), ir::Cast::Make(temp->type(), temp_factor)), @@ -1553,7 +1550,7 @@ std::vector PoolImpl(const Tensor &tensor, Expr(static_cast(tensor->shape[axis[i]].get_constant()) / kernel_size[i]); } - cinn::common::AutoSimplify(temp_factor); + optim::ArithSimplify(temp_factor); Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1)); return lang::ReduceSum( ir::Div::Make(temp(indices), diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc index c4d6d649b2d264..9aed131a42a494 100644 --- a/paddle/cinn/hlir/pe/transform.cc +++ b/paddle/cinn/hlir/pe/transform.cc @@ -420,7 +420,7 @@ std::vector Split( out_shape[i], [=](const std::vector& indice) { auto temp = indice; - temp[axis] = cinn::common::AutoSimplify(temp[axis] + Expr(start[i])); + temp[axis] = optim::ArithSimplify(temp[axis] + Expr(start[i])); return A(temp); }, names[i]); @@ -442,7 +442,7 @@ ir::Tensor Concat(const ir::Tensor& A, std::vector output_shape = A->shape; Expr pivot = A->shape[axis]; output_shape[axis] = - cinn::common::AutoSimplify(output_shape[axis] + B->shape[axis]); + optim::ArithSimplify(output_shape[axis] + B->shape[axis]); auto res = Compute( output_shape, [=](const std::vector& indice) { @@ -481,8 +481,8 @@ ir::Tensor Concat(const std::vector& input_tensors, ::common::errors::InvalidArgument( "Dimensions of inputs tensors in Concat should be equal! Please " "check.")); - output_shape[axis] = cinn::common::AutoSimplify( - output_shape[axis] + input_tensors[i]->shape[axis]); + output_shape[axis] = optim::ArithSimplify(output_shape[axis] + + input_tensors[i]->shape[axis]); } auto res = Compute( @@ -491,7 +491,7 @@ ir::Tensor Concat(const std::vector& input_tensors, auto ret = input_tensors[0](indice); Expr accumulate_shape = Expr(0); for (int i = 0; i < input_size - 1; i++) { - accumulate_shape = cinn::common::AutoSimplify( + accumulate_shape = optim::ArithSimplify( accumulate_shape + input_tensors[i]->shape[axis]); std::vector new_indice = indice; new_indice[axis] = @@ -1068,7 +1068,7 @@ std::vector InferShapeLayoutTransform( int dst_prim_index = (*split_index_map)[i][0]; int dst_sub_index = (*split_index_map)[i][1]; int factor = (*split_index_map)[i][2]; - Expr chunk_shape = cinn::common::AutoSimplify(input_shapes[i] / factor); + Expr chunk_shape = optim::ArithSimplify(input_shapes[i] / factor); Expr block_shape = Expr(factor); output_shape[dst_prim_index] = chunk_shape; output_shape[dst_sub_index] = block_shape; @@ -1100,7 +1100,7 @@ std::vector InferShapeLayoutTransform( ::common::errors::InvalidArgument( "input_shapes[src_sub_index] should be equal to factor")); output_shape[i] = - cinn::common::AutoSimplify(input_shapes[src_prim_index] * factor); + optim::ArithSimplify(input_shapes[src_prim_index] * factor); } else if ((*split_index_map)[i].size() == 1) { int src_prim_index = (*split_index_map)[i][0]; output_shape[i] = input_shapes[src_prim_index]; @@ -1164,13 +1164,11 @@ ir::Tensor LayoutTransform(const Tensor& input, int sub_index = split_infos[1]; int factor = split_infos[2]; if (dst_dim > src_dim) { - new_indice[i] = cinn::common::AutoSimplify( - indice[prim_index] * factor + indice[sub_index]); + new_indice[i] = optim::ArithSimplify(indice[prim_index] * factor + + indice[sub_index]); } else { - new_indice[prim_index] = - cinn::common::AutoSimplify(indice[i] / factor); - new_indice[sub_index] = - cinn::common::AutoSimplify(indice[i] % factor); + new_indice[prim_index] = optim::ArithSimplify(indice[i] / factor); + new_indice[sub_index] = optim::ArithSimplify(indice[i] % factor); } } else if (split_infos.size() == 1) { diff --git a/paddle/cinn/ir/buffer.cc b/paddle/cinn/ir/buffer.cc index cec3f91db7e650..9dc6f4e209b1d6 100644 --- a/paddle/cinn/ir/buffer.cc +++ b/paddle/cinn/ir/buffer.cc @@ -160,7 +160,7 @@ ir::Expr _Buffer_::SymbolicNumel() const { for (auto &i : shape) { res = res * i; } - return common::AutoSimplify(res); + return optim::ArithSimplify(res); } void _Buffer_::Verify() const { diff --git a/paddle/cinn/ir/schedule/impl/loop_transformation.cc b/paddle/cinn/ir/schedule/impl/loop_transformation.cc index f54b0fd81a9d81..e0797212ad4d78 100644 --- a/paddle/cinn/ir/schedule/impl/loop_transformation.cc +++ b/paddle/cinn/ir/schedule/impl/loop_transformation.cc @@ -125,7 +125,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, Expr(temp_var) + substitute_value * Expr(processed_factors[i]); new_loop_vars.push_back(temp_var); } - substitute_value = cinn::common::AutoSimplify(substitute_value); + substitute_value = optim::ArithSimplify(substitute_value); Expr new_node = ir::ir_utils::IRCopy(for_node->body); ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value}); std::vector splited_loops; @@ -167,8 +167,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, for (auto factor : factors) prod_size = prod_size * Expr(factor); std::for_each(factors.begin(), factors.end(), [&](int factor) { if (factor == -1) { - process_factors.push_back( - cinn::common::AutoSimplify(tot_extent / prod_size)); + process_factors.push_back(optim::ArithSimplify(tot_extent / prod_size)); idx_neg1 = -idx_neg1; } else { process_factors.push_back(Expr(factor)); @@ -180,12 +179,11 @@ std::vector DyScheduleImpl::Split(const Expr& loop, idx_neg1 = (-idx_neg1) - 1; - bool exact_split = - (tot_extent == - cinn::common::AutoSimplify(process_factors[0] * process_factors[1])); + bool exact_split = (tot_extent == optim::ArithSimplify(process_factors[0] * + process_factors[1])); if (!exact_split) { process_factors[idx_neg1] = - cinn::common::AutoSimplify(process_factors[idx_neg1] + Expr(1)); + optim::ArithSimplify(process_factors[idx_neg1] + Expr(1)); } PADDLE_ENFORCE_LE( @@ -218,7 +216,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, substitute_value = Expr(temp_var) + substitute_value * process_factors[i]; new_loop_vars.push_back(temp_var); } - substitute_value = cinn::common::AutoSimplify(substitute_value); + substitute_value = optim::ArithSimplify(substitute_value); Expr new_node = ir::ir_utils::IRCopy(for_node->body); ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value}); std::vector splited_loops; @@ -329,7 +327,7 @@ std::vector DyScheduleImpl::Split(const Expr& loop, substitute_value = Expr(temp_var) + substitute_value * process_factors[i]; new_loop_vars.push_back(temp_var); } - substitute_value = cinn::common::AutoSimplify(substitute_value); + substitute_value = optim::ArithSimplify(substitute_value); Expr new_node = ir::ir_utils::IRCopy(for_node->body); ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value}); std::vector splited_loops; @@ -442,7 +440,7 @@ Expr DyScheduleImpl::Fuse(const std::vector& loops) { for (int i = 0; i < loops_number; ++i) { fused_extent = fused_extent * for_nodes[i]->extent; } - fused_extent = cinn::common::AutoSimplify(fused_extent); + fused_extent = optim::ArithSimplify(fused_extent); if (!fused_body.As()) fused_body = Block::Make({fused_body}); Expr new_stmt = For::Make(fused_var, Expr(0), diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc index 316854db08ebed..756b76f271efb7 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.cc +++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc @@ -478,8 +478,8 @@ IterRange GetAccessedRange(const Expr& index, ReplaceExpr(&indice_min, iter_vars, var_mins); ReplaceExpr(&indice_max, iter_vars, var_maxs); // simplify expression - indice_min = cinn::common::AutoSimplify(indice_min); - indice_max = cinn::common::AutoSimplify(indice_max); + indice_min = optim::ArithSimplify(indice_min); + indice_max = optim::ArithSimplify(indice_max); Expr indice_extent; Expr mod_extent(0); @@ -487,7 +487,7 @@ IterRange GetAccessedRange(const Expr& index, Expr mod_right_min = indice_min.As()->a(); Expr mod_right_max = indice_max.As()->a(); Expr mod_right_extent = - cinn::common::AutoSimplify(mod_right_max - mod_right_min + 1); + optim::ArithSimplify(mod_right_max - mod_right_min + 1); mod_extent = indice_min.As()->b(); if (mod_right_extent.get_constant() < mod_extent.get_constant()) { mod_extent = mod_right_extent; @@ -502,9 +502,8 @@ IterRange GetAccessedRange(const Expr& index, indice_extent = mod_extent; } } else { - indice_extent = - cinn::common::AutoSimplify(cinn::common::AutoSimplify(indice_max) - - cinn::common::AutoSimplify(indice_min) + 1); + indice_extent = optim::ArithSimplify(optim::ArithSimplify(indice_max) - + optim::ArithSimplify(indice_min) + 1); } if (indice_extent.is_constant() && indice_extent.get_constant() < 0) { @@ -650,7 +649,7 @@ Expr MakeCacheBlock(const std::vector& buffer_ranges, cinn::common::UniqName("cache_ax" + std::to_string(loop_vars.size()))); // Var loop_var("ax" + std::to_string(loop_vars.size())); loop_vars.push_back(loop_var); - iter_values.push_back(cinn::common::AutoSimplify(range.min + loop_var)); + iter_values.push_back(optim::ArithSimplify(range.min + loop_var)); } // block variables std::vector block_vars; @@ -681,7 +680,7 @@ Expr MakeCacheBlock(const std::vector& buffer_ranges, for (int i = static_cast(loop_vars.size()) - 1; i >= 0; i--) { new_body = For::Make(loop_vars[i], Expr(0), - cinn::common::AutoSimplify(buffer_ranges[i].extent), + optim::ArithSimplify(buffer_ranges[i].extent), ir::ForType::Serial, device_api, ir::Block::Make({new_body})); @@ -1284,9 +1283,9 @@ void InsertBlock(Expr& for_loop, const Expr& insertion, int index) { // NOLINT } IterRange RangeUnion(const IterRange& range1, const IterRange& range2) { - Expr new_min = cinn::common::AutoSimplify(Min::Make(range1.min, range2.min)); - Expr new_extent = cinn::common::AutoSimplify( - cinn::common::AutoSimplify( + Expr new_min = optim::ArithSimplify(Min::Make(range1.min, range2.min)); + Expr new_extent = optim::ArithSimplify( + optim::ArithSimplify( Max::Make(range1.min + range1.extent, range2.min + range2.extent)) - new_min); return IterRange(new_min, new_extent); diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h index 576a7448147e6e..d0e102b0050751 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.h +++ b/paddle/cinn/ir/schedule/ir_schedule_util.h @@ -555,7 +555,7 @@ struct RfMutator : public ir::IRMutator<> { true, ::common::errors::InvalidArgument( "The rfactor loop's minimum value should be zero.")); - auto extent = cinn::common::AutoSimplify(rf_for->extent); + auto extent = optim::ArithSimplify(rf_for->extent); auto& shape = tensor->shape; auto& domain = tensor->domain; PADDLE_ENFORCE_LE( @@ -673,9 +673,9 @@ struct LoopReconstructor : public ir::IRMutator<> { Var var(var_name, Int(32)); loop_vars.push_back(var); loop_extents.push_back(range.extent); - iter_values.push_back(cinn::common::AutoSimplify(range.min) + var); + iter_values.push_back(optim::ArithSimplify(range.min) + var); } else { - iter_values.push_back(cinn::common::AutoSimplify(range.min)); + iter_values.push_back(optim::ArithSimplify(range.min)); } } auto schedule_block_node = diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc index f05da6ce6dcfeb..8e065541d10407 100644 --- a/paddle/cinn/ir/tensor.cc +++ b/paddle/cinn/ir/tensor.cc @@ -443,8 +443,8 @@ bool _Tensor_::HasSameShapeWith(const Tensor &other) const { if (shape.size() != other->shape.size()) return false; for (int i = 0; i < shape.size(); i++) { - Expr dim0 = cinn::common::AutoSimplify(shape[i]); - Expr dim1 = cinn::common::AutoSimplify(other->shape[i]); + Expr dim0 = optim::ArithSimplify(shape[i]); + Expr dim1 = optim::ArithSimplify(other->shape[i]); if (dim0 != dim1) return false; } diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc index 4a3c101f3c325f..ea2af9033a5423 100644 --- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc +++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc @@ -136,7 +136,7 @@ CollectLocalVarToIndexes(ir::Expr* expr) { } int ExtractMulNumberFromExpr(const ir::Expr& expr) { - ir::Expr simplied_expr = cinn::common::AutoSimplify(expr); + ir::Expr simplied_expr = optim::ArithSimplify(expr); if (simplied_expr.is_constant()) { return static_cast(simplied_expr.get_constant()); } else if (expr.As()) { @@ -151,7 +151,7 @@ int ExtractMulNumberFromExpr(const ir::Expr& expr) { } int ExtractAddNumberFromExpr(const ir::Expr& expr) { - ir::Expr simplied_expr = cinn::common::AutoSimplify(expr); + ir::Expr simplied_expr = optim::ArithSimplify(expr); if (simplied_expr.is_constant()) { return static_cast(simplied_expr.get_constant()); } else if (expr.As()) { @@ -173,7 +173,7 @@ int gcd(int a, int b) { } ir::Expr ExtractSymbolicFromExpr(const ir::Expr& expr) { - ir::Expr simplied_expr = cinn::common::AutoSimplify(expr); + ir::Expr simplied_expr = optim::ArithSimplify(expr); if (simplied_expr.is_constant()) { return ir::Expr(0); } else if (expr.As()) { @@ -210,7 +210,7 @@ struct CommonFactorTrait { static ir::Expr Simplify(const ir::Expr& expr, const ir::Expr& factor) { if (factor != unit) { - return cinn::common::AutoSimplify(ir::Div::Make(expr, factor)); + return optim::ArithSimplify(ir::Div::Make(expr, factor)); } return expr; } @@ -229,7 +229,7 @@ struct CommonFactorTrait { static ir::Expr Simplify(const ir::Expr& expr, const ir::Expr& factor) { if (factor != unit) { - return cinn::common::AutoSimplify(ir::Sub::Make(expr, factor)); + return optim::ArithSimplify(ir::Sub::Make(expr, factor)); } return expr; } @@ -244,7 +244,7 @@ struct CommonFactorTrait { static ir::Expr Calculate(const ir::Expr& expr1, const ir::Expr& expr2) { auto IsSymbolicNotEqual = [&](const ir::Expr& expr1, const ir::Expr& expr2) -> bool { - return cinn::common::AutoSimplify( + return optim::ArithSimplify( ir::Sub::Make(ExtractSymbolicFromExpr(expr1), ExtractSymbolicFromExpr(expr2))) != ir::Expr(0); }; @@ -256,7 +256,7 @@ struct CommonFactorTrait { static ir::Expr Simplify(const ir::Expr& expr, const ir::Expr& factor) { if (factor != unit) { - return cinn::common::AutoSimplify(ir::Sub::Make(expr, factor)); + return optim::ArithSimplify(ir::Sub::Make(expr, factor)); } return expr; } diff --git a/paddle/cinn/optim/eliminate_common_global_memory_read.cc b/paddle/cinn/optim/eliminate_common_global_memory_read.cc index 4af31da3b2ecaa..42bc0805137d3c 100644 --- a/paddle/cinn/optim/eliminate_common_global_memory_read.cc +++ b/paddle/cinn/optim/eliminate_common_global_memory_read.cc @@ -46,7 +46,7 @@ std::unordered_map ConstructForVarReplaceMap( for (const auto& [lhs_var, lhs_extent] : lhs_extents) { for (std::size_t i = 0; i < rhs_extents.size(); ++i) { const auto& [rhs_var, rhs_extent] = rhs_extents[i]; - if (cinn::common::AutoSimplify(ir::Sub::Make(lhs_extent, rhs_extent)) == + if (optim::ArithSimplify(ir::Sub::Make(lhs_extent, rhs_extent)) == ir::Expr(0) && visited_rhs_index.count(i) == 0) { ret[lhs_var] = rhs_var; @@ -88,8 +88,7 @@ struct GlobalTensorInfoCollector : public ir::IRMutator { for (size_t i = 0; i < indice1.size(); ++i) { ir::Expr lhs = IndiceToExprWithForVar(indice1.at(i), for_var_map); ir::Expr rhs = IndiceToExprWithForVar(indice2.at(i), for_var_map); - if (cinn::common::AutoSimplify(ir::Sub::Make(lhs, rhs)) != - ir::Expr(0)) { + if (optim::ArithSimplify(ir::Sub::Make(lhs, rhs)) != ir::Expr(0)) { return false; } } @@ -166,7 +165,7 @@ struct GlobalTensorInfoCollector : public ir::IRMutator { } VLOG(6) << "Iter var name: " << iter_var_name << " with extent: " << iter_var_name_to_extent_.at(iter_var_name); - buffer_size = cinn::common::AutoSimplify(ir::Mul::Make( + buffer_size = optim::ArithSimplify(ir::Mul::Make( buffer_size, iter_var_name_to_extent_.at(iter_var_name))); } return buffer_size; @@ -182,7 +181,7 @@ struct GlobalTensorInfoCollector : public ir::IRMutator { CalculateBufferSize(indices_and_extent[0].indices); VLOG(6) << "Global buffer name: " << name << " with size: " << buffer_size; - size = cinn::common::AutoSimplify(ir::Add::Make(size, buffer_size)); + size = optim::ArithSimplify(ir::Add::Make(size, buffer_size)); } if (BufferSizeContainsSymbolic(size)) { VLOG(6) << "Local buffer size contains symbolic: " << size; diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc index 396e4b6e5c0697..7dc54f5b47c1ac 100644 --- a/paddle/cinn/optim/ir_simplify.cc +++ b/paddle/cinn/optim/ir_simplify.cc @@ -132,9 +132,8 @@ struct SimplifyRampMutator : public ir::IRMutator { auto b_ramp = b.As(); if (a_ramp && b_ramp && a_ramp->lanes == b_ramp->lanes) { - Expr base_add = cinn::common::AutoSimplify(a_ramp->base + b_ramp->base); - Expr stride_add = - cinn::common::AutoSimplify(a_ramp->stride + b_ramp->stride); + Expr base_add = optim::ArithSimplify(a_ramp->base + b_ramp->base); + Expr stride_add = optim::ArithSimplify(a_ramp->stride + b_ramp->stride); *expr = ir::Ramp::Make(base_add, stride_add, a_ramp->lanes); } } From 440570ee8aeef4015a2a0ce3ae76f005b0f6dd03 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:29:55 +0800 Subject: [PATCH 14/57] [CINN] Enhance reduce anchor fusion with different flatten axis (#70665) --- .../policy/iters_fusion_policy.cc | 68 ++++++++++++------- .../policy/iters_fusion_policy.h | 2 + test/ir/pir/cinn/test_reduce_fusion.py | 20 ++++++ 3 files changed, 64 insertions(+), 26 deletions(-) diff --git a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc index c0da3a56ea9a67..f07b57ab596ffd 100644 --- a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc +++ b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc @@ -154,14 +154,37 @@ std::optional ItersFusionPolicy::GetReuseItersTransform( } } +std::optional ItersFusionPolicy::GetAppendItersTransform( + FusionIters* source_iters, const FusionIters& target_iters) { + const auto target_unique_iters = + GatherFirstNotInSecond(target_iters, *source_iters); + if (!target_unique_iters.empty()) { + if (!transform_strategy_[ItersTransformType::AppendIters] || + !FLAGS_enable_append_iters_in_fusion) { + VLOG(4) << "Can not append iters in fusion, because of AppendIters " + "tranform is disabled."; + return std::nullopt; + } + std::vector append_axis; + std::vector append_symbols; + for (const auto& iter : target_unique_iters) { + const size_t pos = + std::find(target_iters.begin(), target_iters.end(), iter) - + target_iters.begin(); + append_axis.push_back(pos); + append_symbols.push_back(iters_manager_->GetIterSymbol(iter)); + source_iters->insert(source_iters->begin() + pos, iter); + } + return AppendItersTransform(append_axis, append_symbols); + } + return IdentityItersTransform(); +} + std::optional ItersFusionPolicy::SearchTransformRouteFromReduce2Reduce( const FusionItersSignature& source, const FusionItersSignature& target) { VLOG(4) << "Start search transform Route from reduce to reduce."; - if (source.loop_iters.size() == target.loop_iters.size() && - source.reduce_iter_nums == target.reduce_iter_nums) { - // Currently only support fusion with same iter_nums and same reduce axis - // TODO(huangjiyi): Analysis fusion with different non reduce axis + if (source.reduce_iter_nums == target.reduce_iter_nums) { auto [source_flatten_iters, source_reduce_iters] = SplitReduceIters(source); auto [target_flatten_iters, target_reduce_iters] = SplitReduceIters(target); @@ -186,6 +209,15 @@ ItersFusionPolicy::SearchTransformRouteFromReduce2Reduce( route.push_back(flatten_reuse_iters_transform.value()); route.push_back(reduce_reuse_iters_transform.value()); + // 2. Apply AppendItersTransform for flatten iters + const auto flatten_append_iters_transform = + GetAppendItersTransform(&source_flatten_iters, target_flatten_iters); + if (flatten_append_iters_transform == std::nullopt) { + return std::nullopt; + } else { + route.push_back(flatten_append_iters_transform.value()); + } + // 2. Apply TransposeItersTransform if (source_flatten_iters == target_flatten_iters && source_reduce_iters == target_reduce_iters) { @@ -317,28 +349,12 @@ std::optional ItersFusionPolicy::SearchItersTransformRoute( // 3. Apply AppendItersTransform // if exist iters in target can not find in source FusionIters appended_source_iters = reused_source_iters; - if (!reused_target_unique_iters.empty()) { - if (!transform_strategy_[ItersTransformType::AppendIters] || - !FLAGS_enable_append_iters_in_fusion) { - VLOG(4) << "Can not append iters in fusion, because of AppendIters " - "tranform is disabled."; - return std::nullopt; - } - std::vector append_axis; - std::vector append_symbols; - for (const auto& iter : reused_target_unique_iters) { - const size_t pos = - std::find(target_iters.begin(), target_iters.end(), iter) - - target_iters.begin(); - append_axis.push_back(pos); - append_symbols.push_back(iters_manager_->GetIterSymbol(iter)); - appended_source_iters.insert(appended_source_iters.begin() + pos, iter); - } - iters_transforms.push_back( - AppendItersTransform(append_axis, append_symbols)); - if (appended_source_iters == target_iters) { - return iters_transforms; - } + const auto append_iters_transform = + GetAppendItersTransform(&appended_source_iters, target_iters); + if (append_iters_transform == std::nullopt) { + return std::nullopt; + } else { + iters_transforms.push_back(append_iters_transform.value()); } VLOG(4) << "source iters after reuse and append: " << PrintFusionIters(appended_source_iters); diff --git a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h index dfe4b78030e2b4..636d330e1990b5 100644 --- a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h +++ b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h @@ -62,6 +62,8 @@ struct ItersFusionPolicy final : public PolicyBase { private: std::optional GetReuseItersTransform( FusionIters* source_iters, const FusionIters& target_iters); + std::optional GetAppendItersTransform( + FusionIters* source_iters, const FusionIters& target_iters); std::optional SearchTransformRouteFromReduce2Reduce( const FusionItersSignature& source, const FusionItersSignature& target); std::optional SearchItersTransformRoute( diff --git a/test/ir/pir/cinn/test_reduce_fusion.py b/test/ir/pir/cinn/test_reduce_fusion.py index 58a1d9184c42b1..a8f3140672ee04 100644 --- a/test/ir/pir/cinn/test_reduce_fusion.py +++ b/test/ir/pir/cinn/test_reduce_fusion.py @@ -197,6 +197,26 @@ def init(): self.check_accuracy_and_kernel_num(init, func) + def test_reduce_anchor_fusion(self): + # T + # / \ + # R --> T + # / \ + # R --> T + def func(x): + x = x + 1 + a = paddle.max(x, axis=-1, keepdim=True) + b = x + a + c = paddle.max(b, axis=-1, keepdim=True) + d = c + b + return d + + def init(): + x = paddle.rand((1, 32, 4, 8), dtype='float32') + return (x,) + + self.check_accuracy_and_kernel_num(init, func, kernel_num=1) + if __name__ == "__main__": unittest.main() From 961393d766fefdb85b322de86369adb6bb8001c9 Mon Sep 17 00:00:00 2001 From: fangfangssj <99968055+fangfangssj@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:31:42 +0800 Subject: [PATCH 15/57] add support complex (#70635) --- .../phi/kernels/cpu/activation_grad_kernel.cc | 2 +- paddle/phi/kernels/cpu/activation_kernel.cc | 2 +- paddle/phi/kernels/funcs/activation_functor.h | 34 +++++++++++++++++ .../phi/kernels/gpu/activation_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/activation_kernel.cu | 2 +- python/paddle/tensor/ops.py | 2 + test/legacy_test/test_activation_op.py | 37 +++++++++++++++---- 7 files changed, 69 insertions(+), 12 deletions(-) diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index b8ced8d4defe2f..2e95e70a9c5a2a 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -307,7 +307,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(reciprocal_grad, ReciprocalGradKernel) -PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sqrt_grad, SqrtGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad, SoftplusGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 9db1466b4b7ae6..1ac2ed0f1a26b8 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -212,7 +212,7 @@ PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(silu, SiluKernel) PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, STanhKernel) PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(reciprocal, ReciprocalKernel) -PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sqrt, SqrtKernel) PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel) PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softplus, SoftplusKernel) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 9e02d9ae860ba5..57f3e08121c545 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -759,6 +759,24 @@ struct SqrtGradFunctor : public BaseActivationFunctor { } }; +template +struct SqrtGradFunctor> + : public BaseActivationFunctor> { + template + void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * (static_cast>(0.5) / out).unaryExpr(Conj()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + // rsqrt(x) = x^(-1/2) template struct RsqrtFunctor : public BaseActivationFunctor { @@ -4050,6 +4068,22 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { } }; +template +struct CudaSqrtGradFunctor> + : public BaseActivationFunctor> { + ComplexType one_half = static_cast>(0.5f); + + // dx = dout * 0.5 / out + __device__ __forceinline__ ComplexType operator()( + const ComplexType dout, const ComplexType out) const { + return dout * conj(one_half / out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + template struct CudaRsqrtFunctor : public BaseActivationFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index ecfd46852c1343..602a4b8f2dd617 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -386,7 +386,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad, SoftplusGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_double_grad, SoftplusDoubleGradKernel) -PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sqrt_grad, SqrtGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_double_grad, SqrtDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_double_grad, RsqrtDoubleGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 0ad0cb9f8c8f6c..3afc392a01497d 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -261,7 +261,7 @@ PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel) PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(reciprocal, ReciprocalKernel) -PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sqrt, SqrtKernel) PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel) PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softplus, SoftplusKernel) diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py index 7d48614d176295..28fbfe7c277cf9 100644 --- a/python/paddle/tensor/ops.py +++ b/python/paddle/tensor/ops.py @@ -1114,6 +1114,8 @@ def sqrt(x: Tensor, name: str | None = None) -> Tensor: 'int16', 'int32', 'int64', + 'complex64', + 'complex128', ], 'sqrt', ) diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index ad556f57af1c6b..16515942aaf4f4 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -1667,6 +1667,11 @@ def setUp(self): np.random.seed(1023) x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) + if self.dtype == np.complex64 or self.dtype == np.complex128: + x = ( + np.random.uniform(-1, 1, self.shape) + + 1j * np.random.uniform(-1, 1, self.shape) + ).astype(self.dtype) out = np.sqrt(x) self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} @@ -1679,14 +1684,20 @@ def if_enable_cinn(self): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad( - ['X'], - 'Out', - check_prim=True, - check_pir=True, - check_prim_pir=True, - check_pir_onednn=self.check_pir_onednn, - ) + if self.dtype not in [np.complex64, np.complex128]: + self.check_grad( + ['X'], + 'Out', + check_prim=True, + check_pir=True, + check_prim_pir=True, + check_pir_onednn=self.check_pir_onednn, + ) + else: + self.check_grad( + ['X'], + 'Out', + ) def test_check_output(self): self.check_output( @@ -1746,6 +1757,16 @@ def init_shape(self): self.shape = [] +class TestSqrt_Complex64(TestSqrt): + def init_dtype(self): + self.dtype = np.complex64 + + +class TestSqrt_Complex128(TestSqrt): + def init_dtype(self): + self.dtype = np.complex128 + + @unittest.skipIf( not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(), "core is not compiled with CUDA", From e40f1da13d0e264283e371dfc60a89f029d314b7 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:10:18 +0800 Subject: [PATCH 16/57] disable pattern match one log (#70669) --- paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc b/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc index 57754f583b0450..cbac44b94a4517 100644 --- a/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc +++ b/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc @@ -231,7 +231,7 @@ std::pair ApplyPatternsGreedily( GreedyPatternRewriteDriver driver(region.ir_context(), patterns, config); auto [converged, num_rewrites] = driver.Simplify(); - if (!converged) { + if (!converged && config.max_iterations != 1) { LOG(WARNING) << "The pattern rewrite did not converge after scanning " << config.max_iterations << " times"; } From 05f3be6a5d5814e6c19609fb05d3adbbc1b37334 Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 8 Jan 2025 13:30:55 +0800 Subject: [PATCH 17/57] [fluid_ops]Modify c_allreduce_sum in collective_allreduce_op_wait.py (#70671) --- test/collective/collective_allreduce_op_wait.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/collective/collective_allreduce_op_wait.py b/test/collective/collective_allreduce_op_wait.py index 9d677211e71449..5c020fb4e45f3e 100644 --- a/test/collective/collective_allreduce_op_wait.py +++ b/test/collective/collective_allreduce_op_wait.py @@ -70,11 +70,13 @@ def get_model(self, main_prog, startup_program, dtype="float32"): ) main_prog.global_block().append_op( - type="c_allreduce_sum", - inputs={'X': toutdata}, - attrs={'ring_id': ring_id}, - outputs={'Out': toutdata}, - attr={'use_calc_stream': False}, + type="all_reduce", + inputs={'x': toutdata}, + attrs={ + 'ring_id': ring_id, + 'reduce_type': paddle.distributed.ReduceOp.SUM, + }, + outputs={'out': toutdata}, ) main_prog.global_block().append_op( From cdebfcdb9b930721e41de8b0e9c084c926ebddf9 Mon Sep 17 00:00:00 2001 From: AIbin <37361953+chang-wenbin@users.noreply.github.com> Date: Wed, 8 Jan 2025 14:13:43 +0800 Subject: [PATCH 18/57] update return isinstance(args, (list, tuple)) (#70657) [inference] support jit.inference input is tuple(Paddle.Tensor) --- python/paddle/incubate/jit/inference_decorator.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/paddle/incubate/jit/inference_decorator.py b/python/paddle/incubate/jit/inference_decorator.py index a162489a971b81..4aa3b028a04634 100644 --- a/python/paddle/incubate/jit/inference_decorator.py +++ b/python/paddle/incubate/jit/inference_decorator.py @@ -74,11 +74,15 @@ def is_fixed_type(input): return False +def is_list_or_tuple(args): + return isinstance(args, (list, tuple)) + + # get paddle.Tensor for paddle inference use. def get_tensor(run_time_args, arg_name): if isinstance(run_time_args, paddle.Tensor): return [run_time_args] - elif isinstance(run_time_args, list): + elif is_list_or_tuple(run_time_args): this_input_tensor_lists = [] for ele in run_time_args: assert isinstance( @@ -90,7 +94,7 @@ def get_tensor(run_time_args, arg_name): return [run_time_args] else: raise AssertionError( - f'''we only support adding paddle.incubate.jit.inference() in functions whose arguments are paddle.Tensor or list[paddle.Tensor] or None, + f'''we only support adding paddle.incubate.jit.inference() in functions whose arguments are paddle.Tensor or list[paddle.Tensor] & tuple[paddle.Tensor] or None, but here we get {arg_name} in your function is {type(run_time_args)}, please modify your function to meet our requirement.''' ) @@ -99,7 +103,7 @@ def get_tensor(run_time_args, arg_name): def get_d2s_spec(run_time_args, name): if isinstance(run_time_args, paddle.Tensor): return InputSpec.from_tensor(run_time_args, name=name) - elif isinstance(run_time_args, list): + elif is_list_or_tuple(run_time_args): this_input_spec = [] suffix = 0 for ele in run_time_args: @@ -273,7 +277,7 @@ def forward(self, args): input_specs.append(this_input) for i in range(len(input_specs)): - if isinstance(input_specs[i], list): + if is_list_or_tuple(input_specs[i]): for j in range(len(input_specs[i])): input_specs[i][j].stop_gradient = True elif isinstance(input_specs[i], paddle.static.InputSpec): @@ -285,7 +289,7 @@ def forward(self, args): if len(self.d2s_input_names) == 0: self.d2s_input_names.extend([None] * len(input_tensor_lists)) for i in range(len(input_specs)): - if isinstance(input_specs[i], list): + if is_list_or_tuple(input_specs[i]): for j in range(len(input_specs[i])): input_specs[i][j].shape = self.d2s_input_shapes[ d2s_shapes_id From be71e9766bde4d455d8955f99c7cf6ce5a43746c Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 8 Jan 2025 14:50:58 +0800 Subject: [PATCH 19/57] Remove Wait in if_instruction (#70599) --- .../new_executor/instruction/control_flow/if_instruction.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc index 1b1231359fe833..01b97bf9bb12a5 100644 --- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc @@ -226,7 +226,6 @@ void IfInstruction::Run() { // phi::is_xpu_place(cond.place()) is true #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) - DeviceContext().Wait(); phi::DenseTensor cpu_cond; paddle::framework::TensorCopySync( cond_tensor, phi::CPUPlace(), &cpu_cond); From fe00bd8e47f6ba8721062a5ab724d439baa1325c Mon Sep 17 00:00:00 2001 From: bukejiyu <52310069+bukejiyu@users.noreply.github.com> Date: Wed, 8 Jan 2025 15:21:14 +0800 Subject: [PATCH 20/57] [Infrence]Temporarily disable AVX kernel inlining for GCC12 (#70603) --- cmake/simd.cmake | 10 ++++++++++ paddle/phi/CMakeLists.txt | 5 +++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 500e8c234407ff..119d9e91cacdeb 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -13,6 +13,11 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") set(AVX512F_FLAG "-mavx512f") set(Wno_Maybe_Uninitialized "-Wno-maybe-uninitialized") set(FMA_FLAG "-mfma") + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0) + set(NO_INLINE "-fno-inline") + else() + set(NO_INLINE "") + endif() elseif(MSVC) set(MMX_FLAG "/arch:MMX") set(SSE2_FLAG "/arch:SSE2") @@ -22,6 +27,11 @@ elseif(MSVC) set(AVX512F_FLAG "/arch:AVX512") set(Wno_Maybe_Uninitialized "/wd4701") set(FMA_FLAG "/arch:AVX2") + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0) + set(NO_INLINE "/Ob0") + else() + set(NO_INLINE "") + endif() endif() set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 6162de3b58cd8b..6a17e55d9bcb94 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -141,8 +141,9 @@ if(WITH_AVX kernels/fusion/cpu/fused_layer_norm_avx_kernel.cc kernels/fusion/cpu/self_dp_attention_kernel.cc kernels/fusion/cpu/rms_norm_avx_kernel.cc - PROPERTIES COMPILE_FLAGS - "${Wno_Maybe_Uninitialized} ${FMA_FLAG} ${AVX512F_FLAG}") + PROPERTIES + COMPILE_FLAGS + "${Wno_Maybe_Uninitialized} ${FMA_FLAG} ${AVX512F_FLAG} ${NO_INLINE}") endif() if(WITH_GPU) From afcd24b0af4bdafed6f5e946c43e45519334e008 Mon Sep 17 00:00:00 2001 From: winter-wang <78149749+winter-wang@users.noreply.github.com> Date: Wed, 8 Jan 2025 15:23:39 +0800 Subject: [PATCH 21/57] add local layer api (#70600) * add local layer api * add doc and example codes --------- Co-authored-by: andsonder --- python/paddle/distributed/__init__.py | 2 + .../distributed/auto_parallel/local_layer.py | 95 +++++++++++++++++++ test/auto_parallel/pir/CMakeLists.txt | 2 + test/auto_parallel/pir/local_layer_demo.py | 64 +++++++++++++ test/auto_parallel/pir/test_local_layer.py | 42 ++++++++ 5 files changed, 205 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/local_layer.py create mode 100644 test/auto_parallel/pir/local_layer_demo.py create mode 100644 test/auto_parallel/pir/test_local_layer.py diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index ac0cf6ba3eac9e..61bd791948bdc6 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -59,6 +59,7 @@ SequenceParallelEnable, SequenceParallelEnd, ) +from .auto_parallel.local_layer import LocalLayer from .auto_parallel.placement_type import ( Partial, Replicate, @@ -190,6 +191,7 @@ "to_static", "Strategy", "DistModel", + "LocalLayer", "unshard_dtensor", "parallelize", "SequenceParallelEnd", diff --git a/python/paddle/distributed/auto_parallel/local_layer.py b/python/paddle/distributed/auto_parallel/local_layer.py new file mode 100644 index 00000000000000..63b114a2c84946 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/local_layer.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import paddle +import paddle.distributed as dist +from paddle.nn import Layer + +if TYPE_CHECKING: + from paddle.distributed import Placement + from paddle.distributed.auto_parallel.process_mesh import ProcessMesh + + +class LocalLayer(Layer): + """ + The `LocalLayer` class is a specialized `Layer` for managing distributed tensors during + forward and backward passes in a parallelized training environment. It converts distributed tensors + to local tensors for computation and then back to distributed tensors as output, ensuring seamless + integration with distributed parallelism frameworks. + + Args: + out_dist_attrs (list[tuple[ProcessMesh, list[Placement]]]): + A list where each entry is a tuple containing the `ProcessMesh` and the list of `Placement` + attributes for the corresponding output tensors. These attributes define the distribution + strategy for the outputs. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + from paddle import nn + + class CustomLayer(LocalLayer): + def __init__(self, mesh): + super().__init__( + out_dist_attrs=[(mesh, [dist.Partial(dist.ReduceType.kRedSum)])] + ) + self.fc = nn.Linear(16, 8) + + def forward(self, x): + return self.fc(x) + + # doctest: +REQUIRES(env:DISTRIBUTED) + mesh = dist.ProcessMesh([0, 1], dim_names=["x"]) + custom_layer = CustomLayer(mesh) + input_tensor = dist.auto_parallel.api.dtensor_from_local( + paddle.randn([4, 16]), mesh, [dist.Replicate()] + ) + + output_tensor = custom_layer(input_tensor) + print(output_tensor) + """ + + def __init__( + self, out_dist_attrs: list[tuple[ProcessMesh, list[Placement]]] + ): + super().__init__() + self.out_dist_attrs = out_dist_attrs + + def __call__(self, *inputs: Any, **kwargs: Any) -> Any: + """ + Overrides the base `Layer`'s `__call__` method. Transforms distributed tensors to local tensors + before computation, invokes the parent class's `__call__` method, and then transforms the + outputs back to distributed tensors based on the specified distribution attributes. + """ + inputs = list(inputs) + for idx in range(len(inputs)): + if inputs[idx].is_dist(): + inputs[idx] = dist.auto_parallel.api.dtensor_to_local( + inputs[idx] + ) + outputs = Layer.__call__(self, *inputs, **kwargs) + list_outs = paddle.utils.flatten(outputs) + for idx in range(len(list_outs)): + list_outs[idx] = dist.auto_parallel.api.dtensor_from_local( + list_outs[idx], + self.out_dist_attrs[idx][0], + self.out_dist_attrs[idx][1], + ) + return paddle.utils.pack_sequence_as(outputs, list_outs) diff --git a/test/auto_parallel/pir/CMakeLists.txt b/test/auto_parallel/pir/CMakeLists.txt index 06172a555fef90..71a649276240bd 100644 --- a/test/auto_parallel/pir/CMakeLists.txt +++ b/test/auto_parallel/pir/CMakeLists.txt @@ -17,6 +17,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_pir_reshard_s_to_r MODULES test_pir_reshard_s_to_r) set_tests_properties(test_pir_reshard_s_to_r PROPERTIES TIMEOUT 120) py_test_modules(test_mlp MODULES test_mlp ENVS FLAGS_enable_pir_api=1) + py_test_modules(test_local_layer MODULES test_local_layer ENVS + FLAGS_enable_pir_api=1) py_test_modules( test_semi_auto_parallel_dist_to_static_pir MODULES test_semi_auto_parallel_dist_to_static_pir ENVS FLAGS_enable_pir_api=1) diff --git a/test/auto_parallel/pir/local_layer_demo.py b/test/auto_parallel/pir/local_layer_demo.py new file mode 100644 index 00000000000000..be66d50fb8aa48 --- /dev/null +++ b/test/auto_parallel/pir/local_layer_demo.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from test_to_static_pir_program import ( + DemoNet, + create_data_loader, +) + +import paddle +import paddle.distributed as dist +from paddle import nn + +BATCH_SIZE = 4 +BATCH_NUM = 40 +IMAGE_SIZE = 16 +CLASS_NUM = 8 +np.random.seed(2025) +paddle.seed(2025) + + +class LocalLossLayer(dist.LocalLayer): + def __init__(self, mesh): + super().__init__( + out_dist_attrs=[(mesh, [dist.Partial(dist.ReduceType.kRedSum)])] + ) + self.loss = nn.MSELoss() + + def forward(self, input, label): + return self.loss(input, label) + + +class TestMLPTensorParallel(unittest.TestCase): + def test_to_static_program(self): + mesh = dist.ProcessMesh([0, 1], dim_names=["x"]) + mp_layer = DemoNet(mesh) + opt = paddle.optimizer.SGD( + learning_rate=0.1, parameters=mp_layer.parameters() + ) + loss_fn = LocalLossLayer(mesh) + loader = create_data_loader() + dist_loader = dist.shard_dataloader(loader, meshes=[mesh]) + dist_model = dist.to_static(mp_layer, dist_loader, loss_fn, opt) + + dist_model.train() + for batch_id, (image, label) in enumerate(dist_loader()): + loss = dist_model(image, label) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/auto_parallel/pir/test_local_layer.py b/test/auto_parallel/pir/test_local_layer.py new file mode 100644 index 00000000000000..ddd5afa52e13cc --- /dev/null +++ b/test/auto_parallel/pir/test_local_layer.py @@ -0,0 +1,42 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import collective.test_communication_api_base as test_base + + +class TestLocalLayer(test_base.CommunicationTestDistBase): + def setUp(self): + super().setUp( + num_of_devices=2, + timeout=300, + ) + self._default_envs = {"dtype": "float32", "seed": "2023"} + self._changeable_envs = {"backend": ["gpu"]} + + def test_local_layer(self): + envs_list = test_base.gen_product_envs_list( + {"dtype": "float32", "seed": "2023"}, {"backend": ["gpu"]} + ) + # self._log_dir.name = "./log" + for envs in envs_list: + self.run_test_case( + "local_layer_demo.py", + user_defined_envs=envs, + ) + + +if __name__ == "__main__": + unittest.main() From 2834e1e0300bbe575e4b0448b01174d57c28f19d Mon Sep 17 00:00:00 2001 From: Zhou Xin Date: Wed, 8 Jan 2025 15:50:10 +0800 Subject: [PATCH 22/57] [CINN][Backend Pass Update No.1] Update EliminateCommonFactorOfLocalIndex pass (#70619) * Add comment for eliminateCommonFactorOfLocalIndex, test=document_fix * Update eliminateCommonFactorOfLocalIndex --- .../eliminate_common_factor_of_local_index.cc | 329 +++++++++++++----- .../eliminate_common_factor_of_local_index.h | 8 +- paddle/cinn/optim/transform_gpu_forloop.cc | 7 +- 3 files changed, 249 insertions(+), 95 deletions(-) diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc index ea2af9033a5423..3eaa1723f46179 100644 --- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc +++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc @@ -27,32 +27,24 @@ namespace cinn { namespace optim { namespace { +using ir::Expr; -class GatherLocalIndexVisitor : public ir::IRMutator<> { +class GatherLocalIndexAndProhibitedLocalVarVisitor + : public ir::IRMutator<>, + public ir::stmt::StmtVisitor<> { public: - void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } + void operator()(ir::stmt::BlockRef func_body) { VisitBlock(func_body); } const std::unordered_map>>& local_var_to_indexes() const { return local_var_to_indexes_; } - private: - void Visit(const ir::Store* op, Expr* expr) override { - auto store = expr->As(); - - ir::IRMutator<>::Visit(op, expr); - if (!store->tensor.as_tensor_ref()->buffer.defined()) { - return; - } - - if (store->tensor.as_tensor_ref()->buffer->memory_type == - ir::MemoryType::GPULocal) { - local_var_to_indexes_[store->tensor.as_tensor_ref()->buffer->name] - .push_back(store->indices); - } + const std::unordered_set& prohibited_local_vars() const { + return prohibited_local_vars_; } + private: void Visit(const ir::Load* op, Expr* expr) override { auto load = expr->As(); @@ -71,40 +63,81 @@ class GatherLocalIndexVisitor : public ir::IRMutator<> { ir::IRMutator<>::Visit(op, expr); } - std::unordered_map>> - local_var_to_indexes_; -}; - -class GatherProhibitedLocalVarVisitor : public ir::IRMutator<> { - public: - void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } - - const std::unordered_set& prohibited_local_vars() const { - return prohibited_local_vars_; + void Visit(const Expr& expr) { + Expr expr_ = expr; + ir::IRMutator<>::Visit(&expr_, &expr_); } - private: - void Visit(const ir::Store* op, Expr* expr) override { - auto store = expr->As(); + void VisitStmt(const ir::stmt::Store& stmt) override { + Visit(stmt->value()); - ir::IRMutator<>::Visit(op, expr); - if (!store->tensor.as_tensor_ref()->buffer.defined()) { + if (!stmt->tensor().as_tensor_ref()->buffer.defined()) { return; } - if (store->tensor.as_tensor_ref()->buffer->memory_type != + + if (stmt->tensor().as_tensor_ref()->buffer->memory_type == ir::MemoryType::GPULocal) { - return; - } - const auto& local_var_name = store->tensor.as_tensor_ref()->buffer->name; - if (store->value.As()) { - const auto& call_name = store->value.As()->name; - if (cinn::utils::GetProhibitScheduleExternalFuncNames().count(call_name) > - 0) { - prohibited_local_vars_.insert(local_var_name); + local_var_to_indexes_[stmt->tensor().as_tensor_ref()->buffer->name] + .push_back(stmt->indices()); + + if (stmt->value().As()) { + const std::string& local_var_name = + stmt->tensor().as_tensor_ref()->buffer->name; + const std::string& call_name = stmt->value().As()->name; + if (cinn::utils::GetProhibitScheduleExternalFuncNames().count( + call_name) > 0) { + prohibited_local_vars_.insert(local_var_name); + } } } } + void VisitStmt(const ir::stmt::IfThenElse& stmt) override { + Visit(stmt->condition()); + VisitBlock(stmt->true_case()); + if (stmt->false_case().defined()) { + VisitBlock(stmt->false_case()); + } + } + + void VisitStmt(const ir::stmt::Schedule& stmt) override { + for (const Expr& value : stmt->iter_values()) { + Visit(value); + } + VisitBlock(stmt->body()); + } + + void VisitStmt(const ir::stmt::For& stmt) override { + Visit(stmt->min()); + Visit(stmt->extent()); + VisitBlock(stmt->body()); + } + + void VisitStmt(const ir::stmt::Alloc& stmt) override { + for (const Expr& extent : stmt->extents()) { + Visit(extent); + } + if (stmt->condition().defined()) { + Visit(stmt->condition()); + } + if (stmt->body().defined()) { + Visit(stmt->body()); + } + } + + void VisitStmt(const ir::stmt::Evaluate& stmt) override { + Visit(stmt->value()); + } + + void VisitStmt(const ir::stmt::Free& stmt) override { + Visit(stmt->destination()); + } + + void VisitStmt(const ir::stmt::Let& stmt) override { Visit(stmt->body()); } + + private: + std::unordered_map>> + local_var_to_indexes_; std::unordered_set prohibited_local_vars_; }; @@ -123,16 +156,12 @@ EraseProhibitedLocalVar( } std::unordered_map>> -CollectLocalVarToIndexes(ir::Expr* expr) { - GatherLocalIndexVisitor gather_local_index_visitor; - gather_local_index_visitor(expr); +CollectLocalVarToIndexes(ir::stmt::BlockRef func_body) { + GatherLocalIndexAndProhibitedLocalVarVisitor gather; + gather(func_body); - GatherProhibitedLocalVarVisitor gather_prohibited_local_var_visitor; - gather_prohibited_local_var_visitor(expr); - - return EraseProhibitedLocalVar( - gather_local_index_visitor.local_var_to_indexes(), - gather_prohibited_local_var_visitor.prohibited_local_vars()); + return EraseProhibitedLocalVar(gather.local_var_to_indexes(), + gather.prohibited_local_vars()); } int ExtractMulNumberFromExpr(const ir::Expr& expr) { @@ -284,11 +313,12 @@ std::vector CalculateIndexCommonFactor( "We should guarantee indexes.size() >= 2, because local variable " "should at least load and store once. ")); for (std::size_t i = 1; i < indexes.size(); ++i) { - // NOTE(Hongyu Jia): Ideally, we can guarantee the size of indexes are equal - // However, some unit tests (e.g. test_resnet_cinn, test_instance_norm_op - // are still running with the deprecated OpScheduler, and the ir::Expr - // will break this guarantee after IRGpuScheduleBlockReduce function. - // So we have to relax the restriction here. + // NOTE(Hongyu Jia): Ideally, we can guarantee the size of indexes are + // equal However, some unit tests (e.g. test_resnet_cinn, + // test_instance_norm_op are still running with the deprecated + // OpScheduler, and the ir::Expr will break this guarantee after + // IRGpuScheduleBlockReduce function. So we have to relax the restriction + // here. if (indexes[i].size() != indexes[0].size()) { LOG(WARNING) << "Not supported for calculating common factor, local var = " @@ -330,14 +360,15 @@ CalculateLocalVarCommonFactor( } template -class EliminateCommonFactorVisitor : public ir::IRMutator<> { +class EliminateCommonFactorVisitor : public ir::IRMutator<>, + public ir::stmt::StmtMutator<> { public: EliminateCommonFactorVisitor( const std::unordered_map>& local_var_to_common_factor) : local_var_to_common_factor_(local_var_to_common_factor) {} - void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } + void operator()(ir::stmt::BlockRef func_body) { VisitBlock(func_body); } private: void Visit(const ir::Store* op, Expr* expr) override { @@ -386,27 +417,106 @@ class EliminateCommonFactorVisitor : public ir::IRMutator<> { } ir::IRMutator<>::Visit(op, expr); } + + void Visit(const Expr& expr) { + Expr expr_ = expr; + ir::IRMutator<>::Visit(&expr_, &expr_); + } + + void VisitStmt(ir::stmt::Store stmt) override { + Visit(stmt->value()); + const auto& store_buffer = stmt->tensor().as_tensor_ref()->buffer; + + if (!store_buffer.defined()) { + return; + } + + if (store_buffer->memory_type == ir::MemoryType::GPULocal) { + if (local_var_to_common_factor_.count(store_buffer->name) == 0) { + return; + } + const auto& common_factors = + local_var_to_common_factor_.at(store_buffer->name); + for (std::size_t i = 0; i < stmt->indices().size(); ++i) { + std::vector new_indices = stmt->indices(); + new_indices[i] = + CommonFactorTrait::Simplify(new_indices[i], common_factors[i]); + stmt->set_indices(new_indices); + } + } + } + + void VisitStmt(ir::stmt::IfThenElse stmt) override { + Visit(stmt->condition()); + VisitBlock(stmt->true_case()); + if (stmt->false_case().defined()) { + VisitBlock(stmt->false_case()); + } + } + + void VisitStmt(ir::stmt::Schedule stmt) override { + for (const Expr& value : stmt->iter_values()) { + Visit(value); + } + VisitBlock(stmt->body()); + } + + void VisitStmt(ir::stmt::For stmt) override { + Visit(stmt->min()); + Visit(stmt->extent()); + VisitBlock(stmt->body()); + } + + void VisitStmt(ir::stmt::Alloc stmt) override { + for (const Expr& extent : stmt->extents()) { + Visit(extent); + } + if (stmt->condition().defined()) { + Visit(stmt->condition()); + } + if (stmt->body().defined()) { + Visit(stmt->body()); + } + } + + void VisitStmt(ir::stmt::Evaluate stmt) override { Visit(stmt->value()); } + + void VisitStmt(ir::stmt::Free stmt) override { Visit(stmt->destination()); } + + void VisitStmt(ir::stmt::Let stmt) override { Visit(stmt->body()); } + + private: std::unordered_map> local_var_to_common_factor_; }; } // namespace +// Eliminate common factors from local indices in a function's body. +// If applied to various statement blocks, this may incorrectly simplify +// distinct local buffer indices across different statement blocks to the same +// value. template -void EliminateCommonFactorHelper(ir::Expr* expr) { +void EliminateCommonFactorHelper(ir::stmt::BlockRef func_body) { std::unordered_map>> - local_var_to_indexes = CollectLocalVarToIndexes(expr); + local_var_to_indexes = CollectLocalVarToIndexes(func_body); std::unordered_map> local_var_to_common_factor = CalculateLocalVarCommonFactor(local_var_to_indexes); + for (const auto& [local_var, common_factor] : local_var_to_common_factor) { + auto index = local_var_to_indexes.at(local_var); + for (std::size_t i = 0; i < index.size(); ++i) { + } + } EliminateCommonFactorVisitor eliminate_common_factor_visitor( local_var_to_common_factor); - eliminate_common_factor_visitor(expr); + eliminate_common_factor_visitor(func_body); } -class TransformLocalIndicesVisitor : public ir::IRMutator<> { +class TransformLocalIndicesVisitor : public ir::IRMutator<>, + public ir::stmt::StmtMutator<> { public: - void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); } + void operator()(ir::stmt::BlockRef func_body) { VisitBlock(func_body); } private: template @@ -463,30 +573,14 @@ class TransformLocalIndicesVisitor : public ir::IRMutator<> { }; std::unordered_map name_to_iter; - for (const auto& indice : indices) { - ExtractIterFromIndice(indice, &name_to_iter); - VLOG(6) << "extract iter: " << indice + for (const auto& index : indices) { + ExtractIterFromIndice(index, &name_to_iter); + VLOG(6) << "extract iter: " << index << " iter_set size: " << name_to_iter.size(); } return CopyIndiceItersToLocalBuffer(name_to_iter, indices); } - void Visit(const ir::For* op, ir::Expr* expr) override { - auto* for_ir = expr->As(); - loop_vars_.push_back(for_ir->loop_var); - IRMutator<>::Visit(op, expr); - loop_vars_.pop_back(); - } - - void Visit(const ir::Store* op, ir::Expr* expr) override { - auto store = expr->As(); - if (store->tensor.as_tensor_ref()->buffer->memory_type == - ir::MemoryType::GPULocal) { - store->indices = ConvertIndicesToIters(store->indices); - } - ir::IRMutator<>::Visit(op, expr); - } - void Visit(const ir::Load* op, ir::Expr* expr) override { auto load = expr->As(); if (load->tensor.as_tensor_ref()->buffer->memory_type == @@ -496,23 +590,80 @@ class TransformLocalIndicesVisitor : public ir::IRMutator<> { ir::IRMutator<>::Visit(op, expr); } + void Visit(const Expr& expr) { + Expr expr_ = expr; + ir::IRMutator<>::Visit(&expr_, &expr_); + } + + void VisitStmt(ir::stmt::Store stmt) override { + if (stmt->tensor().as_tensor_ref()->buffer->memory_type == + ir::MemoryType::GPULocal) { + stmt->set_indices(ConvertIndicesToIters(stmt->indices())); + } + Visit(stmt->value()); + } + + void VisitStmt(ir::stmt::IfThenElse stmt) override { + Visit(stmt->condition()); + VisitBlock(stmt->true_case()); + if (stmt->false_case().defined()) { + VisitBlock(stmt->false_case()); + } + } + + void VisitStmt(ir::stmt::Schedule stmt) override { + for (const Expr& value : stmt->iter_values()) { + Visit(value); + } + VisitBlock(stmt->body()); + } + + void VisitStmt(ir::stmt::For stmt) override { + Visit(stmt->min()); + Visit(stmt->extent()); + loop_vars_.push_back(stmt->loop_var()); + VisitBlock(stmt->body()); + loop_vars_.pop_back(); + } + + void VisitStmt(ir::stmt::Alloc stmt) override { + for (const Expr& extent : stmt->extents()) { + Visit(extent); + } + if (stmt->condition().defined()) { + Visit(stmt->condition()); + } + if (stmt->body().defined()) { + Visit(stmt->body()); + } + } + + void VisitStmt(ir::stmt::Evaluate stmt) override { Visit(stmt->value()); } + + void VisitStmt(ir::stmt::Free stmt) override { Visit(stmt->destination()); } + + void VisitStmt(ir::stmt::Let stmt) override { Visit(stmt->body()); } + + private: std::vector loop_vars_; }; -void TransformLocalIndicesToIters(ir::Expr* expr) { +void TransformLocalIndicesToIters(ir::stmt::BlockRef func_body) { TransformLocalIndicesVisitor transform_local_indices_visitor; - transform_local_indices_visitor(expr); + transform_local_indices_visitor(func_body); } -void EliminateCommonFactorOfLocalIndex(ir::Expr* expr) { - VLOG(4) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr; - EliminateCommonFactorHelper(expr); - EliminateCommonFactorHelper(expr); - EliminateCommonFactorHelper(expr); +void EliminateCommonFactorOfLocalIndex(ir::stmt::BlockRef func_body) { + VLOG(4) << "Before EliminateCommonFactorOfLocalIndex, func_body = \n" + << func_body; + EliminateCommonFactorHelper(func_body); + EliminateCommonFactorHelper(func_body); + EliminateCommonFactorHelper(func_body); - TransformLocalIndicesToIters(expr); + TransformLocalIndicesToIters(func_body); - VLOG(4) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr; + VLOG(4) << "After EliminateCommonFactorOfLocalIndex, func_body = \n" + << func_body; } } // namespace optim diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.h b/paddle/cinn/optim/eliminate_common_factor_of_local_index.h index e85cfae242a2fd..c7bd9c22524413 100644 --- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.h +++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/stmt.h" namespace cinn { namespace optim { @@ -40,9 +40,9 @@ namespace optim { * e.g., ([i+2, i+3], [i+4, i+6]) -> ([i, i], [i+2, i+3]) * c) Symbolic common factor elimination. * e.g., ([C, 2], [3C, 4]) -> ([1, 2], [3, 4]) - * 3. Transform simplified indices into iterator-based forms. + * 3. Update the IR, replacing original indices with simplified versions. + * 4. Transform local buffer indices into iterator-based forms. * e.g., [i, 0, 0] -> [0, 0, i] - * 4. Update the IR, replacing original indices with simplified versions. * * Key benefits: * 1. Reduces computational overhead in index calculations. @@ -93,7 +93,7 @@ namespace optim { * Output: * local_tensor[0, 0, 0] = global_tensor[i, 0, 0]; */ -void EliminateCommonFactorOfLocalIndex(ir::Expr* expr); +void EliminateCommonFactorOfLocalIndex(ir::stmt::BlockRef func_body); } // namespace optim } // namespace cinn diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc index 82eac4839c48e1..020cdc4dade8d5 100644 --- a/paddle/cinn/optim/transform_gpu_forloop.cc +++ b/paddle/cinn/optim/transform_gpu_forloop.cc @@ -489,8 +489,11 @@ void OptimizeExprGPU(Expr *expr) { // Replace variables that are in range [0, 1) to zero. ReplaceUnitVarToZero replace_unit_var_to_zero; replace_unit_var_to_zero(expr); - - EliminateCommonFactorOfLocalIndex(expr); + VLOG(10) << "After ReplaceUnitVarToZero: \n" << *expr; + ir::stmt::BlockRef func_body = ir::ConvertExprBlockToStmtBlock(*expr); + EliminateCommonFactorOfLocalIndex(func_body); + *expr = ir::ConvertStmtBlockToExprBlock(func_body); + VLOG(10) << "After EliminateCommonFactorOfLocalIndex: \n" << *expr; ResizeBufferToMaxVarRange(expr); From 7f2a3e45307d6c07006d5e4b63c8c105b9550ee5 Mon Sep 17 00:00:00 2001 From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:10:22 +0800 Subject: [PATCH 23/57] aligin_diff (#70613) --- python/paddle/distributed/auto_parallel/api.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py index 7606911531ca21..4b86405418ae2d 100644 --- a/python/paddle/distributed/auto_parallel/api.py +++ b/python/paddle/distributed/auto_parallel/api.py @@ -1156,6 +1156,10 @@ def _shard_accumulator(self, param): accumulator = self._inner_opt._accumulators[key][target_name] if accumulator.is_dist() and not isinstance(accumulator, pir.Value): continue + + if paddle.in_dynamic_mode(): + origin_accumulator_name = accumulator.name + if self._shard_fn is not None: self._inner_opt._accumulators[key][target_name] = ( self._shard_fn(key, param, accumulator) @@ -1179,12 +1183,10 @@ def _shard_accumulator(self, param): placements=placements, ) ) - if not isinstance( - self._inner_opt._accumulators[key][target_name], pir.Value - ): - self._inner_opt._accumulators[key][target_name].name = ( - target_name + "_" + key - ) + if paddle.in_dynamic_mode(): + self._inner_opt._accumulators[key][ + target_name + ].name = origin_accumulator_name def _reset_placements(self, param): if param.is_dist() and isinstance( From 6d96bed484587dc435e6975abed9e69dd4aaf23f Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 8 Jan 2025 16:14:05 +0800 Subject: [PATCH 24/57] [fluid_ops] clean collective operators part (#70588) --- .../fluid/operators/collective/alltoall_op.cc | 71 ---- .../fluid/operators/collective/alltoall_op.h | 42 -- .../collective/c_allreduce_avg_op.cc | 40 -- .../collective/c_allreduce_max_op.cc | 44 --- .../collective/c_allreduce_prod_op.cc | 44 --- .../operators/collective/c_broadcast_op.cc | 59 --- .../operators/collective/c_broadcast_op.h | 77 ---- .../operators/collective/c_reduce_avg_op.cc | 39 -- .../operators/collective/c_reduce_max_op.cc | 41 -- .../operators/collective/c_reduce_min_op.cc | 40 -- .../fluid/operators/collective/c_reduce_op.h | 371 ------------------ .../operators/collective/c_reduce_prod_op.cc | 41 -- .../operators/collective/c_reduce_sum_op.cc | 41 -- .../operators/collective/global_gather_op.h | 43 -- .../operators/collective/global_scatter_op.h | 43 -- 15 files changed, 1036 deletions(-) delete mode 100644 paddle/fluid/operators/collective/alltoall_op.cc delete mode 100644 paddle/fluid/operators/collective/alltoall_op.h delete mode 100644 paddle/fluid/operators/collective/c_allreduce_avg_op.cc delete mode 100644 paddle/fluid/operators/collective/c_allreduce_max_op.cc delete mode 100644 paddle/fluid/operators/collective/c_allreduce_prod_op.cc delete mode 100644 paddle/fluid/operators/collective/c_broadcast_op.cc delete mode 100644 paddle/fluid/operators/collective/c_broadcast_op.h delete mode 100644 paddle/fluid/operators/collective/c_reduce_avg_op.cc delete mode 100644 paddle/fluid/operators/collective/c_reduce_max_op.cc delete mode 100644 paddle/fluid/operators/collective/c_reduce_min_op.cc delete mode 100644 paddle/fluid/operators/collective/c_reduce_op.h delete mode 100644 paddle/fluid/operators/collective/c_reduce_prod_op.cc delete mode 100644 paddle/fluid/operators/collective/c_reduce_sum_op.cc delete mode 100644 paddle/fluid/operators/collective/global_gather_op.h delete mode 100644 paddle/fluid/operators/collective/global_scatter_op.h diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc deleted file mode 100644 index 1cb8a0b1352842..00000000000000 --- a/paddle/fluid/operators/collective/alltoall_op.cc +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/alltoall_op.h" - -namespace paddle { -namespace operators { - -class AllToAllBaseOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllToAll"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AllToAll"); - int ring_id = ctx->Attrs().Get("ring_id"); - PADDLE_ENFORCE_GE( - ring_id, - 0, - common::errors::InvalidArgument( - "The ring_id (%d) for alltoall op must be non-negative.", ring_id)); - phi::DDim dim = ctx->GetInputDim("X"); - if (dim[0] < 0) dim[0] = -1; - ctx->SetOutputDim("Out", dim); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class AllToAllBaseOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) tensor send."); - AddOutput("Out", "(Tensor) the result of alltoall."); - AddAttr("ring_id", "(int default 0) nccl communication ring id.") - .SetDefault(0); - AddAttr( - "use_calc_stream", - "(bool default false) eject CUDA operations to calculation stream.") - .SetDefault(false); - AddComment(R"DOC( -AllToAll Operator -Scatter tensors from all participators to all participators. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(alltoall, - ops::AllToAllBaseOp, - ops::AllToAllBaseOpMaker) diff --git a/paddle/fluid/operators/collective/alltoall_op.h b/paddle/fluid/operators/collective/alltoall_op.h deleted file mode 100644 index 464a53668bd8a5..00000000000000 --- a/paddle/fluid/operators/collective/alltoall_op.h +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" - -#if defined(PADDLE_WITH_GLOO) -#include "paddle/fluid/framework/fleet/gloo_wrapper.h" -#endif - -namespace paddle { -namespace operators { - -template -class AllToAllOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx UNUSED) const override { - PADDLE_THROW(common::errors::Unavailable( - "Do not support alltoall for cpu kernel now.")); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc deleted file mode 100644 index 13d07557f1e7c9..00000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle::framework { -class OpDesc; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CAllReduceAvgOpMaker : public CAllReduceOpMaker { - protected: - std::string GetName() const override { return "Avg"; } -}; - -DECLARE_INPLACE_OP_INFERER(AllreduceAvgInplaceInferer, {"X", "Out"}); - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_avg, - ops::CAllReduceOp, - ops::CAllReduceAvgOpMaker, - ops::AllreduceAvgInplaceInferer) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc deleted file mode 100644 index 3faf360636a769..00000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle::framework { -class OpDesc; -template -class EmptyGradOpMaker; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CAllReduceMaxOpMaker : public CAllReduceOpMaker { - protected: - std::string GetName() const override { return "Max"; } -}; - -DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"}); - -DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceMax, kRedMax) - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, - ops::CAllReduceOp, - ops::CAllReduceMaxOpMaker, - ops::AllreduceMaxInplaceInferer) diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc deleted file mode 100644 index 4c2bf9528d854d..00000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle::framework { -class OpDesc; -template -class EmptyGradOpMaker; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CAllReduceProdOpMaker : public CAllReduceOpMaker { - protected: - std::string GetName() const override { return "Prod"; } -}; - -DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"}); - -DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceProd, kRedProd) - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, - ops::CAllReduceOp, - ops::CAllReduceProdOpMaker, - ops::AllreduceProdInplaceInferer) diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc deleted file mode 100644 index f1672f6dd04b0d..00000000000000 --- a/paddle/fluid/operators/collective/c_broadcast_op.cc +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_broadcast_op.h" - -namespace paddle::operators { - -class CBroadcastOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) tensor to be broadcasted."); - AddOutput("Out", "(Tensor) the result of broadcast."); - AddAttr("ring_id", "(int default 0) nccl communication ring id.") - .SetDefault(0); - AddAttr("root", "(int default 0) root id for broadcasting.") - .SetDefault(0); - - AddComment(R"DOC( -CBroadcast Operator - -Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#broadcast -)DOC"); - } -}; - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_broadcast, - ops::CBroadcastOp, - ops::CBroadcastOpMaker); diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h deleted file mode 100644 index 79fc593be5da79..00000000000000 --- a/paddle/fluid/operators/collective/c_broadcast_op.h +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/distributed/comm_context_manager.h" - -#if defined(PADDLE_WITH_GLOO) -#include - -#include "paddle/fluid/framework/fleet/gloo_wrapper.h" -#include "paddle/phi/core/distributed/gloo_comm_context.h" -#endif - -namespace paddle { -namespace operators { - -template -class CBroadcastOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_GLOO) - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); - auto root = ctx.Attr("root"); - - int rid = ctx.Attr("ring_id"); - ctx.device_context().Alloc(out); - - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); - if (comm_context_manager.Has(std::to_string(rid))) { - auto* comm_context = static_cast( - comm_context_manager.Get(std::to_string(rid))); - comm_context->Broadcast(out, *in, root); - } else { - // NOTE: This will be removed after moving this operator to phi. - int64_t send_numel = in->numel(); - T* recv_buff = reinterpret_cast(out->data()); - auto gloo = paddle::framework::GlooWrapper::GetInstance(); - PADDLE_ENFORCE_EQ( - gloo->IsInitialized(), - true, - common::errors::PreconditionNotMet( - "You must initialize the gloo environment first to use it.")); - gloo::BroadcastOptions opts(gloo->GetContext()); - opts.setOutput(recv_buff, send_numel); - opts.setRoot(root); - gloo::broadcast(opts); - } -#else - PADDLE_THROW(common::errors::Unavailable( - "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON")); -#endif - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_avg_op.cc b/paddle/fluid/operators/collective/c_reduce_avg_op.cc deleted file mode 100644 index f8d827a708c004..00000000000000 --- a/paddle/fluid/operators/collective/c_reduce_avg_op.cc +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle::framework { -class OpDesc; -template -class EmptyGradOpMaker; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CReduceAvgOpMaker : public CReduceOpMaker { - protected: - std::string GetName() const override { return "Avg"; } -}; - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_reduce_avg, - ops::CReduceOp, - ops::CReduceAvgOpMaker); diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc deleted file mode 100644 index f08b6eda3e18b5..00000000000000 --- a/paddle/fluid/operators/collective/c_reduce_max_op.cc +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle::framework { -class OpDesc; -template -class EmptyGradOpMaker; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CReduceMaxOpMaker : public CReduceOpMaker { - protected: - std::string GetName() const override { return "Max"; } -}; - -DEFINE_C_REDUCE_CPU_KERNEL(CReduceMax, kRedMax) - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_reduce_max, - ops::CReduceOp, - ops::CReduceMaxOpMaker); diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc deleted file mode 100644 index 87c1197cee6ecf..00000000000000 --- a/paddle/fluid/operators/collective/c_reduce_min_op.cc +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle::framework { -class OpDesc; -template -class EmptyGradOpMaker; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CReduceMinOpMaker : public CReduceOpMaker { - protected: - std::string GetName() const override { return "Min"; } -}; - -DEFINE_C_REDUCE_CPU_KERNEL(CReduceMin, kRedMin) -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_reduce_min, - ops::CReduceOp, - ops::CReduceMinOpMaker); diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h deleted file mode 100644 index 49c0f1f52b10e0..00000000000000 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ /dev/null @@ -1,371 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/common/ddim.h" -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/distributed/comm_context_manager.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) -#include "paddle/common/flags.h" -#include "paddle/phi/core/platform/collective_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); -#endif - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#include "paddle/phi/core/distributed/nccl_comm_context.h" -#elif defined(PADDLE_WITH_XPU_BKCL) -#include "paddle/fluid/platform/device/xpu/bkcl_helper.h" -#include "paddle/phi/core/distributed/bkcl_comm_context.h" -#endif - -#if defined(PADDLE_WITH_GLOO) -#include - -#include "paddle/fluid/framework/fleet/gloo_wrapper.h" -#endif - -namespace paddle { -namespace operators { - -enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd, kRedAvg }; - -class CReduceOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -template -class CReduceOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_GLOO) - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); - auto root_id = ctx.Attr("root_id"); - - auto place = ctx.GetPlace(); - int64_t send_numel = in->numel(); - const T* send_buff = in->data(); - T* recv_buff = out->mutable_data(in->dims(), place); - auto gloo = paddle::framework::GlooWrapper::GetInstance(); - PADDLE_ENFORCE_EQ( - gloo->IsInitialized(), - true, - common::errors::PreconditionNotMet( - "You must initialize the gloo environment first to use it.")); - gloo::ReduceOptions opts(gloo->GetContext()); - opts.setInput(const_cast(send_buff), send_numel); - opts.setOutput(recv_buff, send_numel); - opts.setRoot(root_id); - switch (red_type) { - case kRedSum: - opts.setReduceFunction( - static_cast( - &gloo::sum)); - break; - case kRedMax: - opts.setReduceFunction( - static_cast( - &gloo::max)); - break; - case kRedMin: - opts.setReduceFunction( - static_cast( - &gloo::min)); - break; - case kRedProd: - opts.setReduceFunction( - static_cast( - &gloo::product)); - break; - default: - PADDLE_ENFORCE_EQ(true, - false, - common::errors::InvalidArgument( - "Invalid reduce type: %d.", red_type)); - } - gloo::reduce(opts); -#else - PADDLE_THROW(common::errors::Unavailable( - "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON")); -#endif - } -}; - -#define DEFINE_C_REDUCE_CPU_KERNEL(op_name, red_type) \ - template \ - class op_name##CPUKernel : public CReduceOpCPUKernel {}; - -template -class CReduceOpXPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_XPU_BKCL) - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - BKCLDataType dtype = phi::ToBKCLDataType(in->dtype()); - int64_t numel = in->numel(); - const void* sendbuff = in->data(); - out->Resize(in->dims()); - void* recvbuff = out->mutable_data(place); - - int rid = ctx.Attr("ring_id"); - int root = ctx.Attr("root_id"); - - XPUStream stream = nullptr; - platform::BKCLComm* comm = nullptr; - phi::distributed::BKCLCommContext* comm_ctx = nullptr; - - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(rid))); - comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(rid))); - PADDLE_ENFORCE_NE(comm_ctx, - nullptr, - common::errors::Unavailable( - "BKCLCommContext is nullptr, collective op should " - "has ring_id attr.")); - stream = comm_ctx->GetStream(); - VLOG(3) << "new comm_context_manager has rid " << rid; - } else { // old comm_context - comm = platform::BKCLCommContext::Instance().Get(rid, place); - stream = comm->stream(); - VLOG(3) << "old BKCLCommContext has rid " << rid; - } - if (ctx.Attr("use_calc_stream")) { - auto dev_ctx = phi::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->x_context()->xpu_stream; - } - - BKCLOp bkcl_red_type = BKCL_ADD; - switch (red_type) { - case kRedSum: - bkcl_red_type = BKCL_ADD; - break; - - case kRedMax: - bkcl_red_type = BKCL_MAX; - break; - - case kRedMin: - bkcl_red_type = BKCL_MIN; - break; - - case kRedProd: - bkcl_red_type = BKCL_PRODUCT; - break; - - default: - PADDLE_THROW(common::errors::InvalidArgument("Invalid reduce type: %d", - red_type)); - } - - if (comm_ctx) { - comm_ctx->Reduce(out, *in, bkcl_red_type, root, stream); - } else { - PADDLE_ENFORCE_XPU_SUCCESS(bkcl_reduce(comm->comm(), - sendbuff, - recvbuff, - numel, - dtype, - bkcl_red_type, - root, - stream)); - } -#else - PADDLE_THROW(common::errors::PreconditionNotMet( - "PaddlePaddle should be compiled with XPU.")); -#endif - } -}; - -#define DEFINE_C_REDUCE_XPU_KERNEL(op_name, red_type) \ - template \ - class op_name##XPUKernel : public CReduceOpXPUKernel {}; - -template -class CReduceOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - ncclDataType_t dtype = phi::ToNCCLDataType(in->dtype()); - int64_t numel = in->numel(); - const void* sendbuff = in->data(); - out->Resize(in->dims()); - void* recvbuff = out->mutable_data(place); - - int rid = ctx.Attr("ring_id"); - int root = ctx.Attr("root_id"); - - gpuStream_t stream = nullptr; - platform::NCCLComm* comm = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; - - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(rid))); - comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(rid))); - PADDLE_ENFORCE_NE(comm_ctx, - nullptr, - common::errors::Unavailable( - "NCCLCommContext is nullptr, collective op should " - "has ring_id attr.")); - stream = comm_ctx->GetStream(); - VLOG(3) << "new comm_context_manager has rid " << rid; - } else { // old comm_context - comm = platform::NCCLCommContext::Instance().Get(rid, place); - stream = comm->stream(); - VLOG(3) << "old NCCLCommContext has rid " << rid; - } - if (ctx.Attr("use_calc_stream")) { - // should ExecutionContext for calc stream. - stream = ctx.cuda_device_context().stream(); - } - - ncclRedOp_t nccl_red_type = ncclSum; - switch (red_type) { - case kRedSum: - nccl_red_type = ncclSum; - break; - - case kRedMax: - nccl_red_type = ncclMax; - break; - - case kRedMin: - nccl_red_type = ncclMin; - break; - - case kRedProd: - nccl_red_type = ncclProd; - break; - -#if (NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000) || \ - defined(PADDLE_WITH_HIP) - case kRedAvg: - nccl_red_type = ncclAvg; - break; -#endif - - default: - PADDLE_ENFORCE_EQ( - true, - false, - common::errors::InvalidArgument("red_type must be one of kRedSum, " - "kRedMax, kRedMin, kRedProd.")); - } - - if (comm_ctx) { - comm_ctx->Reduce(out, *in, nccl_red_type, root, stream); - } else { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(sendbuff, - recvbuff, - numel, - dtype, - nccl_red_type, - root, - comm->comm(), - stream)); - } -#else - PADDLE_ENFORCE_EQ( - true, - false, - common::errors::Unavailable("PaddlePaddle should compile with GPU..")); -#endif - } -}; - -#define DEFINE_C_REDUCE_CUDA_KERNEL(op_name, red_type) \ - template \ - class op_name##CUDAKernel : public CReduceOpCUDAKernel {}; - -class CReduceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "(Tensor), tensor to be reduced."); - AddOutput("Out", "(Tensor) the reduced result."); - AddAttr("ring_id", "(int default 0) communication ring id.") - .SetDefault(0); - - AddAttr("root_id", "(int default 0) root id.").SetDefault(0); - AddAttr( - "use_calc_stream", - "(bool default false) eject CUDA operations to calculation stream.") - .SetDefault(false); - AddComment(string::Sprintf(R"DOC( -CReduce %s Operator - -Call collective Reduce with reduce type %s. If input and output are -the same variable, in-place reduce will be used. -)DOC", - GetName(), - GetName())); - } - - protected: - virtual std::string GetName() const = 0; -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc deleted file mode 100644 index eb2e614405235b..00000000000000 --- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle::framework { -class OpDesc; -template -class EmptyGradOpMaker; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CReduceProdOpMaker : public CReduceOpMaker { - protected: - std::string GetName() const override { return "Prod"; } -}; - -DEFINE_C_REDUCE_CPU_KERNEL(CReduceProd, kRedProd) - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_reduce_prod, - ops::CReduceOp, - ops::CReduceProdOpMaker); diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc deleted file mode 100644 index 3758877d1b993b..00000000000000 --- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle::framework { -class OpDesc; -template -class EmptyGradOpMaker; -} // namespace paddle::framework -namespace paddle::imperative { -class OpBase; -} // namespace paddle::imperative - -namespace paddle::operators { - -class CReduceSumOpMaker : public CReduceOpMaker { - protected: - std::string GetName() const override { return "Sum"; } -}; - -DEFINE_C_REDUCE_CPU_KERNEL(CReduceSum, kRedSum) - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(c_reduce_sum, - ops::CReduceOp, - ops::CReduceSumOpMaker); diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h deleted file mode 100644 index e6d5c717571df9..00000000000000 --- a/paddle/fluid/operators/collective/global_gather_op.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class GlobalGatherOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx UNUSED) const override { - PADDLE_THROW(common::errors::Unavailable( - "Do not support global gather op for cpu kernel now.")); - } -}; - -template -struct GlobalGatherFunctor { - void operator()(const framework::ExecutionContext& ctx); -}; - -template -struct GlobalGatherProcessGroupFunctor { - void operator()(const framework::ExecutionContext& ctx); -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h deleted file mode 100644 index 70e5d7c2e5d536..00000000000000 --- a/paddle/fluid/operators/collective/global_scatter_op.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class GlobalScatterOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx UNUSED) const override { - PADDLE_THROW(common::errors::Unavailable( - "Do not support global scatter op for cpu kernel now.")); - } -}; - -template -struct GlobalScatterFunctor { - void operator()(const framework::ExecutionContext& ctx); -}; - -template -struct GlobalScatterProcessGroupFunctor { - void operator()(const framework::ExecutionContext& ctx); -}; - -} // namespace operators -} // namespace paddle From 31e8c012852282ab442bcd2aa59194ec46f5debc Mon Sep 17 00:00:00 2001 From: Shuhao Liang <50269654+lshpku@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:31:23 +0800 Subject: [PATCH 25/57] [CINN] Implement the new AlignIterSpaceTactic (#70649) --- .../dy_shape_group_scheduler.cc | 2 + .../tactic/align_iter_space_tactic.cc | 227 +++++++++++++----- .../tactic/align_iter_space_tactic.h | 4 +- .../tactic/tile_first_general_tactic.cc | 44 ---- 4 files changed, 175 insertions(+), 102 deletions(-) diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc index e533e35c67663b..758464d5d21857 100644 --- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc +++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc @@ -16,6 +16,7 @@ #include "paddle/cinn/common/cas.h" #include "paddle/cinn/hlir/framework/pir/trivial_op_impl.h" #include "paddle/cinn/ir/group_schedule/config/schedule_config_manager.h" +#include "paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h" #include "paddle/cinn/ir/group_schedule/tactic/compute_at_reduction_tactic.h" #include "paddle/cinn/ir/group_schedule/tactic/compute_inline_tactic.h" #include "paddle/cinn/ir/group_schedule/tactic/tile_broadcast_tactic.h" @@ -33,6 +34,7 @@ void DynamicShapeGroupScheduler::Init() { VLOG(4) << "original group func body: \n" << ir_sch_->GetModule().GetExprs()[0]; InitBuckets(); + tactics_.emplace_back(CreateAlignIterSpaceTactic()); tactics_.emplace_back(CreateTileBroadcastTactic()); tactics_.emplace_back(CreateTileFirstGeneralTactic()); tactics_.emplace_back(CreateComputeInlineTactic()); diff --git a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc index dcc72e4a217d82..3476755d2460be 100644 --- a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2023 CINN Authors. All Rights Reserved. +// Copyright (c) 2025 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,89 +13,206 @@ // limitations under the License. #include "paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h" -#include "paddle/cinn/common/cas.h" -#include "paddle/cinn/common/integer_set.h" -#include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_analyzer/ir_analyzer.h" -#include "paddle/cinn/ir/op/ir_operators.h" -#include "paddle/cinn/ir/utils/ir_copy.h" namespace cinn { namespace ir { +namespace { +/** + * Reorder the loops according to the memory-consistent order of input or output + * to make memory access as coalesced as possible. + * + * This tactic uses different alignment policies for Reduce and Trivial: + * 1) Reduce: align with the input, because after reduction, the output data is + * significantly smaller than the input data, so it's more critical to make + * input coalesced. + * 2) Trivial: align with the output, because discrete writes incur higher costs + * than discrete reads for the same volume of data due to the hardware design + * of cache. Therefore, we should ensure coalesced writes in priority. + * + * Note: we reorder spatial and reduce loops seperately, because we need to + * maintain the relative order between spatial and reduce loops, so as for later + * tactics to work properly. Thus, we use two lists sp_loop_perm & rd_loop_perm + * to record the permutation of spatial and reduce loops respectively. + * + * + * Examples: + * 1. Reduce + * Input: + * for (i, 0, 8): # S + * for (j, 0, 32): # S + * for (k, 0, 128): # R + * for (a, 0, 256): # R + * var_1[i, j] += var_0[j, a, k, i] + * Analysis: + * We align Reduce to the input `var_0[j, a, k, i]`. In the indices of var_0, + * the mapping from each index to the loop index is: + * indices[0] = j => loops[1] # S + * indices[1] = a => loops[3] # R + * indices[2] = k => loops[2] # R + * indices[3] = i => loops[0] # S + * To make the indices of var_0 consistent with its original memory layout, we + * need to permute the loops in the order {1, 3, 2, 0}. However, as we reorder + * spatial and reduce loop seperately, we split the permutation into sp & rd, + * getting sp_loop_perm = {1, 0} and rd_loop_perm = {3, 2}. + * Output: + * for (j, 0, 32): # S + * for (i, 0, 8): # S + * for (a, 0, 256): # R + * for (k, 0, 128): # R + * var_1[i, j] += var_0[j, a, k, i] + * + * 2. Trivial + * Input: + * for (i, 0, 32): + * for (j, 0, 128): + * for (k, 0, 256): + * var_1[k, i, j] = exp(var_0[j, i, k]) + * Analysis: + * We align Trivial to the output `var_1[k, i, j]`. In the indices of var_1, + * the mapping from each index to the loop index is: + * indices[0] = k => loops[2] + * indices[1] = i => loops[0] + * indices[2] = j => loops[1] + * Like example 1, we should permute the loops in the order {2, 0, 1}. As this + * graph doesn't contain reduce loops, all we get is sp_loop_perm = {2, 0, 1}, + * and rd_loop_perm = {}. + * Output: + * for (k, 0, 256): + * for (i, 0, 32): + * for (j, 0, 128): + * var_1[k, i, j] = exp(var_0[j, i, k]) + */ class AlignIterSpaceTactic final : public ScheduleTactic { public: - void Init(ScheduleContext* context) override; + void Init(ScheduleContext* context, ir::IRSchedule* sch) override; void Apply(ir::IRSchedule* sch, const std::string& block_id) override; std::string TacticName() const override { return "AlignIterSpaceTactic"; } + private: + /** + * Get the common memory-consistent order of loops according to the outputs. + * Returns null if not all outputs share the same order. + */ + std::vector GetCommonOutputLoopPerm(ir::IRSchedule* sch); + private: ScheduleContext* context_; + + // The permutation of spatial and reduce loops, in other to achieve the + // memory-consistent alignment. + std::vector sp_loop_perm_; + std::vector rd_loop_perm_; }; -void AlignIterSpaceTactic::Init(ScheduleContext* context) { +void AlignIterSpaceTactic::Init(ScheduleContext* context, ir::IRSchedule* sch) { context_ = context; -} + sp_loop_perm_.clear(); + rd_loop_perm_.clear(); -void AlignIterSpaceTactic::Apply(ir::IRSchedule* sch, - const std::string& block_id) { - ir::Expr block = sch->GetBlock(block_id); + auto& loop_strides = context_->config.base_info->loop_strides; + auto& reduce_axis = context_->config.base_info->reduce_axis; + std::set reduce_axis_set(reduce_axis.begin(), reduce_axis.end()); - std::vector loops = sch->GetLoops(block_id); - ir::Expr src_total_extent{1}; - for (const auto& loop : loops) { - src_total_extent = src_total_extent * loop.As()->extent; - } - ir::Expr target_sp_extent{1}; - for (const auto& iter : context_->iter_space_info.sp_space) { - target_sp_extent = target_sp_extent * std::get<0>(iter); + if (!loop_strides.empty()) { + // If this is a Reduce, calculate the loop_perm by sorting the loops in the + // descending order of their strides according to the input, then split the + // loop_perm into sp_loop_perm & rd_loop_perm. + std::vector loop_perm(loop_strides.size()); + std::iota(loop_perm.begin(), loop_perm.end(), 0); + std::stable_sort(loop_perm.begin(), loop_perm.end(), [&](int a, int b) { + return loop_strides[a] > loop_strides[b]; + }); + + for (int axis : loop_perm) { + if (reduce_axis_set.count(axis) > 0) { + rd_loop_perm_.push_back(axis); + } else if (loop_strides[axis] != 0) { + sp_loop_perm_.push_back(axis); + } + } + } else { + // If this is a Trvial, calculate the sp_loop_perm according to the output. + sp_loop_perm_ = GetCommonOutputLoopPerm(sch); } - ir::Expr target_total_extent = ir_utils::IRCopy(target_sp_extent); - for (const auto& iter : context_->iter_space_info.rb_space) { - target_total_extent = target_total_extent * std::get<0>(iter); + + VLOG(4) << "AlignIterSpaceTactic:\n" + << "sp_loop_perm: " << utils::Join(sp_loop_perm_, ", ") << "\n" + << "rd_loop_perm: " << utils::Join(rd_loop_perm_, ", "); +} + +std::unordered_map GetLoopVarToIndex( + const std::vector& loops) { + std::unordered_map loop_var2index; + for (int i = 0; i < loops.size(); ++i) { + auto* node = loops[i].As(); + loop_var2index[node->loop_var] = i; } + return loop_var2index; +} - common::cas_intervals_t var_intervals; - common::SymbolicExprAnalyzer symbolic_expr_analyzer(var_intervals); - std::optional total_extent_eq = - symbolic_expr_analyzer.ProveEQ(src_total_extent, target_total_extent); - bool need_reorder = false; - for (int i = 0; i < context_->iter_space_info.rb_last_order.size(); ++i) { - if (context_->iter_space_info.rb_last_order[i] != i) { - need_reorder = true; - break; - } +/** + * Check whether this is an effective permutation. + * A permutation is ineffective if it's entirely in ascending order. + */ +bool IsPermutationEffective(const std::vector& perm) { + for (int i = 1; i < perm.size(); ++i) { + if (perm[i - 1] > perm[i]) return true; } + return false; +} - if (total_extent_eq.has_value() && total_extent_eq.value()) { - if (need_reorder) { - sch->Reorder(block_id, context_->iter_space_info.rb_last_order); - } - if (context_->iter_space_info.sp_space.size() < loops.size() - 1) { - loops = sch->GetLoops(block_id); - - // Align the loop in the current block that needs to be aligned with the - // reduce loop in iter_space_info - std::vector rb_loops( - loops.end() - context_->iter_space_info.rb_space.size(), loops.end()); - sch->Fuse(rb_loops); +std::vector AlignIterSpaceTactic::GetCommonOutputLoopPerm( + ir::IRSchedule* sch) { + std::vector common_loop_perm; + + for (auto& block : sch->GetAllBlocks()) { + std::string block_id = ir::analyzer::GetBlockName(block); + if (context_->output_names.count(block_id) == 0) continue; + + auto store = ir::analyzer::GetStoreOfSBlock(block); + auto& indices = store.As()->indices; + std::unordered_map iter_var2iter_value = + ir::analyzer::GetIterVarToValueOfSBlock(block); + std::unordered_map loop_var2index = + GetLoopVarToIndex(sch->GetLoops(block)); + + std::vector loop_perm; + for (auto& index : indices) { + if (index.is_constant()) continue; + if (!index.is_var()) return {}; + ir::Expr iter_value = iter_var2iter_value[index.as_var_ref()]; + if (!iter_value.is_var()) return {}; + ir::Expr loop_var = iter_value.as_var_ref(); + loop_perm.push_back(loop_var2index[loop_var]); } - if (context_->iter_space_info.sp_space.size() > 1) { - // Align the loop in the current block that needs to be aligned with the - // spatial loop in iter_space_info - loops = sch->GetLoops(block_id); - std::vector sp_loops( - loops.begin(), - loops.end() - context_->iter_space_info.rb_space.size()); - sch->Fuse(sp_loops); + + if (common_loop_perm.empty()) { + common_loop_perm = std::move(loop_perm); + } else if (common_loop_perm != loop_perm) { + return {}; } - } else { - sch->Fuse(loops); } + + return common_loop_perm; } +void AlignIterSpaceTactic::Apply(ir::IRSchedule* sch, + const std::string& block_id) { + if (ir::IsReduceInitTensorName(block_id)) return; + if (IsPermutationEffective(sp_loop_perm_)) { + sch->Reorder(block_id, sp_loop_perm_); + } + if (IsPermutationEffective(rd_loop_perm_)) { + sch->Reorder(block_id, rd_loop_perm_); + } +} + +} // namespace + std::unique_ptr CreateAlignIterSpaceTactic() { return std::make_unique(); } diff --git a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h index 2ac65d114c7f51..12891818120712 100644 --- a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h +++ b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h @@ -1,4 +1,4 @@ -// Copyright (c) 2023 CINN Authors. All Rights Reserved. +// Copyright (c) 2025 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,10 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #pragma once -#include #include "paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h" namespace cinn { diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc index e71e0052a3803c..1022c97420e7cc 100644 --- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc @@ -48,7 +48,6 @@ class TileFirstGeneralTactic final : public ScheduleTactic { std::string TacticName() const override { return "TileFirstGeneralTactic"; } private: - void AlignToReduceInput(ir::IRSchedule* sch, const std::string& block_id); void MergeFlattenAxis(ir::IRSchedule* sch, const std::string& block_id); void MergeDiscreteFlattenAxis(ir::IRSchedule* sch, const std::string& block_id); @@ -128,11 +127,6 @@ void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch, if (!can_apply_) return; if (ir::IsReduceInitTensorName(block_id)) return; - AlignToReduceInput(sch, block_id); - VLOG(6) << "After AlignToReduceInput on block: [" << block_id - << "], loop nest:\n" - << sch->GetLoops(block_id)[0]; - if (UseContinuousDataTile(context_->config)) { VLOG(4) << "Using ApplyContinuousDataTile"; ApplyContinuousDataTile(sch, block_id); @@ -293,44 +287,6 @@ void TileFirstGeneralTactic::ApplyContinuousDataTile( SetReduceType(sch, block_id); } -void TileFirstGeneralTactic::AlignToReduceInput(ir::IRSchedule* sch, - const std::string& block_id) { - const auto& loop_strides = context_->config.base_info->loop_strides; - if (loop_strides.empty()) { - return; - } - - std::vector loops = sch->GetLoops(block_id); - std::vector loop_perm(loops.size()); - std::iota(loop_perm.begin(), loop_perm.end(), 0); - - const auto IsReduce = [&](int64_t axis) { - auto& reduce_axis = context_->config.base_info->reduce_axis; - return std::find(reduce_axis.begin(), reduce_axis.end(), axis) != - reduce_axis.end(); - }; - - std::sort(loop_perm.begin(), loop_perm.end(), [&](int64_t a, int64_t b) { - if (IsReduce(a) == IsReduce(b)) { - return loop_strides[a] > loop_strides[b]; - } - return IsReduce(b); - }); - VLOG(4) << "loop_perm: " << utils::Join(loop_perm, ", "); - - // Reorder S/R loops seperately, otherwise reduce_init will be de-inlined. - std::vector sp_loops, rd_loops; - for (auto i : loop_perm) { - if (IsReduce(i)) { - rd_loops.push_back(loops[i]); - } else if (loop_strides[i] != 0) { - sp_loops.push_back(loops[i]); - } - } - sch->Reorder(sp_loops); - sch->Reorder(rd_loops); -} - void TileFirstGeneralTactic::MergeFlattenAxis(ir::IRSchedule* sch, const std::string& block_id) { if (vec_flatten_axis_.size() >= 2) { From 18af5d8fcc88821c7ba1f026e8b165000c58c631 Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:31:54 +0800 Subject: [PATCH 26/57] =?UTF-8?q?=E3=80=90Bug=20Fix=E3=80=91Fix=20ReduceSu?= =?UTF-8?q?m=20inferMeta=20bug=20(#70660)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix infer bug * fix bug * fix undefined bug --- paddle/cinn/hlir/dialect/operator/ir/ops.yaml | 2 +- .../hlir/dialect/operator/transforms/pd_to_cinn_pass.cc | 9 +++++++-- paddle/phi/infermeta/unary.cc | 3 ++- paddle/phi/infermeta/unary.h | 1 + 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml index 9fe7530e94bd26..4bab4807511538 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml +++ b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml @@ -73,7 +73,7 @@ output : Tensor(out) infer_meta : func : ReduceSumInferMeta - param : [x, axis, keepdim] + param : [x, axis, keepdim, dtype] kernel : func : frobenius_norm param : [x, axis, keepdim] diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc index 7eb0992d69c454..588febb460498c 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc @@ -117,11 +117,16 @@ class SumOpPattern : public pir::OpRewritePattern { auto in = op->operand_source(0); auto in_data_type = in.type().dyn_cast().dtype(); - if (in_data_type.isa() || - in_data_type.isa()) { + + if (dtype != phi::DataType::UNDEFINED && + dtype != paddle::dialect::TransToPhiDataType(in_data_type)) { + in = rewriter.Build(in, dtype).result(0); + } else if (in_data_type.isa() || + in_data_type.isa()) { in = rewriter.Build(in, phi::DataType::INT64) .result(0); } + auto cinn_reduce = rewriter.Build(in, axis, keepdim, dtype); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 11d9ab80a48ef3..c744b699950b64 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -3707,12 +3707,13 @@ void ReduceInferMetaBase(const MetaTensor& x, void ReduceSumInferMeta(const MetaTensor& x, const std::vector& axis, bool keep_dim, + DataType dtype, MetaTensor* out) { bool reduce_all = false; if (axis.empty()) { reduce_all = true; } - SumRawInferMeta(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out); + SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); } void ReduceInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 7ce6a526829f8a..9a07cee72e0412 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -589,6 +589,7 @@ void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); void ReduceSumInferMeta(const MetaTensor& x, const std::vector& axis, bool keep_dim, + DataType dtype, MetaTensor* out); void ReduceInferMeta(const MetaTensor& x, From 2f34ecf1c2f42fd232a8a6dcd901f6740a989054 Mon Sep 17 00:00:00 2001 From: nizne <97940276+nizne9@users.noreply.github.com> Date: Wed, 8 Jan 2025 17:17:17 +0800 Subject: [PATCH 27/57] =?UTF-8?q?=E3=80=90BUPT=E3=80=91[Paddle=20Tensor=20?= =?UTF-8?q?=E7=AC=AC=E4=BA=8C=E6=9C=9F=20API=20=E9=B2=81=E6=A3=92=E6=80=A7?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA]=20`paddle.linalg.vector=5Fnorm`=20API=20?= =?UTF-8?q?=E9=B2=81=E6=A3=92=E6=80=A7=E5=A2=9E=E5=BC=BA=20(#70499)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix bug of paddle.linalg.vector_norm and add test * Add test case base on feedback from the review --- paddle/phi/infermeta/unary.cc | 13 +---- python/paddle/tensor/linalg.py | 17 ++++-- test/legacy_test/test_norm_all.py | 94 ++++++++++++++++++++++++++++++- 3 files changed, 107 insertions(+), 17 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index c744b699950b64..e6ea59be2365ca 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -3320,17 +3320,8 @@ void PNormInferMeta(const MetaTensor& x, auto x_dim = x.dims(); auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(axis, - -x_rank, - errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], R is " - "the rank of Input(X). But received axis: %d, R: %d. " - "Current Input(X)'s shape is=[%s].", - axis, - x_rank, - x_dim)); - PADDLE_ENFORCE_LT(axis, - x_rank, + PADDLE_ENFORCE_EQ((axis >= -x_rank && axis < x_rank) || x_rank == 0, + true, errors::InvalidArgument( "Attr(axis) value should be in range [-R, R-1], R is " "the rank of Input(X). But received axis: %d, R: %d. " diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 93d7d279bf5e76..2c6508200ed1ae 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -672,10 +672,15 @@ def vector_norm_axis_int( if isinstance(axis, list) and len(axis) == 1: axis = axis[0] + if paddle.is_complex(x): + abs_x = paddle.abs(x) + else: + abs_x = x + # when len(axis) == 1, use the original op to calculate if isinstance(axis, int): return vector_norm_axis_int( - x, + abs_x, axis=axis, porder=p, keepdim=keepdim, @@ -686,12 +691,16 @@ def vector_norm_axis_int( # when len(axis) >= 1, calculate by combining other Python apis elif isinstance(axis, list): if p == np.inf or p == -np.inf: - return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name) + return inf_norm( + abs_x, porder=p, axis=axis, keepdim=keepdim, name=name + ) elif p == 0: - return zero_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name) + return zero_norm( + abs_x, porder=p, axis=axis, keepdim=keepdim, name=name + ) else: return vector_norm_axis_tuple( - x, porder=p, axis=axis, keepdim=keepdim, name=name + abs_x, porder=p, axis=axis, keepdim=keepdim, name=name ) diff --git a/test/legacy_test/test_norm_all.py b/test/legacy_test/test_norm_all.py index 70dd38a79e8c8f..cdcada06e06d4a 100644 --- a/test/legacy_test/test_norm_all.py +++ b/test/legacy_test/test_norm_all.py @@ -603,7 +603,7 @@ def check_linalg_vector_static( ) place = base.CPUPlace() exe = base.Executor(place) - np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype) + np_input = np.array(np.random.rand(*shape_x) + 1.0).astype(dtype) expected_result = np_linalg_vector_norm( np_input, porder=p, axis=axis, keepdims=keep_dim ).astype(dtype) @@ -616,7 +616,7 @@ def check_linalg_vector_static( def check_linalg_vector_dygraph( self, p, axis, shape_x, dtype, keep_dim, check_dim=False ): - x_numpy = (np.random.random(shape_x) + 1.0).astype(dtype) + x_numpy = np.array(np.random.random(shape_x) + 1.0).astype(dtype) expected_result = np_linalg_vector_norm( x_numpy, porder=p, axis=axis, keepdims=keep_dim ) @@ -909,6 +909,51 @@ def test_basic(self): keep_dim=keep, check_dim=True, ) + check_linalg_vector_static( + self, + p=2, + axis=None, + shape_x=[], + dtype="float64", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_static( + self, + p=np.inf, + axis=None, + shape_x=[], + dtype="complex64", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_static( + self, + p=-np.inf, + axis=[0, 1, 2, 3], + shape_x=[1, 14, 5, 14], + dtype="complex128", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_static( + self, + p=np.inf, + axis=2, + shape_x=[1, 14, 5, 14], + dtype="complex128", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_static( + self, + p=0, + axis=[1, 3], + shape_x=[1, 14, 5, 14], + dtype="complex128", + keep_dim=keep, + check_dim=True, + ) check_linalg_matrix_static( self, p=-np.inf, @@ -1237,6 +1282,51 @@ def test_dygraph(self): keep_dim=keep, check_dim=True, ) + check_linalg_vector_dygraph( + self, + p=2, + axis=None, + shape_x=(), + dtype="float64", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_dygraph( + self, + p=np.inf, + axis=None, + shape_x=[], + dtype="complex64", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_dygraph( + self, + p=-np.inf, + axis=[0, 1, 2, 3], + shape_x=[1, 14, 5, 14], + dtype="complex128", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_dygraph( + self, + p=np.inf, + axis=2, + shape_x=[1, 14, 5, 14], + dtype="complex128", + keep_dim=keep, + check_dim=True, + ) + check_linalg_vector_dygraph( + self, + p=0, + axis=[1, 3], + shape_x=[1, 14, 5, 14], + dtype="complex128", + keep_dim=keep, + check_dim=True, + ) check_linalg_matrix_dygraph( self, p=-np.inf, From 6705ab15edcdede1a4bd53a57af23b64b49e8604 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Wed, 8 Jan 2025 17:17:39 +0800 Subject: [PATCH 28/57] [CodeStyle][Typos][Q-[1-2],R-[1-12]] Fix typos (`qucik`,`quitted`,`runned`,`readed`,`recived`,`recevied`,`recieved`,`reveived`,`recieves`,`recive`,`receving`,`recommand`,`recomplie`,`recored`,`Recusive`,`recusive`,`Recursivly`,`recursivly`,`reduntant`) (#70674) * fix * fix --- _typos.toml | 19 ------------------- .../transforms/check_infer_symbolic_util.cc | 6 +++--- ...e_shape_ops_into_generate_shape_op_pass.cc | 2 +- .../operator/transforms/pd_to_cinn_pass.cc | 2 +- paddle/cinn/hlir/pe/nn.cc | 10 +++++----- paddle/cinn/runtime/cuda/cuda_util.cc | 14 +++++++------- paddle/cinn/runtime/sycl/sycl_util.cc | 10 +++++----- paddle/common/flags.h | 2 +- paddle/common/flags_native.cc | 2 +- .../distributed/common/chunk_allocator.h | 4 ++-- .../distributed/ps/service/brpc_ps_client.cc | 2 +- .../distributed/ps/service/brpc_ps_client.h | 2 +- .../distributed/ps/table/ctr_accessor.cc | 2 +- .../ps/table/ctr_double_accessor.cc | 2 +- .../distributed/ps/table/ctr_dymf_accessor.cc | 2 +- .../distributed/ps/table/sparse_accessor.cc | 2 +- paddle/fluid/framework/channel.h | 2 +- paddle/fluid/framework/dist_multi_trainer.cc | 4 ++-- .../framework/fleet/heter_ps/heter_comm.h | 4 ++-- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 10 +++++----- .../framework/ir/auto_mixed_precision_pass.cc | 2 +- .../ir/xpu/decoder_attention_xpu_fuse_pass.cc | 2 +- .../ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc | 2 +- .../multihead_matmul_roformer_plugin.cu | 2 +- .../plugin/preln_residual_bias_plugin.cu | 2 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 2 +- .../operators/elementwise/elementwise_op.h | 4 ++-- .../generator/get_expected_kernel_func.cc | 2 +- paddle/phi/infermeta/multiary.cc | 2 +- paddle/phi/kernels/funcs/blas/blas.cc | 2 +- paddle/phi/kernels/gpu/flash_attn_utils.h | 6 +++--- paddle/scripts/paddle_build.sh | 2 +- python/paddle/decomposition/recompute.py | 8 ++++---- .../auto_parallel/static/helper.py | 4 ++-- .../reshard_funcs/sub_to_global_mesh_func.py | 2 +- .../static/tuner/rule_based_tuner.py | 2 +- .../fleet/utils/tensor_parallel_utils.py | 2 +- python/paddle/tensor/creation.py | 2 +- test/ir/pir/cinn/utils.py | 2 +- 39 files changed, 68 insertions(+), 87 deletions(-) diff --git a/_typos.toml b/_typos.toml index a29bf57b1677b1..81230a2f09629c 100644 --- a/_typos.toml +++ b/_typos.toml @@ -156,25 +156,6 @@ protocal = 'protocal' PROTOCAL = 'PROTOCAL' pyrhon = 'pyrhon' pthon = 'pthon' -qucik = 'qucik' -quitted = 'quitted' -runned = 'runned' -readed = 'readed' -recived = 'recived' -recevied = 'recevied' -reveived = 'reveived' -recieved = 'recieved' -recieves = 'recieves' -recive = 'recive' -receving = 'receving' -recommand = 'recommand' -recomplie = 'recomplie' -recored = 'recored' -Recusive = 'Recusive' -recusive = 'recusive' -recursivly = 'recursivly' -Recursivly = 'Recursivly' -reduntant = 'reduntant' Refered = 'Refered' refered = 'refered' registed = 'registed' diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc index bed943587c1637..1471e041a58493 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc @@ -404,13 +404,13 @@ struct ShapeSignatureGenerator { const DoEachT& DoEach) { if (set_size <= 0) return DoEach(is_subset_flags); - const auto& RecusiveVisit = [&](bool is_subset) { + const auto& RecursiveVisit = [&](bool is_subset) { std::vector current_is_subset_flags(is_subset_flags); current_is_subset_flags.push_back(static_cast(is_subset)); VisitEachSubSet(set_size - 1, current_is_subset_flags, DoEach); }; - RecusiveVisit(true); - RecusiveVisit(false); + RecursiveVisit(true); + RecursiveVisit(false); } std::optional GetConstrainedSymbolNamesList( diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc index 4917a0dd2aa9d5..345d88301da639 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc @@ -76,7 +76,7 @@ std::vector FindSourceDenseTensorOfDimTensor( [](const symbol::NullShapeOrDataDimExpr& null_shape_or_data) { return false; }}; - // For TensorListShapeOrDataDimExprs case, we should recursivly visit its + // For TensorListShapeOrDataDimExprs case, we should recursively visit its // each dim_expr, which is automatically in next step. const auto& NeedTrackUpstream = [&](pir::Value value) -> bool { const auto& sym_shape = ShapeOrDataDimExprs4Value(value); diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc index 588febb460498c..008ef30762ece8 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc @@ -1028,7 +1028,7 @@ class SqueezeOpPattern in_shape[i], 1, ::common::errors::PreconditionNotMet( - "sequeze dim MUST be 1, but recive axis [%d] is [%d]", + "squeeze dim MUST be 1, but receive axis [%d] is [%d]", i, in_shape[i])); } diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc index 4954cda7976e0f..61ec4978509dd9 100644 --- a/paddle/cinn/hlir/pe/nn.cc +++ b/paddle/cinn/hlir/pe/nn.cc @@ -104,7 +104,7 @@ Tensor PRelu(const Tensor &A, PADDLE_ENFORCE_EQ(A->shape[axis], slope->shape[0], ::common::errors::InvalidArgument( - "Wrong slope shape: excepted %d but recieved %d.", + "Wrong slope shape: excepted %d but received %d.", A->shape[axis], slope->shape[0])); return Compute( @@ -163,7 +163,7 @@ std::vector Conv2d_winograd_NCHW(const ir::Tensor &input, true, ::common::errors::InvalidArgument( "Filter's output channel size must be divisible by group, but " - "recieved %d as output channel size and %d as group.", + "received %d as output channel size and %d as group.", weights->shape[0] * weights->shape[1], input->shape[1])); @@ -447,7 +447,7 @@ std::vector Conv2d_NCHW(const ir::Tensor &input, true, ::common::errors::InvalidArgument( "Filter's output channel size must be divisible by group, but " - "recieved %d as output channel size and %d as group.", + "received %d as output channel size and %d as group.", weights->shape[0] * weights->shape[1], input->shape[1])); auto res = Compute( @@ -838,7 +838,7 @@ std::vector Conv2d_NHWC(const ir::Tensor &input, true, ::common::errors::InvalidArgument( "Filter's output channel size must be divisible by group, but " - "recieved %d as output channel size and %d as group.", + "received %d as output channel size and %d as group.", weights->shape[0] * weights->shape[1], input->shape[3])); auto res = Compute( @@ -1683,7 +1683,7 @@ std::vector Pool2d(const Tensor &tensor, (tensor->shape.size() == 4U || tensor->shape.size() == 5U), true, ::common::errors::InvalidArgument( - "Pool2d requires tensor's shape_size to be 4 or 5, but recieved %d.", + "Pool2d requires tensor's shape_size to be 4 or 5, but received %d.", tensor->shape.size())); std::vector axis = {height_axis, width_axis}; return PoolImpl(tensor, diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc index af0017222231bc..6349a342d93f3f 100644 --- a/paddle/cinn/runtime/cuda/cuda_util.cc +++ b/paddle/cinn/runtime/cuda/cuda_util.cc @@ -771,7 +771,7 @@ void cinn_call_cudnn_conv2d_forward(void *v_args, num_args, 3, ::common::errors::InvalidArgument( - "Expected number of argruments is 3, but recived %d.", num_args)); + "Expected number of argruments is 3, but received %d.", num_args)); cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle(); CUDNN_CALL(cudnnSetStream(handle, static_cast(stream))); cinn_pod_value_t *args = static_cast(v_args); @@ -925,7 +925,7 @@ void cinn_call_cudnn_conv2d_backward_data(void *v_args, num_args, 3, ::common::errors::InvalidArgument( - "Expected number of argruments is 3, but recived %d.", num_args)); + "Expected number of argruments is 3, but received %d.", num_args)); cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle(); CUDNN_CALL(cudnnSetStream(handle, static_cast(stream))); cinn_pod_value_t *args = static_cast(v_args); @@ -1082,7 +1082,7 @@ void cinn_call_cudnn_conv2d_backward_filter(void *v_args, num_args, 3, ::common::errors::InvalidArgument( - "Expected number of argruments is 3, but recived %d.", num_args)); + "Expected number of argruments is 3, but received %d.", num_args)); cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle(); CUDNN_CALL(cudnnSetStream(handle, static_cast(stream))); cinn_pod_value_t *args = static_cast(v_args); @@ -1236,7 +1236,7 @@ void cinn_call_cudnn_pool2d_forward(void *v_args, num_args, 2, ::common::errors::InvalidArgument( - "Expected number of argruments is 2, but recived %d.", num_args)); + "Expected number of argruments is 2, but received %d.", num_args)); cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle(); CUDNN_CALL(cudnnSetStream(handle, static_cast(stream))); cinn_pod_value_t *args = static_cast(v_args); @@ -1334,7 +1334,7 @@ void cinn_call_cudnn_pool2d_backward(void *v_args, num_args, 4, ::common::errors::InvalidArgument( - "Expected number of argruments is 4, but recived %d.", num_args)); + "Expected number of argruments is 4, but received %d.", num_args)); cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle(); CUDNN_CALL(cudnnSetStream(handle, static_cast(stream))); cinn_pod_value_t *args = static_cast(v_args); @@ -1448,7 +1448,7 @@ void cinn_call_cudnn_softmax_forward(void *v_args, num_args, 2, ::common::errors::InvalidArgument( - "Expected number of argruments is 2, but recived %d.", num_args)); + "Expected number of argruments is 2, but received %d.", num_args)); cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle(); CUDNN_CALL(cudnnSetStream(handle, static_cast(stream))); cinn_pod_value_t *args = static_cast(v_args); @@ -1522,7 +1522,7 @@ void cinn_call_cudnn_softmax_backward(void *v_args, num_args, 3, ::common::errors::InvalidArgument( - "Expected number of argruments is 3, but recived %d.", num_args)); + "Expected number of argruments is 3, but received %d.", num_args)); cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle(); CUDNN_CALL(cudnnSetStream(handle, static_cast(stream))); cinn_pod_value_t *args = static_cast(v_args); diff --git a/paddle/cinn/runtime/sycl/sycl_util.cc b/paddle/cinn/runtime/sycl/sycl_util.cc index 5c14c9ddfdeb6e..7eb7f69bf0474a 100644 --- a/paddle/cinn/runtime/sycl/sycl_util.cc +++ b/paddle/cinn/runtime/sycl/sycl_util.cc @@ -657,7 +657,7 @@ void cinn_call_cnnl_conv2d_forward(void *v_args, num_args, 3, ::common::errors::InvalidArgument( - "Expected number of argruments is 3, but recived %d.", num_args)); + "Expected number of argruments is 3, but received %d.", num_args)); cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle(); auto Queue = SYCLBackendAPI::Global()->get_now_queue(); CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>(); @@ -790,7 +790,7 @@ void cinn_call_cnnl_conv2d_backward_data(void *v_args, num_args, 3, ::common::errors::InvalidArgument( - "Expected number of argruments is 3, but recived %d.", num_args)); + "Expected number of argruments is 3, but received %d.", num_args)); cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle(); auto Queue = SYCLBackendAPI::Global()->get_now_queue(); CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>(); @@ -918,7 +918,7 @@ void cinn_call_cnnl_conv2d_backward_filter(void *v_args, num_args, 3, ::common::errors::InvalidArgument( - "Expected number of argruments is 3, but recived %d.", num_args)); + "Expected number of argruments is 3, but received %d.", num_args)); cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle(); auto Queue = SYCLBackendAPI::Global()->get_now_queue(); CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>(); @@ -1042,7 +1042,7 @@ void cinn_call_cnnl_pool2d_forward(void *v_args, num_args, 2, ::common::errors::InvalidArgument( - "Expected number of argruments is 2, but recived %d.", num_args)); + "Expected number of argruments is 2, but received %d.", num_args)); cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle(); auto Queue = SYCLBackendAPI::Global()->get_now_queue(); CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>(); @@ -1178,7 +1178,7 @@ void cinn_call_cnnl_pool2d_backward(void *v_args, num_args, 4, ::common::errors::InvalidArgument( - "Expected number of argruments is 4, but recived %d.", num_args)); + "Expected number of argruments is 4, but received %d.", num_args)); cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle(); auto Queue = SYCLBackendAPI::Global()->get_now_queue(); CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>(); diff --git a/paddle/common/flags.h b/paddle/common/flags.h index 006f2fea5355da..3ea201fa97899c 100644 --- a/paddle/common/flags.h +++ b/paddle/common/flags.h @@ -110,7 +110,7 @@ namespace flags { /** * @brief Parse commandline flags. * - * It recieves commandline arguments passed in argc and argv from main function, + * It receives commandline arguments passed in argc and argv from main function, * argv[0] is the program name, and argv[1:] are the commandline arguments * which matching the format "--name=value" or "--name value". After parsing, * the corresponding flag value will be reset. diff --git a/paddle/common/flags_native.cc b/paddle/common/flags_native.cc index 12af71499dec2b..220401e14efec4 100644 --- a/paddle/common/flags_native.cc +++ b/paddle/common/flags_native.cc @@ -368,7 +368,7 @@ bool GetValueFromEnv(const std::string& name, std::string* value) { /** * @brief Set flags from environment variables. * - * It recieves a list of flags name, and will find the corresponding environment + * It receives a list of flags name, and will find the corresponding environment * variables named "FLAGS_name", if found, it will set the environment variable * values to the flags. If error_fatal is true, the program will exit when the * environment variable is not set or the flag is not defined, that is the same diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h index aa708ffccf9c40..21c2ddddf31bc2 100644 --- a/paddle/fluid/distributed/common/chunk_allocator.h +++ b/paddle/fluid/distributed/common/chunk_allocator.h @@ -29,8 +29,8 @@ class ChunkAllocator { std::max(sizeof(void*), sizeof(T)), common::errors::InvalidArgument( "The size of Node is invalid. Expected sizeof(Node) == " - "max(sizeof(void*), sizeif(T)).\nBut recieved sizeof(Node) = %u " - "and max(sizeof(void*), sizeif(T)) = %u.", + "max(sizeof(void*), sizeof(T)).\nBut received sizeof(Node) = %u " + "and max(sizeof(void*), sizeof(T)) = %u.", sizeof(Node), std::max(sizeof(void*), sizeof(T)))); _chunk_size = chunk_size; diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc index cb38f07dc68ea7..11998020042a9e 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc @@ -1920,7 +1920,7 @@ std::future BrpcPsClient::PushDense(const Region *regions, "Invalid dense size." "Expect the sum of current position and data number " "to be equal to or smaller than the size." - "But recieved current position = %lu, data number = " + "But received current position = %lu, data number = " "%lu, size = %lu.", pos, data_num, diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h index 3ce8ffbadfe604..dd3f3293f506ed 100644 --- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h @@ -73,7 +73,7 @@ class DownpourPsClientService : public PsService { client_id, (_client->_client_id), common::errors::PreconditionNotMet( - "Wrong request client's id. Expect to match self. But recieved " + "Wrong request client's id. Expect to match self. But received " "request client's id = %lu and self = %lu.", client_id, (_client->_client_id))); diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc index f3cd0c79f62fb5..ee9926f21c1e8a 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc @@ -341,7 +341,7 @@ int CtrCommonAccessor::ParseFromString(const std::string& str, float* value) { ret, 6UL, common::errors::InvalidArgument( - "Invalid return value. Expect more than 6. But recieved %d.", ret)); + "Invalid return value. Expect more than 6. But received %d.", ret)); return ret; } diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc index 99e3fd4579feb4..34d563bfc8723d 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc @@ -347,7 +347,7 @@ int CtrDoubleAccessor::ParseFromString(const std::string& str, float* value) { str_len, 6UL, common::errors::InvalidArgument( - "Invalid string length. Expect more than 6. But recieved %d.", + "Invalid string length. Expect more than 6. But received %d.", str_len)); int show_index = CtrDoubleFeatureValue::ShowIndex(); int click_index = CtrDoubleFeatureValue::ClickIndex(); diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc index 0c5ea90895f4c5..46dde28fc9fe8c 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc @@ -394,7 +394,7 @@ int CtrDymfAccessor::ParseFromString(const std::string& str, float* value) { ret, 7UL, common::errors::InvalidArgument( - "Invalid return value. Expect more than 7. But recieved %d.", ret)); + "Invalid return value. Expect more than 7. But received %d.", ret)); return ret; } diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc index a12523b013b9f6..d5bbf950b7cc58 100644 --- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc +++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc @@ -304,7 +304,7 @@ int SparseAccessor::ParseFromString(const std::string& str, float* value) { ret, 6UL, common::errors::InvalidArgument( - "Invalid return value. Expect more than 6. But recieved %d.", ret)); + "Invalid return value. Expect more than 6. But received %d.", ret)); return ret; } diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 1e307558976adf..7f1955079b57b2 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -340,7 +340,7 @@ Channel MakeChannel(const Channel& other) { // NOTE: ChannelReader is a wrapper for quick read channel with a buffer. It // will read a block data from channel, but user can get data one by one. So it // is important to notice that user must call operator>> until false, or call -// get_buffer_remain until false to make sure the buffered data all readed. +// get_buffer_remain until false to make sure the buffered data all read. template class ChannelReader { public: diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc index afca688c01fbcf..023832c5cb40cd 100644 --- a/paddle/fluid/framework/dist_multi_trainer.cc +++ b/paddle/fluid/framework/dist_multi_trainer.cc @@ -114,7 +114,7 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program, thread_num_, common::errors::InvalidArgument( "static_cast(pool.size()) is invalid, " - "expected %d but recieved %d.", + "expected %d but received %d.", thread_num_, static_cast(pool.size()))); for (int i = 0; i < thread_num_; ++i) { @@ -163,7 +163,7 @@ void DistMultiTrainer::Run() { thread_num_, common::errors::InvalidArgument( "static_cast(pool.size()) is invalid, " - "expected %d but recieved %d.", + "expected %d but received %d.", thread_num_, static_cast(pool.size()))); for (int i = 0; i < thread_num_; ++i) { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index acb71f0e6a2b57..28e9a248342f91 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -351,7 +351,7 @@ class HeterComm { len, common::errors::InvalidArgument( "Invalid size of all keys memory. Expect to be " - "equal to length %d. But recieved %d.", + "equal to length %d. But received %d.", len, all_keys_mem->size())); PADDLE_ENFORCE_GE( @@ -359,7 +359,7 @@ class HeterComm { len * value_bytes, common::errors::InvalidArgument( "Invalid size of all gradients memory. Expect to be equal to " - "length * value bytes %d. But recieved %d.", + "length * value bytes %d. But received %d.", len * value_bytes, all_grads_mem->size())); } diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index e1469e1ead2dfe..edab7bd80287aa 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -410,7 +410,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task, ranks_vec[i]->size(), common::errors::InvalidArgument( "keys_vec[i]->size() should be equal to " - "ranks_vec[i]->size(), but recieved " + "ranks_vec[i]->size(), but received " "keys_vec[i]->size() is %d, ranks_vec[i]->size() is %d", keys_vec[i]->size(), ranks_vec[i]->size())); @@ -428,7 +428,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task, 0UL, common::errors::InvalidArgument( "ranks_vec[i]->size() should be equal to 0, " - "but recieved %d.", + "but received %d.", ranks_vec[i]->size())); for (size_t j = 0; j < keys_vec[i]->size(); ++j) { auto& key = (*keys_vec[i])[j]; @@ -469,7 +469,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task, total_keys, common::errors::InvalidArgument( "Total shard keys number should be less than or equal to total " - "keys number, but recieved %d as total shard keys number and %d " + "keys number, but received %d as total shard keys number and %d " "as total keys number.", total_shard_keys, total_keys)); @@ -1638,7 +1638,7 @@ void PSGPUWrapper::divide_to_device(std::shared_ptr gpu_task) { nullptr, common::errors::InvalidArgument( "The value of local dimension pointer should not " - "be nullptr but recieved %d at position %d.", + "be nullptr but received %d at position %d.", h_dim_ptrs[pos], pos)); d_dim_ptr[cur + k] = h_dim_ptrs[pos]; @@ -2398,7 +2398,7 @@ void PSGPUWrapper::PullSparse(const phi::Place& place, const std::vector& values, const std::vector& slot_lengths, const int hidden_size) { - VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This " + VLOG(0) << "Warning:: recommend use pull_gpups_sparse op instead. This " "PullSparse is not used."; } diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index 4b5d551eafc100..e3c22df825214e 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -1070,7 +1070,7 @@ void AutoMixedPrecisionPass::InsertCastOp() const { cache_kv_outputs.size(), common::errors::InvalidArgument( "Cache inputs should be the same size with cache outputs, but " - "recieved %d as inputs and %d as outputs.", + "received %d as inputs and %d as outputs.", cache_kv_inputs.size(), cache_kv_outputs.size())); for (size_t i = 0; i < cache_kv_inputs.size(); ++i) { diff --git a/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc index cbff317d4383fd..8e56f712cb27d5 100644 --- a/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc @@ -234,7 +234,7 @@ void DecoderAttentionXPUFusePass::ApplyDecoderAttentionXPUFuse( fused_op_desc.SetInput("v", {input_v->Name()}); std::unordered_map> var_quant_scales = GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales"); - // recored q/k/v max, qk_max, and qkv_max + // recorded q/k/v max, qk_max, and qkv_max std::vector input_max_nodes; if (var_quant_scales.find(input_q->Name()) != var_quant_scales.end() && var_quant_scales.find(input_k->Name()) != var_quant_scales.end() && diff --git a/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc index 5908974d486644..8675d5eedbda13 100644 --- a/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc @@ -253,7 +253,7 @@ void QkQkvAttentionXPUFusePass::ApplyQkQkvAttentionXPUFuse( fused_op_desc.SetInput("v", {input->Name()}); std::unordered_map> var_quant_scales = GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales"); - // recored q/k/v max, qk_max, and qkv_max + // recorded q/k/v max, qk_max, and qkv_max std::vector input_max_nodes; if (var_quant_scales.find(input->Name()) != var_quant_scales.end() && var_quant_scales.find(qk_matmul_out->Name()) != diff --git a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu index 7d7a771a67eb82..7bd1ca9226fcd0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu @@ -360,7 +360,7 @@ int MultiheadMatmulRoformerPlugin::enqueue( PADDLE_THROW(common::errors::Fatal( "The Ernie(Bert) TensorRT Plugin should be " "complied with CUDA version >= 10.0 when running with fp16. " - "Please recomplie it or try to use fp32 by set " + "Please recompile it or try to use fp32 by set " "config.SetTRTDynamicShapeInfo(min_input_shape, " "max_input_shape, opt_input_shape, true")); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu index c1c04bdd80f636..d871bab0823a2c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu @@ -537,7 +537,7 @@ int PrelnResidualBiasPluginDynamic::enqueue( PADDLE_THROW(common::errors::Fatal( "The Ernie(Bert) tensorRT plugin should be " "complied with CUDA version >= 10.0 when running with fp16. " - "Please recomplie it or try to use fp32 by set " + "Please recompile it or try to use fp32 by set " "config.SetTRTDynamicShapeInfo(min_input_shape, " "max_input_shape, opt_input_shape, true")); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 6cba98fb7dd725..f614ca12d046c3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -533,7 +533,7 @@ int QkvToContextPluginDynamic::enqueue( PADDLE_THROW(common::errors::Fatal( "The Ernie(Bert) TensorRT Plugin should be " "complied with CUDA version >= 10.0 when running with fp16. " - "Please recomplie it or try to use fp32 by set " + "Please recompile it or try to use fp32 by set " "config.SetTRTDynamicShapeInfo(min_input_shape, " "max_input_shape, opt_input_shape, true")); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 1df5f2d05eef16..39a5a76f8bcaa0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -59,7 +59,7 @@ class ElementwiseOp : public framework::OperatorWithKernel { common::errors::InvalidArgument( "For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS" "), Y must be scalar, the size of Y should be 1. " - "But reveived the size of Y = %s.", + "But received the size of Y = %s.", ctx->GetInputDim("Y").size())); PADDLE_ENFORCE_EQ( ctx->GetInputDim("Y")[0], @@ -67,7 +67,7 @@ class ElementwiseOp : public framework::OperatorWithKernel { common::errors::InvalidArgument( "For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS" "), Y must be scalar, the first dimension of Y should be 1. " - "But reveived the first dimension of Y = %s.", + "But received the first dimension of Y = %s.", ctx->GetInputDim("Y")[0])); } else if (ctx->GetInputsVarType("X").front() != framework::proto::VarType::DENSE_TENSOR) { diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc index eba9c8ca7c2836..97afddf8e10122 100644 --- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc +++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc @@ -275,7 +275,7 @@ phi::KernelKey GetStridedSliceExpectedKernelType( true, common::errors::InvalidArgument( "Place of context is %s. Place of input tensor is %s. They " - "are should be same, but reveived different place.", + "are should be same, but received different place.", string::to_string(ctx.device_context().GetPlace()), string::to_string(tensor.place()))); } diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 80bc394fa62492..c00a0141a5ba70 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -6055,7 +6055,7 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x, 0, errors::InvalidArgument( "The num_head of query must be divisible by the num_head of key, but " - "recived num_head of query is %d, and the num_head of key is %d", + "received num_head of query is %d, and the num_head of key is %d", num_head, k_num_head)); PADDLE_ENFORCE_EQ( diff --git a/paddle/phi/kernels/funcs/blas/blas.cc b/paddle/phi/kernels/funcs/blas/blas.cc index ef1d2f2f591955..6117a01a3bb9bc 100644 --- a/paddle/phi/kernels/funcs/blas/blas.cc +++ b/paddle/phi/kernels/funcs/blas/blas.cc @@ -23,7 +23,7 @@ MatDescriptor CreateMatrixDescriptor(const DDim &tensor_dim, tensor_dim.size(), 1, common::errors::InvalidArgument("The tensor dim size should be greater " - "than 1, but reveived dim size is %d", + "than 1, but received dim size is %d", tensor_dim.size())); MatDescriptor retv; if (num_flatten_cols > 1) { diff --git a/paddle/phi/kernels/gpu/flash_attn_utils.h b/paddle/phi/kernels/gpu/flash_attn_utils.h index 42cd09c21e2ddf..d03225f4f290c8 100644 --- a/paddle/phi/kernels/gpu/flash_attn_utils.h +++ b/paddle/phi/kernels/gpu/flash_attn_utils.h @@ -94,14 +94,14 @@ static std::vector GetAttnSparseMaskDims( dtype, DataType::INT32, common::errors::InvalidArgument("dtype of startend_row_indices must be " - "int32, but recieved %d", + "int32, but received %d", dtype)); PADDLE_ENFORCE_GE( rank, 4, common::errors::InvalidArgument( "The number of dimensions of startend_row_indices is expected to " - "be greater or equal to 4, but recieved %d. The shape of " + "be greater or equal to 4, but received %d. The shape of " "startend_row_indices is [%s]", rank, origin_dims)); @@ -110,7 +110,7 @@ static std::vector GetAttnSparseMaskDims( common::errors::InvalidArgument( "The sparse_mask_dims[%d] of " "attn_mask_start_row_indices is expected to be " - "equal to %d, but recieved %d.", + "equal to %d, but received %d.", rank - 2, max_seqlen_q, origin_dims[2])); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7090df20d6a5e4..fdddf2c040f583 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1476,7 +1476,7 @@ function collect_failed_tests() { done } -# getting qucik disable ut list +# getting quick disable ut list function get_quickly_disable_ut() { python -m pip install httpx if disable_ut_quickly=$(python ${PADDLE_ROOT}/tools/get_quick_disable_lt.py); then diff --git a/python/paddle/decomposition/recompute.py b/python/paddle/decomposition/recompute.py index 1fea2497284754..effd0882000092 100644 --- a/python/paddle/decomposition/recompute.py +++ b/python/paddle/decomposition/recompute.py @@ -226,7 +226,7 @@ def _get_consumer_ops(op): self.result_value_set.add(result) return consumers - def _get_producer_ops_recursivly(root): + def _get_producer_ops_recursively(root): visited = set() queue = deque() queue.append(root) @@ -240,7 +240,7 @@ def _get_producer_ops_recursivly(root): visited.add(new_op) queue.append(new_op) - def _get_consumer_ops_recursivly(root): + def _get_consumer_ops_recursively(root): visited = set() queue = deque() queue.append(root) @@ -256,8 +256,8 @@ def _get_consumer_ops_recursivly(root): for op in self.ops: if op.name() in self.unrecomputable_ops: - _get_producer_ops_recursivly(op) - _get_consumer_ops_recursivly(op) + _get_producer_ops_recursively(op) + _get_consumer_ops_recursively(op) def _has_unfusible_op_on_any_path(self, op1, op2): no_unfusible_op_on_path = ( diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py index f540d5cd319380..46b8d52a9dc798 100644 --- a/python/paddle/distributed/auto_parallel/static/helper.py +++ b/python/paddle/distributed/auto_parallel/static/helper.py @@ -410,7 +410,7 @@ def init_pir(self, main_program, place): if param is None: continue if param.name not in dy_param_name_to_pir_param_name: - # Release the reduntant params + # Release the redundant params param.get_tensor()._clear() continue if not param._is_initialized(): @@ -493,7 +493,7 @@ def init(self, main_program, place, dist_context): if param is None: continue if param.name not in main_program.global_block().vars: - # Release the reduntant params + # Release the redundant params param.get_tensor()._clear() continue if not param._is_initialized(): diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py index cdb87b3b70477c..8fedb9f8a0c287 100644 --- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py +++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py @@ -73,7 +73,7 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type): src_mesh, [src_dist_attr], [src_dist_attr], chunk_id ) else: - # create the buffer on other ranks for receving the data + # create the buffer on other ranks for receiving the data tmp_value = paddle.zeros(dst_type.shape, dst_type.dtype) op = tmp_value.get_defining_op() mesh = paddle.distributed.ProcessMesh(other_ranks) diff --git a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py index 22da88364d3691..2bf40ecc7c97c3 100644 --- a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py +++ b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py @@ -2695,7 +2695,7 @@ def run_or_quit(self): # Quit if just tune if not self._is_run: self._logger.info( - "The process will be quitted when just tune not run." + "The process will be quit when just tune not run." ) sys.exit() diff --git a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py index 662552119a21ca..1fd43cd5b602b0 100644 --- a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py +++ b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py @@ -326,7 +326,7 @@ def add_extra_synchronization( if params_filter_fn(param): params_to_sync.append(param) logger.info( - "The following param are going to be synchronization everytime the optimizer update phase of the program is runned: " + "The following param are going to be synchronization everytime the optimizer update phase of the program is run: " ) logger.info([p.name for p in params_to_sync]) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 2f8567fb0e14d5..ff90fa0b80a90a 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -1089,7 +1089,7 @@ def fill_constant( if out.dtype != dtype: raise TypeError( - "Required out.dtype == dtype if specifying out, but recevied f{out.dtype} != f{dtype}" + "Required out.dtype == dtype if specifying out, but received f{out.dtype} != f{dtype}" ) out = _C_ops.full_(out, shape, value, dtype, place) out.stop_gradient = True diff --git a/test/ir/pir/cinn/utils.py b/test/ir/pir/cinn/utils.py index 62642af979522d..ca1ff888e56490 100644 --- a/test/ir/pir/cinn/utils.py +++ b/test/ir/pir/cinn/utils.py @@ -77,7 +77,7 @@ def check_jit_kernel_number(static_fn, expected_number): def get_jit_kernel_structure_helper(block, map_info, if_op_idx='_0'): """ - Recursivly generate JIT_KERNEL map_info for Static/Dynmaic Shape UT. + Recursively generate JIT_KERNEL map_info for Static/Dynamic Shape UT. """ if_count = 0 for op in block.ops: From 37e33e5b6f499159d328762c2faf378b059fd216 Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 8 Jan 2025 17:18:52 +0800 Subject: [PATCH 29/57] Fix (#70679) --- .../tensorrt/plugin/c_allreduce_op_plugin.cu | 2 +- .../tensorrt/plugin/custom_generic_plugin.cu | 14 +++++------ .../plugin/deformable_conv_op_plugin.cu | 2 +- .../tensorrt/plugin/elementwise_op_plugin.cu | 2 +- .../elementwiseadd_transpose_op_plugin.cu | 2 +- .../plugin/fused_token_prune_op_plugin.cu | 2 +- .../tensorrt/plugin/gelu_op_plugin.cu | 2 +- .../tensorrt/plugin/group_norm_op_plugin.cu | 2 +- .../tensorrt/plugin/hard_swish_op_plugin.cu | 2 +- .../tensorrt/plugin/layer_norm_op_plugin.cu | 2 +- .../plugin/layernorm_shift_partition_op.cu | 2 +- .../plugin/merge_layernorm_op_plugin.cu | 2 +- .../tensorrt/plugin/mish_op_plugin.cu | 2 +- .../multihead_matmul_roformer_plugin.cu | 2 +- .../tensorrt/plugin/pool3d_op_plugin.cu | 2 +- .../tensorrt/plugin/pool_op_plugin.cu | 2 +- .../plugin/preln_groupnorm_act_op_plugin.cu | 2 +- .../plugin/preln_residual_bias_plugin.cu | 2 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 2 +- .../tensorrt/plugin/reverse_roll_op_plugin.cu | 2 +- .../plugin/skip_groupnorm_act_op_plugin.cu | 2 +- .../plugin/skip_merge_layernorm_op_plugin.cu | 2 +- .../tensorrt/plugin/swish_op_plugin.cu | 2 +- .../plugin/trans_layernorm_op_plugin.cu | 2 +- ...transformer_input_output_convert_plugin.cu | 2 +- .../common_subexpression_elimination_pass.cc | 6 ++--- .../general/transfer_layout_pass.cc | 24 +++++++++---------- 27 files changed, 46 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu index 0cab9341b09495..d0627ecf950dae 100644 --- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu @@ -94,7 +94,7 @@ bool CAllReducePluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of CAllReduce plugin shoule not be nullptr.")); + "The input of CAllReduce plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu index d6d76c6b9618ea..73a4462bdef519 100644 --- a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu @@ -48,7 +48,7 @@ void validate(const std::string& op_type, PADDLE_ENFORCE_GE(supports_dtypes.count(datatype), 0, common::errors::InvalidArgument( - "custorm op [%s] has unsupported datatype: [%s], " + "custom op [%s] has unsupported datatype: [%s], " "now only support: [float32, float16, int8, int32].", op_type, datatype)); @@ -56,7 +56,7 @@ void validate(const std::string& op_type, supports_tensor_formats.count(tensor_format), 0, common::errors::InvalidArgument( - "custorm op [%s] has unsupported tensor format: [%s], " + "custom op [%s] has unsupported tensor format: [%s], " "now only support: [LINEAR, CHW32, CHW2, HWC8, CHW4, DHWC8(TensorRT " "7.2 and after), HWC16(TensorRT 8.0 and after)].", op_type, @@ -68,7 +68,7 @@ void validate(const std::string& op_type, supports_formats_tmp.count(tensor_format), 0, common::errors::InvalidArgument( - "custorm op [%s]: float32 only supports [LINEAR, CHW32], " + "custom op [%s]: float32 only supports [LINEAR, CHW32], " "but got tensor format: [%s], ", op_type, tensor_format)); @@ -85,7 +85,7 @@ void validate(const std::string& op_type, PADDLE_ENFORCE_GE(supports_formats_tmp.count(tensor_format), 0, common::errors::InvalidArgument( - "custorm op [%s]: float16 only supports [LINEAR, " + "custom op [%s]: float16 only supports [LINEAR, " "CHW2, HWC8, CHW4, DHWC8(TensorRT 7.2 and after), " "HWC16(TensorRT 8.0 and after)], " "but got tensor format: [%s], ", @@ -99,7 +99,7 @@ void validate(const std::string& op_type, supports_formats_tmp.count(tensor_format), 0, common::errors::InvalidArgument( - "custorm op [%s]: int8 only supports [LINEAR, CHW32, CHW4], " + "custom op [%s]: int8 only supports [LINEAR, CHW32, CHW4], " "but got tensor format: [%s], ", op_type, tensor_format)); @@ -109,7 +109,7 @@ void validate(const std::string& op_type, PADDLE_ENFORCE_GE(supports_formats_tmp.count(tensor_format), 0, common::errors::InvalidArgument( - "custorm op [%s]: int32 only supports [LINEAR], " + "custom op [%s]: int32 only supports [LINEAR], " "but got tensor format: [%s], ", op_type, tensor_format)); @@ -320,7 +320,7 @@ bool CustomGenericPlugin::supportsFormatCombination( "supportsFormatCombination config!" "Please use SetTrtSupportsFormatConfig to set.", op_desc_.Type().c_str())); - // generate support format combaination function by config + // generate support format combination function by config size_t input_num = OpMetaInfoHelper::GetInputs(op_info).size(); size_t output_num = OpMetaInfoHelper::GetOutputs(op_info).size(); std::vector>> diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu index df6290fc3ae5f4..1f787c259b0518 100644 --- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu @@ -1172,7 +1172,7 @@ bool DeformableConvPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of groupnorm plugin shoule not be nullptr.")); + "The input of groupnorm plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 3c4c9df2f16f08..82d003cfba293a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -241,7 +241,7 @@ bool ElementwisePluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu index 855c80e18d88f6..aa89ffd4e222d4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu @@ -62,7 +62,7 @@ bool ElementwiseAddTransposePluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument("The input of elementwiseadd_transpose " - "plugin shoule not be nullptr.")); + "plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu index 835b222943a9b8..b18a0c2d6d357c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu @@ -237,7 +237,7 @@ bool FusedTokenPrunePluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu index c1b4aad6d73c06..46628128e3b0a3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu @@ -152,7 +152,7 @@ bool GeluPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu index e50be737719945..589ab150ae6fd8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu @@ -475,7 +475,7 @@ bool GroupNormPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of groupnorm plugin shoule not be nullptr.")); + "The input of groupnorm plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu index fb328277ab86a4..682929e9d64fb3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu @@ -142,7 +142,7 @@ bool HardSwishPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index 2ebce801564457..ebc539e32718fd 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -210,7 +210,7 @@ bool LayerNormPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of layernorm plugin shoule not be nullptr.")); + "The input of layernorm plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu b/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu index cd5e1ad9032f8e..1190d9d0d08413 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu @@ -554,7 +554,7 @@ bool LayernormShiftPartitionPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument("The input of LayernormShiftPartition " - "plugin shoule not be nullptr.")); + "plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu index 2e228ed3d69744..5972f5c05964b9 100644 --- a/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu @@ -214,7 +214,7 @@ bool MergeLayernormPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument("The input of MergeLayernorm " - "plugin shoule not be nullptr.")); + "plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu index 3263880b883b01..a25f218b0feee7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu @@ -180,7 +180,7 @@ bool MishPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of mish plugin shoule not be nullptr.")); + "The input of mish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu index 7bd1ca9226fcd0..8fcf3f520de015 100644 --- a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu @@ -74,7 +74,7 @@ bool MultiheadMatmulRoformerPlugin::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu index f80556567431b7..eefc0b2f9e8547 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -329,7 +329,7 @@ bool Pool3DPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index bda2ebcaf853a4..e81114c6f2d7ea 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -260,7 +260,7 @@ bool PoolPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu index ab99587dfec1b7..7da3bdeae03d94 100644 --- a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu @@ -40,7 +40,7 @@ bool PrelnGroupnormActPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of prelnGroupnormAct plugin shoule not be nullptr.")); + "The input of prelnGroupnormAct plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu index d871bab0823a2c..6e3334ef5ff3d4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu @@ -291,7 +291,7 @@ bool PrelnResidualBiasPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index f614ca12d046c3..3d443eba031a02 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -236,7 +236,7 @@ bool QkvToContextPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu index 6322fa29606864..0fa40fd08e1a99 100644 --- a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu @@ -143,7 +143,7 @@ bool ReverseRollPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument("The input of ReverseRoll " - "plugin shoule not be nullptr.")); + "plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu index 20d13c1c6f8c7d..85ad7d808cccc2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu @@ -40,7 +40,7 @@ bool SkipGroupnormActPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of SkipGroupnormAct plugin shoule not be nullptr.")); + "The input of SkipGroupnormAct plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu index 5171f3ae0475ec..658b9eceb492ea 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu @@ -227,7 +227,7 @@ bool SkipMergeLayernormPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument("The input of MergeLayernorm " - "plugin shoule not be nullptr.")); + "plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu index 9a485b4d1d7c69..e4702b0032c69e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu @@ -165,7 +165,7 @@ bool SwishPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of swish plugin shoule not be nullptr.")); + "The input of swish plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, diff --git a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu index 30787d118b5414..459998020b62fd 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu @@ -206,7 +206,7 @@ bool TransLayerNormPluginDynamic::supportsFormatCombination( PADDLE_ENFORCE_NOT_NULL( in_out, common::errors::InvalidArgument( - "The input of layernorm plugin shoule not be nullptr.")); + "The input of layernorm plugin should not be nullptr.")); PADDLE_ENFORCE_LT( pos, nb_inputs + nb_outputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu index 9e5ff08411cbca..b601b3fd9e3d1d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu @@ -222,7 +222,7 @@ int TransformerInputConvertPlugin::enqueue( B, MaxLength, vector_length / - num_threads); // batches, max sequnce length, input0.dims.d[2]/* + num_threads); // batches, max sequence length, input0.dims.d[2]/* remove_padding_kernel<<>>( input0, output2, output0); // input(no_varlen), pos_id, input(varlen) return cudaGetLastError() != cudaSuccess; diff --git a/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc b/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc index 682109b5784640..52cafa8793a300 100644 --- a/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc +++ b/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc @@ -452,9 +452,9 @@ struct ExpressionEqual { struct ExpressionTable { public: ExpressionTable() = default; - void RegisiterExpression(Expression expr) { + void RegisterExpression(Expression expr) { auto op_info = expr.CalcOpInfo(); - VLOG(7) << "[RegisiterExpression] op " << expr.op()->name() << " [" + VLOG(7) << "[RegisterExpression] op " << expr.op()->name() << " [" << expr.op() << "]" << "\n hash: " << op_info.first << "\n can_be_safe_to_replace: " << std::boolalpha @@ -506,7 +506,7 @@ struct CSEAnalyzer { // Handle the operation auto expr = expression_table->CreateExpression(op); - expression_table->RegisiterExpression(expr); + expression_table->RegisterExpression(expr); auto maybe_same_expression = expression_table->Lookup(expr); if (expr.CanBeSafeToReplace()) { if (!maybe_same_expression.has_value()) { diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc index 607b48ac4d55ab..780809ca4b410f 100644 --- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc +++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc @@ -212,17 +212,17 @@ struct FlowGraph { Node op_node(&op); auto layout_transform_iface = op.dyn_cast(); - const auto& relevate_inputs = + const auto& relevant_inputs = layout_transform_iface ? layout_transform_iface.RelevantInputs(&op) : op.operands_source(); - const auto& relevate_outputs = + const auto& relevant_outputs = layout_transform_iface ? layout_transform_iface.RelevantOutputs(&op) : op.results(); - VLOG(10) << "[BuildGraph]" << op_node << " isz:" << relevate_inputs.size() - << " osz:" << relevate_outputs.size(); + VLOG(10) << "[BuildGraph]" << op_node << " isz:" << relevant_inputs.size() + << " osz:" << relevant_outputs.size(); // add in edge - for (auto& operand : relevate_inputs) { + for (auto& operand : relevant_inputs) { Node operand_node(operand); // the capacity should be set as the out_degree of operand node float weight = 1.0f; @@ -235,7 +235,7 @@ struct FlowGraph { AddEdge(operand_node, op_node, weight, 0.0f, true); } - for (const auto& op_result : relevate_outputs) { + for (const auto& op_result : relevant_outputs) { // we have ssa, so the output must not be processed Node op_result_node(op_result); @@ -275,19 +275,19 @@ struct FlowGraph { auto layout_transform_iface = op.dyn_cast(); - const auto& relevate_inputs = + const auto& relevant_inputs = layout_transform_iface ? layout_transform_iface.RelevantInputs(&op) : op.operands_source(); - const auto& relevate_outputs = + const auto& relevant_outputs = layout_transform_iface ? layout_transform_iface.RelevantOutputs(&op) : op.results(); - for (const auto& op_operand : relevate_inputs) { + for (const auto& op_operand : relevant_inputs) { Node operand_node(op_operand); AddEdge(src_node(), operand_node, THRESHOLD); } - for (const auto& op_result : relevate_outputs) { + for (const auto& op_result : relevant_outputs) { Node op_result_node(op_result); AddEdge(src_node(), op_result_node, THRESHOLD); } @@ -328,11 +328,11 @@ struct FlowGraph { for (auto& op : *(program.block())) { auto layout_transform_iface = op.dyn_cast(); - const auto& relevate_outputs = + const auto& relevant_outputs = layout_transform_iface ? layout_transform_iface.RelevantOutputs(&op) : op.results(); - for (const auto& op_result : relevate_outputs) { + for (const auto& op_result : relevant_outputs) { Node op_result_node(op_result); for (auto it = op_result.use_begin(); it != op_result.use_end(); ++it) { auto user_op = it->owner(); From ebc5239a61c17a29807b38e33a994926629943c7 Mon Sep 17 00:00:00 2001 From: zhanghonggeng <43205915+zhanghonggeng@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:19:19 +0800 Subject: [PATCH 30/57] Add matmul_add_act_fuse_pass in inference process (#70663) --- paddle/fluid/inference/api/analysis_predictor.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index a3b8a881400a4b..610d1019126cc4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -908,11 +908,13 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { }); // Infer symbol shape for all ops before fused pass fused_op_pm.AddPass(pir::CreateShapeOptimizationPass()); - const std::vector FusedOpPasses{// Operator fusion pass - "map_op_to_another_pass", - "conv2d_bn_fuse_pass", - "conv2d_add_act_fuse_pass", - "conv2d_add_fuse_pass"}; + const std::vector FusedOpPasses{ + // Operator fusion pass + "map_op_to_another_pass", + "conv2d_bn_fuse_pass", + "conv2d_add_act_fuse_pass", + "conv2d_add_fuse_pass", + "matmul_add_act_fuse_pass"}; for (const auto &fused_op : FusedOpPasses) { fused_op_pm.AddPass(pir::PassRegistry::Instance().Get(fused_op)); From 52ebe47398b21b1327f2b9fb0bfe8ba53c0848af Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Wed, 8 Jan 2025 19:15:56 +0800 Subject: [PATCH 31/57] del autosimplify (#70691) --- paddle/cinn/ir/group_schedule/config/group_tile_util.cc | 4 ++-- paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc | 4 ++-- paddle/cinn/ir/ir_analyzer/ir_analyzer.cc | 2 +- paddle/cinn/optim/replace_cross_block_reduction.cc | 2 +- paddle/cinn/optim/replace_mod_to_max.cc | 2 +- paddle/cinn/optim/resize_buffer.cc | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_util.cc b/paddle/cinn/ir/group_schedule/config/group_tile_util.cc index 933cb3a6477565..30ab52b8bb65a7 100644 --- a/paddle/cinn/ir/group_schedule/config/group_tile_util.cc +++ b/paddle/cinn/ir/group_schedule/config/group_tile_util.cc @@ -58,7 +58,7 @@ std::vector GetVarStrides(ir::Expr load_offset, ir::Expr expr = ir::ir_utils::IRCopy(load_offset); replacer.inspecting_var = var; replacer.IRMutator::Visit(&expr, &expr); - ir::Expr res = common::AutoSimplify(expr); + ir::Expr res = optim::ArithSimplify(expr); if (res.is_constant()) { return res.as_int64(); } @@ -90,7 +90,7 @@ ir::Expr GetLargestLoad(const std::vector& exprs) { for (size_t i = 1; i < tensor->shape.size(); i++) { size = size * tensor->shape[i]; } - return common::AutoSimplify(size); + return optim::ArithSimplify(size); }; ir::Expr res = exprs[0]; diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc index 758464d5d21857..fb0b2cadc6f034 100644 --- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc +++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc @@ -185,8 +185,8 @@ SymbolicPredicate DynamicShapeGroupScheduler::MakeBucketPredicate( } } - sp_extent = common::AutoSimplify(sp_extent); - rd_extent = common::AutoSimplify(rd_extent); + sp_extent = optim::ArithSimplify(sp_extent); + rd_extent = optim::ArithSimplify(rd_extent); return {sp_extent, rd_extent}; }(); diff --git a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc index e7e258d6d5a7ca..5b73b0196e7b28 100644 --- a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc +++ b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc @@ -464,7 +464,7 @@ std::vector GetIterValuesOfAccess(ir::Expr load_or_store, for (ir::Expr index : indices) { ir::Expr index_value = ReplaceVarWithExpr( index, s_block->iter_vars, s_block_realize->iter_values); - iter_values.push_back(common::AutoSimplify(index_value)); + iter_values.push_back(optim::ArithSimplify(index_value)); } return iter_values; } diff --git a/paddle/cinn/optim/replace_cross_block_reduction.cc b/paddle/cinn/optim/replace_cross_block_reduction.cc index 452697fd372e3d..5f597e1ef26f1f 100644 --- a/paddle/cinn/optim/replace_cross_block_reduction.cc +++ b/paddle/cinn/optim/replace_cross_block_reduction.cc @@ -30,7 +30,7 @@ namespace { ir::Expr CalcBufferSizeInBytes(const ir::Buffer& buffer) { const ir::Expr numel = buffer->SymbolicNumel(); - return common::AutoSimplify(numel * buffer->dtype.bytes()); + return optim::ArithSimplify(numel * buffer->dtype.bytes()); } std::unordered_set GetReduceVarNames( diff --git a/paddle/cinn/optim/replace_mod_to_max.cc b/paddle/cinn/optim/replace_mod_to_max.cc index 2b723f43638976..f55f8aa68c4e41 100644 --- a/paddle/cinn/optim/replace_mod_to_max.cc +++ b/paddle/cinn/optim/replace_mod_to_max.cc @@ -37,7 +37,7 @@ class ReplaceModToMaxMutator : public ir::IRMutator<> { ir::Mod* node = expr->As(); Expr base = ir::Sub::Make(node->operand(1), Expr(1)); Expr min_expr = ir::Min::Make(node->operand(0), base); - *expr = cinn::common::AutoSimplify(min_expr); + *expr = cinn::optim::ArithSimplify(min_expr); ir::IRMutator<>::Visit(expr, expr); } }; diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc index ab91648f2f96ef..2a09d2f5f841f6 100644 --- a/paddle/cinn/optim/resize_buffer.cc +++ b/paddle/cinn/optim/resize_buffer.cc @@ -184,7 +184,7 @@ class AnalyzeLoopVarRange : public ir::IRMutator<> { 0, ::common::errors::PreconditionNotMet( "Cannot find the extent of var %s", var_name)); - size = common::AutoSimplify(size * var_name_to_extent_.at(var_name)); + size = optim::ArithSimplify(size * var_name_to_extent_.at(var_name)); } return size; @@ -215,7 +215,7 @@ class AnalyzeLoopVarRange : public ir::IRMutator<> { } } ir::Expr tmp = ir::Add::Make(copy, ir::Expr(1)); - ir::Expr simplified = common::AutoSimplify(tmp); + ir::Expr simplified = optim::ArithSimplify(tmp); if (simplified.As()) { ir::Expr lhs = simplified.As()->a(); ir::Expr rhs = simplified.As()->b(); From 9b06852d4a6057ef6fc7f7a4235905e7b8ddafc7 Mon Sep 17 00:00:00 2001 From: cyberslack_lee Date: Wed, 8 Jan 2025 20:48:41 +0800 Subject: [PATCH 32/57] [CodeStyle][Typos][C-71] Fix typos(`creater`,`Creater`) (#70684) * fix * Update _typos.toml --------- Co-authored-by: Nyakku Shigure --- _typos.toml | 6 ++- .../transforms/check_infer_symbolic_util.cc | 4 +- .../transforms/check_infer_symbolic_util.h | 4 +- paddle/cinn/ir/schedule/factorize_reduction.h | 16 +++--- paddle/cinn/ir/schedule/impl/reduction.cc | 18 +++---- paddle/cinn/ir/schedule/ir_schedule_util.h | 4 +- .../fused_multi_transformer_decoder_pass.cc | 12 ++--- .../fused_multi_transformer_encoder_pass.cc | 16 +++--- .../ir/multihead_matmul_fuse_pass.cc | 12 ++--- .../ir/multihead_matmul_roformer_fuse_pass.cc | 4 +- .../trt_cross_multihead_matmul_fuse_pass.cc | 4 +- .../trt_flash_multihead_matmul_fuse_pass.cc | 4 +- .../ir/trt_multihead_matmul_fuse_pass.cc | 12 ++--- .../ir/trt_qk_multihead_matmul_fuse_pass.cc | 4 +- .../fluid/inference/api/analysis_predictor.cc | 6 +-- .../convert/flash_multihead_matmul_op.cc | 2 +- .../generic_and_custom_plugin_creater.cc | 13 ++--- .../inference/tensorrt/convert/op_converter.h | 53 ++++++++++--------- paddle/fluid/inference/tensorrt/op_teller.cc | 8 +-- paddle/fluid/inference/tensorrt/op_teller.h | 6 +-- .../tensorrt/plugin/reverse_roll_op_plugin.h | 4 +- .../plugin/test_fused_token_prune_plugin.cc | 2 +- .../tensorrt/plugin/test_split_plugin.cc | 2 +- test/dygraph_to_static/test_cycle_gan.py | 6 +-- 24 files changed, 114 insertions(+), 108 deletions(-) diff --git a/_typos.toml b/_typos.toml index 81230a2f09629c..cfd08daf4c29e2 100644 --- a/_typos.toml +++ b/_typos.toml @@ -7,6 +7,10 @@ extend-exclude = [ # Skip `intermidiate` check in these files "test/cpp/eager/task_tests/CMakeLists.txt", "test/cpp/eager/task_tests/hook_test_intermidiate.cc", + # Skip `creater` check in these files + "paddle/fluid/inference/tensorrt/convert/CMakeLists.txt", + "paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc", + "paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc", ] [default] @@ -41,8 +45,6 @@ pash = 'pash' unpacket = "unpacket" # These words need to be fixed -Creater = 'Creater' -creater = 'creater' fetchs = 'fetchs' Indexs = 'Indexs' indexs = 'indexs' diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc index 1471e041a58493..12ef2ebc4d0fe9 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc @@ -54,7 +54,7 @@ std::ostream& operator<<(std::ostream& stream, } DimExprs4ValueT MakeDimExprs4Value( - pir::Program* program, const PassManagerCreater& CreatePassManager) { + pir::Program* program, const PassManagerCreator& CreatePassManager) { std::shared_ptr pass_manager = CreatePassManager(); pass_manager->AddPass(pir::CreateShapeOptimizationPass()); pass_manager->Run(program); @@ -623,7 +623,7 @@ void CheckProgramDimExprConstraints( } // namespace void CheckInferSymbolicIfNeed(pir::Program* program, - const PassManagerCreater& CreatePassManager) { + const PassManagerCreator& CreatePassManager) { if (!FLAGS_prim_all || !FLAGS_check_infer_symbolic) return; const auto& GraphDimExprs4Value = MakeDimExprs4Value(program, CreatePassManager); diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h index d61dd2c6d27f38..1ec72bb4180218 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h +++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h @@ -24,10 +24,10 @@ namespace cinn { namespace dialect { namespace ir { -using PassManagerCreater = std::function()>; +using PassManagerCreator = std::function()>; void CheckInferSymbolicIfNeed(pir::Program* program, - const PassManagerCreater& CreatePassManager); + const PassManagerCreator& CreatePassManager); } // namespace ir } // namespace dialect diff --git a/paddle/cinn/ir/schedule/factorize_reduction.h b/paddle/cinn/ir/schedule/factorize_reduction.h index 7c68370d34b818..b330eaf3b1c850 100644 --- a/paddle/cinn/ir/schedule/factorize_reduction.h +++ b/paddle/cinn/ir/schedule/factorize_reduction.h @@ -48,9 +48,9 @@ Tensor CreateRFTensor(const Tensor& original_tensor, // Base class to create a new reduce block, // only used for FactorizeReduction schedule primitive. -class ReduceBlockCreater { +class ReduceBlockCreator { public: - ReduceBlockCreater(const Expr& original_block, + ReduceBlockCreator(const Expr& original_block, const std::vector& original_loops, const Expr& rf_loop, const Expr& original_update_stmt, @@ -245,9 +245,9 @@ class LoadReplacer : public ir::IRMutator<> { // Implement class for building Reduction-Factorized block, // only used for FactorizeReduction schedule primitive. -class RFBlockCreater : public ReduceBlockCreater { +class RFBlockCreator : public ReduceBlockCreator { public: - RFBlockCreater(const Expr& original_block, + RFBlockCreator(const Expr& original_block, const std::vector& original_loops, const Expr& rf_loop, const Expr& original_update_stmt, @@ -255,7 +255,7 @@ class RFBlockCreater : public ReduceBlockCreater { const std::map& var2loops, const Expr& bound_check, int rf_axis) - : ReduceBlockCreater(original_block, + : ReduceBlockCreator(original_block, original_loops, rf_loop, original_update_stmt, @@ -391,16 +391,16 @@ class RFBlockCreater : public ReduceBlockCreater { // Implement class for building Writing-Back block, // only used for FactorizeReduction schedule primitive. -class RBBlockCreater : public ReduceBlockCreater { +class RBBlockCreator : public ReduceBlockCreator { public: - RBBlockCreater(const Expr& original_block, + RBBlockCreator(const Expr& original_block, const std::vector& original_loops, const Expr& rf_loop, const Expr& original_update_stmt, const ir::Tensor& rf_tensor, const std::vector& rf_tensor_access_indices, const Var& rf_block_rf_iter_var) - : ReduceBlockCreater(original_block, + : ReduceBlockCreator(original_block, original_loops, rf_loop, original_update_stmt, diff --git a/paddle/cinn/ir/schedule/impl/reduction.cc b/paddle/cinn/ir/schedule/impl/reduction.cc index e9df0c7520fa49..0b517264e9707f 100644 --- a/paddle/cinn/ir/schedule/impl/reduction.cc +++ b/paddle/cinn/ir/schedule/impl/reduction.cc @@ -45,7 +45,7 @@ Expr DyScheduleImpl::Rfactor(const Expr& rf_loop, int rf_axis) { // get root ScheduleBlockRealize Expr root = GetRootBlock(rf_loop); // create all stmts after rfactor transformation - RfCreater rf_create(root, rf_loop, rf_axis); + RfCreator rf_create(root, rf_loop, rf_axis); // return new created rfactor tensor return rf_create.CreateRfAllStmts(); CINN_IR_SCHEDULE_END(this->err_msg_level_); @@ -121,7 +121,7 @@ Expr DyScheduleImpl::FactorizeReduction(const Expr& rf_loop, // Create new blocks and loops. Tensor rf_tensor = CreateRFTensor(original_tensor, rf_loop, rf_axis); - RFBlockCreater rf_block_creater(original_block, + RFBlockCreator rf_block_creator(original_block, original_loops, rf_loop, original_update_stmt, @@ -129,18 +129,18 @@ Expr DyScheduleImpl::FactorizeReduction(const Expr& rf_loop, var2loops, Expr(false), rf_axis); - rf_block_creater.CreateBlock(); - RBBlockCreater wb_block_creater(original_block, + rf_block_creator.CreateBlock(); + RBBlockCreator wb_block_creator(original_block, original_loops, rf_loop, original_update_stmt, rf_tensor, - rf_block_creater.rf_tensor_access_indices_, - rf_block_creater.rf_var_); - wb_block_creater.CreateBlock(); + rf_block_creator.rf_tensor_access_indices_, + rf_block_creator.rf_var_); + wb_block_creator.CreateBlock(); - Expr rf_body = rf_block_creater.CreateLoops(); - Expr wb_body = wb_block_creater.CreateLoops( + Expr rf_body = rf_block_creator.CreateLoops(); + Expr wb_body = wb_block_creator.CreateLoops( /* with_init = */ with_write_back_block_init); Expr new_computational_body = Block::Make({rf_body, wb_body}); diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h index d0e102b0050751..6e81ab855e7f8b 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.h +++ b/paddle/cinn/ir/schedule/ir_schedule_util.h @@ -1320,9 +1320,9 @@ struct FindBlockParent : public ir::IRMutator<> { }; // The struct used to create all stmts after rfactor transformation. -struct RfCreater : public ir::IRMutator<> { +struct RfCreator : public ir::IRMutator<> { public: - RfCreater(const Expr& root, const Expr& rf_loop, const int& rf_axis) + RfCreator(const Expr& root, const Expr& rf_loop, const int& rf_axis) : root_(root), rf_loop_(rf_loop), rf_axis_(rf_axis) {} void operator()(Expr* expr) { IRMutator::Visit(expr, expr); } diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc index b0f2b78ca3db0a..2be353d224c6d9 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc +++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc @@ -1114,7 +1114,7 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, fused_multi_transformer_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* layer_norm, Node* layer_norm_scale, Node* layer_norm_bias, @@ -1548,7 +1548,7 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_pattern) - fuse_creater(input0, + fuse_creator(input0, layer_norm, layer_norm_scale, layer_norm_bias, @@ -1858,7 +1858,7 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( fused_multi_transformer_fuse_qkv_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* layer_norm, Node* layer_norm_scale, Node* layer_norm_bias, @@ -2277,7 +2277,7 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern) - fuse_creater(input0, + fuse_creator(input0, layer_norm, layer_norm_scale, layer_norm_bias, @@ -2592,7 +2592,7 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( fused_multi_transformer_fuse_qkv_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* layer_norm, Node* layer_norm_scale, Node* layer_norm_bias, @@ -3047,7 +3047,7 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern) - fuse_creater(input0, + fuse_creator(input0, layer_norm, layer_norm_scale, layer_norm_bias, diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc index 3c1dba76fd18c5..bc6a77d0a60e81 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc +++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc @@ -1740,7 +1740,7 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, fused_multi_transformer_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* layer_norm, Node* layer_norm_scale, Node* layer_norm_bias, @@ -2281,7 +2281,7 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_pattern) - fuse_creater(input0, + fuse_creator(input0, layer_norm, layer_norm_scale, layer_norm_bias, @@ -2576,7 +2576,7 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( fused_multi_transformer_fuse_qkv_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* layer_norm, Node* layer_norm_scale, Node* layer_norm_bias, @@ -3116,7 +3116,7 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH( while0, while0, fused_multi_transformer_fuse_qkv_pattern) - fuse_creater(input0, + fuse_creator(input0, layer_norm, layer_norm_scale, layer_norm_bias, @@ -3419,7 +3419,7 @@ int MultiDevicesFusedMultiTransformerEncoderPass::BuildFusion( multi_devices_fused_multi_transformer_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* c_identity, Node* layer_norm, Node* layer_norm_scale, @@ -3904,7 +3904,7 @@ int MultiDevicesFusedMultiTransformerEncoderPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, multi_devices_fused_multi_transformer_pattern) - fuse_creater(input0, + fuse_creator(input0, c_identity0, layer_norm, layer_norm_scale, @@ -4211,7 +4211,7 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( fused_multi_transformer_fuse_qkv_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* layer_norm, Node* layer_norm_scale, Node* layer_norm_bias, @@ -4787,7 +4787,7 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH( while0, while0, fused_multi_transformer_fuse_qkv_pattern); - fuse_creater(input0, + fuse_creator(input0, layer_norm, layer_norm_scale, layer_norm_bias, diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index 85a3cad5446d10..244f581c6bdac5 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -49,7 +49,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, @@ -195,7 +195,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { GET_IR_NODE_FROM_SUBGRAPH( transpose2_qkv_out, transpose2_qkv_out, multihead_pattern); - fuse_creater(input0, + fuse_creator(input0, mul0, mul1, mul2, @@ -861,7 +861,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, @@ -1081,7 +1081,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, if (is_fc_params_shared) { return; } - fuse_creater(input0, + fuse_creator(input0, mul0, mul1, mul2, @@ -1312,7 +1312,7 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph, multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, @@ -1528,7 +1528,7 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph, if (is_fc_params_shared) { return; } - fuse_creater(input0, + fuse_creator(input0, mul0, mul1, mul2, diff --git a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc index 0c9ba92e4ca6fd..1cfc046b85fb08 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc @@ -399,7 +399,7 @@ int MultiHeadMatmulRoformerFusePass::BuildFusion(Graph* graph, multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* input_cos, Node* input_sin, Node* mul0, @@ -649,7 +649,7 @@ int MultiHeadMatmulRoformerFusePass::BuildFusion(Graph* graph, if (is_fc_params_shared) { return; } - fuse_creater(input0, + fuse_creator(input0, input_cos, input_sin, mul0, diff --git a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc index 2bb30602dcc3de..c267956e55e73a 100644 --- a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc @@ -270,7 +270,7 @@ int TrtCrossMultiHeadMatmulFusePass::BuildCrossFusion( name_scope); multihead_pattern(); - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* input1, Node* mul0, Node* mul1, @@ -430,7 +430,7 @@ int TrtCrossMultiHeadMatmulFusePass::BuildCrossFusion( GET_IR_NODE_FROM_SUBGRAPH( transpose2_qkv_out, transpose2_qkv_out, multihead_pattern); - fuse_creater(input0, + fuse_creator(input0, input1, mul0, mul1, diff --git a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc index 44c19de295f22a..1eedd2fadf484c 100644 --- a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc @@ -277,7 +277,7 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion( name_scope); multihead_pattern(); - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, @@ -444,7 +444,7 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion( GET_IR_NODE_FROM_SUBGRAPH( transpose2_qkv_out, transpose2_qkv_out, multihead_pattern); - fuse_creater(input0, + fuse_creator(input0, mul0, mul1, mul2, diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc index 5652e54ce1fc81..8fd3882c3b3161 100644 --- a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc @@ -46,7 +46,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, @@ -192,7 +192,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { GET_IR_NODE_FROM_SUBGRAPH( transpose2_qkv_out, transpose2_qkv_out, multihead_pattern); - fuse_creater(input0, + fuse_creator(input0, mul0, mul1, mul2, @@ -736,7 +736,7 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, @@ -1001,7 +1001,7 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, if (is_fc_params_shared) { return; } - fuse_creater(input0, + fuse_creator(input0, mul0, mul1, mul2, @@ -1207,7 +1207,7 @@ int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph, multihead_pattern(); // Create New OpDesc - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2, @@ -1423,7 +1423,7 @@ int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph, if (is_fc_params_shared) { return; } - fuse_creater(input0, + fuse_creator(input0, mul0, mul1, mul2, diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc index 1e2a17b5a6ad66..f0f83a53cb2560 100644 --- a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc @@ -259,7 +259,7 @@ int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph, patterns::TrtQKMultiHeadMatmulPattern multihead_pattern(pattern, name_scope); multihead_pattern(); - auto fuse_creater = [&](Node* input0, + auto fuse_creator = [&](Node* input0, Node* input1, Node* mul0, Node* mul1, @@ -481,7 +481,7 @@ int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH( transpose2_qkv_out, transpose2_qkv_out, multihead_pattern); - fuse_creater(input0, + fuse_creator(input0, input1, mul0, mul1, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 610d1019126cc4..8128c45a527255 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -3459,9 +3459,9 @@ USE_TRT_CONVERTER(preln_layernorm_shift_partition) USE_TRT_CONVERTER(merge_layernorm) USE_TRT_CONVERTER(trans_layernorm) USE_TRT_CONVERTER(skip_merge_layernorm) -USE_TRT_CONVERTER(generic_plugin_creater) -USE_TRT_CONVERTER(custom_plugin_creater) -USE_TRT_CONVERTER(custom_generic_plugin_creater) +USE_TRT_CONVERTER(generic_plugin_creator) +USE_TRT_CONVERTER(custom_plugin_creater) // typos: disable-line +USE_TRT_CONVERTER(custom_generic_plugin_creator) USE_TRT_CONVERTER(fuse_eleadd_transpose) USE_TRT_CONVERTER(tanh_shrink) USE_TRT_CONVERTER(logsigmoid) diff --git a/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc index cf0fe2884c4978..afb22dc3b5dace 100644 --- a/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc @@ -218,7 +218,7 @@ class FlashMultiheadMatMulOpConverter : public OpConverter { ("shuffle_before_multihead_matmul(Output: " + output_name + ")") .c_str()); auto creator = GetPluginRegistry()->getPluginCreator("fMHA_V2", "1"); - assert("fmha_v2 plugin creater must not be null" && creator != nullptr); + assert("fmha_v2 plugin creator must not be null" && creator != nullptr); std::vector fields{}; std::unique_ptr plugin_collection( new nvinfer1::PluginFieldCollection); diff --git a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc index 8b1c825c991016..05b61a8b46254b 100644 --- a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc +++ b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc @@ -160,7 +160,7 @@ class CustomPluginCreater : public OpConverter { } }; -class GenericPluginCreater : public OpConverter { +class GenericPluginCreator : public OpConverter { public: void operator()(const framework::proto::OpDesc &op, const framework::Scope &scope, @@ -245,7 +245,7 @@ class GenericPluginCreater : public OpConverter { } }; -class CustomGenericPluginCreater : public OpConverter { +class CustomGenericPluginCreator : public OpConverter { public: void operator()(const framework::proto::OpDesc &op, const framework::Scope &scope, @@ -334,7 +334,8 @@ class CustomGenericPluginCreater : public OpConverter { } // namespace paddle::inference::tensorrt -REGISTER_TRT_OP_CONVERTER(custom_plugin_creater, CustomPluginCreater); -REGISTER_TRT_OP_CONVERTER(generic_plugin_creater, GenericPluginCreater); -REGISTER_TRT_OP_CONVERTER(custom_generic_plugin_creater, - CustomGenericPluginCreater); +REGISTER_TRT_OP_CONVERTER(custom_plugin_creater, + CustomPluginCreater); // typos: disable-line +REGISTER_TRT_OP_CONVERTER(generic_plugin_creator, GenericPluginCreator); +REGISTER_TRT_OP_CONVERTER(custom_generic_plugin_creator, + CustomGenericPluginCreator); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index c3b0b5e15f40f9..bae972efce7775 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -145,23 +145,24 @@ class OpConverter { } break; - case OpConverterType::GenericPluginCreater: + case OpConverterType::GenericPluginCreator: LOG(INFO) << "There is no OpConverter for type " << op_desc.Type() - << ", now use generic_plugin_creater!"; - it = Registry::Global().Lookup("generic_plugin_creater"); + << ", now use generic_plugin_creator!"; + it = Registry::Global().Lookup("generic_plugin_creator"); break; - case OpConverterType::CustomPluginCreater: + case OpConverterType::CustomPluginCreater: // typos: disable-line LOG(INFO) << "There is no OpConverter for type " << op_desc.Type() - << ", now use custom_plugin_creater!"; - it = Registry::Global().Lookup("custom_plugin_creater"); + << ", now use custom_plugin_creater!"; // typos: disable-line + it = Registry::Global().Lookup( + "custom_plugin_creater"); // typos: disable-line break; - case OpConverterType::CustomGenericPluginCreater: + case OpConverterType::CustomGenericPluginCreator: LOG(INFO) << "There is no OpConverter for type " << op_desc.Type() - << ", now use custom_generic_plugin_creater!"; + << ", now use custom_generic_plugin_creator!"; it = Registry::Global().Lookup( - "custom_generic_plugin_creater"); + "custom_generic_plugin_creator"); break; default: @@ -174,24 +175,24 @@ class OpConverter { common::errors::Unimplemented("no OpConverter for optype [%s]", op_desc.Type())); - std::string all_outpus_name = "(Outputs:"; - std::string all_inpus_name = "(Inputs:"; + std::string all_outputs_name = "(Outputs:"; + std::string all_inputs_name = "(Inputs:"; for (auto it1 : op_desc.OutputNames()) { for (auto it2 : op_desc.Output(it1)) { - all_outpus_name += it2; - all_outpus_name += ","; + all_outputs_name += it2; + all_outputs_name += ","; } } - all_outpus_name += ")"; + all_outputs_name += ")"; for (auto it1 : op_desc.InputNames()) { for (auto it2 : op_desc.Input(it1)) { - all_inpus_name += it2; - all_inpus_name += ","; + all_inputs_name += it2; + all_inputs_name += ","; } } - all_inpus_name += ")"; - VLOG(1) << op_desc.Type() << all_inpus_name << all_outpus_name + all_inputs_name += ")"; + VLOG(1) << op_desc.Type() << all_inputs_name << all_outputs_name << "are to be converted to TensorRT layer"; it->SetEngine(engine); @@ -219,8 +220,8 @@ class OpConverter { op_desc.Type())); } - auto* output_itensor = engine->GetITensor(output_name); - engine->SetTensorDynamicRange(output_itensor, out_scale); + auto* output_tensor = engine->GetITensor(output_name); + engine->SetTensorDynamicRange(output_tensor, out_scale); VLOG(1) << "Set out scale = " << out_scale << " for tensor " << output_name << "."; } @@ -231,8 +232,8 @@ class OpConverter { float, op_desc.GetAttr("out_" + std::to_string(i) + "_threshold")); std::string output_name = op_desc.Output(op_desc.OutputNames()[i]).front(); - auto* output_itensor = engine->GetITensor(output_name); - engine->SetTensorDynamicRange(output_itensor, out_scale); + auto* output_tensor = engine->GetITensor(output_name); + engine->SetTensorDynamicRange(output_tensor, out_scale); VLOG(1) << "Set out scale = " << out_scale << " for tensor " << output_name << "."; } @@ -246,10 +247,10 @@ class OpConverter { for (size_t i = 0; i < inputs_name.size(); i++) { if (op_desc.HasAttr(inputs_name[i])) { std::string input_tensor_name = op_desc.Input(inputs_name[i])[0]; - auto* input_itensor = engine->GetITensor(input_tensor_name); + auto* input_tensor = engine->GetITensor(input_tensor_name); float input_scale = PADDLE_GET_CONST(float, op_desc.GetAttr(inputs_name[i])); - engine->SetTensorDynamicRange(input_itensor, input_scale); + engine->SetTensorDynamicRange(input_tensor, input_scale); VLOG(1) << "Set input tensor scale = " << input_scale << " for tensor: " << input_tensor_name << "."; } @@ -257,10 +258,10 @@ class OpConverter { for (size_t i = 0; i < outputs_name.size(); i++) { if (op_desc.HasAttr(outputs_name[i])) { std::string output_tensor_name = op_desc.Output(outputs_name[i])[0]; - auto* output_itensor = engine->GetITensor(output_tensor_name); + auto* output_tensor = engine->GetITensor(output_tensor_name); float output_scale = PADDLE_GET_CONST(float, op_desc.GetAttr(outputs_name[i])); - engine->SetTensorDynamicRange(output_itensor, output_scale); + engine->SetTensorDynamicRange(output_tensor, output_scale); VLOG(1) << "Set output tensor scale = " << output_scale << " for tensor: " << output_tensor_name << "."; } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 9a21edd52d838a..0356b17d432300 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -3515,7 +3515,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, with_dynamic_shape, forbid_dynamic_op_enter_into_trt, use_explicit_quantization)) { - SetOpConverterType(node->Op(), OpConverterType::GenericPluginCreater); + SetOpConverterType(node->Op(), OpConverterType::GenericPluginCreator); return true; } auto& custom_plugin_teller = GetCustomPluginTeller(); @@ -3524,7 +3524,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, with_dynamic_shape, forbid_dynamic_op_enter_into_trt, use_explicit_quantization)) { - SetOpConverterType(node->Op(), OpConverterType::CustomPluginCreater); + SetOpConverterType( + node->Op(), + OpConverterType::CustomPluginCreater); // typos: disable-line return true; } auto& custom_generic_plugin_teller = GetCustomGenericPluginTeller(); @@ -3533,7 +3535,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, with_dynamic_shape, forbid_dynamic_op_enter_into_trt, use_explicit_quantization)) { - SetOpConverterType(node->Op(), OpConverterType::CustomGenericPluginCreater); + SetOpConverterType(node->Op(), OpConverterType::CustomGenericPluginCreator); return true; } return false; diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h index f955396b9ac119..63e3614e7cc2e2 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.h +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -59,9 +59,9 @@ struct Teller { enum class OpConverterType { Default = 0, - GenericPluginCreater, - CustomPluginCreater, - CustomGenericPluginCreater + GenericPluginCreator, + CustomPluginCreater, // typos: disable-line + CustomGenericPluginCreator }; /* * class OpTeller helps to tell whether a fluid diff --git a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h index 328b596594006b..24a9d2fe2cd5f9 100644 --- a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h @@ -117,7 +117,7 @@ class ReverseRollPluginDynamic : public DynamicPluginTensorRT { bool with_fp16_; }; -class ReverseRollPluginDynamicCreater : public TensorRTPluginCreator { +class ReverseRollPluginDynamicCreator : public TensorRTPluginCreator { public: const char* getPluginName() const TRT_NOEXCEPT override { return "reverse_roll_dynamic"; @@ -130,7 +130,7 @@ class ReverseRollPluginDynamicCreater : public TensorRTPluginCreator { return new ReverseRollPluginDynamic(serial_data, serial_length); } }; -REGISTER_TRT_PLUGIN_V2(ReverseRollPluginDynamicCreater); +REGISTER_TRT_PLUGIN_V2(ReverseRollPluginDynamicCreator); } // namespace plugin } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc index aed689e8fb44cb..0ec6f3370934d8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc @@ -31,7 +31,7 @@ TEST(fused_token_prune_op_plugin, test_plugin) { plugin.serialize(buf.data()); } -TEST(fused_token_prune_op_plugin, test_plugin_creater) { +TEST(fused_token_prune_op_plugin, test_plugin_creator) { FusedTokenPrunePluginDynamicCreator creator; creator.getFieldNames(); creator.createPlugin("test", nullptr); diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc index 64e55023892c40..1c927ef6949075 100644 --- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc @@ -50,7 +50,7 @@ TEST(split_op_plugin, test_plugin) { sp_plugin.terminate(); } -TEST(split_op_plugin, test_plugin_creater) { +TEST(split_op_plugin, test_plugin_creator) { SplitPluginCreator creator; creator.getFieldNames(); creator.createPlugin("test", nullptr); diff --git a/test/dygraph_to_static/test_cycle_gan.py b/test/dygraph_to_static/test_cycle_gan.py index 6272c4d91d5989..36cb7434db021c 100644 --- a/test/dygraph_to_static/test_cycle_gan.py +++ b/test/dygraph_to_static/test_cycle_gan.py @@ -480,7 +480,7 @@ def pool_image(self, image): return image -def reader_creater(): +def reader_creator(): def reader(): while True: fake_image = np.uint8( @@ -551,8 +551,8 @@ def train(args): A_pool = ImagePool() B_pool = ImagePool() - A_reader = paddle.batch(reader_creater(), args.batch_size)() - B_reader = paddle.batch(reader_creater(), args.batch_size)() + A_reader = paddle.batch(reader_creator(), args.batch_size)() + B_reader = paddle.batch(reader_creator(), args.batch_size)() cycle_gan = paddle.jit.to_static( Cycle_Gan(input_channel=data_shape[1], istrain=True) ) From ca41a7af04458c73403aae5edda976a3d0c50c99 Mon Sep 17 00:00:00 2001 From: Shi Kai Date: Wed, 8 Jan 2025 20:51:02 +0800 Subject: [PATCH 33/57] [Docathon][Add API Legend No.16] Add legend and update doc for atleast_2d -part (#70242) * Add legend and docathon in EN for atleast_2d API * Fix codestyle * Update manipulation.py; test=document_fix --- python/paddle/tensor/manipulation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index bc78d510a91ee3..5f794b2e9fd866 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -5295,6 +5295,19 @@ def atleast_2d(*inputs, name=None): """ Convert inputs to tensors and return the view with at least 2-dimension. Two or high-dimensional inputs are preserved. + The following diagram illustrates the behavior of atleast_2d on different dimensional inputs for the following cases: + + 1. A 0-dim tensor input. + 2. A 0-dim tensor and a 1-dim tensor input. + 3. A 0-dim tensor and a 3-dim tensor input. + + .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/images/api_legend/atleast_2d.png + :width: 600 + :alt: legend of atleast_2d API + :align: center + + In each case, the function returns the tensors (or a list of tensors) in views with at least 2 dimensions. + Args: inputs (Tensor|list(Tensor)): One or more tensors. The data type is ``float16``, ``float32``, ``float64``, ``int16``, ``int32``, ``int64``, ``int8``, ``uint8``, ``complex64``, ``complex128``, ``bfloat16`` or ``bool``. name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. From 0f72fabd513cd54233fcbe8c117f436ff3496c03 Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Wed, 8 Jan 2025 21:26:23 +0800 Subject: [PATCH 34/57] del autosimplify 3 (#70695) --- paddle/cinn/optim/vectorize_loops.cc | 10 ++++---- paddle/cinn/runtime/cpu/cblas.cc | 12 +++++----- paddle/cinn/runtime/cpu/onednn_math.cc | 24 +++++++++---------- .../pir/cinn/adt/merge_block_utils_test.cc | 2 +- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc index f4d4f005857c96..e6324d0db2e409 100644 --- a/paddle/cinn/optim/vectorize_loops.cc +++ b/paddle/cinn/optim/vectorize_loops.cc @@ -170,7 +170,7 @@ class TensorVectorizeTeller : public ir::IRMutator { Expr next_idx = ir::ir_utils::IRCopy(indices.back()); cinn::ir::ir_utils::IrReplaceVarBroadcast( &next_idx, Expr(iter_var_), Expr(i)); - auto gap = cinn::common::AutoSimplify(Expr(next_idx - first_idx)); + auto gap = cinn::optim::ArithSimplify(Expr(next_idx - first_idx)); if (!gap.As() || gap.as_int32() != i) { VLOG(5) << "Tensor:" << tensor->name << " is not accessed sequentially, next:" << next_idx @@ -781,7 +781,7 @@ struct VectorizeLoops_ : public IRMutator { true, ::common::errors::InvalidArgument( "The minimum of forloop should be zero, please check.")); - Expr for_extent = cinn::common::AutoSimplify(forloop->extent); + Expr for_extent = cinn::optim::ArithSimplify(forloop->extent); Simplify(&for_extent); node->extent = for_extent; auto *extent_min = for_extent.As(); @@ -918,7 +918,7 @@ struct VectorizeLoops_ : public IRMutator { inner_for, ::common::errors::InvalidArgument( "Inner_for is nullptr in UnrollCmpFor function.")); - Expr inner_for_extent = cinn::common::AutoSimplify(inner_for->extent); + Expr inner_for_extent = cinn::optim::ArithSimplify(inner_for->extent); Simplify(&inner_for_extent); auto *extent_min = inner_for_extent.As(); if (extent_min) { @@ -951,7 +951,7 @@ struct VectorizeLoops_ : public IRMutator { DeviceAPI::UNK, inner_for->body, inner_for->vectorize_info())}); - Expr new_extent_a = cinn::common::AutoSimplify(le_n->b() + 1); + Expr new_extent_a = cinn::optim::ArithSimplify(le_n->b() + 1); Expr out_for_a = For::Make(outer_for->loop_var, outer_for->min, new_extent_a, @@ -1021,7 +1021,7 @@ struct VectorizeLoops_ : public IRMutator { extent_int % factor == 0 ? extent_trunc : extent_trunc + 1; times = cinn::common::make_const(forloop->extent->type(), extent_times); } else { - times = cinn::common::AutoSimplify( + times = cinn::optim::ArithSimplify( Div::Make(forloop->extent, make_const(factor))); Simplify(×); } diff --git a/paddle/cinn/runtime/cpu/cblas.cc b/paddle/cinn/runtime/cpu/cblas.cc index 5b9ed4dbaca76b..adf6bf6fb17db0 100644 --- a/paddle/cinn/runtime/cpu/cblas.cc +++ b/paddle/cinn/runtime/cpu/cblas.cc @@ -151,8 +151,8 @@ CINN_REGISTER_HELPER(cinn_cpu_mkl) { 12UL, ::common::errors::InvalidArgument( "Wrong number of arguments passed in.")); - auto M = cinn::common::AutoSimplify(args[1]); - auto N = cinn::common::AutoSimplify(args[2]); + auto M = cinn::optim::ArithSimplify(args[1]); + auto N = cinn::optim::ArithSimplify(args[2]); std::vector shape; shape.push_back(M); shape.push_back(N); @@ -173,16 +173,16 @@ CINN_REGISTER_HELPER(cinn_cpu_mkl) { A_tensor, ::common::errors::InvalidArgument("expected type is tensor.")); - auto batch_size = cinn::common::AutoSimplify(args[1]); + auto batch_size = cinn::optim::ArithSimplify(args[1]); int32_t batch_size_val = batch_size.as_int32(); - auto M = cinn::common::AutoSimplify(args[2]); - auto N = cinn::common::AutoSimplify(args[3]); + auto M = cinn::optim::ArithSimplify(args[2]); + auto N = cinn::optim::ArithSimplify(args[3]); std::vector shape; int total = 1; for (auto& v : A_tensor->shape) { - auto val = cinn::common::AutoSimplify(v); + auto val = cinn::optim::ArithSimplify(v); PADDLE_ENFORCE_EQ( val.is_constant(), true, diff --git a/paddle/cinn/runtime/cpu/onednn_math.cc b/paddle/cinn/runtime/cpu/onednn_math.cc index 668788c1194e63..e41fc6119ee116 100644 --- a/paddle/cinn/runtime/cpu/onednn_math.cc +++ b/paddle/cinn/runtime/cpu/onednn_math.cc @@ -168,18 +168,18 @@ CINN_REGISTER_HELPER(cinn_cpu_onednn) { 16UL, ::common::errors::InvalidArgument( "Wrong number of arguments passed in.")); - auto N = cinn::common::AutoSimplify(args[0]); - int input_h = cinn::common::AutoSimplify(args[2]).as_int32(); - int input_w = cinn::common::AutoSimplify(args[3]).as_int32(); - auto c_out = cinn::common::AutoSimplify(args[4]); - int filter_h = cinn::common::AutoSimplify(args[6]).as_int32(); - int filter_w = cinn::common::AutoSimplify(args[7]).as_int32(); - int pad_h = cinn::common::AutoSimplify(args[8]).as_int32(); - int pad_w = cinn::common::AutoSimplify(args[9]).as_int32(); - int stride_h = cinn::common::AutoSimplify(args[10]).as_int32(); - int stride_w = cinn::common::AutoSimplify(args[11]).as_int32(); - int dilation_h = cinn::common::AutoSimplify(args[12]).as_int32(); - int dilation_w = cinn::common::AutoSimplify(args[13]).as_int32(); + auto N = cinn::optim::ArithSimplify(args[0]); + int input_h = cinn::optim::ArithSimplify(args[2]).as_int32(); + int input_w = cinn::optim::ArithSimplify(args[3]).as_int32(); + auto c_out = cinn::optim::ArithSimplify(args[4]); + int filter_h = cinn::optim::ArithSimplify(args[6]).as_int32(); + int filter_w = cinn::optim::ArithSimplify(args[7]).as_int32(); + int pad_h = cinn::optim::ArithSimplify(args[8]).as_int32(); + int pad_w = cinn::optim::ArithSimplify(args[9]).as_int32(); + int stride_h = cinn::optim::ArithSimplify(args[10]).as_int32(); + int stride_w = cinn::optim::ArithSimplify(args[11]).as_int32(); + int dilation_h = cinn::optim::ArithSimplify(args[12]).as_int32(); + int dilation_w = cinn::optim::ArithSimplify(args[13]).as_int32(); int out_h = (input_h - ((filter_h - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1; diff --git a/test/cpp/pir/cinn/adt/merge_block_utils_test.cc b/test/cpp/pir/cinn/adt/merge_block_utils_test.cc index bb5ba4beefe74c..315ab8941b4965 100644 --- a/test/cpp/pir/cinn/adt/merge_block_utils_test.cc +++ b/test/cpp/pir/cinn/adt/merge_block_utils_test.cc @@ -29,7 +29,7 @@ bool IsBlockForAllEqual(const ForTreeNode& first, const ForTreeNode& second) { const ForTreeNode& second) -> bool { const ir::Expr lhs = first.val->extent(); const ir::Expr rhs = second.val->extent(); - if (cinn::common::AutoSimplify(ir::Sub::Make(lhs, rhs)) != ir::Expr(0)) { + if (lhs != rhs) { return false; } return true; From cfb3a7a7b0dc716178c89270092bc6e6d6a87a01 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 9 Jan 2025 09:16:13 +0800 Subject: [PATCH 35/57] fix bug of convert squeeze to reshape in cinn (#70720) --- .../dialect/operator/transforms/pd_to_cinn_pass.cc | 11 ++++++++--- .../pir_graph_analyzing/shardable_axes_base.cc | 8 ++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc index 008ef30762ece8..537c6239fd19b2 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc @@ -1010,18 +1010,23 @@ class SqueezeOpPattern if (IsDefinedBy(op, 1) && !is_dyshape) { const FullIntArrayOp axis_full_op = CastDefinedTo(op, 1); auto axis_vec = cinn::dialect::ir::GetVectorAttr(axis_full_op, "value"); - std::set axis_set(axis_vec.begin(), axis_vec.end()); - auto in_shape = phi::vectorize(op.operand_source(0) .type() .dyn_cast() .dims()); + const std::set axis_set = [&] { + std::set axis_set; + for (int64_t axis : axis_vec) { + axis_set.insert(axis < 0 ? axis + in_shape.size() : axis); + } + return axis_set; + }(); std::vector output_shape; for (size_t i = 0; i < in_shape.size(); ++i) { - if (!axis_set.count(i)) { + if (!axis_set.count(i) || in_shape[i] != 1) { output_shape.push_back(in_shape[i]); } else { PADDLE_ENFORCE_EQ( diff --git a/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc b/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc index 03aea53b8ddebd..3c58262e65d6f0 100644 --- a/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc +++ b/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc @@ -160,7 +160,9 @@ ShardableAxesSignature CreateSignatureForElementWise(pir::Operation* op) { GetCompatibleRank(op->operand_source(i)), ::common::errors::PreconditionNotMet( "Required all inputs rank shall be equal output in " - "elementwise op.")); + "elementwise op : %s [id:%d]", + op->name(), + op->id())); result.inputs.emplace_back(same_axes); } for (int i = 0; i < op->num_results(); ++i) { @@ -168,7 +170,9 @@ ShardableAxesSignature CreateSignatureForElementWise(pir::Operation* op) { GetCompatibleRank(op->result(i)), ::common::errors::PreconditionNotMet( "Required all outputs rank shall be equal each other " - "in elementwise op.")); + "in elementwise op : %s [id:%d]", + op->name(), + op->id())); result.outputs.emplace_back(same_axes); } result.loop = result.outputs.back(); From 84ab826e0f5404845a86b3d077099de7a182f744 Mon Sep 17 00:00:00 2001 From: fangfangssj <99968055+fangfangssj@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:26:36 +0800 Subject: [PATCH 36/57] [CodeStyle][Typos][T-[1-5]] Fix typo(targt, Taget, templat,temporaily,temporily,Temperarily,temporaly,Temperary) (#70722) --- _typos.toml | 8 -------- .../dialect/operator/transforms/pd_to_cinn_pass.cc | 12 ++++++------ paddle/cinn/hlir/framework/pir/fusion_info.cc | 2 +- paddle/cinn/hlir/pe/schedule.cc | 2 +- paddle/fluid/ir_adaptor/translator/CMakeLists.txt | 4 ++-- paddle/phi/kernels/funcs/segmented_array.h | 2 +- python/paddle/nn/functional/loss.py | 8 ++++---- test/dygraph_to_static/test_break_continue.py | 2 +- test/sot/test_step_profiler.py | 2 +- 9 files changed, 17 insertions(+), 25 deletions(-) diff --git a/_typos.toml b/_typos.toml index cfd08daf4c29e2..135388b0344429 100644 --- a/_typos.toml +++ b/_typos.toml @@ -271,14 +271,6 @@ suppport = 'suppport' SWTICH = 'SWTICH' Swith = 'Swith' sysyem = 'sysyem' -targt = 'targt' -Taget = 'Taget' -templat = 'templat' -temporaily = 'temporaily' -temporily = 'temporily' -Temperarily = 'Temperarily' -temporaly = 'temporaly' -Temperary = 'Temperary' tenosr = 'tenosr' iterm = 'iterm' termiante = 'termiante' diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc index 537c6239fd19b2..1baa9197c19fbd 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc @@ -41,21 +41,21 @@ using paddle::dialect::FullOp; namespace { -template +template bool IsDefinedBy(const SourceOpT &op, const size_t idx) { const pir::Operation *defined_op = op->operand_source(idx).defining_op(); - return defined_op && defined_op->isa(); + return defined_op && defined_op->isa(); } -template -TagetOpT CastDefinedTo(const SourceOpT &op, const size_t idx) { - PADDLE_ENFORCE_EQ(IsDefinedBy(op, idx), +template +TargetOpT CastDefinedTo(const SourceOpT &op, const size_t idx) { + PADDLE_ENFORCE_EQ(IsDefinedBy(op, idx), true, ::common::errors::PreconditionNotMet( "Required defined op shall not be nullptr and can cast " "to target type.")); pir::Operation *defined_op = op->operand_source(idx).defining_op(); - return defined_op->dyn_cast(); + return defined_op->dyn_cast(); } template diff --git a/paddle/cinn/hlir/framework/pir/fusion_info.cc b/paddle/cinn/hlir/framework/pir/fusion_info.cc index d445910e0909e7..6d1067cfc52b71 100644 --- a/paddle/cinn/hlir/framework/pir/fusion_info.cc +++ b/paddle/cinn/hlir/framework/pir/fusion_info.cc @@ -236,7 +236,7 @@ std::ostream& operator<<(std::ostream& os, const FusionInfo& fusion_info) { std::vector TopologySort( const OpLoweringGroup& group) { - // NOTE(Aurelius84): Use simplest one-by-one order temporaly. + // NOTE(Aurelius84): Use simplest one-by-one order temporarily. auto* block = group.GetParentBlock(); std::vector ops; ops.reserve(block->size()); diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc index dc30364bcda379..fada77826134bf 100644 --- a/paddle/cinn/hlir/pe/schedule.cc +++ b/paddle/cinn/hlir/pe/schedule.cc @@ -211,7 +211,7 @@ int GetArrayPackingFactor(int shape, const cinn::common::Target &target) { int split_base = GetBasicFactor(type, target); int split_factor = 1; - // temporily use shape-1 instead of shape for isl wrong for1 elimination + // temporarily use shape-1 instead of shape for isl wrong for1 elimination int i = split_base * split_base < shape ? split_base * split_base : shape; for (; i > 1; i--) { if (shape % i == 0) { diff --git a/paddle/fluid/ir_adaptor/translator/CMakeLists.txt b/paddle/fluid/ir_adaptor/translator/CMakeLists.txt index c8b145c449e37b..7cd1c839845e14 100644 --- a/paddle/fluid/ir_adaptor/translator/CMakeLists.txt +++ b/paddle/fluid/ir_adaptor/translator/CMakeLists.txt @@ -8,7 +8,7 @@ set(sparse_op_yaml ${PADDLE_SOURCE_DIR}/paddle/phi/ops/yaml/sparse_ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/ops/yaml/sparse_backward.yaml ) set(op_compat_source_file ${PD_PROGRAM_TRANSLATOR_SOURCE_DIR}/op_compat_info.cc) -set(op_compat_templat_file +set(op_compat_template_file ${PD_PROGRAM_TRANSLATOR_SOURCE_DIR}/op_compat_info.cc.j2) add_custom_command( @@ -17,7 +17,7 @@ add_custom_command( ${PYTHON_EXECUTABLE} ${op_gen_file} --op_compat_yaml_file ${op_compat_yaml_file} --sparse_op_yaml ${sparse_op_yaml} --output_source_file ${op_compat_source_file} - DEPENDS ${op_gen_file} ${op_compat_yaml_file} ${op_compat_templat_file} + DEPENDS ${op_gen_file} ${op_compat_yaml_file} ${op_compat_template_file} VERBATIM) file(GLOB PD_PROGRAM_TRANSLATOR_SRCS "*.cc") diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h index 24046da52aeeeb..71b7e5f4e52739 100644 --- a/paddle/phi/kernels/funcs/segmented_array.h +++ b/paddle/phi/kernels/funcs/segmented_array.h @@ -171,7 +171,7 @@ struct PointerArraySetter : public ArraySetterBase { // need_alloc : tensor data needs extra buffer or not. // use_cuda_graph: tensor data shall be captured by cuda_graph or not. - // pre_alloc_host_buf: tensor data is temporaily stored by pinned memory or + // pre_alloc_host_buf: tensor data is temporarily stored by pinned memory or // not. PointerArraySetter(const Context& ctx, std::vector* t, diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index cdd734c42a07e0..8dd0e2f0a41ede 100644 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -4510,9 +4510,9 @@ def adaptive_log_softmax_with_loss( Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 1.14779019) """ - targt_dim = label.dim() + target_dim = label.dim() - if targt_dim == 1: + if target_dim == 1: if input.shape[0] != label.shape[0]: raise ValueError( 'Input and label should have the same size ' @@ -4523,7 +4523,7 @@ def adaptive_log_softmax_with_loss( '1D label tensor expects 2D input tensors, ' f'but found inputs with size {input.shape}' ) - elif targt_dim == 0: + elif target_dim == 0: if input.dim() != 1: raise ValueError( '0D label tensor expects 1D input tensors, ' @@ -4534,7 +4534,7 @@ def adaptive_log_softmax_with_loss( '0D or 1D label tensor expected, ' 'multi-label not supported' ) - is_batched = targt_dim > 0 + is_batched = target_dim > 0 input = input if is_batched else input.unsqueeze(0) label = label if is_batched else label.unsqueeze(0) diff --git a/test/dygraph_to_static/test_break_continue.py b/test/dygraph_to_static/test_break_continue.py index 0d15dac2843711..ad6ade66ac74fe 100644 --- a/test/dygraph_to_static/test_break_continue.py +++ b/test/dygraph_to_static/test_break_continue.py @@ -351,7 +351,7 @@ def init_dygraph_func(self): def test_transformed_static_result(self): self.init_dygraph_func() dygraph_res = self.run_dygraph_mode() - # NOTE(SigureMo): Temperary run the test in sequential run mode to avoid dependency + # NOTE(SigureMo): Temporarily run the test in sequential run mode to avoid dependency # on the execution order of the test cases. if use_pir_api(): with exe_sequential_run_guard(True): diff --git a/test/sot/test_step_profiler.py b/test/sot/test_step_profiler.py index 82279b3bc09543..b6c895281959a2 100644 --- a/test/sot/test_step_profiler.py +++ b/test/sot/test_step_profiler.py @@ -43,7 +43,7 @@ def forward(self, x): class TestStepProfilerSmokeTest(unittest.TestCase): - # Temperarily disable this test + # Temporarily disable this test # @sot_step_profiler_guard(True) @strict_mode_guard(False) def test_step_profiler_smoke(self): From a881fcd6a9c74df658354e4d833786a597909a9b Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:38:15 +0800 Subject: [PATCH 37/57] del autosimplify 2 (#70694) --- .../optim/trans_buffer_with_dynamic_shape.cc | 6 ++--- paddle/cinn/optim/transform_gpu_forloop.cc | 12 +++++----- paddle/cinn/optim/transform_polyfor_to_for.cc | 2 +- paddle/cinn/optim/update_buffer_axis_pass.cc | 22 ++----------------- paddle/cinn/optim/var_mod_simplify.cc | 4 ++-- 5 files changed, 14 insertions(+), 32 deletions(-) diff --git a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc index 22f92e0290d997..ac030ec4bbfa8c 100644 --- a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc +++ b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc @@ -59,8 +59,8 @@ struct Mutator : public ir::IRMutator<>, public ir::stmt::StmtMutator<> { Expr e = expr->as_tensor()->shape[i]; Expr buf_e = buf->shape[i]; if (buf->memory_type == ir::MemoryType::GPULocal) { - e = cinn::common::AutoSimplify(e); - buf_e = cinn::common::AutoSimplify(buf_e); + e = cinn::optim::ArithSimplify(e); + buf_e = cinn::optim::ArithSimplify(buf_e); if (!e.is_constant()) { auto new_shape = ir::ir_utils::IRCopy(e); new_shape = analyzer.UpperBound(new_shape); @@ -86,7 +86,7 @@ struct Mutator : public ir::IRMutator<>, public ir::stmt::StmtMutator<> { auto e = buf->shape.size() > tensor->shape.size() ? buf->shape[i] : tensor->shape[i]; if (buf->memory_type == ir::MemoryType::GPULocal) { - e = cinn::common::AutoSimplify(e); + e = cinn::optim::ArithSimplify(e); if (!e.is_constant()) { auto new_shape = ir::ir_utils::IRCopy(e); new_shape = analyzer.UpperBound(new_shape); diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc index 020cdc4dade8d5..4012acb2ca10d9 100644 --- a/paddle/cinn/optim/transform_gpu_forloop.cc +++ b/paddle/cinn/optim/transform_gpu_forloop.cc @@ -317,7 +317,7 @@ class SharedAxisVisitor : public ir::IRMutator<> { for (auto axis : gpu_axis) { optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0)); } - indice = cinn::common::AutoSimplify(indice); + indice = cinn::optim::ArithSimplify(indice); } } ir::IRMutator<>::Visit(op, expr); @@ -338,7 +338,7 @@ class SharedAxisVisitor : public ir::IRMutator<> { for (auto axis : gpu_axis) { optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0)); } - indice = cinn::common::AutoSimplify(indice); + indice = cinn::optim::ArithSimplify(indice); } } ir::IRMutator<>::Visit(op, expr); @@ -367,7 +367,7 @@ class LocalAxisVisitor : public ir::IRMutator<> { for (auto axis : gpu_axis) { optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0)); } - indice = cinn::common::AutoSimplify(indice); + indice = cinn::optim::ArithSimplify(indice); } } } @@ -388,7 +388,7 @@ class LocalAxisVisitor : public ir::IRMutator<> { for (auto axis : gpu_axis) { optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0)); } - indice = cinn::common::AutoSimplify(indice); + indice = cinn::optim::ArithSimplify(indice); } } ir::IRMutator<>::Visit(op, expr); @@ -418,7 +418,7 @@ class ReplaceUnitVarToZero : public ir::IRMutator<> { for (auto var_ : loop_var_) { optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0)); } - indice = cinn::common::AutoSimplify(indice); + indice = cinn::optim::ArithSimplify(indice); } ir::IRMutator<>::Visit(op, expr); } @@ -434,7 +434,7 @@ class ReplaceUnitVarToZero : public ir::IRMutator<> { for (auto var_ : loop_var_) { optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0)); } - indice = cinn::common::AutoSimplify(indice); + indice = cinn::optim::ArithSimplify(indice); } ir::IRMutator<>::Visit(op, expr); diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc index ab811a792e09a4..99a145d924ff35 100644 --- a/paddle/cinn/optim/transform_polyfor_to_for.cc +++ b/paddle/cinn/optim/transform_polyfor_to_for.cc @@ -136,7 +136,7 @@ struct PolyForWithSimpleConditionToForMutator : public ir::IRMutator { Expr lhs = lt_n ? lt_n->a() : le_n->a(); Expr rhs = lt_n ? lt_n->b() : PlusOneWithMinMax(le_n->b()); - rhs = cinn::common::AutoSimplify(rhs); + rhs = cinn::optim::ArithSimplify(rhs); if (op->is_vectorized()) PADDLE_ENFORCE_EQ( diff --git a/paddle/cinn/optim/update_buffer_axis_pass.cc b/paddle/cinn/optim/update_buffer_axis_pass.cc index b43b7fc8349145..12927b1a971b55 100644 --- a/paddle/cinn/optim/update_buffer_axis_pass.cc +++ b/paddle/cinn/optim/update_buffer_axis_pass.cc @@ -28,24 +28,6 @@ namespace cinn { namespace optim { -bool ExprMathEqual(const Expr& expr1, const Expr& expr2) { - ir::Expr cmp_expr = common::AutoSimplify(ir::Sub::Make(expr1, expr2)); - // This is ugly code since AutoSimplify is not powerful enough. Modify it - // after we make auto simplify better - ir::Expr simplified = common::AutoSimplify(cmp_expr); - int count = 0; - while (simplified != cmp_expr) { - cmp_expr = simplified; - simplified = common::AutoSimplify(cmp_expr); - ++count; - // Control dead loop - if (count >= 5) { - break; - } - } - return simplified.is_constant() && simplified.get_constant() == 0; -} - void FormalizeSingleIndex(const ir::Tensor& tensor, std::vector* indices) { if (tensor->shape.size() > 1 && indices->size() == 1) { @@ -56,7 +38,7 @@ void FormalizeSingleIndex(const ir::Tensor& tensor, mul = ir::Mul::Make(tensor->shape[i + 1], mul); ir::Expr div_expr = ir::Div::Make(origin_index_expr, mul); ir::Expr index_expr = ir::Mod::Make(div_expr, tensor->shape[i]); - indices->insert(indices->begin(), common::AutoSimplify(index_expr)); + indices->insert(indices->begin(), optim::ArithSimplify(index_expr)); } } } @@ -150,7 +132,7 @@ class AnalyzeBufferAxis : public ir::IRMutator<> { buffer_name_access_same_index_expr[buffer_name]; for (int i = 0; i < indices.size(); ++i) { if (index_expr.count(i)) { - if (!ExprMathEqual(index_expr[i], GetIndexBindExpr(indices[i]))) { + if (index_expr[i] != GetIndexBindExpr(indices[i])) { index_expr.erase(i); } } diff --git a/paddle/cinn/optim/var_mod_simplify.cc b/paddle/cinn/optim/var_mod_simplify.cc index 7306bc7ff2a506..bab7d7f5877722 100644 --- a/paddle/cinn/optim/var_mod_simplify.cc +++ b/paddle/cinn/optim/var_mod_simplify.cc @@ -86,11 +86,11 @@ struct ReplaceVarWithDivMutator : public ir::IRMutator<> { } // namespace void VarModSimplify(Expr* e) { - *e = cinn::common::AutoSimplify(*e); + *e = cinn::optim::ArithSimplify(*e); ReplaceModWithDivMutator()(e); ReplaceDivWithVarMutator mutator; mutator(e); - *e = cinn::common::AutoSimplify(*e); + *e = cinn::optim::ArithSimplify(*e); auto div_var_map = mutator.div_var_map_; ReplaceVarWithDivMutator()(e, mutator.div_var_map_); } From 62dc1bd4f6ae93e6e7ea2022cb19a7c2ea720598 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 9 Jan 2025 10:38:45 +0800 Subject: [PATCH 38/57] update vlog level (#70692) --- paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc b/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc index 1844a0e7ed661d..e5f50f9c00f642 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc +++ b/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc @@ -666,7 +666,7 @@ bool MakeGenerateShapeOpAttribute( } } if (!has_symbol_binding) { - LOG(WARNING) << "no symbol binding found for dim expr: " << symbol_name; + VLOG(2) << "no symbol binding found for dim expr: " << symbol_name; return false; } } From d59da9fb17156136e1dfe0df31d7d6e436d5ae4d Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:44:35 +0800 Subject: [PATCH 39/57] [CINN] Align initial subgraph order with block.ops (#70719) --- paddle/fluid/pir/transforms/sub_graph_detector.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pir/transforms/sub_graph_detector.cc b/paddle/fluid/pir/transforms/sub_graph_detector.cc index 913617c8e5b30e..57343056ee87aa 100644 --- a/paddle/fluid/pir/transforms/sub_graph_detector.cc +++ b/paddle/fluid/pir/transforms/sub_graph_detector.cc @@ -461,13 +461,14 @@ void SubgraphDetector::MergeSource2Target(const SubGraphPtr& source, SubgraphDetector::SubgraphDetector(pir::Block* block, const OpClassifier& classifier) { - // init sort_ops_ in reverse topo order - sort_ops_ = InverselyTopologicalSort(block); - // init op2index_ in topo order + // init sort_ops_ in reverse topo order and op2index_ in topo order int index = 0; for (auto& op : *block) { + sort_ops_.push_back(&op); op2index_[&op] = index++; } + std::reverse(sort_ops_.begin(), sort_ops_.end()); + // construct subgraphs and upstream/downstream relation std::vector subgraph_list; for (const auto& op : sort_ops_) { From f747eefc184b51e85dac650da1bc9ff97253871a Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 10:51:38 +0800 Subject: [PATCH 40/57] [fluid_ops] Replace c_allreduce_sum in python/paddle/nn/clip.py (#70707) --- .../distributed/transpiler/collective.py | 38 +++++++++++-------- python/paddle/nn/clip.py | 14 ++++--- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/python/paddle/distributed/transpiler/collective.py b/python/paddle/distributed/transpiler/collective.py index 2ce07a8db585af..5a408671e34136 100644 --- a/python/paddle/distributed/transpiler/collective.py +++ b/python/paddle/distributed/transpiler/collective.py @@ -352,16 +352,17 @@ def _insert_allreduce_ops(self): ) offset += 1 - # As we search ops reversely, we should insert c_allreduce_sum + # As we search ops reversely, we should insert all_reduce sum # op in the same way to keep the ring_id alternate ring_id = (ring_id + 1) % self.nrings block._insert_op( offset, - type='c_allreduce_sum', - inputs={'X': grad}, - outputs={'Out': grad}, + type='all_reduce', + inputs={'x': grad}, + outputs={'out': grad}, attrs={ 'ring_id': ring_id, + 'reduce_type': paddle.distributed.ReduceOp.SUM, self.op_role_key: OpRole.Backward, }, ) @@ -454,11 +455,12 @@ def _transpile_main_program(self): ring_id = (ring_id + 1) % self.nrings block._insert_op( idx + 3, - type='c_allreduce_sum', - inputs={'X': [param]}, - outputs={'Out': [param]}, + type='all_reduce', + inputs={'x': [param]}, + outputs={'out': [param]}, attrs={ 'ring_id': ring_id, + 'reduce_type': paddle.distributed.ReduceOp.SUM, self.op_role_key: OpRole.Optimize, }, ) @@ -701,10 +703,14 @@ def _insert_fuse_allreduce_ops(self): ring_id = (ring_id + 1) % self.nrings block._insert_op( global_offset, - type='c_allreduce_sum', - inputs={'X': fused_output}, - outputs={'Out': fused_output}, - attrs={'ring_id': ring_id, self.op_role_key: OpRole.Backward}, + type='all_reduce', + inputs={'x': fused_output}, + outputs={'out': fused_output}, + attrs={ + 'ring_id': ring_id, + 'reduce_type': paddle.distributed.ReduceOp.SUM, + self.op_role_key: OpRole.Backward, + }, ) global_offset += 1 @@ -1013,18 +1019,18 @@ def _insert_fuse_allreduce_ops(self): ) break - # insert the allreduce_sum op + # insert the all_reduce sum op for idx, op in enumerate(block.ops): if self._is_optimizer_op(op): for fused_var in fused_vars: block._insert_op( idx, - type='c_allreduce_sum', - inputs={'X': fused_var}, - outputs={'Out': fused_var}, + type='all_reduce', + inputs={'x': fused_var}, + outputs={'out': fused_var}, attrs={ 'ring_id': ring_id, - 'use_calc_stream': False, + 'reduce_type': paddle.distributed.ReduceOp.SUM, self.op_role_key: OpRole.Backward, }, ) diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index 9913063eb946f6..c48d0b358eaac2 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -1018,11 +1018,11 @@ def async_add_n(var_list): ) if self.should_comm_on_shard_dim and self.has_dist_param: - global_norm_dist_var = paddle._C_ops.c_allreduce_sum( - global_norm_dist_var, self.sharding_group.id, True, False + global_norm_dist_var = paddle._C_ops.all_reduce( + global_norm_dist_var, self.sharding_group.id, dist.ReduceOp.SUM ) - global_norm_dist_var = paddle._C_ops.c_allreduce_sum( - global_norm_dist_var, self.mp_group.id, True, False + global_norm_dist_var = paddle._C_ops.all_reduce( + global_norm_dist_var, self.mp_group.id, dist.ReduceOp.SUM ) if global_norm_var is None: global_norm_var = global_norm_dist_var @@ -1036,8 +1036,10 @@ def async_add_n(var_list): shape=[1], dtype=sum_dtype, fill_value=0.0 ) if self.should_comm_on_shard_dim and self.has_not_dist_param: - global_norm_not_dist_var = paddle._C_ops.c_allreduce_sum( - global_norm_not_dist_var, self.sharding_group.id, True, False + global_norm_not_dist_var = paddle._C_ops.all_reduce( + global_norm_not_dist_var, + self.sharding_group.id, + dist.ReduceOp.SUM, ) if global_norm_var is None: global_norm_var = global_norm_not_dist_var From aef9f5e1c4be60f9254fbb25904a6f407a9a6387 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 10:53:10 +0800 Subject: [PATCH 41/57] [fluid_ops] collective_global_gather.py remove dynamic_static_unified_comm (#70713) --- test/collective/collective_global_gather.py | 13 ++++--------- test/collective/collective_global_scatter.py | 13 ++++--------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/test/collective/collective_global_gather.py b/test/collective/collective_global_gather.py index 77d5df10c5fdd5..70c1abd6b3e338 100644 --- a/test/collective/collective_global_gather.py +++ b/test/collective/collective_global_gather.py @@ -62,10 +62,8 @@ def run_trainer(self, args): endpoints = args["endpoints"].split(",") rank = args["trainerid"] current_endpoint = args["currentendpoint"] - if args["dynamic_static_unified_comm"]: - paddle.distributed.collective._init_parallel_env(args["backend"]) - else: - paddle.distributed.init_parallel_env() + + paddle.distributed.collective._init_parallel_env(args["backend"]) nranks = 2 if args['backend'] == 'nccl': device_id = int(os.getenv("FLAGS_selected_gpus", "0")) @@ -112,11 +110,8 @@ def run_trainer(self, args): ) if args['static_mode']: - result = ( - self.get_model(train_prog, startup_prog, rank) - if args["dynamic_static_unified_comm"] - else self.get_model(train_prog, startup_prog, rank) - ) + result = self.get_model(train_prog, startup_prog, rank) + fetch_list = [] for elem in result: fetch_list.append(elem.name) diff --git a/test/collective/collective_global_scatter.py b/test/collective/collective_global_scatter.py index 2987c30e34f28d..b63a0e564f09d3 100644 --- a/test/collective/collective_global_scatter.py +++ b/test/collective/collective_global_scatter.py @@ -63,10 +63,8 @@ def run_trainer(self, args): rank = args["trainerid"] current_endpoint = args["currentendpoint"] nranks = 2 - if args["dynamic_static_unified_comm"]: - paddle.distributed.collective._init_parallel_env(args["backend"]) - else: - paddle.distributed.init_parallel_env() + + paddle.distributed.collective._init_parallel_env(args["backend"]) if args['backend'] == 'nccl': device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = base.CUDAPlace( @@ -90,11 +88,8 @@ def run_trainer(self, args): "float32" ) if args['static_mode']: - result = ( - self.get_model(train_prog, startup_prog, rank) - if args["dynamic_static_unified_comm"] - else self.get_model(train_prog, startup_prog, rank) - ) + result = self.get_model(train_prog, startup_prog, rank) + exe = base.Executor(place) exe.run(startup_prog) fetch_list = [] From 24b0e23a1c891a138773fb51b2f230e85f4ecf6a Mon Sep 17 00:00:00 2001 From: chen2016013 <111894720+chen2016013@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:54:41 +0800 Subject: [PATCH 42/57] open transpose op in auto-recompute (#70711) --- python/paddle/decomposition/recompute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/decomposition/recompute.py b/python/paddle/decomposition/recompute.py index effd0882000092..cca3a5c8a04abe 100644 --- a/python/paddle/decomposition/recompute.py +++ b/python/paddle/decomposition/recompute.py @@ -80,7 +80,7 @@ "pd_op.slice", "pd_op.squeeze", "pd_op.unsqueeze", - # "pd_op.transpose", + "pd_op.transpose", # "pd_op.prod", "pd_op.log", "pd_op.log1p", From 3180ca0123f94b3b15a338553cd10d5289fc8175 Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:59:32 +0800 Subject: [PATCH 43/57] [Paddle TensorRT No.8] pd_op.anchor_generator (#70667) * pd_op.anchor_generator * fix * fix * fix * pd_op.anchor_generator --- .../plugin/anchor_generator_op_plugin.cu | 285 ++++++++++++++++++ .../plugin/anchor_generator_op_plugin.h | 98 ++++++ .../transforms/tensorrt/trt_op_marker_pass.cc | 2 + .../fluid/pybind/manual_static_op_function.h | 17 ++ paddle/fluid/pybind/pybind.cc | 5 + python/paddle/tensorrt/converter.py | 4 + python/paddle/tensorrt/impls/others.py | 60 ++++ test/tensorrt/test_converter_others.py | 54 +++- 8 files changed, 524 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu index f7adaab13d1167..2378e8e11097b7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu @@ -667,6 +667,291 @@ nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin( } #endif +PIRAnchorGeneratorPluginDynamic::PIRAnchorGeneratorPluginDynamic( + const nvinfer1::DataType data_type, + const std::vector& anchor_sizes, + const std::vector& aspect_ratios, + const std::vector& stride, + const std::vector& variances, + const float offset, + const int num_anchors) + : data_type_(data_type), + anchor_sizes_(anchor_sizes), + aspect_ratios_(aspect_ratios), + stride_(stride), + variances_(variances), + offset_(offset), + num_anchors_(num_anchors) { + // data_type_ is used to determine the output data type + // data_type_ can only be float32 + // height, width, num_anchors are calculated at configurePlugin + PADDLE_ENFORCE_EQ(data_type_, + nvinfer1::DataType::kFLOAT, + common::errors::InvalidArgument( + "TRT anchor generator plugin only accepts float32.")); + PADDLE_ENFORCE_GE( + num_anchors_, + 0, + common::errors::InvalidArgument( + "TRT anchor generator plugin only accepts number of anchors greater " + "than 0, but receive number of anchors = %d.", + num_anchors_)); + PrepareParamsOnDevice(); +} + +PIRAnchorGeneratorPluginDynamic::~PIRAnchorGeneratorPluginDynamic() { + auto release_device_ptr = [](void* ptr) { + if (ptr) { + cudaFree(ptr); + ptr = nullptr; + } + }; + release_device_ptr(anchor_sizes_device_); + release_device_ptr(aspect_ratios_device_); + release_device_ptr(stride_device_); + release_device_ptr(variances_device_); +} + +PIRAnchorGeneratorPluginDynamic::PIRAnchorGeneratorPluginDynamic( + void const* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchor_sizes_); + DeserializeValue(&data, &length, &aspect_ratios_); + DeserializeValue(&data, &length, &stride_); + DeserializeValue(&data, &length, &variances_); + DeserializeValue(&data, &length, &offset_); + DeserializeValue(&data, &length, &num_anchors_); + PrepareParamsOnDevice(); +} + +nvinfer1::IPluginV2DynamicExt* PIRAnchorGeneratorPluginDynamic::clone() const + TRT_NOEXCEPT { + auto plugin = new PIRAnchorGeneratorPluginDynamic(data_type_, + anchor_sizes_, + aspect_ratios_, + stride_, + variances_, + offset_, + num_anchors_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs PIRAnchorGeneratorPluginDynamic::getOutputDimensions( + int outputIndex, + const nvinfer1::DimsExprs* inputs, + int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT { + nvinfer1::DimsExprs ret{}; + ret.nbDims = 4; + ret.d[0] = inputs[0].d[2]; // feature height + ret.d[1] = inputs[0].d[3]; // feature width + ret.d[2] = exprBuilder.constant(num_anchors_); + ret.d[3] = exprBuilder.constant(4); + return ret; +} + +bool PIRAnchorGeneratorPluginDynamic::supportsFormatCombination( + int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT { + // input can be any, doesn't matter + // anchor generator doesn't read input raw data, only need the shape info + auto type = inOut[pos].type; + auto format = inOut[pos].format; +#if IS_TRT_VERSION_GE(7234) + if (pos == 0) return true; +#else + if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR; +#endif + return (type == nvinfer1::DataType::kFLOAT && + format == nvinfer1::TensorFormat::kLINEAR); +} + +void PIRAnchorGeneratorPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) TRT_NOEXCEPT {} + +size_t PIRAnchorGeneratorPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT { + return 0; +} + +template +int PIRAnchorGeneratorPluginDynamic::enqueue_impl( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream) { + const int height = inputDesc[0].dims.d[2]; + const int width = inputDesc[0].dims.d[3]; + const int box_num = height * width * num_anchors_; + const int block = 512; + const int gen_anchor_grid = (box_num + block - 1) / block; + T* anchors = static_cast(outputs[0]); + T* vars = static_cast(outputs[1]); + const T* anchor_sizes_device = static_cast(anchor_sizes_device_); + const T* aspect_ratios_device = static_cast(aspect_ratios_device_); + const T* stride_device = static_cast(stride_device_); + const T* variances_device = static_cast(variances_device_); + phi::GenAnchors + <<>>(anchors, + aspect_ratios_device, + aspect_ratios_.size(), + anchor_sizes_device, + anchor_sizes_.size(), + stride_device, + stride_.size(), + height, + width, + offset_); + const int var_grid = (box_num * 4 + block - 1) / block; + phi::SetVariance<<>>( + vars, variances_device, variances_.size(), box_num * 4); + return cudaGetLastError() != cudaSuccess; +} + +int PIRAnchorGeneratorPluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT { + assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT); + assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT); + return enqueue_impl( + inputDesc, outputDesc, inputs, outputs, workspace, stream); +} + +nvinfer1::DataType PIRAnchorGeneratorPluginDynamic::getOutputDataType( + int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT { + return inputTypes[0]; +} + +const char* PIRAnchorGeneratorPluginDynamic::getPluginType() const + TRT_NOEXCEPT { + return "pir_anchor_generator_plugin_dynamic"; +} + +int PIRAnchorGeneratorPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { + return 2; +} + +int PIRAnchorGeneratorPluginDynamic::initialize() TRT_NOEXCEPT { return 0; } + +void PIRAnchorGeneratorPluginDynamic::terminate() TRT_NOEXCEPT {} + +size_t PIRAnchorGeneratorPluginDynamic::getSerializationSize() const + TRT_NOEXCEPT { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchor_sizes_); + serialize_size += SerializedSize(aspect_ratios_); + serialize_size += SerializedSize(stride_); + serialize_size += SerializedSize(variances_); + serialize_size += SerializedSize(offset_); + serialize_size += SerializedSize(num_anchors_); + return serialize_size; +} + +void PIRAnchorGeneratorPluginDynamic::serialize(void* buffer) const + TRT_NOEXCEPT { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchor_sizes_); + SerializeValue(&buffer, aspect_ratios_); + SerializeValue(&buffer, stride_); + SerializeValue(&buffer, variances_); + SerializeValue(&buffer, offset_); + SerializeValue(&buffer, num_anchors_); +} + +void PIRAnchorGeneratorPluginDynamic::destroy() TRT_NOEXCEPT {} + +void PIRAnchorGeneratorPluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) TRT_NOEXCEPT { + namespace_ = std::string(lib_namespace); +} + +const char* PIRAnchorGeneratorPluginDynamicCreator::getPluginNamespace() const + TRT_NOEXCEPT { + return namespace_.c_str(); +} + +const char* PIRAnchorGeneratorPluginDynamicCreator::getPluginName() const + TRT_NOEXCEPT { + return "pir_anchor_generator_plugin_dynamic"; +} + +const char* PIRAnchorGeneratorPluginDynamicCreator::getPluginVersion() const + TRT_NOEXCEPT { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +PIRAnchorGeneratorPluginDynamicCreator::getFieldNames() TRT_NOEXCEPT { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* PIRAnchorGeneratorPluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT { + const nvinfer1::PluginField* fields = fc->fields; + std::vector anchor_sizes, aspect_ratios, stride, variances; + float offset = .5; + int num_anchors = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + const nvinfer1::PluginField& f = fc->fields[i]; + const std::string field_name(f.name); + if (field_name.compare("anchor_sizes") == 0) { + const float* data = static_cast(f.data); + anchor_sizes.assign(data, data + f.length); + } else if (field_name.compare("aspect_ratios") == 0) { + const float* data = static_cast(f.data); + aspect_ratios.assign(data, data + f.length); + } else if (field_name.compare("stride") == 0) { + const float* data = static_cast(f.data); + stride.assign(data, data + f.length); + } else if (field_name.compare("variances") == 0) { + const float* data = static_cast(f.data); + variances.assign(data, data + f.length); + } else if (field_name.compare("offset") == 0) { + offset = *static_cast(f.data); + } else if (field_name.compare("num_anchors") == 0) { + num_anchors = *static_cast(f.data); + } else { + assert(false && "unknown plugin field name."); + } + } + return new PIRAnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT, + anchor_sizes, + aspect_ratios, + stride, + variances, + offset, + num_anchors); +} + +nvinfer1::IPluginV2Ext* +PIRAnchorGeneratorPluginDynamicCreator::deserializePlugin( + const char* name, + const void* serial_data, + size_t serial_length) TRT_NOEXCEPT { + auto plugin = new PIRAnchorGeneratorPluginDynamic(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h index 72f11c76767ebb..20f145e9095694 100644 --- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h @@ -227,7 +227,105 @@ class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator { std::string namespace_; nvinfer1::PluginFieldCollection field_collection_; }; + +class PIRAnchorGeneratorPluginDynamic : public DynamicPluginTensorRT { + public: + explicit PIRAnchorGeneratorPluginDynamic( + const nvinfer1::DataType data_type, + const std::vector& anchor_sizes, + const std::vector& aspect_ratios, + const std::vector& stride, + const std::vector& variances, + const float offset, + const int num_anchors); + PIRAnchorGeneratorPluginDynamic(void const* data, size_t length); + ~PIRAnchorGeneratorPluginDynamic(); + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, + const nvinfer1::DimsExprs* inputs, + int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) // NOLINT + TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) TRT_NOEXCEPT override; + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const + TRT_NOEXCEPT override; + const char* getPluginType() const TRT_NOEXCEPT override; + int getNbOutputs() const TRT_NOEXCEPT override; + int initialize() TRT_NOEXCEPT override; + void terminate() TRT_NOEXCEPT override; + size_t getSerializationSize() const TRT_NOEXCEPT override; + void serialize(void* buffer) const TRT_NOEXCEPT override; + void destroy() TRT_NOEXCEPT override; + + private: + template + int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream); + nvinfer1::DataType data_type_; + std::vector anchor_sizes_; + std::vector aspect_ratios_; + std::vector stride_; + std::vector variances_; + float offset_; + void* anchor_sizes_device_; + void* aspect_ratios_device_; + void* stride_device_; + void* variances_device_; + int num_anchors_; + std::string namespace_; +}; + +class PIRAnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + PIRAnchorGeneratorPluginDynamicCreator() = default; + ~PIRAnchorGeneratorPluginDynamicCreator() override = default; + void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override; + const char* getPluginNamespace() const TRT_NOEXCEPT override; + const char* getPluginName() const TRT_NOEXCEPT override; + const char* getPluginVersion() const TRT_NOEXCEPT override; + const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, + const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) + TRT_NOEXCEPT override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; + REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator); +REGISTER_TRT_PLUGIN_V2(PIRAnchorGeneratorPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index 0ad509a9601882..78eeb58a19133d 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -94,6 +94,7 @@ DEFINE_GENERAL_PATTERN(Flip, paddle::dialect::FlipOp) DEFINE_GENERAL_PATTERN(Mish, paddle::dialect::MishOp) DEFINE_GENERAL_PATTERN(AssignValue, paddle::dialect::AssignValueOp) DEFINE_GENERAL_PATTERN(AssignValue_, paddle::dialect::AssignValue_Op) +DEFINE_GENERAL_PATTERN(Anchor_Generator, paddle::dialect::AnchorGeneratorOp) DEFINE_GENERAL_PATTERN(Exp, paddle::dialect::ExpOp) DEFINE_GENERAL_PATTERN(Abs, paddle::dialect::AbsOp) DEFINE_GENERAL_PATTERN(Abs_, paddle::dialect::Abs_Op) @@ -2294,6 +2295,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ADD_PATTERN(Mish) ADD_PATTERN(AssignValue) ADD_PATTERN(AssignValue_) + ADD_PATTERN(Anchor_Generator) ADD_PATTERN(Exp) ADD_PATTERN(Abs) ADD_PATTERN(Abs_) diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h index 4363fc6c8630d5..f0cf95ee7f66fb 100644 --- a/paddle/fluid/pybind/manual_static_op_function.h +++ b/paddle/fluid/pybind/manual_static_op_function.h @@ -28,6 +28,7 @@ #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/op_callstack_utils.h" #include "paddle/fluid/pybind/op_function_common.h" +#include "paddle/fluid/pybind/static_op_function.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/infermeta/spmd_rules/rules.h" @@ -1188,6 +1189,18 @@ static PyObject *fused_gemm_epilogue(PyObject *self, } } +static PyObject *anchor_generator(PyObject *self, + PyObject *args, + PyObject *kwargs) { + if (egr::Controller::Instance().GetCurrentTracer() == nullptr) { + VLOG(6) << "Call static_api_anchor_generator"; + return static_api_anchor_generator(self, args, kwargs); + } else { + ThrowExceptionToPython(std::current_exception()); + return nullptr; + } +} + static PyObject *share_var(PyObject *self, PyObject *args, PyObject *kwargs) { try { VLOG(6) << "Add share_var op into program"; @@ -1267,6 +1280,10 @@ static PyMethodDef ManualOpsAPI[] = { (PyCFunction)(void (*)(void))fused_gemm_epilogue, METH_VARARGS | METH_KEYWORDS, "C++ interface function for fused_gemm_epilogue."}, + {"anchor_generator", + (PyCFunction)(void (*)(void))anchor_generator, + METH_VARARGS | METH_KEYWORDS, + "C++ interface function for anchor_generator."}, {"_run_custom_op", (PyCFunction)(void (*)(void))run_custom_op, METH_VARARGS | METH_KEYWORDS, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 61c0a8e55ecb2f..b59e431a8480d4 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -235,6 +235,7 @@ limitations under the License. */ #include "pybind11/stl.h" #ifdef PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/pir/declare_plugin.h" +#include "paddle/fluid/platform/tensorrt/trt_plugin.h" #endif COMMON_DECLARE_bool(use_mkldnn); @@ -3422,6 +3423,10 @@ All parameter, weight, gradient are variables in Paddle. m.def("clear_shape_info", []() { paddle::framework::CollectShapeManager::Instance().ClearShapeInfo(); }); +#ifdef PADDLE_WITH_TENSORRT + m.def("register_paddle_plugin", + []() { paddle::platform::TrtPluginRegistry::Global()->RegistToTrt(); }); +#endif #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS) BindHeterWrapper(&m); diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index cab46618c4c0ee..3e7b32d400042b 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -17,6 +17,10 @@ import logging import numpy as np + +import paddle + +paddle.base.core.register_paddle_plugin() import tensorrt as trt import paddle diff --git a/python/paddle/tensorrt/impls/others.py b/python/paddle/tensorrt/impls/others.py index f2f571f6953129..8f9cafbccf758c 100644 --- a/python/paddle/tensorrt/impls/others.py +++ b/python/paddle/tensorrt/impls/others.py @@ -303,6 +303,66 @@ def share_data_converter(network, paddle_op, inputs): return identity_layer.get_output(0) +@converter_registry.register("pd_op.anchor_generator", trt_version="8.x") +def anchor_generator_converter(network, paddle_op, inputs): + inputs = inputs[0] + input_dims = inputs.shape + anchor_sizes = paddle_op.attrs().get("anchor_sizes") + aspect_ratios = paddle_op.attrs().get("aspect_ratios") + stride = paddle_op.attrs().get("stride") + variances = paddle_op.attrs().get("variances") + offset = paddle_op.attrs().get("offset") + num_anchors = len(aspect_ratios) * len(anchor_sizes) + + height = input_dims[1] + width = input_dims[2] + box_num = width * height * num_anchors + data_type = trt.float32 + + plugin_fields = [ + trt.PluginField( + "anchor_sizes", + np.array(anchor_sizes, dtype=np.float32), + trt.PluginFieldType.FLOAT32, + ), + trt.PluginField( + "aspect_ratios", + np.array(aspect_ratios, dtype=np.float32), + trt.PluginFieldType.FLOAT32, + ), + trt.PluginField( + "stride", + np.array(stride, dtype=np.float32), + trt.PluginFieldType.FLOAT32, + ), + trt.PluginField( + "variances", + np.array(variances, dtype=np.float32), + trt.PluginFieldType.FLOAT32, + ), + trt.PluginField( + "offset", + np.array(offset, dtype=np.float32), + trt.PluginFieldType.FLOAT32, + ), + trt.PluginField( + "num_anchors", + np.array(num_anchors, dtype=np.int32), + trt.PluginFieldType.INT32, + ), + ] + plugin_field_collection = trt.PluginFieldCollection(plugin_fields) + plugin_name = "pir_anchor_generator_plugin_dynamic" + plugin_version = "1" + plugin = get_trt_plugin( + plugin_name, plugin_field_collection, plugin_version + ) + anchor_generator_layer = network.add_plugin_v2([inputs], plugin) + out0 = anchor_generator_layer.get_output(0) + out1 = anchor_generator_layer.get_output(1) + return (out0, out1) + + @converter_registry.register("pd_op.affine_channel", trt_version="8.x") def affine_channel_converter(network, paddle_op, inputs): x, scale_weights, bias_weights = inputs diff --git a/test/tensorrt/test_converter_others.py b/test/tensorrt/test_converter_others.py index 0c88733296f262..8b201467137eec 100644 --- a/test/tensorrt/test_converter_others.py +++ b/test/tensorrt/test_converter_others.py @@ -437,7 +437,7 @@ def test_fp16_trt_result(self): self.check_trt_result(precision_mode="fp16") -class TestAffineChannelCas1TRTPattern(TensorRTBaseTest): +class TestAffineChannelCase1TRTPattern(TensorRTBaseTest): def setUp(self): self.python_api = affine_channel self.api_args = { @@ -458,5 +458,57 @@ def test_fp16_trt_result(self): self.check_trt_result(precision_mode="fp16") +def anchor_generator(x, anchor_sizes, aspect_ratios, variances, stride, offset): + return _C_ops.anchor_generator( + x, anchor_sizes, aspect_ratios, variances, stride, offset + ) + + +class TestAnchorGeneratorTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = anchor_generator + self.api_args = { + "x": np.random.random((2, 3, 3, 100)).astype("float32"), + "anchor_sizes": [64.0, 128.0, 256.0], + "aspect_ratios": [0.5, 1, 2], + "variances": [1.0, 1.0, 1.0, 1.0], + "stride": [16.0, 16.0], + "offset": 0.5, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 3, 3, 100]} + self.opt_shape = {"x": [2, 3, 3, 100]} + self.max_shape = {"x": [3, 3, 3, 100]} + + def test_fp32_trt_result(self): + self.check_trt_result() + + def test_fp16_trt_result(self): + self.check_trt_result(precision_mode="fp16") + + +class TestAnchorGeneratorCase1TRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = anchor_generator + self.api_args = { + "x": np.random.random((2, 3, 64, 64)).astype("float32"), + "anchor_sizes": [64.0, 128.0, 256.0], + "aspect_ratios": [0.4, 1.2, 3], + "variances": [0.5, 1.0, 0.5, 1.0], + "stride": [16.0, 32.0], + "offset": 0.8, + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [2, 3, 64, 64]} + self.opt_shape = {"x": [2, 3, 64, 64]} + self.max_shape = {"x": [3, 3, 64, 64]} + + def test_fp32_trt_result(self): + self.check_trt_result() + + def test_fp16_trt_result(self): + self.check_trt_result(precision_mode="fp16") + + if __name__ == '__main__': unittest.main() From a0075d2fc1bb5c1be3ee13bf7ad56d7a82056ea4 Mon Sep 17 00:00:00 2001 From: Junjie Zhang <1356732652@qq.com> Date: Thu, 9 Jan 2025 10:59:48 +0800 Subject: [PATCH 44/57] Update CMakeLists.txt (#70683) --- test/legacy_test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index a625ad80b7077d..4da0738db3c47d 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -854,7 +854,7 @@ set_tests_properties(test_imperative_transformer_sorted_gradient PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 250) set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 250) -set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270) +set_tests_properties(test_activation_op PROPERTIES TIMEOUT 600) set_tests_properties(test_normal PROPERTIES TIMEOUT 120) set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120) set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 270) From 6ba4c447fc326a7c905ebf6137588485e072ed81 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:03:02 +0800 Subject: [PATCH 45/57] [CINN] Delete llvm opt for host code (#70685) * delete code * fix * delete compile host model in cuda * fix --- paddle/cinn/backends/compiler.cc | 1 - paddle/cinn/backends/llvm/execution_engine.cc | 14 +++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc index 0658d0507e4775..644b97757999ce 100644 --- a/paddle/cinn/backends/compiler.cc +++ b/paddle/cinn/backends/compiler.cc @@ -430,7 +430,6 @@ void Compiler::CompileCudaModule(const Module& module, device_fn_name_.emplace_back(kernel_fn_name); } engine_->Link(host_module); - #else CINN_NOT_IMPLEMENTED #endif diff --git a/paddle/cinn/backends/llvm/execution_engine.cc b/paddle/cinn/backends/llvm/execution_engine.cc index ed771ef57ad540..91a32c283c77db 100644 --- a/paddle/cinn/backends/llvm/execution_engine.cc +++ b/paddle/cinn/backends/llvm/execution_engine.cc @@ -171,8 +171,10 @@ std::unique_ptr NaiveObjectCache::getObject( template void ExecutionEngine::Link(const ir::Module &module) { + if (module.functions().size() == 0) { + return; + } utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary); - auto ir_emitter = std::make_unique(m.get(), b.get()); VLOG(3) << "ir_emitter->Compile(module) Begin"; ir_emitter->Compile(module); @@ -211,6 +213,16 @@ void ExecutionEngine::Link(const ir::Module &module) { } } +template <> +void ExecutionEngine::Link(const ir::Module &module) { + if (module.functions().size() == 0) { + return; + } + utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary); + auto ir_emitter = std::make_unique(m.get(), b.get()); + ir_emitter->Compile(module); +} + bool ExecutionEngine::AddModule(std::unique_ptr module, std::unique_ptr context) { utils::RecordEvent("ExecutionEngine AddModule", utils::EventType::kOrdinary); From b1c7b888a53dc054da4601732149c62246117fa7 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 11:04:36 +0800 Subject: [PATCH 46/57] [fluid_ops] c_comm_init remove FLAGS_dynamic_static_unified_comm (#70718) --- .../operators/collective/c_comm_init_op.cc | 96 +++++-------------- 1 file changed, 26 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 349a2626bb4f4b..875c7fb41b3416 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -29,17 +29,6 @@ limitations under the License. */ #include "paddle/phi/core/platform/collective_helper.h" #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/phi/core/distributed/nccl_comm_context.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); -#elif defined(PADDLE_WITH_XPU_BKCL) -#include "paddle/phi/core/distributed/bkcl_comm_context.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); -#endif -#if defined(PADDLE_WITH_CUSTOM_DEVICE) -COMMON_DECLARE_bool(dynamic_static_unified_comm); -#endif - #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/store/store_utils.h" @@ -75,46 +64,26 @@ class CCommInitOp : public framework::OperatorBase { device_id = Attr("device_id"); } int rank_id = Attr("rank"); - if (FLAGS_dynamic_static_unified_comm) { - VLOG(3) << "#### use new comm lab ####"; - auto store = phi::distributed::CreateOrGetGlobalTCPStore(); - if (!phi::distributed::CommContextManager::GetInstance().Has( - std::to_string(rid))) { - phi::distributed::CommContextManager::CreateXCCLCommContext( - store, - std::to_string(rid), - phi::CustomPlace(place.GetDeviceType(), device_id), - rank_id, - nranks, - "c_comm_init_op"); - } - return; - } - using UniqueId = phi::ccl::CCLRootId; - using CommContext = platform::XCCLCommContext; + VLOG(3) << "#### use new comm lab ####"; + auto store = phi::distributed::CreateOrGetGlobalTCPStore(); + if (!phi::distributed::CommContextManager::GetInstance().Has( + std::to_string(rid))) { + phi::distributed::CommContextManager::CreateXCCLCommContext( + store, + std::to_string(rid), + phi::CustomPlace(place.GetDeviceType(), device_id), + rank_id, + nranks, + "c_comm_init_op"); + } + return; - VLOG(3) << "#### use old comm lab ####"; - UniqueId* comm_id = var->GetMutable(); - CommContext::Instance(place.GetDeviceType()) - .CreateComm(comm_id, nranks, rank_id, device_id, rid); #else PADDLE_THROW(common::errors::PreconditionNotMet( "PaddlePaddle should compile with custom device.")); #endif } else { -// TODO(wangxi): Put this in the unified header file -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - using UniqueId = ncclUniqueId; - using CommContext = platform::NCCLCommContext; -#elif defined(PADDLE_WITH_XPU_BKCL) - using UniqueId = BKCLUniqueId; - using CommContext = platform::BKCLCommContext; -#else - PADDLE_THROW(common::errors::PreconditionNotMet( - "PaddlePaddle should be compiled with GPU or XPU.")); -#endif - PADDLE_ENFORCE_EQ(place.GetType() == phi::AllocationType::GPU || place.GetType() == phi::AllocationType::XPU, true, @@ -137,33 +106,20 @@ class CCommInitOp : public framework::OperatorBase { } int rank_id = Attr("rank"); #endif -#if defined(PADDLE_WITH_NCCL) - if (FLAGS_dynamic_static_unified_comm) { - VLOG(3) << "#### use new comm lab ####"; - auto store = phi::distributed::CreateOrGetGlobalTCPStore(); - phi::distributed::CommContextManager::SetDeviceId(device_id); - std::string endpoints = Attr("endpoints"); - phi::distributed::CommContextManager::CreateNCCLCommContext( - store, std::to_string(rid), rank_id, nranks, endpoints); - return; - } +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + VLOG(3) << "#### use new comm lab ####"; + auto store = phi::distributed::CreateOrGetGlobalTCPStore(); + phi::distributed::CommContextManager::SetDeviceId(device_id); + std::string endpoints = Attr("endpoints"); + phi::distributed::CommContextManager::CreateNCCLCommContext( + store, std::to_string(rid), rank_id, nranks, endpoints); #elif defined(PADDLE_WITH_XPU_BKCL) - if (FLAGS_dynamic_static_unified_comm) { - VLOG(3) << "#### use new comm lab ####"; - auto store = phi::distributed::CreateOrGetGlobalTCPStore(); - phi::distributed::CommContextManager::SetDeviceId(device_id); - std::string endpoints = Attr("endpoints"); - phi::distributed::CommContextManager::CreateBKCLCommContext( - store, std::to_string(rid), rank_id, nranks, endpoints); - return; - } -#endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) - VLOG(3) << "#### use old comm lab ####"; - UniqueId* comm_id = var->GetMutable(); - CommContext::Instance().CreateComm( - comm_id, nranks, rank_id, device_id, rid); + VLOG(3) << "#### use new comm lab ####"; + auto store = phi::distributed::CreateOrGetGlobalTCPStore(); + phi::distributed::CommContextManager::SetDeviceId(device_id); + std::string endpoints = Attr("endpoints"); + phi::distributed::CommContextManager::CreateBKCLCommContext( + store, std::to_string(rid), rank_id, nranks, endpoints); #endif } } From 885318b1a7e9025bb8737c4deefe0f083a47df23 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:29:29 +0800 Subject: [PATCH 47/57] fix bce loss decomp bug (#70724) --- .../decomp_rule/decomp_rule/composite.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h index 357b2434c1f676..3899f19c7b23ae 100644 --- a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h +++ b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h @@ -200,9 +200,19 @@ Tensor reciprocal_decomp(const Tensor& x) { template Tensor bce_loss_decomp(const Tensor& x, const Tensor& label) { - auto one = full_scalar(1, x.dtype(), x.place()); - auto ans = full_scalar(-1, x.dtype(), x.place()) * - (label * log(x) + (one - label) * log(one - x)); + auto org_dtype = x.dtype(); + auto x_mt = ConvertToMT(x); + + auto neg_100 = full_scalar(-100, x_mt.dtype(), x.place()); + auto one = full_scalar(1, x_mt.dtype(), x.place()); + + auto log_x = maximum(log(x_mt), neg_100); + auto log_1_x = maximum(log(one - x_mt), neg_100); + + auto ans = full_scalar(-1, x_mt.dtype(), x.place()) * + (label * log_x + (one - label) * log_1_x); + ans = ConvertToOrig(ans, org_dtype); + return ans; } From 74eb10a376dea51a1fb001c6c491d005b8b13e03 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 14:47:58 +0800 Subject: [PATCH 48/57] [clean old comm] remove FLAGS_dynamic_static_unified_comm in python directory (#70727) --- .../auto_parallel/static/process_group.py | 84 ++++++++----------- .../fleet/base/private_helper_function.py | 42 +--------- .../fleet/meta_optimizers/common.py | 7 -- .../meta_optimizers/sharding_optimizer.py | 6 -- 4 files changed, 35 insertions(+), 104 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/process_group.py b/python/paddle/distributed/auto_parallel/static/process_group.py index 49f893368dccd0..4dba7898f3a160 100644 --- a/python/paddle/distributed/auto_parallel/static/process_group.py +++ b/python/paddle/distributed/auto_parallel/static/process_group.py @@ -160,58 +160,42 @@ def instantiate(self): strategy.nrings = 1 if core.is_compiled_with_cuda(): place = core.CUDAPlace(genv.device_id) - use_new_comm = paddle.get_flags( - "FLAGS_dynamic_static_unified_comm" - )["FLAGS_dynamic_static_unified_comm"] - if use_new_comm: - store = core.create_or_get_global_tcp_store() - endpoints_str = "" - for endpoint in strategy.trainer_endpoints: - endpoints_str += endpoint - endpoints_str += f"ring_id:{ring_id}" - endpoints_str_hash = hashlib.md5( - endpoints_str.encode(encoding='UTF-8') - ).hexdigest() - - core.CommContextManager.set_device_id(genv.device_id) - core.CommContextManager.create_nccl_comm_context( - store, - str(ring_id), - strategy.local_rank, - strategy.nranks, - endpoints_str_hash, - ) - else: - core.NCCLParallelContext(strategy, place).init_with_ring_id( - ring_id - ) + store = core.create_or_get_global_tcp_store() + endpoints_str = "" + for endpoint in strategy.trainer_endpoints: + endpoints_str += endpoint + endpoints_str += f"ring_id:{ring_id}" + endpoints_str_hash = hashlib.md5( + endpoints_str.encode(encoding='UTF-8') + ).hexdigest() + + core.CommContextManager.set_device_id(genv.device_id) + core.CommContextManager.create_nccl_comm_context( + store, + str(ring_id), + strategy.local_rank, + strategy.nranks, + endpoints_str_hash, + ) elif core.is_compiled_with_xpu(): place = core.XPUPlace(genv.device_id) - use_new_comm = paddle.get_flags( - "FLAGS_dynamic_static_unified_comm" - )["FLAGS_dynamic_static_unified_comm"] - if use_new_comm: - store = core.create_or_get_global_tcp_store() - endpoints_str = "" - for endpoint in strategy.trainer_endpoints: - endpoints_str += endpoint - endpoints_str += f"ring_id:{ring_id}" - endpoints_str_hash = hashlib.md5( - endpoints_str.encode(encoding='UTF-8') - ).hexdigest() - - core.CommContextManager.set_device_id(genv.device_id) - core.CommContextManager.create_bkcl_comm_context( - store, - str(ring_id), - strategy.local_rank, - strategy.nranks, - endpoints_str_hash, - ) - else: - core.BKCLParallelContext(strategy, place).init_with_ring_id( - ring_id - ) + store = core.create_or_get_global_tcp_store() + endpoints_str = "" + for endpoint in strategy.trainer_endpoints: + endpoints_str += endpoint + endpoints_str += f"ring_id:{ring_id}" + endpoints_str_hash = hashlib.md5( + endpoints_str.encode(encoding='UTF-8') + ).hexdigest() + + core.CommContextManager.set_device_id(genv.device_id) + core.CommContextManager.create_bkcl_comm_context( + store, + str(ring_id), + strategy.local_rank, + strategy.nranks, + endpoints_str_hash, + ) elif genv.device_type in core.get_all_custom_device_type(): place = core.CustomPlace(genv.device_type, genv.device_id) core.XCCLParallelContext(strategy, place).init_with_ring_id( diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py index 0da733c0f24c65..34eb192c106b17 100644 --- a/python/paddle/distributed/fleet/base/private_helper_function.py +++ b/python/paddle/distributed/fleet/base/private_helper_function.py @@ -11,12 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import socket -import sys -import time -from contextlib import closing -import paddle __all__ = [] @@ -35,39 +30,4 @@ def wait_server_ready(endpoints): >>> wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) """ - try: - use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[ - "FLAGS_dynamic_static_unified_comm" - ] - except: - use_new_comm = False - - if use_new_comm: - return - assert not isinstance(endpoints, str) - while True: - all_ok = True - not_ready_endpoints = [] - for ep in endpoints: - ip_port = ep.split(":") - with closing( - socket.socket(socket.AF_INET, socket.SOCK_STREAM) - ) as sock: - sock.settimeout(2) - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - if hasattr(socket, 'SO_REUSEPORT'): - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) - - result = sock.connect_ex((ip_port[0], int(ip_port[1]))) - if result != 0: - all_ok = False - not_ready_endpoints.append(ep) - if not all_ok: - sys.stderr.write("server not ready, wait 3 sec to retry...\n") - sys.stderr.write( - "not ready endpoints:" + str(not_ready_endpoints) + "\n" - ) - sys.stderr.flush() - time.sleep(3) - else: - break + return diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py index 75be5f621d4124..8147a957796e0f 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/common.py +++ b/python/paddle/distributed/fleet/meta_optimizers/common.py @@ -97,13 +97,6 @@ def _init_communicator( other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) - if rank == 0 and wait_port: - use_new_comm = paddle.get_flags( - "FLAGS_dynamic_static_unified_comm" - )["FLAGS_dynamic_static_unified_comm"] - if not use_new_comm: - wait_server_ready(other_endpoints) - def _add_sync_by_allreduce(block): sync_var = block.create_var( name=unique_name.generate('sync_var'), diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 045befd1f7bd28..07de62d3039f89 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -14,7 +14,6 @@ import os -import paddle from paddle.base import core from paddle.incubate.optimizer import PipelineOptimizer from paddle.static import ( @@ -705,11 +704,6 @@ def minimize_impl( self._recreate_not_persist_param_as_var() self._dump_program_for_debug() - use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[ - "FLAGS_dynamic_static_unified_comm" - ] - if not use_new_comm: - self._wait() return optimize_ops, params_grads def _init_pair_comm(self, pair, ring_id): From dbf9de203ae3588b98edc1337c6563fb7ee5e99f Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 14:48:15 +0800 Subject: [PATCH 49/57] [fluid_ops] c_scatter remove FLAGS_dynamic_static_unified_comm (#70717) --- paddle/phi/kernels/gpu/c_scatter_kernel.cu | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/paddle/phi/kernels/gpu/c_scatter_kernel.cu b/paddle/phi/kernels/gpu/c_scatter_kernel.cu index 4ea62f468e58e9..8598b787d524d7 100644 --- a/paddle/phi/kernels/gpu/c_scatter_kernel.cu +++ b/paddle/phi/kernels/gpu/c_scatter_kernel.cu @@ -51,20 +51,8 @@ void CScatterOpCUDAKernel(const Context& dev_ctx, common::errors::InvalidArgument( "The ring_id (%d) for c_scatter_op must be non-negative.", ring_id)); - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); - - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(ring_id))); - comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(ring_id))); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( From 7040bb18b8a0b9067030f4c7086638b3d819a593 Mon Sep 17 00:00:00 2001 From: bukejiyu <52310069+bukejiyu@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:19:14 +0800 Subject: [PATCH 50/57] [infrence]fix openvino.cmake (#70701) --- cmake/external/openvino.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/openvino.cmake b/cmake/external/openvino.cmake index f08f987cff1b6d..dea1fd4625d0d3 100644 --- a/cmake/external/openvino.cmake +++ b/cmake/external/openvino.cmake @@ -103,7 +103,7 @@ file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/openvino/convert.patch native_convert) set(OPENVINO_PATCH_COMMAND - git checkout -- . && git fetch --depth=1 origin && git + git checkout -- . && git fetch --depth=1 origin ${OPENVINO_COMMIT} && git checkout ${OPENVINO_COMMIT} && patch -Np1 -d ${SOURCE_DIR} < ${native_convert} || true) From 4077efe2b1d0504b7e0bb1b8837b8f8c3f6bb970 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 9 Jan 2025 16:25:01 +0800 Subject: [PATCH 51/57] [Inference]Support some pass use in converter (#70529) * support pass in converter * fix bugs * fix unittest * resolve conflict * resolve unittest * fix unittest * fix unittest * reduce file * perfect comment --- .../fluid/pir/dialect/operator/utils/utils.cc | 8 + .../general/constant_folding_pass.cc | 4 +- paddle/fluid/pybind/pir.cc | 6 + python/paddle/tensorrt/converter.py | 196 +++++++++++------- python/paddle/tensorrt/converter_utils.py | 21 +- python/paddle/tensorrt/export.py | 23 +- python/paddle/tensorrt/impls/common.py | 8 +- python/paddle/tensorrt/impls/conv.py | 3 + python/paddle/tensorrt/impls/creation.py | 12 +- python/paddle/tensorrt/impls/input.py | 5 + python/paddle/tensorrt/impls/manipulation.py | 78 +++---- python/paddle/tensorrt/impls/math.py | 24 +-- python/paddle/tensorrt/impls/others.py | 43 +--- python/paddle/tensorrt/impls/pooling.py | 9 +- python/paddle/tensorrt/impls/search.py | 24 +-- python/paddle/tensorrt/util.py | 25 ++- test/tensorrt/tensorrt_test_base.py | 21 +- test/tensorrt/test_converter_conv.py | 22 ++ test/tensorrt/test_converter_model_bert.py | 1 + 19 files changed, 311 insertions(+), 222 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc index 01e754b6889585..05c3337d4c2c31 100644 --- a/paddle/fluid/pir/dialect/operator/utils/utils.cc +++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc @@ -66,6 +66,8 @@ enum class AttrType { STRING, + TENSOR_NAME, + NUM_ATTR_TYPES, }; @@ -90,6 +92,8 @@ static inline AttrType GetAttributeType(const pir::Attribute& attr) { return AttrType::DATA_TYPE; } else if (attr.isa()) { return AttrType::PLACE; + } else if (attr.isa()) { + return AttrType::TENSOR_NAME; } else { PADDLE_THROW(common::errors::Unimplemented( "Unsupported ir Attribute type when casting it into " @@ -141,6 +145,10 @@ static std::function GetAttrCast( [](const pir::Attribute& attr) { return T{attr.dyn_cast().data()}; }}, + {AttrType::TENSOR_NAME, + [](const pir::Attribute& attr) { + return T{attr.dyn_cast().data()}; + }}, {AttrType::ARRAY, [](const pir::Attribute& attr) { auto attr_vec = attr.dyn_cast().AsVector(); diff --git a/paddle/fluid/pir/transforms/general/constant_folding_pass.cc b/paddle/fluid/pir/transforms/general/constant_folding_pass.cc index 083c3cb9f63317..66669e276ee2ae 100644 --- a/paddle/fluid/pir/transforms/general/constant_folding_pass.cc +++ b/paddle/fluid/pir/transforms/general/constant_folding_pass.cc @@ -47,6 +47,7 @@ #include "paddle/pir/include/core/region.h" #include "paddle/pir/include/core/value.h" #include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" #include "paddle/pir/include/pattern_rewrite/frozen_rewrite_pattern_set.h" #include "paddle/pir/include/pattern_rewrite/pattern_match.h" #include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h" @@ -300,7 +301,6 @@ class ConstantFoldingPattern : public pir::RewritePattern { } paddle::framework::InterpreterCore core( place_, {}, kernel_program->block(), scope_, *exe_config_); - core.Run({}); return output_var_names; } @@ -557,3 +557,5 @@ std::unique_ptr CreateConstantFoldingPass() { } } // namespace pir + +REGISTER_IR_PASS(constant_folding_pass, ConstantFoldingPass); diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc index d0e2b99ddb991c..335fb3ebc7ff2c 100644 --- a/paddle/fluid/pybind/pir.cc +++ b/paddle/fluid/pybind/pir.cc @@ -2676,6 +2676,12 @@ void BindPassManager(pybind11::module *m) { pass->Set(attr.first, new int(attr.second.cast())); } else if (py::isinstance(attr.second)) { pass->Set(attr.first, new float(attr.second.cast())); + } else if (py::isinstance(attr.second)) { + pass->SetNotOwned(attr.first, + attr.second.cast()); + } else if (py::isinstance(attr.second)) { + pass->Set(attr.first, + new phi::Place(attr.second.cast())); } else { PADDLE_THROW(common::errors::InvalidArgument( "The pass attr is not supported this type.")); diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py index 3e7b32d400042b..2017886fac5218 100644 --- a/python/paddle/tensorrt/converter.py +++ b/python/paddle/tensorrt/converter.py @@ -27,6 +27,7 @@ from paddle import pir from paddle.base.core import clear_shape_info, get_value_shape_range_info from paddle.base.log_helper import get_logger +from paddle.pir.core import _PADDLE_PIR_DTYPE_2_NUMPY_DTYPE from .impls.activation import * # noqa: F403 from .impls.attribute import * # noqa: F403 @@ -75,8 +76,10 @@ def __init__(self, paddle_program, scope, trt_config=None): # save parameters for v in params: name = v.get_defining_op().attrs()["parameter_name"] - weight_array = np.array(self.scope.var(name).get_tensor()) - # weights = trt.Weights(weight_array) + if self.scope.find_var(name) is None: + weight_array = None + else: + weight_array = np.array(self.scope.var(name).get_tensor()) param_dict.update({name: weight_array}) self.param_dict = param_dict @@ -150,6 +153,7 @@ def convert_subgraph_to_trt(self, program, group_op): opt_value_map = {} max_value_map = {} input_names = [] + new_input_values = [] # Because one of the inputs to pd_op.concat is builtin.combine, # during the conversion process using the converter, @@ -172,7 +176,24 @@ def convert_subgraph_to_trt(self, program, group_op): param_name = defining_op.attrs()["parameter_name"] weight = trt.Weights(self.param_dict[param_name]) value_to_trt_tensor[value.id] = weight - input_names.append("") + elif defining_op.name() == "builtin.constant": + constant_value_name = defining_op.attrs()["value"] + constant_tensor = self.scope.var( + constant_value_name + ).get_tensor() + out_dtype = np.dtype( + _PADDLE_PIR_DTYPE_2_NUMPY_DTYPE[value.dtype] + ) + if out_dtype == np.dtype("float64"): + out_dtype = np.dtype("float32") + if out_dtype == np.dtype("int64"): + out_dtype = np.dtype("int32") + constant_data = np.array(constant_tensor, dtype=out_dtype) + if len(constant_data) == 0: + value_to_trt_tensor[value.id] = None + else: + constant_tensor = trt.Weights(constant_data) + value_to_trt_tensor[value.id] = constant_tensor else: shape = value.shape dtype = map_dtype(value.dtype.name) @@ -184,6 +205,7 @@ def convert_subgraph_to_trt(self, program, group_op): name=input_name, dtype=dtype, shape=shape ) input_names.append(input_name) + new_input_values.append(value) value_to_trt_tensor[value.id] = input_tensor for op in operations: @@ -196,6 +218,9 @@ def convert_subgraph_to_trt(self, program, group_op): if not source.initialized(): operands.append(None) continue + vec_type = source.type().as_vec_type() + if vec_type is not None and len(vec_type.as_list()) == 0: + continue define_op_name = source.get_defining_op().name() if define_op_name == "builtin.combine": operand_list = [] @@ -242,6 +267,10 @@ def convert_subgraph_to_trt(self, program, group_op): for idx, result in enumerate(op.results()): if result.is_combine(): + # empty vec value condition + if len(result.type().as_vec_type().as_list()) == 0: + results.append(result) + continue used_ops = result.all_used_ops() for use_op in used_ops: if use_op.name() == "builtin.split": @@ -249,6 +278,7 @@ def convert_subgraph_to_trt(self, program, group_op): results.extend(split_outputs) else: results.append(result) + for idx, result in enumerate(results): if idx < len(trt_outs): value_to_trt_tensor[result.id] = trt_outs[idx] @@ -258,83 +288,86 @@ def convert_subgraph_to_trt(self, program, group_op): # Set TRT min/opt/max input shape and the value of shape tensor for i, value in enumerate(origin_input_value): trt_input = value_to_trt_tensor[value.id] - if isinstance(trt_input, trt.Weights): + defining_op_name = value.get_defining_op().name() + if ( + defining_op_name == "builtin.parameter" + or defining_op_name == "builtin.constant" + ): + # constant/parameter condition, needn't get min/opt/max shape continue input_name = trt_input.name - if input_name != "": - _logger.info( - f"set shape of {value}, op is: {value.get_defining_op()}" + _logger.info( + f"set shape of {value}, op is: {value.get_defining_op()}" + ) + min_shape = [] + opt_shape = [] + max_shape = [] + min_value = [] + opt_value = [] + max_value = [] + + value_define_op = value.get_defining_op() + # if the input value is generated by the other trt_engine_op, so the shape is searched by origin value + if ( + value_define_op.name() == "builtin.split" + and value_define_op.operand_source(0).get_defining_op().name() + == "pd_op.tensorrt_engine" + ): + min_shape = self.input_info[value.id]["min_shape"] + opt_shape = self.input_info[value.id]["opt_shape"] + max_shape = self.input_info[value.id]["max_shape"] + if trt_input.is_shape_tensor: + min_value = self.input_info[value.id]["min_value"] + opt_value = self.input_info[value.id]["opt_value"] + max_value = self.input_info[value.id]["max_value"] + else: + min_shape = get_value_shape_range_info( + value, False, paddle.base.core.ShapeMode.kMIN ) - min_shape = [] - opt_shape = [] - max_shape = [] - min_value = [] - opt_value = [] - max_value = [] - - value_define_op = value.get_defining_op() - # if the input value is generated by the other trt_engine_op, so the shape is searched by origin value - if ( - value_define_op.name() == "builtin.split" - and value_define_op.operand_source(0) - .get_defining_op() - .name() - == "pd_op.tensorrt_engine" - ): - min_shape = self.input_info[value.id]["min_shape"] - opt_shape = self.input_info[value.id]["opt_shape"] - max_shape = self.input_info[value.id]["max_shape"] - if trt_input.is_shape_tensor: - min_value = self.input_info[value.id]["min_value"] - opt_value = self.input_info[value.id]["opt_value"] - max_value = self.input_info[value.id]["max_value"] - else: - min_shape = get_value_shape_range_info( - value, False, paddle.base.core.ShapeMode.kMIN - ) - opt_shape = get_value_shape_range_info( - value, False, paddle.base.core.ShapeMode.kOPT - ) - max_shape = get_value_shape_range_info( - value, False, paddle.base.core.ShapeMode.kMAX - ) - if trt_input.is_shape_tensor: - min_value = get_value_shape_range_info( - value, True, paddle.base.core.ShapeMode.kMIN - ) - opt_value = get_value_shape_range_info( - value, True, paddle.base.core.ShapeMode.kOPT - ) - max_value = get_value_shape_range_info( - value, True, paddle.base.core.ShapeMode.kMAX - ) - if not trt_input.is_shape_tensor: - _logger.info(f"set min_shape of {value} as {min_shape}") - _logger.info(f"set opt_shape of {value} as {opt_shape}") - _logger.info(f"set max_shape of {value} as {max_shape}") - profile.set_shape( - input_name, min=min_shape, opt=opt_shape, max=max_shape - ) - else: - _logger.info( - f"set min_value of shape input: {value} as {min_value}" - ) - _logger.info( - f"set max_value of shape input: {value} as {opt_value}" + opt_shape = get_value_shape_range_info( + value, False, paddle.base.core.ShapeMode.kOPT + ) + max_shape = get_value_shape_range_info( + value, False, paddle.base.core.ShapeMode.kMAX + ) + + if trt_input.is_shape_tensor: + min_value = get_value_shape_range_info( + value, True, paddle.base.core.ShapeMode.kMIN ) - _logger.info( - f"set opt_value of shape input: {value} as {max_value}" + opt_value = get_value_shape_range_info( + value, True, paddle.base.core.ShapeMode.kOPT ) - profile.set_shape_input( - input_name, min=min_value, opt=opt_value, max=max_value + max_value = get_value_shape_range_info( + value, True, paddle.base.core.ShapeMode.kMAX ) + if not trt_input.is_shape_tensor: + _logger.info(f"set min_shape of {value} as {min_shape}") + _logger.info(f"set opt_shape of {value} as {opt_shape}") + _logger.info(f"set max_shape of {value} as {max_shape}") + profile.set_shape( + input_name, min=min_shape, opt=opt_shape, max=max_shape + ) + else: + _logger.info( + f"set min_value of shape input: {value} as {min_value}" + ) + _logger.info( + f"set max_value of shape input: {value} as {opt_value}" + ) + _logger.info( + f"set opt_value of shape input: {value} as {max_value}" + ) + profile.set_shape_input( + input_name, min=min_value, opt=opt_value, max=max_value + ) - min_shape_map[input_name] = min_shape - opt_shape_map[input_name] = opt_shape - max_shape_map[input_name] = max_shape - min_value_map[input_name] = min_value - opt_value_map[input_name] = opt_value - max_value_map[input_name] = max_value + min_shape_map[input_name] = min_shape + opt_shape_map[input_name] = opt_shape + max_shape_map[input_name] = max_shape + min_value_map[input_name] = min_value + opt_value_map[input_name] = opt_value + max_value_map[input_name] = max_value out_shapes = [] out_names = [] @@ -473,7 +506,7 @@ def convert_subgraph_to_trt(self, program, group_op): with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(program): pir.set_insertion_point(group_op) out = paddle._C_ops.tensorrt_engine( - origin_input_value, + new_input_values, trt_params, input_names, out_names, @@ -533,5 +566,20 @@ def convert_program_to_trt(self): orin_out_values[o_i].replace_all_uses_with(new_out[o_i]) self.program.global_block().remove_op(op) + + save_one_parameter = ( + False # We need to keep at least one parameter for save + ) + for op in self.program.global_block().ops: + if op.name() == "builtin.parameter": + if not save_one_parameter: + save_one_parameter = True + continue + if op.results()[0].use_empty(): + self.program.global_block().remove_op(op) + if op.name() == "builtin.constant": + if op.results()[0].use_empty(): + self.program.global_block().remove_op(op) + # Call clear_shape_info to clear the previous shape information clear_shape_info() diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py index 76ccec354b0c5e..dfb38f13563241 100644 --- a/python/paddle/tensorrt/converter_utils.py +++ b/python/paddle/tensorrt/converter_utils.py @@ -453,6 +453,7 @@ def trt_reduce_to_scalar(network, tensor, dtype=trt.int32): def convert_conv2d(network, paddle_op, inputs): from paddle.tensorrt.util import support_fp32_mix_precision + bias = None if ( paddle_op.name() == "pd_op.conv2d" or paddle_op.name() == "pd_op.depthwise_conv2d" @@ -469,7 +470,8 @@ def convert_conv2d(network, paddle_op, inputs): output_size = None else: raise ValueError("Invalid number of inputs for conv2d_transpose") - + if paddle_op.name() == "pd_op.fused_conv2d_add_act": + input_tensor, filter, bias, _ = inputs input_shape = paddle_op.operands()[0].source().shape filter_shape = paddle_op.operands()[1].source().shape @@ -521,13 +523,14 @@ def convert_conv2d(network, paddle_op, inputs): if ( paddle_op.name() == "pd_op.conv2d" or paddle_op.name() == "pd_op.depthwise_conv2d" + or paddle_op.name() == "pd_op.fused_conv2d_add_act" ): layer = network.add_convolution_nd( input=input_tensor, num_output_maps=n_output, kernel_shape=nv_ksize, kernel=filter, - bias=None, + bias=bias, ) elif ( paddle_op.name() == "pd_op.conv2d_transpose" @@ -564,9 +567,21 @@ def convert_conv2d(network, paddle_op, inputs): return layer.get_output(0) +def get_input_constant_value(paddle_op, inputs, input_index): + input_op = paddle_op.operands()[input_index].source().get_defining_op() + if input_op.name() == "builtin.constant": + return inputs[input_index].numpy().tolist() + elif input_op.name() == "pd_op.full_int_array": + return input_op.attrs()["value"] + elif input_op.name() == "pd_op.full": + return [input_op.attrs()["value"]] + else: + return None + + def add_reduce_layer(network, paddle_op, inputs, op_type): input_tensor = inputs[0] - axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"] + axis = get_input_constant_value(paddle_op, inputs, 1) input_shape = paddle_op.operands()[0].source().shape keepdim = paddle_op.attrs()["keepdim"] if network.has_implicit_batch_dimension: diff --git a/python/paddle/tensorrt/export.py b/python/paddle/tensorrt/export.py index 044f58f0041908..126f3086b1d514 100644 --- a/python/paddle/tensorrt/export.py +++ b/python/paddle/tensorrt/export.py @@ -172,6 +172,7 @@ def __init__( precision_mode: PrecisionMode = PrecisionMode.FP32, ops_run_float: str | list | None = None, optimization_level: int | None = 3, + disable_passes: list = [], ) -> None: """ A class for configuring TensorRT optimizations. @@ -196,6 +197,8 @@ def __init__( The directory where the optimized model will be saved (default is None). optimization_level (int, optional): Set TensorRT optimization level (default is 3). Only supported in TensorRT versions greater than 8.6. + disable_passes : (str|list, optional): + A list of string representing the names of pass that should not be used for origin program (default is []). Returns: None @@ -226,6 +229,7 @@ def __init__( self.precision_mode = precision_mode self.ops_run_float = ops_run_float self.disable_ops = disable_ops + self.disable_passes = disable_passes self.optimization_level = optimization_level paddle.framework.set_flags( {'FLAGS_trt_min_group_size': min_subgraph_size} @@ -257,18 +261,23 @@ def convert_to_trt(program, trt_config, scope): opt_shape_feed[feed_name[i]] = opt_data max_shape_feed[feed_name[i]] = max_data - # run warmup for collecting shape - program = warmup_shape_infer( + # run pir pass (including trt_op_marker_pass) + program_with_pir = run_pir_pass( program, + partition_mode=False, + disable_passes=trt_config.disable_passes, + scope=scope, + ) + + # run warmup for collecting shape + program = warmup_shape_infer( + program_with_pir, min_shape_feed=min_shape_feed, opt_shape_feed=opt_shape_feed, max_shape_feed=max_shape_feed, scope=scope, ) - # run pir pass (including trt_op_marker_pass) - program_with_pir = run_pir_pass(program, partition_mode=False) - # specify certain operators to be excluded from entering TensorRT if trt_config.disable_ops: forbid_op_lower_trt(program, trt_config.disable_ops) @@ -277,7 +286,9 @@ def convert_to_trt(program, trt_config, scope): mark_builtin_op(program) # run pir pass (including trt_sub_graph_extract_pass) - program_with_pir = run_pir_pass(program, partition_mode=True) + program_with_pir = run_pir_pass( + program, partition_mode=True, scope=scope + ) # Step4: run TRTConverter (would lower group_op into tensorrt_engine_op) converter = PaddleToTensorRTConverter( diff --git a/python/paddle/tensorrt/impls/common.py b/python/paddle/tensorrt/impls/common.py index 77db6ba90ab840..fef1eef69b8328 100644 --- a/python/paddle/tensorrt/impls/common.py +++ b/python/paddle/tensorrt/impls/common.py @@ -17,7 +17,10 @@ import tensorrt as trt from paddle import pir -from paddle.tensorrt.converter_utils import get_shape_tensor_element +from paddle.tensorrt.converter_utils import ( + get_input_constant_value, + get_shape_tensor_element, +) from paddle.tensorrt.register import converter_registry from paddle.tensorrt.util import get_trt_version_list @@ -25,8 +28,7 @@ @converter_registry.register("pd_op.dropout", trt_version="8.x") def dropout_converter(network, paddle_op, inputs): input_x = inputs[0] - p_defining_op = paddle_op.operands()[2].source().get_defining_op() - dropout_prob = p_defining_op.attrs()["value"] + dropout_prob = get_input_constant_value(paddle_op, inputs, 2)[0] downgrade_in_infer = paddle_op.attrs().get("mode") if downgrade_in_infer == "upscale_in_train": diff --git a/python/paddle/tensorrt/impls/conv.py b/python/paddle/tensorrt/impls/conv.py index 55db36b9aa7db1..48b3dee19b58f0 100644 --- a/python/paddle/tensorrt/impls/conv.py +++ b/python/paddle/tensorrt/impls/conv.py @@ -19,6 +19,9 @@ @converter_registry.register("pd_op.depthwise_conv2d", trt_version="8.x") @converter_registry.register("pd_op.conv2d", trt_version="trt_version_ge=8.0") +@converter_registry.register( + "pd_op.fused_conv2d_add_act", trt_version="trt_version_ge=8.0" +) @converter_registry.register("pd_op.conv2d_transpose", trt_version="8.x") @converter_registry.register( "pd_op.depthwise_conv2d_transpose", trt_version="8.x" diff --git a/python/paddle/tensorrt/impls/creation.py b/python/paddle/tensorrt/impls/creation.py index 59cdaa4ad025ad..d45f2a15886909 100644 --- a/python/paddle/tensorrt/impls/creation.py +++ b/python/paddle/tensorrt/impls/creation.py @@ -20,6 +20,7 @@ from paddle.tensorrt.converter_utils import ( add_1D_constant_layer, cast_tensor, + get_input_constant_value, resize_to_1d, trt_cast, trt_floor_div, @@ -148,9 +149,8 @@ def full_like_converter(network, paddle_op, inputs): f"cast converter currently doesn't support dtype: {out_dtype}" ) - value_op = paddle_op.operands()[1].source().get_defining_op() - if value_op.name() == "pd_op.full": - fill_value = value_op.attrs()["value"] + fill_value = get_input_constant_value(paddle_op, inputs, 1) + if fill_value is not None: value = network.add_constant( (1,), np.array( @@ -206,9 +206,9 @@ def full_with_tensor_converter(network, paddle_op, inputs): else: shape_tensor_list = [shape_tensor] - shape_op = paddle_op.operands()[1].source().get_defining_op() - if shape_op.name() == "pd_op.full_int_array": - shape_tensor = shape_op.attrs()["value"] + shape_val = get_input_constant_value(paddle_op, inputs, 1) + if shape_val is not None: + shape_tensor = shape_val is_static_shape = True else: shape_tensor = inputs[1] diff --git a/python/paddle/tensorrt/impls/input.py b/python/paddle/tensorrt/impls/input.py index 8098a9d1264612..385958910c8ad4 100644 --- a/python/paddle/tensorrt/impls/input.py +++ b/python/paddle/tensorrt/impls/input.py @@ -47,6 +47,11 @@ def one_hot_converter(network, paddle_op, inputs): values_tensor = add_1D_constant_layer(network, values_data, dtype=np_dtype) + if isinstance(num_classes_tensor, trt.Weights): + num_classes_tensor = network.add_constant( + paddle_op.operands()[1].source().shape, num_classes_tensor + ).get_output(0) + reshape_layer = network.add_shuffle(num_classes_tensor) reshape_layer.reshape_dims = () depth_tensor = reshape_layer.get_output(0) diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py index 8f005518d618c7..3f2084ed926bc4 100644 --- a/python/paddle/tensorrt/impls/manipulation.py +++ b/python/paddle/tensorrt/impls/manipulation.py @@ -22,6 +22,7 @@ cast_tensor, fix_negative_indices, get_axes_for_reduce_op, + get_input_constant_value, get_shape_tensor_element, has_dynamic_shape, resize_to_1d, @@ -47,9 +48,8 @@ def reshape_converter(network, paddle_op, inputs): x = inputs[0] is_constant_shape = False - shape_defining_op = paddle_op.operands()[1].source().get_defining_op() - if shape_defining_op.name() == "pd_op.full_int_array": - shape = shape_defining_op.attrs()["value"] + shape = get_input_constant_value(paddle_op, inputs, 1) + if shape is not None: reshape_dim = shape is_constant_shape = True elif isinstance(inputs[1], list): @@ -177,7 +177,7 @@ def concat_converter(network, paddle_op, inputs): axis_tensor = inputs[1] concat_layer = network.add_concatenation(inputs=input_tensors) - axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"] + axis = get_input_constant_value(paddle_op, inputs, 1)[0] axis = int(axis) if axis < 0: axis = len(input_tensors[0].shape) + axis @@ -195,7 +195,7 @@ def concat_converter(network, paddle_op, inputs): def unsqueeze_converter(network, paddle_op, inputs): x = inputs[0] input_dims = x.shape - axes = paddle_op.operands()[1].source().get_defining_op().attrs()["value"] + axes = get_input_constant_value(paddle_op, inputs, 1) assert ( len(axes) > 0 ), f"axes size should be > 0 in when convert unsqueeze op in TensorRT, but received len(axes) = {len(axes)}." @@ -250,15 +250,8 @@ def squeeze_converter(network, paddle_op, inputs): input_val = network.add_constant(input_shape, input_val).get_output(0) # Get axis - axis = ( - paddle_op.operands()[1] - .source() - .get_defining_op() - .attrs() - .get("value", []) - ) - - if not axis: + axis = get_input_constant_value(paddle_op, inputs, 1) + if len(axis) == 0: for i in range(input_shape_size): if input_shape[i] == -1: raise RuntimeError( @@ -307,9 +300,8 @@ def expand_converter(network, paddle_op, inputs): rank = len(input_dims) paddle_shape_tensor = paddle_op.operands()[1].source() - shape_tensor_source_op = paddle_shape_tensor.get_defining_op() - if shape_tensor_source_op.name() == "pd_op.full_int_array": - shape = shape_tensor_source_op.attrs()["value"] + shape = get_input_constant_value(paddle_op, inputs, 1) + if shape is not None: shape_tensor = add_1D_constant_layer(network, shape) shape_rank = len(shape) elif paddle_shape_tensor.type().as_vec_type(): @@ -376,8 +368,6 @@ def slice_converter(network, paddle_op, inputs): axes = paddle_op.attrs()["axes"] decrease_axis = paddle_op.attrs().get("decrease_axis") - starts_op = paddle_op.operands()[1].source().get_defining_op() - ends_op = paddle_op.operands()[2].source().get_defining_op() input_shape_tensor = trt_shape(network, input_tensor) input_rank = len(input_tensor.shape) @@ -389,8 +379,8 @@ def slice_converter(network, paddle_op, inputs): get_shape_tensor_element(network, input_shape_tensor, i) ) - if starts_op.name() == "pd_op.full_int_array": - starts = starts_op.attrs()["value"] + starts = get_input_constant_value(paddle_op, inputs, 1) + if starts is not None: assert len(starts) == len( axes ), f"The size of this starts: {len(starts)} must be equal to the axes: {len(axes)}." @@ -422,8 +412,8 @@ def slice_converter(network, paddle_op, inputs): network, starts, idx ) - if ends_op.name() == "pd_op.full_int_array": - ends = ends_op.attrs()["value"] + ends = get_input_constant_value(paddle_op, inputs, 2) + if ends is not None: assert len(ends) == len( axes ), f"The size of this ends: {len(ends)} must be equal to the axes: {len(axes)}." @@ -500,9 +490,8 @@ def split_with_num_converter(network, paddle_op, inputs): input_shape_size = len(input_tensor.shape) # Handle the case where axis is of type pir::Value - axis_op = paddle_op.operands()[1].source().get_defining_op() - if axis_op.name() == "pd_op.full": - axis_value = axis_op.attrs()["value"] + axis_value = get_input_constant_value(paddle_op, inputs, 1) + if axis_value is not None: axis_tensor = add_1D_constant_layer(network, axis_value) else: axis_tensor = inputs[1] @@ -576,18 +565,16 @@ def split_converter(network, paddle_op, inputs): input_shape = input_tensor.shape input_shape_size = len(input_shape) - axis_op = paddle_op.operands()[2].source().get_defining_op() - if axis_op.name() == "pd_op.full": - axis_value = axis_op.attrs()["value"] + axis_value = get_input_constant_value(paddle_op, inputs, 2) + if axis_value is not None: axis_tensor = add_1D_constant_layer(network, axis_value) else: axis_tensor = inputs[2] axis_tensor = cast_tensor(network, axis_tensor, trt.int32) # Retrieve and process sections - sections_op = paddle_op.operands()[1].source().get_defining_op() - if sections_op.name() == "pd_op.full_int_array": - sections_value = sections_op.attrs()["value"] + sections_value = get_input_constant_value(paddle_op, inputs, 1) + if sections_value is not None: section_list = [int(s) for s in sections_value] dynamic_sections = False else: @@ -756,9 +743,8 @@ def tile_converter(network, paddle_op, inputs): input_shape_tensor = network.add_shape(input).get_output(0) rank = len(input_shape) - repeat_times_op = paddle_op.operands()[1].source().get_defining_op() - if repeat_times_op.name() == "pd_op.full_int_array": - repeat_times = repeat_times_op.attrs()["value"] + repeat_times = get_input_constant_value(paddle_op, inputs, 1) + if repeat_times is not None: repeat_tensor = add_1D_constant_layer(network, repeat_times) repeat_rank = len(repeat_times) else: @@ -809,19 +795,9 @@ def tile_converter(network, paddle_op, inputs): def strided_slice_converter(network, paddle_op, inputs): input_tensor = inputs[0] axes = paddle_op.attrs()["axes"] - - starts_op = paddle_op.operands()[1].source().get_defining_op() - ends_op = paddle_op.operands()[2].source().get_defining_op() - strides_op = paddle_op.operands()[3].source().get_defining_op() - - if starts_op.name() == "pd_op.full_int_array": - starts = starts_op.attrs()["value"] - - if ends_op.name() == "pd_op.full_int_array": - ends = ends_op.attrs()["value"] - - if strides_op.name() == "pd_op.full_int_array": - strides = strides_op.attrs()["value"] + starts = get_input_constant_value(paddle_op, inputs, 1) + ends = get_input_constant_value(paddle_op, inputs, 2) + strides = get_input_constant_value(paddle_op, inputs, 3) input_shape = input_tensor.shape nchw_input_dims = len(input_shape) @@ -886,10 +862,8 @@ def roll_converter(network, paddle_op, inputs): input_tensor = inputs[0] axis = paddle_op.attrs()["axis"] - shifts_op = paddle_op.operands()[1].source().get_defining_op() - if shifts_op.name() == "pd_op.full_int_array": - shifts = shifts_op.attrs()["value"] - else: + shifts = get_input_constant_value(paddle_op, inputs, 1) + if shifts is None: shifts = inputs[1] axis_size = len(axis) diff --git a/python/paddle/tensorrt/impls/math.py b/python/paddle/tensorrt/impls/math.py index 4247a02fbdfe5f..e260b27281ed2b 100644 --- a/python/paddle/tensorrt/impls/math.py +++ b/python/paddle/tensorrt/impls/math.py @@ -25,6 +25,7 @@ fill_constant_layer, get_axes_for_reduce_op, get_axis_length, + get_input_constant_value, get_shape_tensor_element, trt_cast, trt_concat, @@ -63,9 +64,8 @@ def scale_converter(network, paddle_op, inputs): reshape_layer_bias = network.add_shuffle(bias_tensor) reshape_layer_bias.set_input(1, bias_shapes_tensor) - scale_op = paddle_op.operands()[1].source().get_defining_op() - if scale_op.name() == "pd_op.full": - scale = scale_op.attrs()["value"] + scale = get_input_constant_value(paddle_op, inputs, 1) + if scale is not None: has_scale_tensor = False if is_int: scale_tensor = add_1D_constant_layer( @@ -125,7 +125,7 @@ def scale_converter(network, paddle_op, inputs): @converter_registry.register("pd_op.max", trt_version="trt_version_ge=8.0") def max_converter(network, paddle_op, inputs): input_tensor = inputs[0] - axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"] + axis = get_input_constant_value(paddle_op, inputs, 1) input_shape = input_tensor.shape keepdim = paddle_op.attrs()["keepdim"] if network.has_implicit_batch_dimension: @@ -171,10 +171,10 @@ def multiply_converter(network, paddle_op, inputs): @converter_registry.register("pd_op.clip", trt_version="8.x") def clip_converter(network, paddle_op, inputs): def _get_constant_or_expand_tensor( - op, constant_inputs, input_shape_tensor, rank + value, constant_inputs, input_shape_tensor, rank ): - if op.name() == "pd_op.full": - value = op.attrs()["value"] + + if value is not None: return fill_constant_layer( network, input_shape_tensor, rank, value, input_tensor.dtype ) @@ -194,15 +194,15 @@ def _get_constant_or_expand_tensor( input_shape_tensor = network.add_shape(input_tensor).get_output(0) # handle min operation - min_op = paddle_op.operands()[1].source().get_defining_op() + min_value = get_input_constant_value(paddle_op, inputs, 1) alpha_t = _get_constant_or_expand_tensor( - min_op, inputs[1], input_shape_tensor, rank + min_value, inputs[1], input_shape_tensor, rank ) # handle max operation - max_op = paddle_op.operands()[2].source().get_defining_op() + max_value = get_input_constant_value(paddle_op, inputs, 2) beta_t = _get_constant_or_expand_tensor( - max_op, inputs[2], input_shape_tensor, rank + max_value, inputs[2], input_shape_tensor, rank ) # run the clip operation @@ -294,7 +294,7 @@ def all_converter(network, paddle_op, inputs): def cumsum_converter(network, paddle_op, inputs): input_tensor = inputs[0] dtype = input_tensor.dtype - axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"] + axis = get_input_constant_value(paddle_op, inputs, 1)[0] input_shape = input_tensor.shape rank = len(input_shape) diff --git a/python/paddle/tensorrt/impls/others.py b/python/paddle/tensorrt/impls/others.py index 8f9cafbccf758c..3aff438e0417bc 100644 --- a/python/paddle/tensorrt/impls/others.py +++ b/python/paddle/tensorrt/impls/others.py @@ -21,6 +21,7 @@ from paddle.tensorrt.converter_utils import ( add_1D_constant_layer, fill_constant_layer, + get_input_constant_value, get_shape_tensor_element, get_trt_plugin, trt_concat, @@ -164,43 +165,13 @@ def set_value_converter(network, paddle_op, inputs): paddle_op.name() == "pd_op.set_value" or paddle_op.name() == "pd_op.set_value_" ): - starts = ( - paddle_op.operands()[1] - .source() - .get_defining_op() - .attrs()["value"][0] - ) - ends = ( - paddle_op.operands()[2] - .source() - .get_defining_op() - .attrs()["value"][0] - ) - steps = ( - paddle_op.operands()[3] - .source() - .get_defining_op() - .attrs()["value"][0] - ) + starts = get_input_constant_value(paddle_op, inputs, 1)[0] + ends = get_input_constant_value(paddle_op, inputs, 2)[0] + steps = get_input_constant_value(paddle_op, inputs, 3)[0] else: - starts = ( - paddle_op.operands()[2] - .source() - .get_defining_op() - .attrs()["value"][0] - ) - ends = ( - paddle_op.operands()[3] - .source() - .get_defining_op() - .attrs()["value"][0] - ) - steps = ( - paddle_op.operands()[4] - .source() - .get_defining_op() - .attrs()["value"][0] - ) + starts = get_input_constant_value(paddle_op, inputs, 2)[0] + ends = get_input_constant_value(paddle_op, inputs, 3)[0] + steps = get_input_constant_value(paddle_op, inputs, 4)[0] axes = paddle_op.attrs()["axes"][0] input_dims = x.shape diff --git a/python/paddle/tensorrt/impls/pooling.py b/python/paddle/tensorrt/impls/pooling.py index a49c8a8e9026d6..372fd0a1af065d 100644 --- a/python/paddle/tensorrt/impls/pooling.py +++ b/python/paddle/tensorrt/impls/pooling.py @@ -16,6 +16,7 @@ import numpy as np import tensorrt as trt +from paddle.tensorrt.converter_utils import get_input_constant_value from paddle.tensorrt.register import converter_registry @@ -36,12 +37,10 @@ def pool2d_converter(network, paddle_op, inputs): padding_algorithm = paddle_op.attrs().get("padding_algorithm", "EXPLICIT") if not paddle_op.attrs().get("kernel_size") and len(inputs) == 2: - full_int_op = paddle_op.operands()[1].source().get_defining_op() - if full_int_op.name() == "pd_op.full_int_array": - kernel_size = full_int_op.attrs().get("value", [1, 1]) - else: + kernel_size = get_input_constant_value(paddle_op, inputs, 1) + if kernel_size is None: raise Exception( - "The defining op of kernel size must be pd_op.full_int_array" + "The defining op of kernel size must be builtin.constant/pd_op.full_int_array" ) else: kernel_size = paddle_op.attrs().get("kernel_size", [1, 1]) diff --git a/python/paddle/tensorrt/impls/search.py b/python/paddle/tensorrt/impls/search.py index de9100297b1681..100514e88ef3df 100644 --- a/python/paddle/tensorrt/impls/search.py +++ b/python/paddle/tensorrt/impls/search.py @@ -16,6 +16,7 @@ import tensorrt as trt from paddle.tensorrt.converter_utils import ( + get_input_constant_value, get_shape_tensor_element, squeeze_trt, trt_cast, @@ -41,13 +42,7 @@ def argmax_converter(network, paddle_op, inputs): x = inputs[0] input_dims = x.shape rank = len(input_dims) - axis = int( - paddle_op.operands()[1] - .source() - .get_defining_op() - .attrs() - .get("value", -1) - ) + axis = int(get_input_constant_value(paddle_op, inputs, 1)[0]) keepdims = paddle_op.attrs()["keepdims"] if axis < 0: @@ -84,13 +79,7 @@ def argmin_converter(network, paddle_op, inputs): x = inputs[0] input_dims = x.shape rank = len(input_dims) - axis = int( - paddle_op.operands()[1] - .source() - .get_defining_op() - .attrs() - .get("value", -1) - ) + axis = int(get_input_constant_value(paddle_op, inputs, 1)[0]) keepdims = paddle_op.attrs()["keepdims"] if axis < 0: @@ -171,11 +160,10 @@ def topk_converter(network, paddle_op, inputs): largest = paddle_op.attrs().get("largest", True) flag = trt.TopKOperation.MAX if largest else trt.TopKOperation.MIN - k_op = paddle_op.operands()[1].source().get_defining_op() - if k_op.name() == "pd_op.full": - k = k_op.attrs()["value"] - else: + k_list = get_input_constant_value(paddle_op, inputs, 1) + if k_list is None: raise NotImplementedError("Dynamic k is not supported in TensorRT.") + k = k_list[0] input_rank = len(input_shape) expand_to_2d = input_rank == 1 diff --git a/python/paddle/tensorrt/util.py b/python/paddle/tensorrt/util.py index fbabef8c6178d5..de286d9bfa9ac0 100644 --- a/python/paddle/tensorrt/util.py +++ b/python/paddle/tensorrt/util.py @@ -49,20 +49,40 @@ def map_dtype(pd_dtype): raise TypeError(f"Unsupported dtype: {pd_dtype}") -def run_pir_pass(program, partition_mode=False): +def run_pir_pass(program, partition_mode=False, disable_passes=[], scope=None): pm = pir.PassManager(opt_level=4) pm.enable_print_statistics() paddle.base.libpaddle.pir.infer_symbolic_shape_pass(pm, program) + if scope is None: + scope = paddle.static.global_scope() + place = paddle.CUDAPlace(0) passes = [ {'trt_op_marker_pass': {}}, + { + 'constant_folding_pass': { + "__place__": place, + "__param_scope__": scope, + } + }, + {'conv2d_add_fuse_pass': {}}, + {'trt_op_marker_pass': {}}, # for fusion op ] if partition_mode: passes = [{'trt_sub_graph_extract_pass': {}}] for pass_item in passes: for pass_name, pass_attr in pass_item.items(): + if pass_name in disable_passes: + continue pm.add_pass(pass_name, pass_attr) pm.run(program) + + # delete unused op + for op in program.global_block().ops: + if op.name() == "builtin.constant" or op.name() == "builtin.parameter": + if op.results()[0].use_empty(): + program.global_block().remove_op(op) + return program @@ -198,10 +218,13 @@ def weight_to_tensor(network, paddle_value, trt_tensor, use_op_name): "pd_op.batch_norm_", "pd_op.layer_norm", "pd_op.depthwise_conv2d_transpose", + "pd_op.fused_conv2d_add_act", "pd_op.affine_channel", ] if use_op_name in forbid_cast_op: return trt_tensor + if paddle_value.get_defining_op().name() == "builtin.constant": + return trt_tensor input_shape = paddle_value.shape if type(trt_tensor) == trt.Weights: return network.add_constant(input_shape, trt_tensor).get_output(0) diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py index a8fc090d00bb00..48a8673b2dd6ef 100755 --- a/test/tensorrt/tensorrt_test_base.py +++ b/test/tensorrt/tensorrt_test_base.py @@ -43,6 +43,9 @@ def __init__(self, methodName='runTest'): self.max_shape = None self.target_marker_op = "" self.dynamic_shape_data = {} + self.disable_passes = [ + "constant_folding_pass", + ] def create_fake_program(self): if self.python_api is None: @@ -257,6 +260,14 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): max_shape_data[feed_name] = np.random.randn( *self.max_shape[feed_name] ).astype(self.api_args[feed_name].dtype) + + # run pir pass(including some constant fold pass, dead code elimination pass, fusion pass and trt_op_marker_pass) + main_program = run_pir_pass( + main_program, + partition_mode=False, + disable_passes=self.disable_passes, + ) + scope = paddle.static.global_scope() main_program = warmup_shape_infer( main_program, @@ -265,15 +276,11 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"): max_shape_feed=max_shape_data, scope=scope, ) - for op in main_program.global_block().ops[::-1]: # Remove all invalid fetch op if op.name() == "pd_op.fetch": main_program.global_block().remove_op(op) - # run pir pass(including some fusion pass and trt_op_marker_pass) - main_program = run_pir_pass(main_program, partition_mode=False) - # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph. mark_builtin_op(main_program) @@ -331,7 +338,11 @@ def check_marker(self, expected_result): main_program, startup_program, fetch_list = ( self.create_fake_program() ) - main_program = run_pir_pass(main_program, partition_mode=False) + main_program = run_pir_pass( + main_program, + partition_mode=False, + disable_passes=self.disable_passes, + ) marker_result = False for op in main_program.global_block().ops: if op.name() == self.target_marker_op: diff --git a/test/tensorrt/test_converter_conv.py b/test/tensorrt/test_converter_conv.py index 4c6d5c0d212341..e723cad045a66b 100644 --- a/test/tensorrt/test_converter_conv.py +++ b/test/tensorrt/test_converter_conv.py @@ -41,6 +41,7 @@ def setUp(self): self.min_shape = {"x": [1, 3, 8, 8]} self.opt_shape = {"x": [2, 3, 8, 8]} self.max_shape = {"x": [10, 3, 8, 8]} + self.disable_passes = ['constant_folding_pass', 'conv2d_add_fuse_pass'] def test_trt_result_fp16(self): self.check_trt_result(precision_mode="fp16") @@ -61,6 +62,7 @@ def setUp(self): self.min_shape = {"x": [1, 3, 8, 8]} self.opt_shape = {"x": [2, 3, 8, 8]} self.max_shape = {"x": [10, 3, 8, 8]} + self.disable_passes = ['constant_folding_pass', 'conv2d_add_fuse_pass'] def test_trt_result(self): self.check_trt_result() @@ -79,6 +81,7 @@ def setUp(self): self.min_shape = {"x": [1, 3, 8, 8]} self.opt_shape = {"x": [2, 3, 8, 8]} self.max_shape = {"x": [10, 3, 8, 8]} + self.disable_passes = ['constant_folding_pass', 'conv2d_add_fuse_pass'] def test_trt_result(self): self.check_trt_result() @@ -313,5 +316,24 @@ def test_trt_result(self): self.check_trt_result() +class TestFusedConv2dAddActTRTPattern(TensorRTBaseTest): + def setUp(self): + self.python_api = conv2d_wrapper + self.api_args = { + "x": np.random.random([2, 3, 8, 8]).astype("float32"), + } + self.program_config = {"feed_list": ["x"]} + self.min_shape = {"x": [1, 3, 8, 8]} + self.opt_shape = {"x": [2, 3, 8, 8]} + self.max_shape = {"x": [10, 3, 8, 8]} + self.disable_passes = [] + + def test_trt_result_fp16(self): + self.check_trt_result(precision_mode="fp16") + + def test_trt_result_fp32(self): + self.check_trt_result() + + if __name__ == '__main__': unittest.main() diff --git a/test/tensorrt/test_converter_model_bert.py b/test/tensorrt/test_converter_model_bert.py index d2f163757935bb..1435a396668fd6 100644 --- a/test/tensorrt/test_converter_model_bert.py +++ b/test/tensorrt/test_converter_model_bert.py @@ -46,6 +46,7 @@ def test_paddle_to_tensorrt_conversion_bert(self): # Create a TensorRTConfig with inputs as a required field. trt_config = TensorRTConfig(inputs=[input_config]) trt_config.disable_ops = "pd_op.dropout" + trt_config.disable_passes = ['constant_folding_pass'] # Step1.1: get original results(for tests only) output_var = program.global_block().ops[-1].result(0) From 93cae2bcb0bb99c819fb6ce1ae6f991b08049a68 Mon Sep 17 00:00:00 2001 From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com> Date: Thu, 9 Jan 2025 17:42:40 +0800 Subject: [PATCH 52/57] fix pir pass of moe global mesh tensor (#70715) --- .../distributed/auto_parallel/static/pir_pass.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py index 958c0c112109b0..65a2aed0f50a5d 100644 --- a/python/paddle/distributed/auto_parallel/static/pir_pass.py +++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py @@ -440,6 +440,20 @@ def prune_op(block): elif op.name() == "cf.yield": continue elif op.name() == "pd_op.pylayer": + # if the pylayer op is not on the current rank, we should delete it + is_cur_rank = False + for pylayer_block in list(op.blocks())[::-1]: + for sub_block_op in pylayer_block.ops: + if ( + sub_block_op.dist_attr + and cur_rank + in sub_block_op.dist_attr.process_mesh.process_ids + ): + is_cur_rank = True + break + if not is_cur_rank: + op.erase() + continue for pylayer_block in list(op.blocks())[::-1]: prune_op(pylayer_block) # update pylayer op's inputs From 8d120573b105a5fc3ca0a3e7214ab96ebd62ebc5 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 9 Jan 2025 18:17:50 +0800 Subject: [PATCH 53/57] [dygraph] refine dygraph backward error info (#70709) * refine dygraph backward error info --- .../eager/accumulation/accumulation_node.h | 6 + .../generator/eager_gen.py | 3 +- paddle/fluid/eager/backward.cc | 342 ++++++++++-------- paddle/fluid/framework/op_call_stack.cc | 34 ++ paddle/fluid/framework/op_call_stack.h | 5 + paddle/fluid/pybind/eager.cc | 1 + paddle/fluid/pybind/eager_math_op_patch.cc | 43 +++ paddle/fluid/pybind/eager_method.cc | 3 + paddle/fluid/pybind/eager_py_layer.cc | 1 + paddle/fluid/pybind/eager_utils.cc | 14 + 10 files changed, 296 insertions(+), 156 deletions(-) diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index e58df3eee65555..114e65048c5371 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -19,6 +19,8 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/utils/test_macros.h" +COMMON_DECLARE_int32(call_stack_level); + namespace egr { class TEST_API GradNodeAccumulation : public GradNodeBase { @@ -30,6 +32,10 @@ class TEST_API GradNodeAccumulation : public GradNodeBase { weak_grad_ = meta->WeakGrad(); } + if (FLAGS_call_stack_level == 3) { + this->SetForwardTrace(egr::Controller::Instance().GetPythonStack()); + } + SetDefaultGradInOutMeta(); } diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index cd86ee75562363..31523fb1ae8d02 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -485,7 +485,7 @@ class {} : public egr::GradNodeBase {{ // Node Construction {} // Set for forward trace - if (FLAGS_check_nan_inf) {{ + if (FLAGS_check_nan_inf || FLAGS_call_stack_level == 3) {{ grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack()); }} // SetAttributes if needed @@ -590,6 +590,7 @@ class {} : public egr::GradNodeBase {{ #include "paddle/fluid/imperative/amp_utils.h" COMMON_DECLARE_bool(check_nan_inf); +COMMON_DECLARE_int32(call_stack_level); COMMON_DECLARE_string(tensor_operants_mode); COMMON_DECLARE_bool(use_stride_kernel); {} diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 16c6aa07a9543a..ddc1a43ba73f50 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -18,6 +18,7 @@ #include "paddle/phi/core/memory/stats.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" +COMMON_DECLARE_int32(call_stack_level); namespace egr { std::unordered_map getInDegreeMap( @@ -254,177 +255,208 @@ std::vector RunBackward( while (!queue.empty()) { GradNodeBase* node = queue.front(); VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node; - - if (queue.size() > 1 && node_in_degree_map[node] != 0) { + try { + if (queue.size() > 1 && node_in_degree_map[node] != 0) { + queue.pop_front(); + continue; + } queue.pop_front(); - continue; - } - queue.pop_front(); - // Run node: This is where Hook happens - auto node_input_buffer_iter = node_input_buffers_dict.find(node); - PADDLE_ENFORCE_NE( - node_input_buffer_iter, - node_input_buffers_dict.end(), - common::errors::Fatal( - "Unable to find next node in the GradTensorHolder \n" - "Trying to run Node without configuring its GradTensorHolder.")); - - std::unique_ptr node_input_buffer = - std::move(node_input_buffer_iter->second); - - // Check input - EnforceGradNodeHasInput(node); - - VLOG(7) << "Run Backward Kernel with GradTensorHolder."; - - // This 'Global_XXXGradNode' record event is different with - // 'Local_XXXGradNode' event. - // * 'Global_XXXGradNode' will not only cover execution time of this - // function, but also include gradient - // accumulation when the output(s) of corresponding forward OP are shared - // by other OP(s), which may have extra overhead of accumulation than - // 'Local_XXXGradNode'. - // * 'Local_XXXGradNode' will only cover execution time of GradNode - // function. - phi::RecordEvent grad_node_record_event( - "Global_" + std::string((*node).name()), - phi::TracerEventType::Operator, - 1); - - // Run Pre Backward Node and get outputs - paddle::small_vector, kSlotSmallVectorSize> - grad_output_tensors = (*node)( - node_input_buffer->Buffers(), create_graph, is_general_grad); - - if (!inputs.empty() && is_general_grad) { - GeneralGrad::Instance().SetResultForEndingNodes(grad_output_tensors, - node); - } - - // retain_grad or not - if (!retain_graph) { - VLOG(3) - << "retain_graph is false, need to clear the TensorWrapper of nodes."; - node->ClearTensorWrappers(); - } + // Run node: This is where Hook happens + auto node_input_buffer_iter = node_input_buffers_dict.find(node); + PADDLE_ENFORCE_NE( + node_input_buffer_iter, + node_input_buffers_dict.end(), + common::errors::Fatal( + "Unable to find next node in the GradTensorHolder \n" + "Trying to run Node without configuring its GradTensorHolder.")); + + std::unique_ptr node_input_buffer = + std::move(node_input_buffer_iter->second); + + // Check input + EnforceGradNodeHasInput(node); + + VLOG(7) << "Run Backward Kernel with GradTensorHolder."; + + // This 'Global_XXXGradNode' record event is different with + // 'Local_XXXGradNode' event. + // * 'Global_XXXGradNode' will not only cover execution time of this + // function, but also include gradient + // accumulation when the output(s) of corresponding forward OP are + // shared by other OP(s), which may have extra overhead of accumulation + // than 'Local_XXXGradNode'. + // * 'Local_XXXGradNode' will only cover execution time of GradNode + // function. + phi::RecordEvent grad_node_record_event( + "Global_" + std::string((*node).name()), + phi::TracerEventType::Operator, + 1); + + // Run Pre Backward Node and get outputs + paddle::small_vector, kSlotSmallVectorSize> + grad_output_tensors = (*node)( + node_input_buffer->Buffers(), create_graph, is_general_grad); + + if (!inputs.empty() && is_general_grad) { + GeneralGrad::Instance().SetResultForEndingNodes(grad_output_tensors, + node); + } - // TODO(jiabin): Should we erase it or find a more efficient way. - node_input_buffers_dict.erase(node_input_buffer_iter); + // retain_grad or not + if (!retain_graph) { + VLOG(3) << "retain_graph is false, need to clear the TensorWrapper of " + "nodes."; + node->ClearTensorWrappers(); + } - // Prepare GradTensorHolder for next node - const paddle::small_vector, kSlotSmallVectorSize>& - metas = node->OutputMeta(); - PADDLE_ENFORCE(metas.size() == grad_output_tensors.size() || metas.empty(), - common::errors::Fatal( - "Number of edges should be either empty ( for leaf node " - ") or the same as number of output grad tensors, but we " - "got edges size is: %d, grad_output size is: %d", - metas.size(), - grad_output_tensors.size())); - - for (size_t i = 0; i < metas.size(); i++) { - for (size_t j = 0; j < metas[i].size(); j++) { - const Edge& edge = metas[i][j].GetEdge(); - if (!edge.IsInitialized()) { - continue; - } - auto edge_rank = edge.GetEdgeRankInfo(); - // Since we make edge has as same rank as bwd outputs, we indexing them - // with the same rank(i, j) - auto next_node_shared = edge.GetMutableGradNode(); - VLOG(3) << "Node: " << node->name() << " addr:" << node - << ", Found pending node: " << next_node_shared->name() - << " addr: " << next_node_shared.get(); - // Next node could be nullptr if it is leaf tensor with no - // AccumulationNode attached - // Or it could also originated from dispensable inputs - if (!next_node_shared || !next_node_shared.get() || - grad_output_tensors[i].empty()) { - continue; - } + // TODO(jiabin): Should we erase it or find a more efficient way. + node_input_buffers_dict.erase(node_input_buffer_iter); - PADDLE_ENFORCE_LT( - j, - grad_output_tensors[i].size(), - common::errors::Fatal( - "Rank of grad_output_tensors should be less than " - "grad_output_tensors[i].size(), which is: %d. This error may " - "indicate autoprune or autograd api error. ", - grad_output_tensors.size())); - paddle::Tensor& grad_output_tensor = grad_output_tensors[i][j]; - - if ((!grad_output_tensor.defined() || - !grad_output_tensor.has_allocation())) { - VLOG(7) << "We get grad_output_tensor with slot: " << i - << ", rank: " << j - << " as undefined tensor or without allocation."; - } + // Prepare GradTensorHolder for next node + const paddle::small_vector, + kSlotSmallVectorSize>& metas = + node->OutputMeta(); + PADDLE_ENFORCE( + metas.size() == grad_output_tensors.size() || metas.empty(), + common::errors::Fatal( + "Number of edges should be either empty ( for leaf node " + ") or the same as number of output grad tensors, but we " + "got edges size is: %d, grad_output size is: %d", + metas.size(), + grad_output_tensors.size())); + + for (size_t i = 0; i < metas.size(); i++) { + for (size_t j = 0; j < metas[i].size(); j++) { + const Edge& edge = metas[i][j].GetEdge(); + if (!edge.IsInitialized()) { + continue; + } + auto edge_rank = edge.GetEdgeRankInfo(); + // Since we make edge has as same rank as bwd outputs, we indexing + // them with the same rank(i, j) + auto next_node_shared = edge.GetMutableGradNode(); + VLOG(3) << "Node: " << node->name() << " addr:" << node + << ", Found pending node: " << next_node_shared->name() + << " addr: " << next_node_shared.get(); + // Next node could be nullptr if it is leaf tensor with no + // AccumulationNode attached + // Or it could also originated from dispensable inputs + if (!next_node_shared || !next_node_shared.get() || + grad_output_tensors[i].empty()) { + continue; + } - VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i - << ", rank: " << j - << " 's name is: " << grad_output_tensor.name(); - - auto* next_node = next_node_shared.get(); - if (!node_input_buffers_dict.count(next_node)) { - const auto& input_meta = next_node->InputMeta(); - auto grad_tensor_holder = - std::make_unique(input_meta); - VLOG(7) << "Construct GradTensorHolder for grad node: " - << next_node->name(); - node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); - } + PADDLE_ENFORCE_LT( + j, + grad_output_tensors[i].size(), + common::errors::Fatal( + "Rank of grad_output_tensors should be less than " + "grad_output_tensors[i].size(), which is: %d. This error may " + "indicate autoprune or autograd api error. ", + grad_output_tensors.size())); + paddle::Tensor& grad_output_tensor = grad_output_tensors[i][j]; + + if ((!grad_output_tensor.defined() || + !grad_output_tensor.has_allocation())) { + VLOG(7) << "We get grad_output_tensor with slot: " << i + << ", rank: " << j + << " as undefined tensor or without allocation."; + } - VLOG(3) << "Sum or Move grad inputs for edge slot: " << edge_rank.first - << ", rank: " << edge_rank.second; - - node_input_buffers_dict[next_node]->add(edge_rank.first, - edge_rank.second, - grad_output_tensor, - create_graph); - - // Update queue - node_in_degree_map[next_node]--; - VLOG(7) << next_node->name() - << " ref_cnt is: " << node_in_degree_map[next_node]; - - PADDLE_ENFORCE( - node_in_degree_map[next_node] >= 0, - common::errors::Fatal( - "Detected in-degree value smaller than zero. For Node: %s" - "Node's in-degree cannot be negative.", - next_node->name())); - - auto add_next_node_func = [&queue](GradNodeBase* next_node) { - if (dynamic_cast(next_node)) { - queue.push_front(next_node); - } else { - queue.push_back(next_node); + VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i + << ", rank: " << j + << " 's name is: " << grad_output_tensor.name(); + + auto* next_node = next_node_shared.get(); + if (!node_input_buffers_dict.count(next_node)) { + const auto& input_meta = next_node->InputMeta(); + auto grad_tensor_holder = + std::make_unique(input_meta); + VLOG(7) << "Construct GradTensorHolder for grad node: " + << next_node->name(); + node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); } - }; - if (node_in_degree_map[next_node] == 0) { - if (force_sequential_nodes_set.count(next_node)) { - if (force_sequential_nodes_queue.front() == next_node) { - force_sequential_nodes_queue.pop_front(); - add_next_node_func(next_node); - while (ready_force_sequential_nodes.count( - force_sequential_nodes_queue.front())) { - ready_force_sequential_nodes.erase( - force_sequential_nodes_queue.front()); - add_next_node_func(force_sequential_nodes_queue.front()); + + VLOG(3) << "Sum or Move grad inputs for edge slot: " + << edge_rank.first << ", rank: " << edge_rank.second; + + node_input_buffers_dict[next_node]->add(edge_rank.first, + edge_rank.second, + grad_output_tensor, + create_graph); + + // Update queue + node_in_degree_map[next_node]--; + VLOG(7) << next_node->name() + << " ref_cnt is: " << node_in_degree_map[next_node]; + + PADDLE_ENFORCE( + node_in_degree_map[next_node] >= 0, + common::errors::Fatal( + "Detected in-degree value smaller than zero. For Node: %s" + "Node's in-degree cannot be negative.", + next_node->name())); + + auto add_next_node_func = [&queue](GradNodeBase* next_node) { + if (dynamic_cast(next_node)) { + queue.push_front(next_node); + } else { + queue.push_back(next_node); + } + }; + if (node_in_degree_map[next_node] == 0) { + if (force_sequential_nodes_set.count(next_node)) { + if (force_sequential_nodes_queue.front() == next_node) { force_sequential_nodes_queue.pop_front(); + add_next_node_func(next_node); + while (ready_force_sequential_nodes.count( + force_sequential_nodes_queue.front())) { + ready_force_sequential_nodes.erase( + force_sequential_nodes_queue.front()); + add_next_node_func(force_sequential_nodes_queue.front()); + force_sequential_nodes_queue.pop_front(); + } + } else { + ready_force_sequential_nodes.insert(next_node); + continue; } } else { - ready_force_sequential_nodes.insert(next_node); - continue; + add_next_node_func(next_node); } - } else { - add_next_node_func(next_node); } } } + paddle::memory::LogDeviceMemoryStats(place, std::string((*node).name())); + } catch (::common::enforce::EnforceNotMet& ex) { + if (FLAGS_call_stack_level == 3) { + paddle::framework::InsertCallStackInfoDygraph( + node->name(), {node->GetForwardTrace()}, &ex); + } + + LOG(WARNING) << "While running Node (" << node->name() + << ") raises an EnforceNotMet exception"; + throw ex; + } catch (std::exception& ex) { + LOG(WARNING) << "While running Node (" << node->name() + << ") raises a std::exception: " + << common::demangle(typeid(ex).name()); + if (FLAGS_call_stack_level == 3) { + LOG(WARNING) << "Node (" << node->name() + << ")'s forward call stack is :" << node->GetForwardTrace() + << std::endl; + } + std::rethrow_exception(std::current_exception()); + } catch (...) { + LOG(WARNING) << "While running Node (" << node->name() + << ") raises an unknown exception"; + if (FLAGS_call_stack_level == 3) { + LOG(WARNING) << "Node (" << node->name() + << ")'s forward call stack is :" << node->GetForwardTrace() + << std::endl; + } + std::rethrow_exception(std::current_exception()); } - paddle::memory::LogDeviceMemoryStats(place, std::string((*node).name())); } VLOG(7) << "Run Backward Final hook size: " diff --git a/paddle/fluid/framework/op_call_stack.cc b/paddle/fluid/framework/op_call_stack.cc index f7b60af104747d..8d765d0c1becbf 100644 --- a/paddle/fluid/framework/op_call_stack.cc +++ b/paddle/fluid/framework/op_call_stack.cc @@ -74,6 +74,40 @@ void InsertCallStackInfo(const std::string &type, exception->set_error_str(sout.str()); } +void InsertCallStackInfoDygraph( + const std::string &node_name, + const std::vector &forward_callstack_str, + platform::EnforceNotMet *exception) { + const std::vector *callstack = &forward_callstack_str; + std::ostringstream sout; + // Step 1. Construct python call stack string + if (callstack) { + if (FLAGS_call_stack_level > 1) { + sout << "\n\n Forward Traceback (most recent call last):"; + } else { + sout << "In user code:\n"; + } + for (auto &line : *callstack) { + sout << "\n " << line; + } + } + VLOG(1) << exception->error_str(); + // Step 2. Construct final call stack & append error op name + if (FLAGS_call_stack_level > 1) { + sout << exception->what(); + } else { + // If callstack exists, use err_str_ instead sub_err_str_ + if (callstack) { + sout << "\n\n"; + sout << InsertIndentationIntoEachLine(exception->error_str()); + } else { + sout << exception->simple_error_str(); + } + } + sout << " [GradNode < " << node_name << " > error]"; + exception->set_error_str(sout.str()); +} + void InsertCallStackInfo(const std::string &type, const std::vector &callstack_attr_str, platform::EnforceNotMet *exception) { diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h index 9f9ecd14ef8be7..3be29cb4585967 100644 --- a/paddle/fluid/framework/op_call_stack.h +++ b/paddle/fluid/framework/op_call_stack.h @@ -31,6 +31,11 @@ void InsertCallStackInfo(const std::string &type, const std::vector &callstack_attr_str, platform::EnforceNotMet *exception); +void InsertCallStackInfoDygraph( + const std::string &type, + const std::vector &callstack_attr_str, + platform::EnforceNotMet *exception); + // only append error op for exception message void AppendErrorOpHint(const std::string &type, platform::EnforceNotMet *exception); diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 74585c3131cc91..dba28787c1acf6 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -827,6 +827,7 @@ Tensor is the basic data structure in PaddlePaddle. There are some ways to creat * **/ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY + SetPythonStack(); // set a flag to record use kwargs or not bool flag_kwargs = false; if (kwargs) flag_kwargs = true; diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index 2abecf91708ce9..cb1f600bed32b0 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -180,6 +180,7 @@ paddle::Tensor CallScalarFunction(const paddle::Tensor& self_tensor, double other, std::string op_type) { paddle::Tensor ret; + SetPythonStack(); // scale_ad_func need sclar and bias with float type. if (op_type == "add" || op_type == "radd") { ret = scale_ad_func(self_tensor, phi::Scalar(1.0), other, true); @@ -223,6 +224,7 @@ void TypePromotionForZeroDimTensor(std::string func, promote_type = self_tensor.dtype(); } } + SetPythonStack(); if (self_tensor.dtype() != promote_type) { eager_gil_scoped_release guard; self_tensor = cast_ad_func(self_tensor, promote_type); @@ -243,6 +245,9 @@ static PyObject* tensor__add__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__add__method"; + + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -338,6 +343,8 @@ static PyObject* tensor__sub__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__sub__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -432,6 +439,8 @@ static PyObject* tensor__rsub__method(TensorObject* self, EAGER_TRY VLOG(4) << "Running Eager tensor__rsub__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -525,6 +534,8 @@ static PyObject* tensor__mul__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__mul__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -628,6 +639,8 @@ static PyObject* tensor__div__method(TensorObject* self, VLOG(6) << "Running Eager tensor__div__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -720,6 +733,8 @@ static PyObject* tensor__rdiv__method(TensorObject* self, VLOG(6) << "Running Eager tensor__rdiv__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -806,6 +821,8 @@ static PyObject* tensor__gt__method(TensorObject* self, EAGER_TRY VLOG(4) << "Running Eager tensor__gt__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -897,6 +914,8 @@ static PyObject* tensor__ge__method(TensorObject* self, EAGER_TRY VLOG(4) << "Running Eager tensor__ge__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -988,6 +1007,8 @@ static PyObject* tensor__mod__method(TensorObject* self, VLOG(6) << "Running Eager tensor__mod__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1079,6 +1100,8 @@ static PyObject* tensor__rmod__method(TensorObject* self, VLOG(6) << "Running Eager tensor__rmod__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1170,6 +1193,8 @@ static PyObject* tensor__matmul__method(TensorObject* self, VLOG(6) << "Running Eager tensor__matmul__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1295,6 +1320,8 @@ static PyObject* tensor__rmatmul__method(TensorObject* self, VLOG(6) << "Running Eager tensor__rmatmul__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1420,6 +1447,8 @@ static PyObject* tensor__lt__method(TensorObject* self, EAGER_TRY VLOG(4) << "Running Eager tensor__lt__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1511,6 +1540,8 @@ static PyObject* tensor__le__method(TensorObject* self, EAGER_TRY VLOG(4) << "Running Eager tensor__le__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1601,6 +1632,8 @@ static PyObject* tensor__floordiv__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__floordiv__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1691,6 +1724,8 @@ static PyObject* tensor__rfloordiv__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__rfloordiv__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1782,6 +1817,8 @@ static PyObject* tensor__pow__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__pow__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1877,6 +1914,8 @@ static PyObject* tensor__rpow__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__rpow__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -1970,6 +2009,8 @@ static PyObject* tensor__ne__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__ne__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); @@ -2061,6 +2102,8 @@ static PyObject* tensor__eq__method(TensorObject* self, EAGER_TRY VLOG(6) << "Running Eager tensor__eq__method"; + SetPythonStack(); + // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); SetDevice(place); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 6cffcf68330af7..c121df6b78a31a 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -1421,6 +1421,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY + SetPythonStack(); PyObject* _index = PyTuple_GET_ITEM(args, 0); VLOG(4) << "Call new indexing strategy _getitem_dygraph"; @@ -1691,6 +1692,7 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY + SetPythonStack(); VLOG(4) << "Call new indexing strategy _setitem_dygraph"; PyObject* _index = PyTuple_GET_ITEM(args, 0); @@ -2006,6 +2008,7 @@ static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY + SetPythonStack(); int64_t hook_id = 0; if (egr::EagerUtils::IsLeafTensor(self->tensor)) { VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name(); diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc index c695c5357e0bdc..269af549b4d132 100644 --- a/paddle/fluid/pybind/eager_py_layer.cc +++ b/paddle/fluid/pybind/eager_py_layer.cc @@ -134,6 +134,7 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args, PyObject* kwargs) { EAGER_TRY + SetPythonStack(); VLOG(6) << "Begin run PyLayer apply..."; PyObject* backward_function = PyObject_GetAttrString(cls, "_backward_function"); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index f4f1500189c94c..beb67068106d50 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -51,6 +51,7 @@ limitations under the License. */ COMMON_DECLARE_bool(check_nan_inf); COMMON_DECLARE_int32(check_nan_inf_level); +COMMON_DECLARE_int32(call_stack_level); using egr::ConvertToDistTensor; @@ -269,6 +270,19 @@ void SetPythonStack() { std::string last = str + egr::Controller::Instance().GetPythonStack(); egr::Controller::Instance().SetPythonStack(last); } + + if (FLAGS_call_stack_level == 3) { + VLOG(4) << "this is SetPythonStack"; + pybind11::gil_scoped_acquire gil; + PyObject* mod = PyImport_ImportModule("traceback"); + PyObject* traceback_list = PyObject_CallMethod(mod, "format_stack", ""); + std::string str = ""; + for (Py_ssize_t i = 0; i < PyList_Size(traceback_list); i++) { + PyObject* line = PyList_GetItem(traceback_list, i); + str += py::str(PyUnicode_AsUTF8(line)); + } + egr::Controller::Instance().SetPythonStack(str); + } } std::shared_ptr CastPyArg2JitFunction(PyObject* obj, From 4e3f03112da476baee72c4a5009cfc8bee0ccb3b Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 18:42:08 +0800 Subject: [PATCH 54/57] [clean old comm] remove dynamic_static_unified_comm in test directory (#70729) --- test/collective/collective_allgather_api.py | 7 +++---- test/collective/test_collective_alltoall_api.py | 3 +-- test/legacy_test/test_collective_api_base.py | 7 +------ test/legacy_test/test_collective_base.py | 11 ++--------- test/xpu/collective_allgather_api.py | 7 +++---- test/xpu/test_collective_api_base.py | 7 +------ test/xpu/test_collective_base_xpu.py | 12 +++--------- 7 files changed, 14 insertions(+), 40 deletions(-) diff --git a/test/collective/collective_allgather_api.py b/test/collective/collective_allgather_api.py index a502e5a6dad50d..e6d8aaa6c0084c 100644 --- a/test/collective/collective_allgather_api.py +++ b/test/collective/collective_allgather_api.py @@ -98,10 +98,9 @@ def run_trainer(self, args): rank = args["trainerid"] current_endpoint = args["currentendpoint"] nranks = 2 - if args["use_comm_context"] or args["dynamic_static_unified_comm"]: - paddle.distributed.collective._init_parallel_env(args["backend"]) - else: - paddle.distributed.init_parallel_env() + + paddle.distributed.collective._init_parallel_env(args["backend"]) + if args['backend'] == 'nccl': device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = base.CUDAPlace( diff --git a/test/collective/test_collective_alltoall_api.py b/test/collective/test_collective_alltoall_api.py index 5c3bb4c056006b..464014ad5cc8c0 100644 --- a/test/collective/test_collective_alltoall_api.py +++ b/test/collective/test_collective_alltoall_api.py @@ -39,7 +39,7 @@ def test_alltoall_nccl_with_new_comm(self): "alltoall", "nccl", dtype=dtype, - need_envs={"FLAGS_dynamic_static_unified_comm": "true"}, + need_envs={}, ) def test_alltoall_nccl_with_new_comm_pir(self): @@ -57,7 +57,6 @@ def test_alltoall_nccl_with_new_comm_pir(self): "nccl", dtype=dtype, need_envs={ - "FLAGS_dynamic_static_unified_comm": "true", "FLAGS_enable_pir_in_executor": "1", }, ) diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py index 5f4b1e71540b65..6ebf194da385aa 100644 --- a/test/legacy_test/test_collective_api_base.py +++ b/test/legacy_test/test_collective_api_base.py @@ -125,9 +125,7 @@ def run_trainer(self, args): rank = args["trainerid"] current_endpoint = args["currentendpoint"] nranks = 2 - if args['static_mode'] and ( - args["use_comm_context"] or args["dynamic_static_unified_comm"] - ): + if args['static_mode']: paddle.distributed.collective._init_parallel_env(args["backend"]) else: paddle.distributed.init_parallel_env() @@ -188,9 +186,6 @@ def runtime_main(test_class, col_type): args["dtype"] = os.getenv("DTYPE") args["reduce_type"] = os.getenv("REDUCE_TYPE") args["use_comm_context"] = bool(int(os.getenv("USE_COMM_CONTEXT", "0"))) - args["dynamic_static_unified_comm"] = bool( - os.getenv("FLAGS_dynamic_static_unified_comm", "true").lower() == "true" - ) model.run_trainer(args) diff --git a/test/legacy_test/test_collective_base.py b/test/legacy_test/test_collective_base.py index 9e570ec31ba961..bbdfd402dd5919 100644 --- a/test/legacy_test/test_collective_base.py +++ b/test/legacy_test/test_collective_base.py @@ -111,12 +111,8 @@ def run_trainer(self, args): rank = args["trainerid"] current_endpoint = args["currentendpoint"] nranks = 2 - if args["dynamic_static_unified_comm"]: - _init_parallel_env("nccl") - else: - self.initCommunicator( - startup_prog, rank, nranks, True, current_endpoint, endpoints - ) + + _init_parallel_env("nccl") self.rank = rank result = self.get_model(train_prog, startup_prog) @@ -146,9 +142,6 @@ def runtime_main(test_class, col_type, sub_type): args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT") args["col_type"] = col_type args["dtype"] = os.getenv("DTYPE") - args["dynamic_static_unified_comm"] = bool( - int(os.getenv("FLAGS_dynamic_static_unified_comm", "1")) - ) model.run_trainer(args) diff --git a/test/xpu/collective_allgather_api.py b/test/xpu/collective_allgather_api.py index b4995ee1d08e0f..7f3c397bffa256 100644 --- a/test/xpu/collective_allgather_api.py +++ b/test/xpu/collective_allgather_api.py @@ -100,10 +100,9 @@ def run_trainer(self, args): rank = args["trainerid"] current_endpoint = args["currentendpoint"] nranks = 2 - if args["use_comm_context"] or args["dynamic_static_unified_comm"]: - paddle.distributed.collective._init_parallel_env(args["backend"]) - else: - paddle.distributed.init_parallel_env() + + paddle.distributed.collective._init_parallel_env(args["backend"]) + if args['backend'] == 'nccl': device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = base.CUDAPlace( diff --git a/test/xpu/test_collective_api_base.py b/test/xpu/test_collective_api_base.py index a43a3e5b6df202..098f8c4c7dafb5 100644 --- a/test/xpu/test_collective_api_base.py +++ b/test/xpu/test_collective_api_base.py @@ -125,9 +125,7 @@ def run_trainer(self, args): rank = args["trainerid"] current_endpoint = args["currentendpoint"] nranks = 2 - if args['static_mode'] and ( - args["use_comm_context"] or args["dynamic_static_unified_comm"] - ): + if args['static_mode']: paddle.distributed.collective._init_parallel_env(args["backend"]) else: paddle.distributed.init_parallel_env() @@ -187,9 +185,6 @@ def runtime_main(test_class, col_type): args["dtype"] = os.getenv("DTYPE") args["reduce_type"] = os.getenv("REDUCE_TYPE") args["use_comm_context"] = bool(int(os.getenv("USE_COMM_CONTEXT", "0"))) - args["dynamic_static_unified_comm"] = bool( - os.getenv("FLAGS_dynamic_static_unified_comm", "true").lower() == "true" - ) model.run_trainer(args) diff --git a/test/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py index 3fed84ecbb6f14..1cd2e0e44f394a 100644 --- a/test/xpu/test_collective_base_xpu.py +++ b/test/xpu/test_collective_base_xpu.py @@ -140,12 +140,9 @@ def run_trainer(self, args): rank = args["trainerid"] current_endpoint = args["currentendpoint"] nranks = 2 - if args["dynamic_static_unified_comm"]: - _init_parallel_env("bkcl") - else: - self.initCommunicator( - startup_prog, rank, nranks, True, current_endpoint, endpoints - ) + + _init_parallel_env("bkcl") + self.rank = rank np_dtype = DataTypeCast(args["dtype"]) result = self.get_model(train_prog, startup_prog, np_dtype) @@ -174,9 +171,6 @@ def runtime_main(test_class, col_type, sub_type): args["col_type"] = col_type args["dtype"] = os.getenv("DTYPE") args["batch_size"] = os.getenv("BATCH_SIZE") - args["dynamic_static_unified_comm"] = bool( - int(os.getenv("FLAGS_dynamic_static_unified_comm", "1")) - ) model.run_trainer(args) From f70042ab551078635fa5b28738aca23a6f298cd9 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 18:43:12 +0800 Subject: [PATCH 55/57] [clean old comm][fluid_ops]c_allreduce_op.h (#70732) --- .../operators/collective/c_allreduce_op.h | 85 ++++++++----------- .../operators/collective/c_gen_bkcl_id_op.cc | 18 ---- .../operators/collective/c_gen_nccl_id_op.cc | 18 ---- .../operators/collective/c_wait_comm_op.cc | 40 +++------ .../operators/collective/c_wait_compute_op.cc | 40 +++------ .../operators/collective/recv_v2_op.cu.cc | 49 ++++------- .../operators/collective/send_v2_op.cu.cc | 49 ++++------- 7 files changed, 98 insertions(+), 201 deletions(-) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index b6db792077a362..57c4a7061df834 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -29,7 +29,6 @@ limitations under the License. */ defined(PADDLE_WITH_XPU_BKCL) #include "paddle/common/flags.h" #include "paddle/phi/core/platform/collective_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -180,30 +179,24 @@ class CAllReduceOpXPUKernel : public framework::OpKernel { const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(rid))); - comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(rid))); - PADDLE_ENFORCE_NE(comm_ctx, - nullptr, - common::errors::Unavailable( - "BKCLCommContext is nullptr, collective op should " - "has ring_id attr.")); - stream = comm_ctx->GetStream(); - VLOG(3) << "new comm_context_manager has rid " << rid; - } else { - comm = platform::BKCLCommContext::Instance().Get(rid, place); - stream = comm->stream(); - VLOG(3) << "old BKCLCommContext has rid " << rid; - } + + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), + true, + common::errors::InvalidArgument( + "You choose to use new communication library. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(rid))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(rid))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + common::errors::Unavailable( + "BKCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + stream = comm_ctx->GetStream(); + VLOG(3) << "new comm_context_manager has rid " << rid; + if (ctx.Attr("use_calc_stream")) { auto dev_ctx = phi::DeviceContextPool::Instance().Get(place); stream = static_cast(dev_ctx)->x_context()->xpu_stream; @@ -325,30 +318,24 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(rid))); - comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(rid))); - PADDLE_ENFORCE_NE(comm_ctx, - nullptr, - common::errors::Unavailable( - "NCCLCommContext is nullptr, collective op should " - "has ring_id attr.")); - stream = comm_ctx->GetStream(); - VLOG(3) << "new comm_context_manager has rid " << rid; - } else { - comm = platform::NCCLCommContext::Instance().Get(rid, place); - stream = comm->stream(); - VLOG(3) << "old NCCLCommContext has rid " << rid; - } + + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), + true, + common::errors::InvalidArgument( + "You choose to use new communication library. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(rid))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(rid))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + common::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + stream = comm_ctx->GetStream(); + VLOG(3) << "new comm_context_manager has rid " << rid; + if (ctx.Attr("use_calc_stream")) { // should not use global ctx for calc stream. // auto dev_ctx = phi::DeviceContextPool::Instance().Get(place); diff --git a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc index 324cdde5175c4e..3479562f93ae55 100644 --- a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/gen_comm_id_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); namespace paddle { namespace operators { @@ -63,30 +62,13 @@ class CGenBKCLIdOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const phi::Place& dev_place) const override { - int rank = Attr("rank"); - int ring_id = Attr("ring_id"); - std::function func = [&](size_t i) -> std::string { return Output("Out"); }; - std::string endpoint = Attr("endpoint"); - std::vector bkcl_ids; bkcl_ids.resize(1); - if (!FLAGS_dynamic_static_unified_comm) { - int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); - if (rank == 0) { - GenBKCLID(&bkcl_ids); - std::vector endpoint_list = - Attr>("other_endpoints"); - platform::SendBroadCastCommID(endpoint_list, &bkcl_ids, ring_id); - } else { - platform::RecvBroadCastCommID(server_fd, endpoint, &bkcl_ids, ring_id); - } - } - CopyBKCLIDToVar(bkcl_ids, func, scope); } }; diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 5004439695097f..beda7cf0c1377b 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/phi/core/platform/device_context.h" #include "paddle/phi/core/platform/gen_comm_id_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); namespace paddle::operators { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -58,30 +57,13 @@ class CGenNCCLIdOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const phi::Place& dev_place) const override { - int rank = Attr("rank"); - int ring_id = Attr("ring_id"); - std::function func = [&](size_t i) -> std::string { return Output("Out"); }; - std::string endpoint = Attr("endpoint"); - std::vector nccl_ids; nccl_ids.resize(1); - if (!FLAGS_dynamic_static_unified_comm) { - int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); - if (rank == 0) { - GenNCCLID(&nccl_ids); - std::vector endpoint_list = - Attr>("other_endpoints"); - platform::SendBroadCastCommID(endpoint_list, &nccl_ids, ring_id); - } else { - platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids, ring_id); - } - } - CopyNCCLIDToVar(nccl_ids, func, scope); } }; diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index ce9387d5aea183..8226f6d1d495e2 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -22,7 +22,6 @@ class Scope; #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/platform/collective_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif namespace paddle::operators { @@ -56,31 +55,20 @@ class CWaitCommOp : public framework::OperatorBase { const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(ring_id))); - phi::distributed::NCCLCommContext* comm_ctx = - static_cast( - comm_context_manager.Get(std::to_string(ring_id))); - comm_stream = comm_ctx->GetStream(); - event = comm_ctx->GetComputeEvent(); - VLOG(3) << "new comm_context_manager has rid " << ring_id; - } else { - comm_stream = - platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); - - event = platform::NCCLCommContext::Instance() - .Get(ring_id, place) - ->comm_event(); - VLOG(3) << "old NCCLCommContext has rid " << ring_id; - } + + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), + true, + common::errors::InvalidArgument( + "You choose to use new communication library. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(ring_id))); + phi::distributed::NCCLCommContext* comm_ctx = + static_cast( + comm_context_manager.Get(std::to_string(ring_id))); + comm_stream = comm_ctx->GetStream(); + event = comm_ctx->GetComputeEvent(); + VLOG(3) << "new comm_context_manager has rid " << ring_id; // comm_stream-->event-->compute_stream #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index 4d8a5f158c679b..234832a6c46059 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -22,7 +22,6 @@ class Scope; #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/platform/collective_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif namespace paddle::operators { @@ -56,31 +55,20 @@ class CWaitComputeOp : public framework::OperatorBase { const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(ring_id))); - phi::distributed::NCCLCommContext* comm_ctx = - static_cast( - comm_context_manager.Get(std::to_string(ring_id))); - comm_stream = comm_ctx->GetStream(); - event = comm_ctx->GetComputeEvent(); - VLOG(3) << "new comm_context_manager has rid " << ring_id; - } else { - comm_stream = - platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); - - event = platform::NCCLCommContext::Instance() - .Get(ring_id, place) - ->compute_event(); - VLOG(3) << "old NCCLCommContext has rid " << ring_id; - } + + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), + true, + common::errors::InvalidArgument( + "You choose to use new communication library. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(ring_id))); + phi::distributed::NCCLCommContext* comm_ctx = + static_cast( + comm_context_manager.Get(std::to_string(ring_id))); + comm_stream = comm_ctx->GetStream(); + event = comm_ctx->GetComputeEvent(); + VLOG(3) << "new comm_context_manager has rid " << ring_id; // compute_stream-->event-->comm_stream #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index d6fbfdf6f4eee9..1888e2204a66ca 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/platform/collective_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif #include "paddle/fluid/distributed/collective/process_group.h" @@ -175,37 +174,23 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { const auto &comm_context_manager = phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(rid))); - comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(rid))); - PADDLE_ENFORCE_NE(comm_ctx, - nullptr, - common::errors::Unavailable( - "NCCLCommContext is nullptr, collective op should " - "has ring_id attr.")); - stream = comm_ctx->GetStream(); - VLOG(3) << "new comm_context_manager has rid " << rid; - } else { - comm = platform::NCCLCommContext::Instance().Get(rid, place); - PADDLE_ENFORCE_LT( - peer, - comm->nranks(), - common::errors::InvalidArgument("The value of peer (%d) you set must " - "be less than comm->nranks (%d).", - peer, - comm->nranks())); - stream = comm->stream(); - VLOG(3) << "old NCCLCommContext has rid " << rid; - } + + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), + true, + common::errors::InvalidArgument( + "You choose to use new communication library. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(rid))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(rid))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + common::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + stream = comm_ctx->GetStream(); + VLOG(3) << "new comm_context_manager has rid " << rid; if (ctx.Attr("use_calc_stream")) { // should ExecutionContext for calc stream. diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index 10ff7108cab23b..d0c0c48cfd75a7 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" #include "paddle/phi/core/platform/collective_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif #include "paddle/fluid/distributed/collective/process_group.h" #include "paddle/phi/api/include/tensor.h" @@ -167,37 +166,23 @@ class SendOpV2CUDAKernel : public framework::OpKernel { const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(rid))); - comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(rid))); - PADDLE_ENFORCE_NE(comm_ctx, - nullptr, - common::errors::Unavailable( - "NCCLCommContext is nullptr, collective op should " - "has ring_id attr.")); - stream = comm_ctx->GetStream(); - VLOG(3) << "new comm_context_manager has rid " << rid; - } else { - comm = platform::NCCLCommContext::Instance().Get(rid, place); - PADDLE_ENFORCE_LT( - peer, - comm->nranks(), - common::errors::InvalidArgument("The value of peer (%d) you set must " - "be less than comm->nranks (%d).", - peer, - comm->nranks())); - stream = comm->stream(); - VLOG(3) << "old NCCLCommContext has rid " << rid; - } + + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), + true, + common::errors::InvalidArgument( + "You choose to use new communication library. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(rid))); + comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(rid))); + PADDLE_ENFORCE_NE(comm_ctx, + nullptr, + common::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + stream = comm_ctx->GetStream(); + VLOG(3) << "new comm_context_manager has rid " << rid; if (ctx.Attr("use_calc_stream")) { // should ExecutionContext for calc stream. From 2f47c31ea51b64034bdf269063f74ff78f86e259 Mon Sep 17 00:00:00 2001 From: co63oc Date: Thu, 9 Jan 2025 18:43:42 +0800 Subject: [PATCH 56/57] [fluid_ops] sharding_optimizer.py replace c_broadcast (#70705) --- .../fleet/meta_optimizers/sharding_optimizer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 07de62d3039f89..50a8d35a4526c7 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -1707,7 +1707,7 @@ def _initialization_broadcast(self): # offload and optimize_cast will insert broadcast op broadcast_params = set() for op in startup_block.ops: - if op.type == 'c_broadcast': + if op.type == 'broadcast': broadcast_params.add(op.desc.output_arg_names()[0]) for param in params_name: @@ -1723,13 +1723,12 @@ def _initialization_broadcast(self): for ring in rings: startup_block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, + type='broadcast', + inputs={'x': param}, + outputs={'out': param}, attrs={ 'ring_id': ring, 'root': 0, - 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Forward, }, ) From 266e3cd2f1c73e092b4bb037f9106b73e76e8ebc Mon Sep 17 00:00:00 2001 From: zty-king <129518799+zty-king@users.noreply.github.com> Date: Thu, 9 Jan 2025 18:55:29 +0800 Subject: [PATCH 57/57] =?UTF-8?q?=E6=94=AF=E6=8C=81=E9=9D=9E=E5=9D=87?= =?UTF-8?q?=E8=A1=A1VPP=E7=BC=96=E6=8E=92=E7=9A=84=E7=81=B5=E6=B4=BB?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=B1=82=E5=88=86=E9=85=8D=E7=AD=96=E7=95=A5?= =?UTF-8?q?=20(#70230)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 * 支持非均衡VPP编排的灵活模型层分配策略 --- .../auto_parallel/static/pir_pass.py | 39 +++++++++++++------ .../pir/vpp_pass_unittest_pir.py | 19 ++++++++- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py index 65a2aed0f50a5d..7f6238a223acb7 100644 --- a/python/paddle/distributed/auto_parallel/static/pir_pass.py +++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py @@ -1066,12 +1066,6 @@ def complete_chunk_id(dist_program, startup_program, pipeline_strategy): dist_program.global_block().ops, seg_method ) ops = dist_program.global_block().ops - - assert (len(seg_struct_names) % num_chunks == 0) or ( - (len(seg_struct_names) + 1) % num_chunks == 0 - and (len(seg_struct_names) + 1) // num_chunks != 1 - ), f"The number of layers[{seg_method}] ({len(seg_struct_names)}) should be divisible by part number ({num_chunks}),or ({len(seg_struct_names)} + 1) should be divisible by {num_chunks} and not equal to {num_chunks}." - # Step2: analysis whether the pp_stage is non-decreasing among segments # 1. if non_use_custom_mesh is True, the ops' process_mesh will be changed by vpp strategy # 2. if non_use_custom_mesh is False, the ops's process_mesh will not be changed. @@ -1080,20 +1074,41 @@ def complete_chunk_id(dist_program, startup_program, pipeline_strategy): # Step3: Get op index boundary, pp_stage, chunk_id, struct_names of each segment seg_pp_stages = [i % pp_degree for i in range(num_chunks)] seg_chunk_ids = [i // pp_degree for i in range(num_chunks)] - seg_layer_num = [0] * num_chunks - for j in range(0, len(seg_struct_names)): - i = j % num_chunks - seg_layer_num[i] = seg_layer_num[i] + 1 seg_parts = [0] - + last_struct_name = None + stage_ids = ( + [] + ) # stage_ids[i] represents the stage number assigned to the i-th layer. for idx, op in enumerate(ops): if len(seg_parts) == len(seg_struct_names): break struct_name = _extract_seg_method(op, seg_method) + if op.dist_attr is not None and last_struct_name != struct_name: + pp_stage = get_pp_stage_by_process_mesh( + op.dist_attr.process_mesh, pp_degree + ) + if pp_stage is not None: + stage_ids.append(pp_stage) + last_struct_name = struct_name if struct_name == seg_struct_names[len(seg_parts)]: seg_parts.append(idx) seg_parts.append(len(ops)) - + pp_stage_layer_nums = [0] * pp_degree + for i in stage_ids: + pp_stage_layer_nums[i] = pp_stage_layer_nums[i] + 1 + assert all( + value >= vpp_degree for value in pp_stage_layer_nums + ), "The number of layers on each pp_stage must not be less than the vpp_degree in the pp_stage to ensure that each chunk contains at least one layer." + seg_layer_num = [0] * num_chunks + for pp_stage in range( + 0, pp_degree + ): # Each pp_stage is assigned a number of layers based on user intent. + pp_stage_layer_num = pp_stage_layer_nums[pp_stage] + for i in range(0, pp_stage_layer_num): + # The pp_stage uses a Round robin scheduling algorithm to allocate layers one by one. + virtual_chunk_id = i % vpp_degree + real_chunk_id = (virtual_chunk_id) * pp_degree + pp_stage + seg_layer_num[real_chunk_id] = seg_layer_num[real_chunk_id] + 1 # Step4: Set the process_mesh of each op seg_id = 0 reshard_ops = [] diff --git a/test/auto_parallel/pir/vpp_pass_unittest_pir.py b/test/auto_parallel/pir/vpp_pass_unittest_pir.py index cae24c9453cff1..97a4f703769265 100644 --- a/test/auto_parallel/pir/vpp_pass_unittest_pir.py +++ b/test/auto_parallel/pir/vpp_pass_unittest_pir.py @@ -103,6 +103,7 @@ def __init__( initializer_range=0.02, manual=True, hidden_layer=4, + random_shard=False, ): super().__init__() @@ -116,6 +117,10 @@ def __init__( self.layer_to_mesh = [PP_MESH_0] * ( hidden_layer - hidden_layer // 2 ) + [PP_MESH_1] * (hidden_layer // 2) + if random_shard: + self.layer_to_mesh = [PP_MESH_0] * (4) + [PP_MESH_1] * ( + hidden_layer - 4 + ) self.layers = nn.LayerList( [ @@ -221,11 +226,14 @@ def run_pipeline( enable_send_recv_overlap=False, batch_size=BATCH_SIZE, hidden_layer=4, + random_shard=False, ): self.init() strategy = apply_pass(schedule_mode, acc_step, enable_send_recv_overlap) - model = MLPLayer(manual=manual, hidden_layer=hidden_layer) + model = MLPLayer( + manual=manual, hidden_layer=hidden_layer, random_shard=random_shard + ) opt = paddle.optimizer.AdamW( learning_rate=0.00001, parameters=model.parameters() ) @@ -276,6 +284,15 @@ def test_pp_pass(self): schedule_mode="VPP", acc_step=4, manual=False, hidden_layer=7 ) self.check_result(Tail_removed_loss_vpp, loss_vpp) + # random-shard-vpp + Random_shards_vpp = self.run_pipeline( + schedule_mode="VPP", + acc_step=4, + manual=False, + hidden_layer=7, + random_shard=True, + ) + self.check_result(Random_shards_vpp, loss_vpp) def check_result(self, loss1, loss2): return np.array_equal(loss1, loss2)