From 6de5ead7da2e997071fafaeb9d870be1f5fb429a Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Tue, 7 Jan 2025 19:19:07 +0800
Subject: [PATCH 01/57] [Paddle TensorRT No.6-7] Add
 pd_op.affine_channel,pd_op.numel converter (#70507)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix

* fix

* fix

* 增加converter.py支持opt_shape

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix
---
 .../new_executor/collect_shape_manager.cc     | 69 ++++++++++-------
 .../transforms/tensorrt/trt_op_marker_pass.cc | 30 ++++++++
 python/paddle/tensorrt/converter.py           |  1 -
 python/paddle/tensorrt/export.py              |  5 +-
 python/paddle/tensorrt/impls/manipulation.py  | 10 +++
 python/paddle/tensorrt/impls/others.py        | 51 +++++++++++++
 python/paddle/tensorrt/util.py                |  9 ++-
 test/tensorrt/tensorrt_test_base.py           | 27 ++++++-
 test/tensorrt/test_converter_activation.py    | 22 ++++++
 test/tensorrt/test_converter_attribute.py     |  2 +
 test/tensorrt/test_converter_common.py        | 11 +++
 test/tensorrt/test_converter_conv.py          | 12 +++
 test/tensorrt/test_converter_creation.py      | 11 +++
 test/tensorrt/test_converter_input.py         |  2 +
 test/tensorrt/test_converter_linalg.py        |  9 ++-
 test/tensorrt/test_converter_logic.py         | 29 ++++++++
 test/tensorrt/test_converter_manipulation.py  | 74 ++++++++++++++++++-
 test/tensorrt/test_converter_math.py          | 33 +++++++++
 test/tensorrt/test_converter_norm.py          |  4 +
 test/tensorrt/test_converter_ops.py           |  2 +
 test/tensorrt/test_converter_others.py        | 66 ++++++++++++++++-
 test/tensorrt/test_converter_pooling.py       |  9 ++-
 test/tensorrt/test_converter_search.py        | 12 +++
 test/tensorrt/test_converter_stat.py          |  2 +
 test/tensorrt/test_converter_vision.py        |  1 +
 25 files changed, 467 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/collect_shape_manager.cc b/paddle/fluid/framework/new_executor/collect_shape_manager.cc
index 02c4aaae5dfe5c..053a4055779b95 100644
--- a/paddle/fluid/framework/new_executor/collect_shape_manager.cc
+++ b/paddle/fluid/framework/new_executor/collect_shape_manager.cc
@@ -34,7 +34,9 @@ void CollectShapeManager::CollectShapeInfo(
     auto *var = scope->FindVar(var_name);
     if (!var || !var->IsType<phi::DenseTensor>()) continue;
     auto tensor = var->Get<phi::DenseTensor>();
-    if (!tensor.initialized()) continue;
+    if (!tensor.initialized() && !instr->NoNeedBuffer().count(input.first)) {
+      continue;
+    }
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -124,36 +126,53 @@ void CollectShapeManager::StatisticShapeRangeInfo() {
         for (auto const &it : shape_data) {
           auto val = it.first;
           auto shapes = it.second;
+
           std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
           std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
           std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
+          // Applicable to scenarios where min/opt/max are specified;
+          if (shapes.size() == 3) {
+            for (size_t d = 0; d < shapes[0].size(); ++d) {
+              std::vector<int32_t> dim_values;
+              for (const auto &shape : shapes) {
+                dim_values.push_back(shape[d]);
+              }
+              std::sort(dim_values.begin(), dim_values.end());
+              min_shape[d] = dim_values[0];
+              opt_shape[d] = dim_values[1];
+              max_shape[d] = dim_values[2];
+            }
+            min_data[val] = min_shape;
+            max_data[val] = max_shape;
+            opt_data[val] = opt_shape;
+          } else {
+            // suitable for scenarios where shape is automatically collected.
+            auto ShapeMaxFreq =
+                [](const std::map<int32_t, int32_t> &m) -> int32_t {
+              std::vector<std::pair<int32_t, int32_t>> counter;
+              for (auto &it : m) counter.emplace_back(it);
+              std::sort(counter.begin(),
+                        counter.end(),
+                        [](std::pair<int32_t, int32_t> &a,
+                           std::pair<int32_t, int32_t> &b) {
+                          return a.second > b.second;
+                        });
+              return counter[0].first;
+            };
 
-          auto ShapeMaxFreq =
-              [](const std::map<int32_t, int32_t> &m) -> int32_t {
-            std::vector<std::pair<int32_t, int32_t>> counter;
-            for (auto &it : m) counter.emplace_back(it);
-            std::sort(counter.begin(),
-                      counter.end(),
-                      [](std::pair<int32_t, int32_t> &a,
-                         std::pair<int32_t, int32_t> &b) {
-                        return a.second > b.second;
-                      });
-            return counter[0].first;
-          };
-
-          for (size_t d = 0; d < shapes[0].size(); ++d) {
-            std::map<int32_t, int32_t> counter;
-            for (auto &shape : shapes) {
-              counter[shape[d]] += 1;
-              if (shape[d] < min_shape[d]) min_shape[d] = shape[d];
-              if (shape[d] > max_shape[d]) max_shape[d] = shape[d];
+            for (size_t d = 0; d < shapes[0].size(); ++d) {
+              std::map<int32_t, int32_t> counter;
+              for (auto &shape : shapes) {
+                counter[shape[d]] += 1;
+                if (shape[d] < min_shape[d]) min_shape[d] = shape[d];
+                if (shape[d] > max_shape[d]) max_shape[d] = shape[d];
+              }
+              opt_shape[d] = ShapeMaxFreq(counter);
             }
-            opt_shape[d] = ShapeMaxFreq(counter);
+            min_data[val] = min_shape;
+            max_data[val] = max_shape;
+            opt_data[val] = opt_shape;
           }
-
-          min_data[val] = min_shape;
-          max_data[val] = max_shape;
-          opt_data[val] = opt_shape;
         }
       };
   extract_min_max_opt(min_shapes_, max_shapes_, opt_shapes_, shape_info_);
diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
index 7c2aad2caefdda..c67bd5d012973b 100644
--- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
+++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
@@ -94,6 +94,7 @@ DEFINE_GENERAL_PATTERN(Flip, paddle::dialect::FlipOp)
 DEFINE_GENERAL_PATTERN(Mish, paddle::dialect::MishOp)
 DEFINE_GENERAL_PATTERN(AssignValue, paddle::dialect::AssignValueOp)
 DEFINE_GENERAL_PATTERN(AssignValue_, paddle::dialect::AssignValue_Op)
+DEFINE_GENERAL_PATTERN(Numel, paddle::dialect::NumelOp)
 #undef DEFINE_GENERAL_PATTERN
 
 // Add ReduceCommonOpPattern base class to simplify code
@@ -2191,6 +2192,33 @@ class InstanceNormOpPattern
   }
 };
 
+class AffineChannelOpPattern
+    : public pir::OpRewritePattern<paddle::dialect::AffineChannelOp> {
+ public:
+  using pir::OpRewritePattern<
+      paddle::dialect::AffineChannelOp>::OpRewritePattern;
+  bool MatchAndRewrite(paddle::dialect::AffineChannelOp op,
+                       pir::PatternRewriter &rewriter) const override {
+    if (op->HasAttribute(kCanRunTrtAttr) &&
+        op->attribute<pir::BoolAttribute>(kCanRunTrtAttr).data()) {
+      return false;
+    }
+    if (!op->HasAttribute("data_layout")) {
+      VLOG(3) << "pd_op.affine_channel must has data_layout";
+      return false;
+    }
+    pir::Value x = op.operand_source(0);
+    auto x_shape = pir::GetShapeFromValue(x);
+    if (x_shape.size() == 2) {
+      VLOG(3) << "pd_op.affine_channel x.shape can not be 2";
+      return false;
+    }
+
+    op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true));
+    return true;
+  }
+};
+
 class TrtOpMarkerPass : public pir::PatternRewritePass {
  public:
   TrtOpMarkerPass() : pir::PatternRewritePass("trt_op_marker_pass", 2) {}
@@ -2245,6 +2273,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass {
     ADD_PATTERN(Mish)
     ADD_PATTERN(AssignValue)
     ADD_PATTERN(AssignValue_)
+    ADD_PATTERN(Numel)
 #if IS_TRT_VERSION_GE(8600)
     ADD_PATTERN(Layer_norm)
 #endif
@@ -2321,6 +2350,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass {
     ps.Add(std::make_unique<CeluOpPattern>(context));
     ps.Add(std::make_unique<OneHotOpPattern>(context));
     ps.Add(std::make_unique<InstanceNormOpPattern>(context));
+    ps.Add(std::make_unique<AffineChannelOpPattern>(context));
     return ps;
   }
 };
diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py
index 2fe7416a646623..cab46618c4c0ee 100644
--- a/python/paddle/tensorrt/converter.py
+++ b/python/paddle/tensorrt/converter.py
@@ -294,7 +294,6 @@ def convert_subgraph_to_trt(self, program, group_op):
                     max_shape = get_value_shape_range_info(
                         value, False, paddle.base.core.ShapeMode.kMAX
                     )
-
                     if trt_input.is_shape_tensor:
                         min_value = get_value_shape_range_info(
                             value, True, paddle.base.core.ShapeMode.kMIN
diff --git a/python/paddle/tensorrt/export.py b/python/paddle/tensorrt/export.py
index 1a36ce3aff74d9..044f58f0041908 100644
--- a/python/paddle/tensorrt/export.py
+++ b/python/paddle/tensorrt/export.py
@@ -248,17 +248,20 @@ def convert_to_trt(program, trt_config, scope):
     with paddle.pir_utils.IrGuard():
         min_shape_feed = {}
         max_shape_feed = {}
+        opt_shape_feed = {}
         for i, input_instance in enumerate(trt_config.inputs):
             # get fake inputs
-            min_data, _, max_data = input_instance.generate_input_data()
+            min_data, opt_data, max_data = input_instance.generate_input_data()
             program_with_output = program.list_vars()[-1]
             min_shape_feed[feed_name[i]] = min_data
+            opt_shape_feed[feed_name[i]] = opt_data
             max_shape_feed[feed_name[i]] = max_data
 
             # run warmup for collecting shape
         program = warmup_shape_infer(
             program,
             min_shape_feed=min_shape_feed,
+            opt_shape_feed=opt_shape_feed,
             max_shape_feed=max_shape_feed,
             scope=scope,
         )
diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py
index 3ec25417953fb1..8f005518d618c7 100644
--- a/python/paddle/tensorrt/impls/manipulation.py
+++ b/python/paddle/tensorrt/impls/manipulation.py
@@ -947,3 +947,13 @@ def roll_converter(network, paddle_op, inputs):
             )
 
     return layer.get_output(0)
+
+
+@converter_registry.register("pd_op.numel", trt_version="8.x")
+def numel_converter(network, paddle_op, inputs):
+    input_tensor = inputs[0]
+    shape_tensor = network.add_shape(input_tensor).get_output(0)
+    layer = network.add_reduce(
+        shape_tensor, trt.ReduceOperation.PROD, axes=1, keep_dims=False
+    )
+    return layer.get_output(0)
diff --git a/python/paddle/tensorrt/impls/others.py b/python/paddle/tensorrt/impls/others.py
index da386091ebcf92..f2f571f6953129 100644
--- a/python/paddle/tensorrt/impls/others.py
+++ b/python/paddle/tensorrt/impls/others.py
@@ -301,3 +301,54 @@ def share_data_converter(network, paddle_op, inputs):
     identity_layer = network.add_identity(x)
 
     return identity_layer.get_output(0)
+
+
+@converter_registry.register("pd_op.affine_channel", trt_version="8.x")
+def affine_channel_converter(network, paddle_op, inputs):
+    x, scale_weights, bias_weights = inputs
+    data_layout = paddle_op.attrs().get("data_layout")
+
+    if data_layout == "NCHW":
+        channel_axis = 1
+        x_input = x
+    elif data_layout == "NHWC":
+        # Permute NHWC to NCHW
+        shuffle_layer1 = network.add_shuffle(x)
+        shuffle_layer1.first_transpose = (0, 3, 1, 2)
+        x_input = shuffle_layer1.get_output(0)
+        channel_axis = 1
+    else:
+        raise ValueError(f"affine_channel: Unsupported layout: {data_layout}")
+
+    if not isinstance(scale_weights, trt.Weights):
+        raise TypeError("affine_channel requires scale as trt.Weights")
+    if not isinstance(bias_weights, trt.Weights):
+        raise TypeError("affine_channel requires bias as trt.Weights")
+
+    if scale_weights.size != bias_weights.size:
+        raise ValueError(
+            f"affine_channel: scale.size({scale_weights.size}) != bias.size({bias_weights.size})"
+        )
+
+    power_array = np.ones((scale_weights.size,), dtype=np.float32)
+    power_weights = trt.Weights(power_array)
+
+    layer = network.add_scale_nd(
+        input=x_input,
+        mode=trt.ScaleMode.CHANNEL,
+        shift=bias_weights,
+        scale=scale_weights,
+        power=power_weights,
+        channel_axis=channel_axis,
+    )
+    if not layer:
+        raise RuntimeError("affine_channel: add_scale_nd failed.")
+
+    out_tensor = layer.get_output(0)
+
+    if data_layout == "NHWC":
+        shuffle_layer2 = network.add_shuffle(out_tensor)
+        shuffle_layer2.first_transpose = (0, 2, 3, 1)
+        out_tensor = shuffle_layer2.get_output(0)
+
+    return out_tensor
diff --git a/python/paddle/tensorrt/util.py b/python/paddle/tensorrt/util.py
index cba02fb3997622..fbabef8c6178d5 100644
--- a/python/paddle/tensorrt/util.py
+++ b/python/paddle/tensorrt/util.py
@@ -94,7 +94,9 @@ def predict_program(program, feed_data, fetch_var_list, scope=None):
             return output
 
 
-def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None):
+def warmup_shape_infer(
+    program, min_shape_feed, opt_shape_feed, max_shape_feed, scope=None
+):
     paddle.framework.set_flags({"FLAGS_enable_collect_shape": True})
     with paddle.pir_utils.IrGuard():
         with paddle.static.program_guard(program):
@@ -103,6 +105,9 @@ def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None):
             for _ in range(1):
                 executor.run(program, feed=min_shape_feed, scope=scope)
 
+            for _ in range(1):
+                executor.run(program, feed=opt_shape_feed, scope=scope)
+
             # Run the program with input_data_max_shape (fake max_shape input)
             for _ in range(1):
                 executor.run(program, feed=max_shape_feed, scope=scope)
@@ -120,6 +125,7 @@ def warmup_shape_infer(program, min_shape_feed, max_shape_feed, scope=None):
                 )
             )
     paddle.framework.set_flags({"FLAGS_enable_collect_shape": False})
+
     return exe_program
 
 
@@ -192,6 +198,7 @@ def weight_to_tensor(network, paddle_value, trt_tensor, use_op_name):
         "pd_op.batch_norm_",
         "pd_op.layer_norm",
         "pd_op.depthwise_conv2d_transpose",
+        "pd_op.affine_channel",
     ]
     if use_op_name in forbid_cast_op:
         return trt_tensor
diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py
index 03e51b100fcbaa..a8fc090d00bb00 100755
--- a/test/tensorrt/tensorrt_test_base.py
+++ b/test/tensorrt/tensorrt_test_base.py
@@ -39,6 +39,7 @@ def __init__(self, methodName='runTest'):
         self.api_args = None
         self.program_config = None
         self.min_shape = None
+        self.opt_shape = None
         self.max_shape = None
         self.target_marker_op = ""
         self.dynamic_shape_data = {}
@@ -62,6 +63,7 @@ def create_fake_program(self):
                     ].items():
                         if (
                             feed_name in self.min_shape.keys()
+                            and feed_name in self.opt_shape.keys()
                             and feed_name in self.max_shape.keys()
                         ):
                             input_shape_without_dynamic_dim = (
@@ -89,11 +91,15 @@ def create_fake_program(self):
                     api_args[feed_name] = new_list_args
                 else:
                     empty_min_max_shape = (
-                        self.min_shape is None or self.max_shape is None
+                        self.min_shape is None
+                        or self.max_shape is None
+                        or self.opt_shape is None
                     )
+
                     if (
                         not empty_min_max_shape
                         and feed_name in self.min_shape.keys()
+                        and feed_name in self.opt_shape.keys()
                         and feed_name in self.max_shape.keys()
                     ):
                         # dynamic shape condition
@@ -181,6 +187,7 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
             output_expected = self.run_program(main_program, fetch_list)
 
             min_shape_data = dict()  # noqa: C408
+            opt_shape_data = dict()  # noqa: C408
             max_shape_data = dict()  # noqa: C408
             for feed_name in self.program_config["feed_list"]:
                 if self.api_args[feed_name] is None:
@@ -190,11 +197,13 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
                     if (
                         feed_name not in self.min_shape.keys()
                         and feed_name not in self.max_shape.keys()
+                        and feed_name not in self.opt_shape.keys()
                     ):
                         for sub_feed_name, sub_feed_value in self.api_args[
                             feed_name
                         ].items():
                             min_shape_data[sub_feed_name] = sub_feed_value
+                            opt_shape_data[sub_feed_name] = sub_feed_value
                             max_shape_data[sub_feed_name] = sub_feed_value
                             continue
                     else:
@@ -206,6 +215,11 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
                             ).astype(
                                 self.api_args[feed_name][sub_feed_name].dtype
                             )
+                            opt_shape_data[sub_feed_name] = np.random.randn(
+                                *self.opt_shape[feed_name][i]
+                            ).astype(
+                                self.api_args[feed_name][sub_feed_name].dtype
+                            )
                             max_shape_data[sub_feed_name] = np.random.randn(
                                 *self.max_shape[feed_name][i]
                             ).astype(
@@ -216,8 +230,10 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
                     if (
                         feed_name not in self.min_shape.keys()
                         and feed_name not in self.max_shape.keys()
+                        and feed_name not in self.opt_shape.keys()
                     ):
                         min_shape_data[feed_name] = self.api_args[feed_name]
+                        opt_shape_data[feed_name] = self.api_args[feed_name]
                         max_shape_data[feed_name] = self.api_args[feed_name]
                         continue
                     else:
@@ -225,6 +241,9 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
                             min_shape_data[feed_name] = self.dynamic_shape_data[
                                 feed_name
                             ](self.min_shape[feed_name])
+                            opt_shape_data[feed_name] = self.dynamic_shape_data[
+                                feed_name
+                            ](self.opt_shape[feed_name])
                             max_shape_data[feed_name] = self.dynamic_shape_data[
                                 feed_name
                             ](self.max_shape[feed_name])
@@ -232,6 +251,9 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
                             min_shape_data[feed_name] = np.random.randn(
                                 *self.min_shape[feed_name]
                             ).astype(self.api_args[feed_name].dtype)
+                            opt_shape_data[feed_name] = np.random.randn(
+                                *self.opt_shape[feed_name]
+                            ).astype(self.api_args[feed_name].dtype)
                             max_shape_data[feed_name] = np.random.randn(
                                 *self.max_shape[feed_name]
                             ).astype(self.api_args[feed_name].dtype)
@@ -239,6 +261,7 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
             main_program = warmup_shape_infer(
                 main_program,
                 min_shape_feed=min_shape_data,
+                opt_shape_feed=opt_shape_data,
                 max_shape_feed=max_shape_data,
                 scope=scope,
             )
@@ -262,7 +285,7 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
 
             input = Input(
                 min_input_shape=self.min_shape,
-                optim_input_shape=self.min_shape,
+                optim_input_shape=self.opt_shape,
                 max_input_shape=self.max_shape,
             )
             trt_config = TensorRTConfig(inputs=[input])
diff --git a/test/tensorrt/test_converter_activation.py b/test/tensorrt/test_converter_activation.py
index 268dc1e592e073..2e95b50e20e95e 100644
--- a/test/tensorrt/test_converter_activation.py
+++ b/test/tensorrt/test_converter_activation.py
@@ -29,6 +29,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [1]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -44,6 +45,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [1]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -58,6 +60,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [1, 3], "y": [1, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -72,6 +75,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [1, 3], "y": [1, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -84,6 +88,7 @@ def setUp(self):
         self.api_args = {"x": np.random.randn(3).astype("float32")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [1]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -96,6 +101,7 @@ def setUp(self):
         self.api_args = {"x": np.random.randn(3).astype("float32")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [2]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -111,6 +117,7 @@ def setUp(self):
         self.api_args = {"x": np.random.randn(3).astype("float32")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [1]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -125,6 +132,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -139,6 +147,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -153,6 +162,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -167,6 +177,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -184,6 +195,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -198,6 +210,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -212,6 +225,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result_fp16(self):
@@ -231,6 +245,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -246,6 +261,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [1, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -261,6 +277,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -275,6 +292,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [2]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -289,6 +307,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -303,6 +322,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 4]}
+        self.opt_shape = {"x": [2, 3, 4]}
         self.max_shape = {"x": [5, 3, 4]}
 
     def test_trt_result(self):
@@ -317,6 +337,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 4, 2]}
+        self.opt_shape = {"x": [2, 3, 4, 2]}
         self.max_shape = {"x": [5, 3, 4, 2]}
 
     def test_trt_result(self):
@@ -331,6 +352,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_attribute.py b/test/tensorrt/test_converter_attribute.py
index ff4defcf70187a..cdb647857804a2 100644
--- a/test/tensorrt/test_converter_attribute.py
+++ b/test/tensorrt/test_converter_attribute.py
@@ -28,6 +28,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -42,6 +43,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_common.py b/test/tensorrt/test_converter_common.py
index dce25797b26e09..58f65e49b31802 100644
--- a/test/tensorrt/test_converter_common.py
+++ b/test/tensorrt/test_converter_common.py
@@ -44,6 +44,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [1, 2, 3]}
         self.max_shape = {"x": [10, 2, 3]}
 
     def test_trt_result(self):
@@ -60,6 +61,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [1, 2, 3]}
         self.max_shape = {"x": [10, 2, 3]}
 
     def test_trt_result(self):
@@ -126,6 +128,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 3, 6, 10]}
+        self.opt_shape = {"x": [2, 3, 6, 10]}
         self.max_shape = {"x": [12, 3, 6, 10]}
 
     def test_trt_result(self):
@@ -155,6 +158,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 6, 10, 3]}
+        self.opt_shape = {"x": [2, 6, 10, 3]}
         self.max_shape = {"x": [12, 6, 10, 3]}
 
     def test_trt_result(self):
@@ -182,6 +186,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "OutSize"]}
         self.min_shape = {"x": [2, 3, 6, 10]}
+        self.opt_shape = {"x": [2, 3, 6, 10]}
         self.max_shape = {"x": [12, 3, 6, 10]}
 
     def test_trt_result(self):
@@ -214,6 +219,7 @@ def setUp(self):
             "feed_list": ["x", "OutSize", "SizeTensor", "Scale"]
         }
         self.min_shape = {"x": [2, 3, 6, 10]}
+        self.opt_shape = {"x": [2, 3, 6, 10]}
         self.max_shape = {"x": [12, 3, 6, 10]}
 
     def test_trt_result(self):
@@ -243,6 +249,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 6, 10, 3]}
+        self.opt_shape = {"x": [2, 6, 10, 3]}
         self.max_shape = {"x": [12, 6, 10, 3]}
 
     def test_trt_result(self):
@@ -274,6 +281,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "SizeTensor"]}
         self.min_shape = {"x": [2, 3, 6, 10]}
+        self.opt_shape = {"x": [2, 3, 6, 10]}
         self.max_shape = {"x": [12, 3, 6, 10]}
 
     def test_trt_result(self):
@@ -302,6 +310,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 3, 6, 10]}
+        self.opt_shape = {"x": [2, 3, 6, 10]}
         self.max_shape = {"x": [12, 3, 6, 10]}
 
     def test_trt_result(self):
@@ -314,6 +323,7 @@ def setUp(self):
         self.api_args = {"x": np.random.random([2, 3, 6, 10]).astype("float32")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 3, 6, 10]}
+        self.opt_shape = {"x": [2, 3, 6, 10]}
         self.max_shape = {"x": [12, 3, 6, 10]}
 
     def test_trt_result(self):
@@ -331,6 +341,7 @@ def setUp(self):
         self.api_args = {"x": np.random.random([2, 3, 6, 10]).astype("float32")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 3, 6, 10]}
+        self.opt_shape = {"x": [2, 3, 6, 10]}
         self.max_shape = {"x": [12, 3, 6, 10]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_conv.py b/test/tensorrt/test_converter_conv.py
index 4dd17c977caf88..4c6d5c0d212341 100644
--- a/test/tensorrt/test_converter_conv.py
+++ b/test/tensorrt/test_converter_conv.py
@@ -39,6 +39,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 8, 8]}
+        self.opt_shape = {"x": [2, 3, 8, 8]}
         self.max_shape = {"x": [10, 3, 8, 8]}
 
     def test_trt_result_fp16(self):
@@ -58,6 +59,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 8, 8]}
+        self.opt_shape = {"x": [2, 3, 8, 8]}
         self.max_shape = {"x": [10, 3, 8, 8]}
 
     def test_trt_result(self):
@@ -75,6 +77,7 @@ def setUp(self):
 
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 8, 8]}
+        self.opt_shape = {"x": [2, 3, 8, 8]}
         self.max_shape = {"x": [10, 3, 8, 8]}
 
     def test_trt_result(self):
@@ -132,6 +135,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 5, 5]}
+        self.opt_shape = {"x": [2, 3, 5, 5]}
         self.max_shape = {"x": [4, 3, 5, 5]}
 
     def test_trt_result(self):
@@ -154,6 +158,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 5, 5]}
+        self.opt_shape = {"x": [2, 3, 5, 5]}
         self.max_shape = {"x": [4, 3, 5, 5]}
 
     def test_trt_result(self):
@@ -176,6 +181,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 5, 5]}
+        self.opt_shape = {"x": [2, 3, 5, 5]}
         self.max_shape = {"x": [4, 3, 5, 5]}
 
 
@@ -205,6 +211,7 @@ def setUp(self):
         self.api_args = {"x": np.random.random([3, 2, 8, 8]).astype("float32")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 8, 8]}
+        self.opt_shape = {"x": [3, 2, 8, 8]}
         self.max_shape = {"x": [10, 2, 8, 8]}
 
     def test_trt_result(self):
@@ -221,6 +228,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 8, 8]}
+        self.opt_shape = {"x": [3, 2, 8, 8]}
         self.max_shape = {"x": [10, 2, 8, 8]}
 
     def test_trt_result(self):
@@ -238,6 +246,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 8, 8]}
+        self.opt_shape = {"x": [3, 2, 8, 8]}
         self.max_shape = {"x": [10, 2, 8, 8]}
 
     def test_trt_result(self):
@@ -262,6 +271,7 @@ def setUp(self):
         self.api_args = {"x": np.random.random([3, 2, 8, 8]).astype("float32")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 8, 8]}
+        self.opt_shape = {"x": [3, 2, 8, 8]}
         self.max_shape = {"x": [10, 2, 8, 8]}
 
     def test_trt_result(self):
@@ -279,6 +289,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 8, 8]}
+        self.opt_shape = {"x": [3, 2, 8, 8]}
         self.max_shape = {"x": [10, 2, 8, 8]}
 
     def test_trt_result(self):
@@ -295,6 +306,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 8, 8]}
+        self.opt_shape = {"x": [3, 2, 8, 8]}
         self.max_shape = {"x": [10, 2, 8, 8]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_creation.py b/test/tensorrt/test_converter_creation.py
index 1478a67cc8bbf6..8c1623d1a2ebad 100644
--- a/test/tensorrt/test_converter_creation.py
+++ b/test/tensorrt/test_converter_creation.py
@@ -27,6 +27,7 @@ def setUp(self):
         self.api_args = {"shape": [3, 2], "fill_value": 1.0}
         self.program_config = {"feed_list": []}
         self.min_shape = {}
+        self.opt_shape = {}
         self.max_shape = {}
 
     def test_trt_result(self):
@@ -41,6 +42,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [2, 2]}
         self.max_shape = {"x": [3, 2]}
 
     def test_trt_result(self):
@@ -96,6 +98,7 @@ def test_trt_result(self):
                 self.api_args = api_args
                 self.program_config = {"feed_list": ["x"]}
                 self.min_shape = {}
+                self.opt_shape = {}
                 self.max_shape = {}
                 self.check_trt_result()
 
@@ -110,6 +113,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": []}
         self.min_shape = {}
+        self.opt_shape = {}
         self.max_shape = {}
 
     def test_trt_result(self):
@@ -125,6 +129,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "output"]}
         self.min_shape = {"x": [1, 2], "output": [1, 2]}
+        self.opt_shape = {"x": [2, 2], "output": [2, 2]}
         self.max_shape = {"x": [3, 2], "output": [3, 2]}
 
     def test_trt_result(self):
@@ -140,6 +145,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["input"]}
         self.min_shape = {"input": [1, 2]}
+        self.opt_shape = {"input": [3, 2]}
         self.max_shape = {"input": [5, 2]}
 
     def test_trt_result(self):
@@ -155,6 +161,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["input"]}
         self.min_shape = {"input": [1, 2]}
+        self.opt_shape = {"input": [3, 2]}
         self.max_shape = {"input": [5, 2]}
 
     def test_trt_result(self):
@@ -170,6 +177,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["input"]}
         self.min_shape = {"input": [1, 2]}
+        self.opt_shape = {"input": [3, 2]}
         self.max_shape = {"input": [5, 2]}
 
     def test_trt_result(self):
@@ -185,6 +193,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["input", "fill_value"]}
         self.min_shape = {"input": [1, 2]}
+        self.opt_shape = {"input": [3, 2]}
         self.max_shape = {"input": [5, 2]}
 
     def test_trt_result(self):
@@ -201,6 +210,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["value", "shape"]}
         self.min_shape = {}
+        self.opt_shape = {}
         self.max_shape = {}
 
     def test_trt_result(self):
@@ -217,6 +227,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["value"]}
         self.min_shape = {}
+        self.opt_shape = {}
         self.max_shape = {}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_input.py b/test/tensorrt/test_converter_input.py
index 945ff2133efd1b..c4f0254c8b4dcb 100644
--- a/test/tensorrt/test_converter_input.py
+++ b/test/tensorrt/test_converter_input.py
@@ -35,6 +35,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "num_classes"]}
         self.min_shape = {"x": [1, 1]}
+        self.opt_shape = {"x": [3, 1]}
         self.max_shape = {"x": [6, 1]}
 
     def test_trt_result(self):
@@ -58,6 +59,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1]}
+        self.opt_shape = {"x": [3, 1]}
         self.max_shape = {"x": [6, 1]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_linalg.py b/test/tensorrt/test_converter_linalg.py
index 28162d1da0359b..910ffffcdd5448 100644
--- a/test/tensorrt/test_converter_linalg.py
+++ b/test/tensorrt/test_converter_linalg.py
@@ -31,10 +31,11 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3, 2]}
+        self.opt_shape = {"x": [1, 3], "y": [3, 2]}
         self.max_shape = {"x": [5, 3], "y": [3, 2]}
 
     def test_trt_result(self):
-        self.check_trt_result()
+        self.check_trt_result(rtol=1e-3, atol=1e-3)
 
 
 class TestTransposeTRTPattern(TensorRTBaseTest):
@@ -46,6 +47,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 4]}
+        self.opt_shape = {"x": [1, 3, 4]}
         self.max_shape = {"x": [5, 3, 4]}
 
     def test_trt_result(self):
@@ -61,6 +63,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 2, 3], "y": [1, 3, 2]}
+        self.opt_shape = {"x": [1, 2, 3], "y": [1, 3, 2]}
         self.max_shape = {"x": [5, 2, 3], "y": [5, 3, 2]}
 
     def test_trt_result(self):
@@ -76,6 +79,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 4]}
+        self.opt_shape = {"x": [1, 3, 4]}
         self.max_shape = {"x": [5, 3, 4]}
 
     def test_trt_result(self):
@@ -91,6 +95,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 4]}
+        self.opt_shape = {"x": [1, 3, 4]}
         self.max_shape = {"x": [5, 3, 4]}
 
     def test_trt_result(self):
@@ -106,6 +111,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 4]}
+        self.opt_shape = {"x": [1, 3, 4]}
         self.max_shape = {"x": [5, 3, 4]}
 
     def test_trt_result(self):
@@ -121,6 +127,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 4]}
+        self.opt_shape = {"x": [1, 3, 4]}
         self.max_shape = {"x": [5, 3, 4]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_logic.py b/test/tensorrt/test_converter_logic.py
index cfc3375c3896c0..30c920c4137439 100644
--- a/test/tensorrt/test_converter_logic.py
+++ b/test/tensorrt/test_converter_logic.py
@@ -29,6 +29,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [5, 3], "y": [3]}
 
     def test_trt_result(self):
@@ -44,6 +45,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
 
     def test_trt_result(self):
@@ -59,6 +61,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [5, 3], "y": [3]}
 
     def test_trt_result(self):
@@ -74,6 +77,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
 
     def test_trt_result(self):
@@ -89,6 +93,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
 
     def test_trt_result(self):
@@ -104,6 +109,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
 
     def test_trt_result(self):
@@ -119,6 +125,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
 
     def test_trt_result(self):
@@ -134,6 +141,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
 
     def test_trt_result(self):
@@ -149,6 +157,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [5, 3], "y": [3]}
 
     def test_trt_result_fp16(self):
@@ -167,6 +176,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 5], "y": [1, 5]}
+        self.opt_shape = {"x": [2, 5], "y": [1, 5]}
         self.max_shape = {"x": [10, 5], "y": [1, 5]}
 
     def test_trt_result_fp16(self):
@@ -185,6 +195,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result_fp16(self):
@@ -203,6 +214,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [2, 1], "y": [2, 3]}
+        self.opt_shape = {"x": [2, 1], "y": [2, 3]}
         self.max_shape = {"x": [2, 1], "y": [2, 3]}
 
     def test_trt_result_fp16(self):
@@ -221,6 +233,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [5, 3], "y": [3]}
 
     def test_trt_result_fp16(self):
@@ -238,6 +251,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result_fp16(self):
@@ -255,6 +269,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result_fp16(self):
@@ -275,6 +290,7 @@ def test_trt_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
         self.check_trt_result()
 
@@ -285,6 +301,7 @@ def test_trt_diff_shape_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [4, 3], "y": [3]}
         self.check_trt_result()
 
@@ -298,6 +315,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [5, 3], "y": [3]}
 
     def test_trt_result(self):
@@ -313,6 +331,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [5, 3], "y": [3]}
 
     def test_trt_result(self):
@@ -327,6 +346,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -341,6 +361,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result_fp16(self):
@@ -375,6 +396,7 @@ def test_trt_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
         self.check_trt_result()
 
@@ -385,6 +407,7 @@ def test_trt_diff_shape_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [4, 3], "y": [3]}
         self.check_trt_result()
 
@@ -414,6 +437,7 @@ def test_trt_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
         self.check_trt_result()
 
@@ -424,6 +448,7 @@ def test_trt_diff_shape_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [4, 3], "y": [3]}
         self.check_trt_result()
 
@@ -450,6 +475,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [2, 3]}
 
     def test_trt_result(self):
@@ -462,6 +488,7 @@ def setUp(self):
         self.api_args = {"x": np.random.random([2]).astype("bool")}
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2]}
+        self.opt_shape = {"x": [2]}
         self.max_shape = {"x": [2]}
 
     def test_trt_result(self):
@@ -479,6 +506,7 @@ def test_trt_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1], "y": [1]}
+        self.opt_shape = {"x": [2], "y": [2]}
         self.max_shape = {"x": [5], "y": [5]}
         self.check_trt_result()
 
@@ -489,6 +517,7 @@ def test_trt_diff_shape_result(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [3]}
+        self.opt_shape = {"x": [2, 3], "y": [3]}
         self.max_shape = {"x": [4, 3], "y": [3]}
         self.check_trt_result()
 
diff --git a/test/tensorrt/test_converter_manipulation.py b/test/tensorrt/test_converter_manipulation.py
index 6b38f0b91a5b09..595091a928bb82 100644
--- a/test/tensorrt/test_converter_manipulation.py
+++ b/test/tensorrt/test_converter_manipulation.py
@@ -30,6 +30,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [5, 3]}
         self.max_shape = {"x": [10, 3]}
 
     def test_trt_result(self):
@@ -45,6 +46,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [5, 3]}
         self.max_shape = {"x": [10, 3]}
 
     def test_trt_result(self):
@@ -60,6 +62,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [5, 3]}
         self.max_shape = {"x": [10, 3]}
 
     def test_trt_result(self):
@@ -79,6 +82,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [[1, 3], [1, 3], [1, 2]]}
+        self.opt_shape = {"x": [[5, 3], [5, 3], [5, 2]]}
         self.max_shape = {"x": [[5, 3], [5, 3], [5, 2]]}
 
     def test_trt_result(self):
@@ -95,6 +99,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 1, 19]}
+        self.opt_shape = {"x": [10, 1, 1, 19]}
         self.max_shape = {"x": [10, 1, 1, 19]}
 
     def test_trt_result(self):
@@ -110,6 +115,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [6, 3]}
         self.max_shape = {"x": [6, 3]}
 
     def test_trt_result(self):
@@ -125,6 +131,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "shape"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [6, 3]}
         self.max_shape = {"x": [6, 3]}
 
     def test_trt_result(self):
@@ -148,6 +155,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 6, 64, 64]}
+        self.opt_shape = {"x": [4, 6, 64, 64]}
         self.max_shape = {"x": [8, 6, 64, 64]}
 
     def test_trt_result(self):
@@ -163,6 +171,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {}
+        self.opt_shape = {}
         self.max_shape = {}
 
     def test_trt_result(self):
@@ -180,6 +189,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [2, 6, 64, 64]}
+        self.opt_shape = {"x": [4, 6, 64, 64]}
         self.max_shape = {"x": [8, 6, 64, 64]}
 
     def test_trt_result(self):
@@ -197,6 +207,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [4, 3]}
         self.max_shape = {"x": [4, 3]}
 
     def test_trt_result(self):
@@ -214,6 +225,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "starts", "ends"]}
         self.min_shape = {"x": [3, 4, 5, 6]}
+        self.opt_shape = {"x": [6, 4, 5, 6]}
         self.max_shape = {"x": [6, 4, 5, 6]}
 
     def test_trt_result(self):
@@ -230,6 +242,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 9, 5]}
+        self.opt_shape = {"x": [3, 9, 5]}
         self.max_shape = {"x": [3, 9, 5]}
 
     def test_trt_result(self):
@@ -246,6 +259,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "axis"]}
         self.min_shape = {"x": [1, 9, 5]}
+        self.opt_shape = {"x": [3, 9, 5]}
         self.max_shape = {"x": [3, 9, 5]}
 
     def test_trt_result(self):
@@ -262,7 +276,8 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "axis"]}
         self.min_shape = {"x": [1, 2]}
-        self.max_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [1, 2]}
+        self.max_shape = {"x": [3, 2]}
 
     def test_trt_result(self):
         self.check_trt_result()
@@ -278,6 +293,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 9, 5]}
+        self.opt_shape = {"x": [2, 9, 5]}
         self.max_shape = {"x": [3, 9, 5]}
 
     def test_trt_result(self):
@@ -294,6 +310,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 9, 5]}
+        self.opt_shape = {"x": [2, 9, 5]}
         self.max_shape = {"x": [3, 9, 5]}
 
     def test_trt_result(self):
@@ -310,6 +327,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "axis"]}
         self.min_shape = {"x": [1, 9, 5]}
+        self.opt_shape = {"x": [2, 9, 5]}
         self.max_shape = {"x": [3, 9, 5]}
 
     def test_trt_result(self):
@@ -330,6 +348,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "num_or_sections"]}
         self.min_shape = {"x": [1, 9, 5]}
+        self.opt_shape = {"x": [2, 9, 5]}
         self.max_shape = {"x": [3, 9, 5]}
 
     def test_trt_result(self):
@@ -346,6 +365,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "num_or_sections", "axis"]}
         self.min_shape = {"x": [1, 9, 5]}
+        self.opt_shape = {"x": [2, 9, 5]}
         self.max_shape = {"x": [3, 9, 5]}
 
     def test_trt_result(self):
@@ -365,6 +385,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [[1, 2], [1, 2], [1, 2]]}
+        self.opt_shape = {"x": [[2, 2], [2, 2], [2, 2]]}
         self.max_shape = {"x": [[3, 2], [3, 2], [3, 2]]}
 
     def test_trt_result(self):
@@ -384,6 +405,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [[1, 2], [1, 2], [1, 2]]}
+        self.opt_shape = {"x": [[2, 2], [2, 2], [2, 2]]}
         self.max_shape = {"x": [[3, 2], [3, 2], [3, 2]]}
 
     def test_trt_result(self):
@@ -399,6 +421,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 2, 3]}
         self.max_shape = {"x": [2, 2, 3]}
 
     def test_trt_result(self):
@@ -414,6 +437,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "repeat_times"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 2, 3]}
         self.max_shape = {"x": [2, 2, 3]}
 
     def test_trt_result(self):
@@ -429,6 +453,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 2, 3]}
         self.max_shape = {"x": [2, 2, 3]}
 
     def test_trt_result(self):
@@ -447,6 +472,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4, 10]}
+        self.opt_shape = {"x": [2, 4, 10]}
         self.max_shape = {"x": [5, 4, 10]}
 
     def test_trt_result(self):
@@ -465,6 +491,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4, 10]}
+        self.opt_shape = {"x": [2, 4, 10]}
         self.max_shape = {"x": [5, 4, 10]}
 
     def test_trt_result(self):
@@ -483,6 +510,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4, 10]}
+        self.opt_shape = {"x": [2, 4, 10]}
         self.max_shape = {"x": [5, 4, 10]}
 
     def test_trt_result(self):
@@ -501,7 +529,8 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 56, 56, 128]}
-        self.max_shape = {"x": [1, 56, 56, 128]}
+        self.opt_shape = {"x": [3, 56, 56, 128]}
+        self.max_shape = {"x": [2, 56, 56, 128]}
 
     def test_trt_result(self):
         self.check_trt_result()
@@ -522,7 +551,8 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 56, 56, 128]}
-        self.max_shape = {"x": [1, 56, 56, 128]}
+        self.opt_shape = {"x": [3, 56, 56, 128]}
+        self.max_shape = {"x": [3, 56, 56, 128]}
 
     def test_trt_result(self):
         self.check_trt_result()
@@ -538,6 +568,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4, 10]}
+        self.opt_shape = {"x": [2, 4, 10]}
         self.max_shape = {"x": [5, 4, 10]}
 
     def test_trt_result(self):
@@ -554,6 +585,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4, 10]}
+        self.opt_shape = {"x": [2, 4, 10]}
         self.max_shape = {"x": [5, 4, 10]}
 
     def test_trt_result(self):
@@ -570,6 +602,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "shift"]}
         self.min_shape = {"x": [1, 4, 10]}
+        self.opt_shape = {"x": [2, 4, 10]}
         self.max_shape = {"x": [5, 4, 10]}
 
     def test_trt_result(self):
@@ -585,6 +618,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 28]}
+        self.opt_shape = {"x": [2, 1, 28]}
         self.max_shape = {"x": [5, 1, 28]}
 
     def test_trt_result(self):
@@ -600,11 +634,45 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 28]}
+        self.opt_shape = {"x": [2, 1, 28]}
         self.max_shape = {"x": [5, 1, 28]}
 
     def test_trt_result(self):
         self.check_trt_result()
 
 
+class TestNumelTRTCase1Pattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.numel
+        self.api_args = {
+            "x": np.random.randn(2, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
+        self.max_shape = {"x": [5, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_fp16_result(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestNumelTRTCase2Pattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.numel
+        self.api_args = {
+            "x": np.random.randn(1, 2, 33, 33).astype("int64"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [1, 2, 33, 33]}
+        self.opt_shape = {"x": [2, 2, 33, 33]}
+        self.max_shape = {"x": [5, 2, 33, 33]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/tensorrt/test_converter_math.py b/test/tensorrt/test_converter_math.py
index cdb9858f738d07..3783615ddbde1c 100644
--- a/test/tensorrt/test_converter_math.py
+++ b/test/tensorrt/test_converter_math.py
@@ -29,6 +29,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4]}
+        self.opt_shape = {"x": [2, 4]}
         self.max_shape = {"x": [5, 4]}
 
     def test_trt_result(self):
@@ -44,6 +45,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -59,6 +61,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -74,6 +77,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -89,6 +93,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -104,6 +109,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result_fp16(self):
@@ -130,6 +136,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -155,6 +162,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -170,6 +178,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4]}
+        self.opt_shape = {"x": [2, 4]}
         self.max_shape = {"x": [5, 4]}
 
     def test_trt_result(self):
@@ -185,6 +194,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4, 6]}
+        self.opt_shape = {"x": [2, 4, 6]}
         self.max_shape = {"x": [5, 4, 6]}
 
     def test_trt_result(self):
@@ -200,6 +210,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4, 6]}
+        self.opt_shape = {"x": [2, 4, 6]}
         self.max_shape = {"x": [5, 4, 6]}
 
     def test_trt_result(self):
@@ -216,6 +227,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 2]}
+        self.opt_shape = {"x": [2, 3, 2]}
         self.max_shape = {"x": [5, 3, 2]}
 
     def test_trt_result(self):
@@ -232,6 +244,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 2]}
+        self.opt_shape = {"x": [2, 3, 2]}
         self.max_shape = {"x": [5, 3, 2]}
 
     def test_trt_result(self):
@@ -248,6 +261,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 2]}
+        self.opt_shape = {"x": [2, 3, 2]}
         self.max_shape = {"x": [5, 3, 2]}
 
     def test_trt_result(self):
@@ -264,6 +278,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 2]}
+        self.opt_shape = {"x": [2, 3, 2]}
         self.max_shape = {"x": [5, 3, 2]}
 
     def test_trt_result(self):
@@ -279,6 +294,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 2, 3]}
         self.max_shape = {"x": [5, 2, 3]}
 
     def test_trt_result_fp16(self):
@@ -297,6 +313,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 2, 3]}
         self.max_shape = {"x": [5, 2, 3]}
 
     def test_trt_result_fp16(self):
@@ -315,6 +332,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 2, 3]}
         self.max_shape = {"x": [5, 2, 3]}
 
     def test_trt_result_fp16(self):
@@ -333,6 +351,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 2, 3]}
         self.max_shape = {"x": [5, 2, 3]}
 
     def test_trt_result(self):
@@ -348,6 +367,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -371,6 +391,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"x": [2, 3], "y": [2, 3]}
         self.max_shape = {"x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -385,6 +406,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -399,6 +421,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -419,6 +442,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4]}
+        self.opt_shape = {"x": [2, 4]}
         self.max_shape = {"x": [5, 4]}
 
     def test_trt_result(self):
@@ -436,6 +460,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 4]}
+        self.opt_shape = {"x": [2, 4]}
         self.max_shape = {"x": [5, 4]}
 
     def test_trt_result(self):
@@ -456,6 +481,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "min", "max"]}
         self.min_shape = {"x": [1, 4]}
+        self.opt_shape = {"x": [2, 4]}
         self.max_shape = {"x": [5, 4]}
 
     def test_trt_result(self):
@@ -474,6 +500,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "min", "max"]}
         self.min_shape = {"x": [1, 4]}
+        self.opt_shape = {"x": [2, 4]}
         self.max_shape = {"x": [5, 4]}
 
     def test_trt_result(self):
@@ -489,6 +516,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]}
+        self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]}
         self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]}
 
     def test_trt_result_fp16(self):
@@ -507,6 +535,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3, 4], "y": [4]}
+        self.opt_shape = {"x": [2, 3, 4], "y": [4]}
         self.max_shape = {"x": [5, 3, 4], "y": [4]}
 
     def test_trt_result_fp16(self):
@@ -537,6 +566,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]}
+        self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]}
         self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]}
 
     def test_trt_result_fp16(self):
@@ -555,6 +585,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]}
+        self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]}
         self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]}
 
     def test_trt_result_fp16(self):
@@ -573,6 +604,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3, 4], "y": [4]}
+        self.opt_shape = {"x": [2, 3, 4], "y": [4]}
         self.max_shape = {"x": [5, 3, 4], "y": [4]}
 
     def test_trt_result_fp16(self):
@@ -603,6 +635,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "y"]}
         self.min_shape = {"x": [1, 3, 4], "y": [1, 3, 4]}
+        self.opt_shape = {"x": [2, 3, 4], "y": [2, 3, 4]}
         self.max_shape = {"x": [5, 3, 4], "y": [5, 3, 4]}
 
     def test_trt_result_fp16(self):
diff --git a/test/tensorrt/test_converter_norm.py b/test/tensorrt/test_converter_norm.py
index 9144a64386395a..d33880c73d9c21 100644
--- a/test/tensorrt/test_converter_norm.py
+++ b/test/tensorrt/test_converter_norm.py
@@ -33,6 +33,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 2, 3]}
+        self.opt_shape = {"x": [2, 1, 2, 3]}
         self.max_shape = {"x": [5, 1, 2, 3]}
 
     def test_trt_result(self):
@@ -53,6 +54,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "weight", "bias"]}
         self.min_shape = {"x": [1, 2, 1, 3]}
+        self.opt_shape = {"x": [2, 2, 1, 3]}
         self.max_shape = {"x": [5, 2, 1, 3]}
 
     def test_trt_result(self):
@@ -69,6 +71,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "weight", "bias"]}
         self.min_shape = {"x": [1, 2, 1]}
+        self.opt_shape = {"x": [2, 2, 1]}
         self.max_shape = {"x": [5, 2, 1]}
 
     def test_trt_result(self):
@@ -85,6 +88,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "weight", "bias"]}
         self.min_shape = {"x": [1, 2, 1, 3]}
+        self.opt_shape = {"x": [2, 2, 1, 3]}
         self.max_shape = {"x": [5, 2, 1, 3]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_ops.py b/test/tensorrt/test_converter_ops.py
index 8bc188e3e5514b..544fca80fbecc0 100644
--- a/test/tensorrt/test_converter_ops.py
+++ b/test/tensorrt/test_converter_ops.py
@@ -28,6 +28,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
         self.max_shape = {"x": [10, 3]}
 
     def test_trt_result(self):
@@ -42,6 +43,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
         self.max_shape = {"x": [10, 3]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_others.py b/test/tensorrt/test_converter_others.py
index a26b5546c9a719..0c88733296f262 100644
--- a/test/tensorrt/test_converter_others.py
+++ b/test/tensorrt/test_converter_others.py
@@ -66,6 +66,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["bboxes", "scores"]}
         self.min_shape = {"bboxes": [1, 5, 4], "scores": [1, 4, 5]}
+        self.opt_shape = {"bboxes": [2, 5, 4], "scores": [2, 4, 5]}
         self.max_shape = {"bboxes": [3, 5, 4], "scores": [3, 4, 5]}
 
     def test_trt_result(self):
@@ -170,6 +171,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [2, 2]}
         self.max_shape = {"x": [20, 2]}
 
     def test_trt_result(self):
@@ -193,7 +195,8 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2]}
-        self.max_shape = {"x": [20, 2]}
+        self.opt_shape = {"x": [2, 2]}
+        self.max_shape = {"x": [5, 2]}
 
     def test_trt_result(self):
         self.check_marker(expected_result=False)
@@ -216,6 +219,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [2, 2]}
         self.max_shape = {"x": [20, 2]}
 
     def test_trt_result(self):
@@ -239,6 +243,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [2, 2]}
         self.max_shape = {"x": [20, 2]}
 
     def test_trt_result(self):
@@ -262,6 +267,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [2, 2]}
         self.max_shape = {"x": [20, 2]}
 
     def test_trt_result(self):
@@ -285,6 +291,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "starts"]}
         self.min_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [2, 2]}
         self.max_shape = {"x": [20, 2]}
 
     def test_trt_result(self):
@@ -307,6 +314,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 2]}
+        self.opt_shape = {"x": [2, 2]}
         self.max_shape = {"x": [20, 2]}
 
     def test_trt_result(self):
@@ -329,6 +337,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "values"]}
         self.min_shape = {"x": [1, 3, 3], "values": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 3, 3], "values": [2, 2, 3]}
         self.max_shape = {"x": [4, 3, 3], "values": [4, 2, 3]}
 
     def test_trt_result(self):
@@ -352,6 +361,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "values"]}
         self.min_shape = {"x": [1, 3, 3], "values": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 3, 3], "values": [2, 2, 3]}
         self.max_shape = {"x": [4, 3, 3], "values": [4, 2, 3]}
 
     def test_trt_result(self):
@@ -374,6 +384,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "values"]}
         self.min_shape = {"x": [1, 3, 3], "values": [1, 2, 3]}
+        self.opt_shape = {"x": [2, 3, 3], "values": [2, 2, 3]}
         self.max_shape = {"x": [4, 3, 3], "values": [4, 2, 3]}
 
     def test_trt_result(self):
@@ -388,11 +399,64 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [4, 3, 5]}
+        self.opt_shape = {"x": [5, 3, 5]}
         self.max_shape = {"x": [6, 3, 5]}
 
     def test_trt_result(self):
         self.check_trt_result()
 
 
+def affine_channel(x, scale_shape, bias_shape, layout):
+    scale = paddle.static.create_parameter(
+        shape=scale_shape, dtype='float32', name="scale"
+    )
+    bias = paddle.static.create_parameter(
+        shape=bias_shape, dtype='float32', name="bias"
+    )
+    return _C_ops.affine_channel(x, scale, bias, layout)
+
+
+class TestAffineChannelTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = affine_channel
+        self.api_args = {
+            "x": np.random.random((2, 100, 3, 3)).astype("float32"),
+            "scale_shape": [100],
+            "bias_shape": [100],
+            "layout": "NCHW",
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [1, 100, 3, 3]}
+        self.opt_shape = {"x": [2, 100, 3, 3]}
+        self.max_shape = {"x": [3, 100, 3, 3]}
+
+    def test_fp32_trt_result(self):
+        self.check_trt_result()
+
+    def test_fp16_trt_result(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestAffineChannelCas1TRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = affine_channel
+        self.api_args = {
+            "x": np.random.random((2, 3, 3, 100)).astype("float32"),
+            "scale_shape": [100],
+            "bias_shape": [100],
+            "layout": "NHWC",
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [1, 3, 3, 100]}
+        self.opt_shape = {"x": [2, 3, 3, 100]}
+        self.max_shape = {"x": [3, 3, 3, 100]}
+
+    def test_fp32_trt_result(self):
+        self.check_trt_result()
+
+    def test_fp16_trt_result(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/tensorrt/test_converter_pooling.py b/test/tensorrt/test_converter_pooling.py
index 32523ba4c27e96..5219b71df28d47 100644
--- a/test/tensorrt/test_converter_pooling.py
+++ b/test/tensorrt/test_converter_pooling.py
@@ -56,6 +56,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 2, 3]}
+        self.opt_shape = {"x": [1, 1, 2, 3]}
         self.max_shape = {"x": [5, 1, 2, 3]}
 
     def test_trt_result(self):
@@ -80,6 +81,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 2, 3]}
+        self.opt_shape = {"x": [1, 1, 2, 3]}
         self.max_shape = {"x": [5, 1, 2, 3]}
 
     def test_trt_result(self):
@@ -104,6 +106,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 2, 3]}
+        self.opt_shape = {"x": [1, 1, 2, 3]}
         self.max_shape = {"x": [5, 1, 2, 3]}
 
     def test_trt_result(self):
@@ -128,6 +131,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 2, 3]}
+        self.opt_shape = {"x": [1, 1, 2, 3]}
         self.max_shape = {"x": [5, 1, 2, 3]}
 
     def test_trt_result(self):
@@ -152,6 +156,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 1, 5, 5]}
+        self.opt_shape = {"x": [1, 1, 5, 5]}
         self.max_shape = {"x": [5, 1, 5, 5]}
 
     def test_trt_result(self):
@@ -176,6 +181,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 16, 56, 56]}
+        self.opt_shape = {"x": [1, 16, 56, 56]}
         self.max_shape = {"x": [5, 16, 56, 56]}
 
     def test_trt_result(self):
@@ -200,7 +206,8 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 5, 5]}
-        self.max_shape = {"x": [2, 3, 5, 5]}  # 动态批次大小，宽度保持为 1
+        self.opt_shape = {"x": [1, 3, 5, 5]}
+        self.max_shape = {"x": [2, 3, 5, 5]}
 
     def test_trt_result(self):
         self.check_trt_result()
diff --git a/test/tensorrt/test_converter_search.py b/test/tensorrt/test_converter_search.py
index 2665caee450dd5..dffb0348bd64b6 100644
--- a/test/tensorrt/test_converter_search.py
+++ b/test/tensorrt/test_converter_search.py
@@ -29,6 +29,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -89,6 +90,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -150,6 +152,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["condition", "x", "y"]}
         self.min_shape = {"condition": [1, 3], "x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"condition": [2, 3], "x": [2, 3], "y": [2, 3]}
         self.max_shape = {"condition": [5, 3], "x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -165,6 +168,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -180,6 +184,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [2]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -195,6 +200,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -226,6 +232,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["condition", "x", "y"]}
         self.min_shape = {"condition": [1, 3], "x": [1, 3], "y": [1, 3]}
+        self.opt_shape = {"condition": [2, 3], "x": [2, 3], "y": [2, 3]}
         self.max_shape = {"condition": [5, 3], "x": [5, 3], "y": [5, 3]}
 
     def test_trt_result(self):
@@ -241,6 +248,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -256,6 +264,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [2]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -272,6 +281,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1]}
+        self.opt_shape = {"x": [2]}
         self.max_shape = {"x": [5]}
 
     def test_trt_result(self):
@@ -288,6 +298,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "index"]}
         self.min_shape = {"x": [1, 3, 3], "index": [1]}
+        self.opt_shape = {"x": [2, 3, 3], "index": [2]}
         self.max_shape = {"x": [5, 3, 3], "index": [5]}
 
     def test_trt_result_fp16(self):
@@ -307,6 +318,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "index"]}
         self.min_shape = {"x": [1, 3, 3], "index": [1]}
+        self.opt_shape = {"x": [2, 3, 3], "index": [2]}
         self.max_shape = {"x": [5, 3, 3], "index": [5]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_stat.py b/test/tensorrt/test_converter_stat.py
index 4ea43f9bbb2f6c..9a7e8d19c4cc99 100644
--- a/test/tensorrt/test_converter_stat.py
+++ b/test/tensorrt/test_converter_stat.py
@@ -30,6 +30,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3]}
+        self.opt_shape = {"x": [2, 3]}
         self.max_shape = {"x": [5, 3]}
 
     def test_trt_result(self):
@@ -46,6 +47,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x"]}
         self.min_shape = {"x": [1, 3, 2]}
+        self.opt_shape = {"x": [2, 3, 2]}
         self.max_shape = {"x": [5, 3, 2]}
 
     def test_trt_result(self):
diff --git a/test/tensorrt/test_converter_vision.py b/test/tensorrt/test_converter_vision.py
index 59d735311eaf6a..62b0b14f49ae7a 100644
--- a/test/tensorrt/test_converter_vision.py
+++ b/test/tensorrt/test_converter_vision.py
@@ -40,6 +40,7 @@ def setUp(self):
         }
         self.program_config = {"feed_list": ["x", "grid"]}
         self.min_shape = {"x": [1, 1, 3, 3], "grid": [1, 3, 4, 2]}
+        self.opt_shape = {"x": [1, 1, 3, 3], "grid": [1, 3, 4, 2]}
         self.max_shape = {"x": [5, 1, 3, 3], "grid": [5, 3, 4, 2]}
 
 

From 5718f746d945449b5bb9a8615e0b1f06aff09475 Mon Sep 17 00:00:00 2001
From: Zhou Xin <zhou.xin@mail.ustc.edu.cn>
Date: Tue, 7 Jan 2025 19:47:14 +0800
Subject: [PATCH 02/57] [CINN][Backend Pass Update No.10] Update
 ReplaceCrossThreadReduction pass (#70592)

* Update replaceCrossThreadReduction

* Add visit logics for IfThenElse stmt

* Refine test and exception message

* Leverage help function Mutate to refactor CrossThreadReductionReplacer
---
 paddle/cinn/ir/schedule/ir_schedule_util.cc   |  12 ++
 paddle/cinn/ir/schedule/ir_schedule_util.h    |   7 +
 .../optim/replace_cross_thread_reduction.cc   | 199 ++++++++++--------
 .../optim/replace_cross_thread_reduction.h    |  11 +-
 .../replace_cross_thread_reduction_test.cc    |  26 ++-
 5 files changed, 152 insertions(+), 103 deletions(-)

diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index 19bed9130494dd..316854db08ebed 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -110,6 +110,18 @@ int GetLoopExtent(const Expr& loop) {
   return static_cast<int>(loop.As<ir::For>()->extent.get_constant());
 }
 
+int GetLoopExtent(const ir::stmt::For loop) {
+  PADDLE_ENFORCE_EQ(
+      cinn::common::is_zero(loop->min()),
+      true,
+      ::common::errors::InvalidArgument("For node's min should be zero."));
+  PADDLE_ENFORCE_EQ(loop->extent().is_constant(),
+                    true,
+                    ::common::errors::InvalidArgument(
+                        "For node's extent should be constant."));
+  return static_cast<int>(loop->extent().get_constant());
+}
+
 void SetCudaAxisInfo(ir::LoweredFunc lowered_func) {
   auto CannotProveLT = [](const ir::Expr& lhs, const ir::Expr& rhs) -> bool {
     std::vector<ir::Expr> exprs{rhs, lhs};
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h
index 7ec7e4f96f4a2c..576a7448147e6e 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.h
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.h
@@ -65,6 +65,13 @@ Tensor GetReadTensor(const Expr& block, int index);
  */
 int GetLoopExtent(const Expr& loop);
 
+/**
+ * \brief Given a For node, return its extent as int.
+ * @param loop The given For node
+ * @return The extent of For node
+ */
+int GetLoopExtent(const ir::stmt::For loop);
+
 /**
  * \brief Given a vector of Exprs, return whether they contain a var with
  * specific name.
diff --git a/paddle/cinn/optim/replace_cross_thread_reduction.cc b/paddle/cinn/optim/replace_cross_thread_reduction.cc
index 487214c08c6b90..947911eeef30c4 100644
--- a/paddle/cinn/optim/replace_cross_thread_reduction.cc
+++ b/paddle/cinn/optim/replace_cross_thread_reduction.cc
@@ -26,7 +26,9 @@
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
+#include "paddle/cinn/ir/utils/stmt_converter.h"
 #include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/pass/pass_manager.h"
 
 namespace cinn {
 namespace optim {
@@ -40,27 +42,17 @@ struct BufferCmp {
 };
 
 thread_local std::set<ir::Buffer, BufferCmp> shm_buffer_;
-struct CrossThreadReductionReplacer : public ir::IRMutator<> {
+struct CrossThreadReductionReplacer {
   void operator()(ir::LoweredFunc fn) { Visit(fn.As<ir::_LoweredFunc_>()); }
 
  private:
-  bool CanReplace(const ir::ScheduleBlockRealize* block_realize) {
-    const ir::ScheduleBlock* schedule_block =
-        block_realize->schedule_block.As<ir::ScheduleBlock>();
-
-    PADDLE_ENFORCE_NOT_NULL(
-        schedule_block,
-        ::common::errors::PreconditionNotMet(
-            "The schedule block pointer in CanReplace must not be null."));
-
-    if (block_realize->schedule_block.As<ir::ScheduleBlock>()->name.substr(
-            0, 4) == "root") {
+  bool CanReplace(const ir::stmt::Schedule block) {
+    if (block->name().substr(0, 4) == "root") {
       return false;
     }
 
-    const std::vector<ir::Expr>& iter_values = block_realize->iter_values;
-    const std::vector<ir::Var>& iter_vars = schedule_block->iter_vars;
-    ir::Expr body = schedule_block->body;
+    const std::vector<ir::Expr>& iter_values = block->iter_values();
+    const std::vector<ir::Var>& iter_vars = block->iter_vars();
 
     std::unordered_set<std::string> reduce_var_names;
     for (int i = 0; i < iter_values.size(); ++i) {
@@ -72,23 +64,22 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> {
             if (x->as_var()) {
               reduce_var_names.insert(x->as_var()->name);
             }
-
             return false;
           });
     }
 
-    auto IsThreadBindOnReduceAxis = [&](const ir::For* for_node) {
-      return reduce_var_names.count(for_node->loop_var->name) > 0 &&
+    auto IsThreadBindOnReduceAxis = [&](const ir::stmt::For& for_node) {
+      return reduce_var_names.count(for_node->loop_var()->name) > 0 &&
              for_node->is_gpu_thread_binded();
     };
 
     std::vector<int> thread_binded_reduce_loop_indices;
     bool is_thread_binded_inner_loop = false;
     for (int i = 0; i < cur_loops_.size(); ++i) {
-      bool is_thread_bind_on_reduce =
-          IsThreadBindOnReduceAxis(cur_loops_[i].As<ir::For>());
-      if (is_thread_bind_on_reduce && ir::GetLoopExtent(cur_loops_[i]) == 1)
+      bool is_thread_bind_on_reduce = IsThreadBindOnReduceAxis(cur_loops_[i]);
+      if (is_thread_bind_on_reduce && ir::GetLoopExtent(cur_loops_[i]) == 1) {
         return false;
+      }
       if (is_thread_binded_inner_loop || is_thread_bind_on_reduce) {
         if (ir::GetLoopExtent(cur_loops_[i]) > 1024) {
           return false;
@@ -115,7 +106,7 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> {
   int GetBlockSize() const {
     int block_size = 1;
     for (auto& loop : cur_loops_) {
-      if (loop->as<ir::For>()->is_gpu_thread_binded()) {
+      if (loop->is_gpu_thread_binded()) {
         block_size *= ir::GetLoopExtent(loop);
       }
     }
@@ -123,13 +114,14 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> {
   }
 
   template <typename OpT>
-  void ReplaceByContinuousReduceExternCall(ir::Expr* store, bool return_warp) {
-    auto* node = store->As<ir::Store>()->value.As<OpT>();
+  void ReplaceByContinuousReduceExternCall(ir::stmt::Store store,
+                                           bool return_warp) {
+    auto* node = store->value().As<OpT>();
     PADDLE_ENFORCE_NOT_NULL(
         node, ::common::errors::InvalidArgument("The node must not be null."));
     auto& operand = node->b();
     std::string reduce_func_name = hlir::pe::CrossThreadReduceExternalFuncName(
-        store->As<ir::Store>()->value, operand.template As<ir::Load>()->tensor);
+        store->value(), operand.template As<ir::Load>()->tensor);
     auto tmp_dtype =
         operand.template As<ir::Load>()->tensor.as_tensor()->type();
     auto tmp_buffer = ir::_Buffer_::Make(
@@ -138,18 +130,18 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> {
     tmp_buffer->dtype = tmp_dtype;
     tmp_buffer->memory_type = ir::MemoryType::GPUShared;
     shm_buffer_.insert(tmp_buffer);
-    store->As<ir::Store>()->value = lang::CallExtern(
-        reduce_func_name, {node->b(), tmp_buffer, ir::Expr(return_warp)});
+    store->set_value(lang::CallExtern(
+        reduce_func_name, {node->b(), tmp_buffer, ir::Expr(return_warp)}));
   }
 
   template <typename OpT>
-  void ReplaceByDiscreteReduceExternCall(ir::Expr* store) {
-    auto* node = store->As<ir::Store>()->value.As<OpT>();
+  void ReplaceByDiscreteReduceExternCall(ir::stmt::Store store) {
+    auto* node = store->value().As<OpT>();
     PADDLE_ENFORCE_NOT_NULL(
         node, ::common::errors::InvalidArgument("The node must not be null."));
     auto& operand = node->b();
     std::string reduce_func_name = hlir::pe::DiscreteReduceExternalFuncName(
-        store->As<ir::Store>()->value, operand.template As<ir::Load>()->tensor);
+        store->value(), operand.template As<ir::Load>()->tensor);
     auto tmp_dtype =
         operand.template As<ir::Load>()->tensor.as_tensor()->type();
     auto tmp_buffer = ir::_Buffer_::Make(
@@ -158,12 +150,12 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> {
     tmp_buffer->dtype = tmp_dtype;
     tmp_buffer->memory_type = ir::MemoryType::GPUShared;
     shm_buffer_.insert(tmp_buffer);
-    store->As<ir::Store>()->value =
-        lang::CallExtern(reduce_func_name, {node->b(), tmp_buffer});
+    store->set_value(
+        lang::CallExtern(reduce_func_name, {node->b(), tmp_buffer}));
   }
 
   template <typename OpT>
-  void ReplaceByReduceExternCall(ir::Expr* store,
+  void ReplaceByReduceExternCall(ir::stmt::Store store,
                                  const ir::ReduceMethod& method) {
     std::visit(cinn::adt::match{
                    [&](const ir::NoneReduceMethod&) {
@@ -181,10 +173,11 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> {
                method);
   }
 
-  void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
-  void Visit(ir::_LoweredFunc_* fn) override {
-    ir::IRMutator<>::Visit(fn);
+  void Visit(ir::_LoweredFunc_* fn) {
+    ir::stmt::Mutate(
+        fn->body_block,
+        [&](ir::stmt::StmtRef stmt) { PreCall(stmt); },
+        [&](ir::stmt::StmtRef stmt) { PostCall(stmt); });
     if (std::find_if(fn->temp_bufs.begin(),
                      fn->temp_bufs.end(),
                      [&](const ir::Buffer& buf) -> bool {
@@ -198,74 +191,98 @@ struct CrossThreadReductionReplacer : public ir::IRMutator<> {
     shm_buffer_.clear();
   }
 
-  void Visit(const ir::ScheduleBlockRealize* expr, ir::Expr* op) override {
-    if (!CanReplace(expr)) {
-      VLOG(6) << "Can't replace cross thread reduction: " << *op;
-      IRMutator::Visit(expr, op);
-      return;
+  void PreCall(ir::stmt::StmtRef stmt) {
+    switch (stmt->stmt_type()) {
+      case ir::StmtNodeTy::Schedule:
+        VisitStmt(stmt.as<ir::stmt::Schedule>());
+        break;
+      case ir::StmtNodeTy::For:
+        cur_loops_.push_back(stmt.as<ir::stmt::For>());
+        break;
+      default:
+        break;
     }
-    VLOG(6) << "Can replace cross thread reduction: " << *op;
+  }
 
-    const ir::ScheduleBlock* schedule_block =
-        expr->schedule_block.As<ir::ScheduleBlock>();
-    PADDLE_ENFORCE_NOT_NULL(
-        schedule_block,
-        ::common::errors::PreconditionNotMet(
-            "The schedule block pointer in Visit must not be null."));
-    ir::Expr original_update_body = schedule_block->body;
-    ir::Expr original_update_stmt;
-    PADDLE_ENFORCE_EQ(original_update_body.As<ir::Block>() ||
-                          original_update_body.As<ir::Store>(),
-                      true,
-                      ::common::errors::InvalidArgument(
-                          "The type of original_update_body is incorrect."
-                          "Expected type is Block or Store."));
-    if (original_update_body.As<ir::Block>()) {
-      PADDLE_ENFORCE_EQ(
-          original_update_body.As<ir::Block>()->stmts.size(),
-          1,
-          ::common::errors::InvalidArgument(
-              "The size of stmts is incorrect."
-              "Expected size is 1, but receive %d.",
-              original_update_body.As<ir::Block>()->stmts.size()));
-      original_update_stmt = original_update_body.As<ir::Block>()->stmts[0];
-    } else if (original_update_body.As<ir::Store>()) {
-      original_update_stmt = original_update_body;
+  void PostCall(ir::stmt::StmtRef stmt) {
+    switch (stmt->stmt_type()) {
+      case ir::StmtNodeTy::For:
+        cur_loops_.pop_back();
+        break;
+      default:
+        break;
     }
-
-#define REPLACE_TO_EXTERNAL_CALL(Op)                              \
-  if (original_update_stmt.As<ir::Store>()->value.As<Op>()) {     \
-    ReplaceByReduceExternCall<Op>(&original_update_stmt,          \
-                                  schedule_block->reduce_method); \
   }
 
-    REPLACE_TO_EXTERNAL_CALL(ir::Add)
-    REPLACE_TO_EXTERNAL_CALL(ir::Mul)
-    REPLACE_TO_EXTERNAL_CALL(ir::Max)
-    REPLACE_TO_EXTERNAL_CALL(ir::Min)
-    REPLACE_TO_EXTERNAL_CALL(ir::And)
-    REPLACE_TO_EXTERNAL_CALL(ir::Or)
-#undef REPLACE_TO_EXTERNAL_CALL
-
-    VLOG(6) << "Replace cross thread reduction: " << *op;
-
-    IRMutator::Visit(expr, op);
-  }
+  void VisitStmt(ir::stmt::Schedule stmt) {
+    if (!CanReplace(stmt)) {
+      return;
+    }
+    ir::stmt::BlockRef original_update_body = stmt->body();
 
-  void Visit(const ir::For* expr, ir::Expr* op) override {
-    cur_loops_.push_back(*op);
-    IRMutator::Visit(expr, op);
-    cur_loops_.pop_back();
+    ir::stmt::Store original_update_stmt;
+    PADDLE_ENFORCE_EQ(original_update_body->stmts().size(),
+                      1,
+                      ::common::errors::InvalidArgument(
+                          "The size of statements is incorrect."
+                          "Expected size is 1, but receive %d.",
+                          original_update_body->stmts().size()));
+    PADDLE_ENFORCE_EQ(original_update_body->stmts()[0].isa<ir::stmt::Store>(),
+                      true,
+                      ::common::errors::InvalidArgument(
+                          "The stmt in schedule's body should be store "
+                          "statement, but get %s.",
+                          original_update_body->stmts()[0]->stmt_type()));
+    original_update_stmt =
+        original_update_body->stmts()[0].as<ir::stmt::Store>();
+
+    switch (original_update_stmt->value()->node_type()) {
+      case cinn::ir::IrNodeTy::Add:
+        ReplaceByReduceExternCall<ir::Add>(original_update_stmt,
+                                           stmt->reduce_method());
+        break;
+      case cinn::ir::IrNodeTy::Mul:
+        ReplaceByReduceExternCall<ir::Mul>(original_update_stmt,
+                                           stmt->reduce_method());
+        break;
+      case cinn::ir::IrNodeTy::Max:
+        ReplaceByReduceExternCall<ir::Max>(original_update_stmt,
+                                           stmt->reduce_method());
+        break;
+      case cinn::ir::IrNodeTy::Min:
+        ReplaceByReduceExternCall<ir::Min>(original_update_stmt,
+                                           stmt->reduce_method());
+        break;
+      case cinn::ir::IrNodeTy::And:
+        ReplaceByReduceExternCall<ir::And>(original_update_stmt,
+                                           stmt->reduce_method());
+        break;
+      case cinn::ir::IrNodeTy::Or:
+        ReplaceByReduceExternCall<ir::Or>(original_update_stmt,
+                                          stmt->reduce_method());
+        break;
+      default:
+        PADDLE_THROW(::common::errors::InvalidArgument(
+            "The node type is not supported in cross thread reduction."));
+    }
   }
 
  private:
-  std::vector<ir::Expr> cur_loops_;
+  std::vector<ir::stmt::For> cur_loops_;
 };
 
 }  // namespace
 
 void ReplaceCrossThreadReduction(ir::LoweredFunc fn) {
-  CrossThreadReductionReplacer()(fn);
+  FuncPassManager manager;
+  manager.AddPass(std::make_unique<ReplaceCrossThreadReductionPass>());
+  manager.Run(fn);
+}
+
+LogicalResult ReplaceCrossThreadReductionPass::Run(ir::LoweredFunc func) {
+  CrossThreadReductionReplacer replacer;
+  replacer(func);
+  return LogicalResult::success();
 }
 
 }  // namespace optim
diff --git a/paddle/cinn/optim/replace_cross_thread_reduction.h b/paddle/cinn/optim/replace_cross_thread_reduction.h
index 9de7bfba8e1aed..16d83d384ff8d3 100644
--- a/paddle/cinn/optim/replace_cross_thread_reduction.h
+++ b/paddle/cinn/optim/replace_cross_thread_reduction.h
@@ -16,14 +16,21 @@
  * This file implements the strategy to remove the unnecessary nested block.
  */
 #pragma once
-#include <vector>
 
-#include "paddle/cinn/common/common.h"
 #include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/pass/pass.h"
 
 namespace cinn {
 namespace optim {
 
+class ReplaceCrossThreadReductionPass : public FuncPass {
+ public:
+  ReplaceCrossThreadReductionPass()
+      : FuncPass("replace_cross_thread_reduction") {}
+
+  LogicalResult Run(ir::LoweredFunc func) override;
+};
+
 /**
  * Replace cross thread reduction to external call.
  */
diff --git a/paddle/cinn/optim/replace_cross_thread_reduction_test.cc b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
index dd304a43213f5f..a7d3fc6c9f973b 100644
--- a/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
+++ b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
@@ -63,19 +63,25 @@ TEST(CrossThreadReductionReplacer, basic) {
   EXPECT_EQ(utils::GetStreamCnt(new_func->body), utils::Trim(R"ROC({
   ScheduleBlock(root)
   {
-    thread_bind[blockIdx.x] for (i, 0, 64)
     {
-      ScheduleBlock(B__reduce_init)
+      thread_bind[blockIdx.x] for (i, 0, 64)
       {
-        i0 = axis.bind(i)
-        B__reduce_init[i0] = 0.00000000f
-      }
-      thread_bind[threadIdx.x] for (reduce_j, 0, 128)
-      {
-        ScheduleBlock(B)
+        ScheduleBlock(B__reduce_init)
+        {
+          i0 = axis.bind(i)
+          {
+            B__reduce_init[i0] = 0.00000000f
+          }
+        }
+        thread_bind[threadIdx.x] for (reduce_j, 0, 128)
         {
-          i0_0, i1 = axis.bind(i, reduce_j)
-          B[i0_0] = cinn_partial_block_reduce_sum_fp32_internal_shm(A[i0_0, i1], _Buffer_<cinn_buffer_t*: 32>(shm32__fp32_reduce), false)
+          ScheduleBlock(B)
+          {
+            i0_0, i1 = axis.bind(i, reduce_j)
+            {
+              B[i0_0] = cinn_partial_block_reduce_sum_fp32_internal_shm(A[i0_0, i1], _Buffer_<cinn_buffer_t*: 32>(shm32__fp32_reduce), false)
+            }
+          }
         }
       }
     }

From 63527fb67da56d975895f0468ece7bf2881c4919 Mon Sep 17 00:00:00 2001
From: Zhou Xin <zhou.xin@mail.ustc.edu.cn>
Date: Tue, 7 Jan 2025 19:53:10 +0800
Subject: [PATCH 03/57] [CINN][Backend Pass Update No.8] Update
 rearrange_load_instruction pass (#70437)

* update rearrange_load_ins_pass

* Implement rearrange_laod_ins as a FuncPass

* Leverage Visit to implement ContainsStmtInStmt

* Remove commentted code
---
 paddle/cinn/optim/CMakeLists.txt              |   2 +-
 paddle/cinn/optim/optimize.cc                 |   8 +-
 .../cinn/optim/rearrange_load_instruction.cc  | 305 ------------
 .../optim/rearrange_load_instruction_pass.cc  | 449 ++++++++++++++++++
 ...on.h => rearrange_load_instruction_pass.h} |  11 +-
 5 files changed, 464 insertions(+), 311 deletions(-)
 delete mode 100644 paddle/cinn/optim/rearrange_load_instruction.cc
 create mode 100644 paddle/cinn/optim/rearrange_load_instruction_pass.cc
 rename paddle/cinn/optim/{rearrange_load_instruction.h => rearrange_load_instruction_pass.h} (94%)

diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index 25b9d5032b6555..d95c6e1d238401 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -36,7 +36,7 @@ gather_srcs(
   if_fusion_pass.cc
   merge_block_utils.cc
   eliminate_common_global_memory_read.cc
-  rearrange_load_instruction.cc
+  rearrange_load_instruction_pass.cc
   check_tensor_buffer_map.cc
   longlong2int_pass.cc
   vectorize_for_trans.cc
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index 7f7815ba2b9670..fec6877220b8c7 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -31,7 +31,7 @@
 #include "paddle/cinn/optim/lower_function_call_bind_vars.h"
 #include "paddle/cinn/optim/lower_intrin.h"
 #include "paddle/cinn/optim/map_extern_call.h"
-#include "paddle/cinn/optim/rearrange_load_instruction.h"
+#include "paddle/cinn/optim/rearrange_load_instruction_pass.h"
 #include "paddle/cinn/optim/remove_schedule_block_pass.h"
 #include "paddle/cinn/optim/replace_const_param_to_integer.h"
 #include "paddle/cinn/optim/replace_cross_block_reduction.h"
@@ -114,7 +114,7 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn,
 
   // Simplify already contains CastSimplify
   Simplify(&copied->body);
-  VLOG(10) << "After Optimize Simplify:" << copied;
+  VLOG(4) << "After Optimize Simplify:" << copied;
 
   BlockPassManager pass_manager;
   pass_manager.AddPass(CreateIfFusionPass());
@@ -122,7 +122,9 @@ ir::LoweredFunc Optimize(ir::LoweredFunc fn,
 
   target.arch.Match(
       [&](common::NVGPUArch) {
-        RearrangeLoadInstruction(&copied->body);
+        FuncPassManager func_pass_manager;
+        func_pass_manager.AddPass(CreateRearrangeLoadInstructionPass());
+        func_pass_manager.Run(copied);
         VLOG(4) << "After Optimize RearrangeLoadInstruction:" << copied;
       },
       [](auto) {});
diff --git a/paddle/cinn/optim/rearrange_load_instruction.cc b/paddle/cinn/optim/rearrange_load_instruction.cc
deleted file mode 100644
index 9c3b8b067e1f58..00000000000000
--- a/paddle/cinn/optim/rearrange_load_instruction.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/optim/rearrange_load_instruction.h"
-
-#include <vector>
-#include "paddle/cinn/ir/ir_analyzer/ir_analyzer.h"
-#include "paddle/cinn/ir/ir_mutator.h"
-#include "paddle/cinn/ir/ir_printer.h"
-
-PD_DECLARE_bool(cinn_enable_rearrange_load);
-
-namespace cinn {
-namespace optim {
-namespace {
-
-constexpr int MaxRearrangeLoadNum = 8;
-
-template <typename NodeTy>
-bool ContainsExprNode(const ir::Expr& expr) {
-  auto res = ir::ir_utils::CollectIRNodes(
-      expr,
-      [](const ir::Expr* x) { return x->As<NodeTy>(); },
-      /* uniq_target = */ true);
-  return !res.empty();
-}
-
-/**
- * Calculate the buffer size as a constant. For dynamic dims, since they are
- * difficult to compare, we just estimate them to be 32.
- * Note: this is a heuristic optimization, so the exact number is not very
- * important.
- */
-int64_t EstimateBufferSize(const ir::Buffer& buffer) {
-  int64_t size = 1;
-  for (auto& dim_size : buffer->shape) {
-    if (dim_size.is_constant()) {
-      size *= dim_size.as_int64();
-    } else {
-      size *= 32;
-    }
-  }
-  return size;
-}
-
-std::vector<std::string> SortLoadsByBufferSizes(
-    const std::unordered_map<std::string, const ir::Expr*>& load_map,
-    std::vector<std::string> load_list) {
-  // Calculate the buffer sizes of loads (with estimation).
-  std::map<ir::Buffer, int64_t> buffer_size_map;
-  for (auto& [_, load_expr] : load_map) {
-    auto& buffer = load_expr->As<ir::Load>()->tensor.as_tensor()->buffer;
-    if (buffer_size_map.count(buffer)) {
-      continue;
-    }
-    buffer_size_map[buffer] = EstimateBufferSize(buffer);
-  }
-
-  const auto GetBufferSize = [&](const std::string& key) {
-    auto& buffer = load_map.at(key)->As<ir::Load>()->tensor.as_tensor()->buffer;
-    return buffer_size_map[buffer];
-  };
-
-  // Sort loads by their buffer sizes from large to small.
-  // Note: we use stable sort here, because for equal-size loads, we want to
-  // keep their original order.
-  std::stable_sort(load_list.begin(),
-                   load_list.end(),
-                   [&](const std::string& key1, const std::string& key2) {
-                     return GetBufferSize(key1) > GetBufferSize(key2);
-                   });
-  return load_list;
-}
-
-struct LoadCollector : public ir::IRMutator<> {
-  explicit LoadCollector(const std::set<ir::Buffer>& locally_defined_buffers)
-      : locally_defined_buffers_(locally_defined_buffers) {}
-
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
- private:
-  // Collect loads that meet the following criteria:
-  // 1) It is loading from global memory. Local loads are simply register reads
-  //    and do not require rearrangement.
-  // 2) The value being loaded is not defined locally by a previous store. In
-  //    such cases, the value resides in a register rather than in memory, thus
-  //    doesn't need rearrangement. This criteria also prevents data-dependency
-  //    harzards.
-  // 3) It doesn't contains indirect indices (i.e. loads within indices).
-  //    Indirect indices are hard to manage and are seldom seem, so we choose
-  //    not to handle them.
-  void Visit(const ir::Load* op, ir::Expr* expr) override {
-    auto& buffer = op->tensor.as_tensor()->buffer;
-    if (buffer->memory_type != ir::MemoryType::Heap) {
-      return;
-    }
-    if (locally_defined_buffers_.count(buffer) > 0) {
-      return;
-    }
-    for (auto& index_expr : op->indices) {
-      if (ContainsExprNode<ir::Load>(index_expr)) {
-        return;
-      }
-    }
-    std::string key = utils::GetStreamCnt(*expr);
-    CollectLoad(key, expr);
-  }
-
-  // Handle Select as a special op.
-  // Since Select evaluates only one of its two branches, we can rearrange a
-  // load in Select only if the load appears in both branches, otherwise we may
-  // violate the control dependency.
-  void Visit(const ir::Select* op, ir::Expr* expr) override {
-    auto* node = expr->As<ir::Select>();
-    ir::IRMutator<>::Visit(&node->condition, &node->condition);
-
-    LoadCollector true_collector(locally_defined_buffers_);
-    true_collector(&node->true_value);
-    LoadCollector false_collector(locally_defined_buffers_);
-    false_collector(&node->false_value);
-
-    for (auto& key : true_collector.load_list_) {
-      if (false_collector.load_map_.count(key) > 0) {
-        CollectLoad(key, true_collector.load_map_[key]);
-      }
-    }
-  }
-
-  void CollectLoad(const std::string& key, const ir::Expr* expr) {
-    auto [_, is_first] = load_map_.emplace(key, expr);
-    if (is_first) {
-      load_list_.push_back(key);
-    }
-  }
-
- public:
-  // map from the signatures of loads to the load nodes
-  std::unordered_map<std::string, const ir::Expr*> load_map_;
-  // list of the signatures of loads in the order they are visited
-  std::vector<std::string> load_list_;
-
- private:
-  const std::set<ir::Buffer>& locally_defined_buffers_;
-};
-
-struct LoadReplacer : public ir::IRMutator<> {
-  explicit LoadReplacer(const std::unordered_map<std::string, ir::Var>& var_map)
-      : var_map_(var_map) {}
-
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
- private:
-  void Visit(const ir::Load* op, ir::Expr* expr) override {
-    std::string key = utils::GetStreamCnt(*expr);
-    if (var_map_.count(key) > 0) {
-      *expr = Expr(var_map_.at(key));
-    }
-  }
-
-  const std::unordered_map<std::string, ir::Var>& var_map_;
-};
-
-struct RearrangeLoadInstructionMutator : public ir::IRMutator<> {
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
- private:
-  // A block is a leaf block if it is inside at least one loop, and all of its
-  // stmts are schedule blocks.
-  bool IsLeafBlock(const ir::Block& block) {
-    if (parent_loops_.empty()) {
-      return false;
-    }
-    for (auto& stmt : block.stmts) {
-      if (!stmt.As<ir::ScheduleBlockRealize>()) {
-        return false;
-      }
-      auto* node = stmt.As<ir::ScheduleBlockRealize>()
-                       ->schedule_block.As<ir::ScheduleBlock>();
-      if (node->name.substr(0, 4) == "root") {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Local buffer initialization is like:
-  //    var_1_local[0] = var_1[blockIdx.x],
-  // where the lhs is a local buffer and the rhs is a single load.
-  bool IsLocalBufferInit(const ir::Store& store) {
-    auto& store_buffer = store.tensor.as_tensor()->buffer;
-    return store_buffer->memory_type == ir::MemoryType::GPULocal &&
-           store.value.As<ir::Load>();
-  }
-
-  void DoRearrangeLoadInstruction(ir::Block* block) {
-    // Step 1. Collect loads in each schedule block under this block.
-    // Requirements:
-    // 1) The schedule block cannot contain IfThenElse, or we will violate the
-    //    control dependency. Schedule blocks that have IfThenElse usually don't
-    //    benefit from rearranging loads, so it's ok to skip them.
-    // 2) The schedule block is not local buffer initialization, because when
-    //    initializing the local buffer with a load, we have already rearranged
-    //    that load.
-    // 3) There are more constrains on the loads to collect, see LoadCollector
-    //    for details.
-    LoadCollector collector(locally_defined_buffers_);
-    for (auto& stmt : block->stmts) {
-      ir::Expr store = ir::analyzer::GetStoreOfSBlock(stmt);
-      auto* store_node = store.As<ir::Store>();
-      if (ContainsExprNode<ir::IfThenElse>(stmt)) continue;
-      if (IsLocalBufferInit(*store_node)) continue;
-      collector(&store_node->value);
-    }
-
-    // Step 2. Sort the loads by their buffer sizes from large to small, and
-    //    only keep the first `MaxRearrangeLoadNum` loads.
-    // Performance concerns:
-    // 1) Larger buffers need more time to access, so we should issue their
-    //    corresponding loads earlier.
-    // 2) Rearranged loads will consume registers, so we should set a limit
-    //    to prevent register overflow.
-    std::vector<std::string> load_list =
-        SortLoadsByBufferSizes(collector.load_map_, collector.load_list_);
-    if (load_list.size() > MaxRearrangeLoadNum) {
-      load_list.resize(MaxRearrangeLoadNum);
-    }
-
-    // Step 3. Create loads with Let at the beginning of the block.
-    std::vector<ir::Expr> new_stmts;
-    std::unordered_map<std::string, ir::Var> var_map;
-    for (auto& key : load_list) {
-      auto* load_expr = collector.load_map_[key];
-      auto* tensor = load_expr->As<ir::Load>()->tensor.as_tensor();
-      ir::Var local_var = ir::Var(common::UniqName(tensor->name + "_local"),
-                                  tensor->buffer->dtype);
-      ir::Expr let_expr = ir::Let::Make(local_var, *load_expr);
-      new_stmts.push_back(let_expr);
-      var_map[key] = local_var;
-    }
-
-    // Step 4. Replace loads in schedule blocks with the above Let vars.
-    LoadReplacer replacer(var_map);
-    for (auto& stmt : block->stmts) {
-      replacer(&stmt);
-      new_stmts.push_back(stmt);
-    }
-    block->stmts = std::move(new_stmts);
-  }
-
-  void Visit(const ir::Block* op, ir::Expr* expr) override {
-    auto* node = expr->As<ir::Block>();
-    ir::IRMutator<>::Visit(op, expr);
-    if (IsLeafBlock(*op)) {
-      DoRearrangeLoadInstruction(node);
-    }
-  }
-
-  void Visit(const ir::ScheduleBlockRealize* op, ir::Expr* expr) override {
-    auto* block_node = op->schedule_block.As<ir::ScheduleBlock>();
-    if (block_node->name.substr(0, 4) == "root") {
-      ir::IRMutator<>::Visit(op, expr);
-      return;
-    }
-    for (auto& buffer_range : block_node->write_buffers) {
-      auto& write_buffer = buffer_range.As<ir::_BufferRange_>()->buffer;
-      locally_defined_buffers_.insert(write_buffer.as_buffer_ref());
-    }
-  }
-
-  void Visit(const ir::For* op, ir::Expr* expr) override {
-    parent_loops_.push_back(op);
-    ir::IRMutator<>::Visit(op, expr);
-    parent_loops_.pop_back();
-  }
-
- private:
-  // Buffers whose values are defined locally inside this function.
-  // Note: even if a buffer is allocated on global memory, its value may be
-  // assigned locally. If so, it also belongs to this set.
-  std::set<ir::Buffer> locally_defined_buffers_;
-
-  std::vector<const ir::For*> parent_loops_;
-};
-
-}  // namespace
-
-void RearrangeLoadInstruction(Expr* expr) {
-  if (!FLAGS_cinn_enable_rearrange_load) return;
-  RearrangeLoadInstructionMutator mutator;
-  mutator(expr);
-}
-
-}  // namespace optim
-}  // namespace cinn
diff --git a/paddle/cinn/optim/rearrange_load_instruction_pass.cc b/paddle/cinn/optim/rearrange_load_instruction_pass.cc
new file mode 100644
index 00000000000000..366c077306d7a9
--- /dev/null
+++ b/paddle/cinn/optim/rearrange_load_instruction_pass.cc
@@ -0,0 +1,449 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/optim/rearrange_load_instruction_pass.h"
+#include "paddle/cinn/common/cinn_value.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+#include "paddle/cinn/ir/stmt_visitors.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/stmt_converter.h"
+#include "paddle/phi/core/enforce.h"
+
+PD_DECLARE_bool(cinn_enable_rearrange_load);
+
+namespace cinn {
+namespace optim {
+
+using ir::stmt::Alloc;
+using ir::stmt::BlockRef;
+using ir::stmt::Evaluate;
+using ir::stmt::For;
+using ir::stmt::Free;
+using ir::stmt::IfThenElse;
+using ir::stmt::Let;
+using ir::stmt::Schedule;
+using ir::stmt::StmtRef;
+using ir::stmt::Store;
+
+namespace {
+constexpr int MaxRearrangeLoadNum = 8;
+
+template <typename NodeTy>
+bool ContainsExprNodeInExpr(const ir::Expr& expr) {
+  auto res = ir::ir_utils::CollectIRNodes(
+      expr,
+      [](const ir::Expr* x) { return x->As<NodeTy>(); },
+      /* uniq_target = */ true);
+  return !res.empty();
+}
+
+template <typename StmtType>
+bool ContainsStmtInStmt(const StmtRef& stmt) {
+  bool found = false;
+  auto CheckStmt = [&found](const StmtRef& stmt) {
+    if (!found && stmt.isa<StmtType>()) {
+      found = true;
+    }
+  };
+  ir::stmt::Visit(stmt, CheckStmt, [](const StmtRef&) {});
+  return found;
+}
+
+/**
+ * Calculate the buffer size as a constant. For dynamic dims, since they are
+ * difficult to compare, we just estimate them to be 32.
+ * Note: this is a heuristic optimization, so the exact number is not very
+ * important.
+ */
+int64_t EstimateBufferSize(const ir::Buffer& buffer) {
+  int64_t size = 1;
+  for (auto& dim_size : buffer->shape) {
+    if (dim_size.is_constant()) {
+      size *= dim_size.as_int64();
+    } else {
+      size *= 32;
+    }
+  }
+  return size;
+}
+
+std::vector<std::string> SortLoadsByBufferSizes(
+    const std::unordered_map<std::string, const ir::Expr*>& load_map,
+    std::vector<std::string> load_list) {
+  // Calculate the buffer sizes of loads (with estimation).
+  std::map<ir::Buffer, int64_t> buffer_size_map;
+  for (auto& [_, load_expr] : load_map) {
+    auto& buffer = load_expr->As<ir::Load>()->tensor.as_tensor()->buffer;
+    if (buffer_size_map.count(buffer)) {
+      continue;
+    }
+    buffer_size_map[buffer] = EstimateBufferSize(buffer);
+  }
+
+  const auto GetBufferSize = [&](const std::string& key) {
+    auto& buffer = load_map.at(key)->As<ir::Load>()->tensor.as_tensor()->buffer;
+    return buffer_size_map[buffer];
+  };
+
+  // Sort loads by their buffer sizes from large to small.
+  // Note: we use stable sort here, because for equal-size loads, we want to
+  // keep their original order.
+  std::stable_sort(load_list.begin(),
+                   load_list.end(),
+                   [&](const std::string& key1, const std::string& key2) {
+                     return GetBufferSize(key1) > GetBufferSize(key2);
+                   });
+  return load_list;
+}
+
+struct LoadCollector : public ir::IRMutator<> {
+  explicit LoadCollector(const std::set<ir::Buffer>& locally_defined_buffers)
+      : locally_defined_buffers_(locally_defined_buffers) {}
+
+  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  // Collect loads that meet the following criteria:
+  // 1) It is loading from global memory. Local loads are simply register
+  //    reads and do not require rearrangement.
+  // 2) The value being loaded is not defined locally by a previous store. In
+  //    such cases, the value resides in a register rather than in memory,
+  //    thus doesn't need rearrangement. This criteria also prevents
+  //    data-dependency harzards.
+  // 3) It doesn't contains indirect indices (i.e. loads within indices).
+  //    Indirect indices are hard to manage and are seldom seem, so we choose
+  //    not to handle them.
+  void Visit(const ir::Load* op, ir::Expr* expr) override {
+    auto& buffer = op->tensor.as_tensor()->buffer;
+    if (buffer->memory_type != ir::MemoryType::Heap) {
+      return;
+    }
+    if (locally_defined_buffers_.count(buffer) > 0) {
+      return;
+    }
+    for (auto& index_expr : op->indices) {
+      if (ContainsExprNodeInExpr<ir::Load>(index_expr)) {
+        return;
+      }
+    }
+    std::string key = utils::GetStreamCnt(*expr);
+    CollectLoad(key, expr);
+  }
+
+  // Handle Select as a special op.
+  // Since Select evaluates only one of its two branches, we can rearrange a
+  // load in Select only if the load appears in both branches, otherwise we
+  // may violate the control dependency.
+  void Visit(const ir::Select* op, ir::Expr* expr) override {
+    auto* node = expr->As<ir::Select>();
+    ir::IRMutator<>::Visit(&node->condition, &node->condition);
+
+    LoadCollector true_collector(locally_defined_buffers_);
+    true_collector(&node->true_value);
+    LoadCollector false_collector(locally_defined_buffers_);
+    false_collector(&node->false_value);
+
+    for (auto& key : true_collector.load_list_) {
+      if (false_collector.load_map_.count(key) > 0) {
+        CollectLoad(key, true_collector.load_map_[key]);
+      }
+    }
+  }
+
+  void CollectLoad(const std::string& key, const ir::Expr* expr) {
+    auto [_, is_first] = load_map_.emplace(key, expr);
+    if (is_first) {
+      load_list_.push_back(key);
+    }
+  }
+
+ public:
+  // map from the signatures of loads to the load nodes
+  std::unordered_map<std::string, const ir::Expr*> load_map_;
+  // list of the signatures of loads in the order they are visited
+  std::vector<std::string> load_list_;
+
+ private:
+  const std::set<ir::Buffer>& locally_defined_buffers_;
+};
+
+struct LoadReplacer : public ir::IRMutator<>, public ir::stmt::StmtMutator<> {
+  explicit LoadReplacer(const std::unordered_map<std::string, ir::Var>& var_map)
+      : var_map_(var_map) {}
+
+  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  void operator()(StmtRef stmt) { ir::stmt::StmtMutator<>::VisitStmt(stmt); }
+
+ private:
+  void Visit(const ir::Load* op, ir::Expr* expr) override {
+    std::string key = utils::GetStreamCnt(*expr);
+    if (var_map_.count(key) > 0) {
+      *expr = Expr(var_map_.at(key));
+    }
+  }
+
+  void VisitStmt(ir::stmt::Let stmt) override {
+    if (stmt->body().defined()) {
+      Expr body = stmt->body();
+      ir::IRMutator<>::Visit(&body, &body);
+      stmt->set_body(body);
+    }
+  }
+
+  void VisitStmt(ir::stmt::Store stmt) override {
+    auto* tensor = stmt->tensor().as_tensor();
+
+    std::vector<Expr> new_indices = stmt->indices();
+    for (Expr& index : new_indices) {
+      ir::IRMutator<>::Visit(&index, &index);
+    }
+    stmt->set_indices(new_indices);
+
+    Expr tensor_expr = stmt->tensor();
+    ir::IRMutator<>::Visit(&tensor_expr, &tensor_expr);
+    stmt->set_tensor(tensor_expr);
+
+    Expr value = stmt->value();
+    ir::IRMutator<>::Visit(&value, &value);
+    stmt->set_value(value);
+  }
+
+  void VisitStmt(ir::stmt::For stmt) override {
+    Expr min = stmt->min();
+    ir::IRMutator<>::Visit(&min, &min);
+    Expr extent = stmt->extent();
+    ir::IRMutator<>::Visit(&extent, &extent);
+    VisitBlock(stmt->body());
+    ir::Expr loop_var = stmt->loop_var();
+    ir::IRMutator<>::Visit(&loop_var, &loop_var);
+    stmt->set_loop_var(loop_var);
+  }
+
+  void VisitStmt(ir::stmt::IfThenElse stmt) override {
+    Expr condition = stmt->condition();
+    ir::IRMutator<>::Visit(&condition, &condition);
+    ir::stmt::BlockRef true_case = stmt->true_case();
+    VisitBlock(true_case);
+    stmt->set_true_case(true_case);
+    if (stmt->false_case().defined()) {
+      ir::stmt::BlockRef false_case = stmt->false_case();
+      VisitBlock(false_case);
+      stmt->set_false_case(false_case);
+    }
+  }
+
+  void VisitStmt(ir::stmt::Schedule stmt) override {
+    std::vector<Var> vars = stmt->iter_vars();
+    for (ir::Var& var : vars) {
+      if (var->lower_bound.defined()) {
+        ir::IRMutator<>::Visit(&var->lower_bound, &var->lower_bound);
+      }
+      if (var->upper_bound.defined()) {
+        ir::IRMutator<>::Visit(&var->upper_bound, &var->upper_bound);
+      }
+    }
+    std::vector<Expr> new_read_buffers = stmt->read_buffers();
+    for (Expr& read_buffer : new_read_buffers) {
+      ir::IRMutator<>::Visit(&read_buffer, &read_buffer);
+    }
+    stmt->set_read_buffers(new_read_buffers);
+
+    std::vector<Expr> new_write_buffers = stmt->write_buffers();
+    for (Expr& write_buffer : new_write_buffers) {
+      ir::IRMutator<>::Visit(&write_buffer, &write_buffer);
+    }
+    stmt->set_write_buffers(new_write_buffers);
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(ir::stmt::Alloc stmt) override { return; }
+
+  void VisitStmt(ir::stmt::Free stmt) override { return; }
+
+  void VisitStmt(ir::stmt::Evaluate) override { return; }
+
+  const std::unordered_map<std::string, ir::Var>& var_map_;
+};
+
+struct RearrangeLoadInstructionMutator : public ir::stmt::StmtMutator<> {
+  void operator()(BlockRef block) { VisitBlock(block); }
+
+ private:
+  // A block is a leaf block if it is inside at least one loop, and all of its
+  // stmts are schedule blocks.
+  bool IsLeafBlock(BlockRef block) {
+    if (parent_loops_.empty()) return false;
+    for (StmtRef stmt : block->stmts()) {
+      if (!stmt.isa<Schedule>()) return false;
+      Schedule schedule_stmt = stmt.as<Schedule>();
+      if (schedule_stmt->name().substr(0, 4) == "root") return false;
+    }
+    return true;
+  }
+
+  // Local buffer initialization is like:
+  //    var_1_local[0] = var_1[blockIdx.x],
+  // where the lhs is a local buffer and the rhs is a single load.
+  bool IsLocalBufferInit(Store store_stmt) {
+    const ir::Buffer& store_buffer = store_stmt->tensor().as_tensor()->buffer;
+    return store_buffer->memory_type == ir::MemoryType::GPULocal &&
+           store_stmt->value().As<ir::Load>();
+  }
+
+  void DoRearrangeLoadInstruction(BlockRef block) {
+    auto GetStoreOfScheduleStmt = [](Schedule schedule_stmt) -> Store {
+      bool found = false;
+      Store ret;
+      for (StmtRef stmt : schedule_stmt->body()->stmts()) {
+        if (stmt.isa<Store>()) {
+          PADDLE_ENFORCE(found == false,
+                         ::common::errors::InvalidArgument(
+                             "One schedule statement should only have one "
+                             "store statement."));
+          found = true;
+          ret = stmt.as<Store>();
+        }
+      }
+      PADDLE_ENFORCE(found == true,
+                     ::common::errors::InvalidArgument(
+                         "One schedule statement should have one store "
+                         "statement, but not found."));
+      return ret;
+    };
+
+    // Step 1. Collect loads in each schedule block under this block.
+    // Requirements:
+    // 1) The schedule block cannot contain IfThenElse, or we will violate the
+    //    control dependency. Schedule blocks that have IfThenElse usually
+    //    don't benefit from rearranging loads, so it's ok to skip them.
+    // 2) The schedule block is not local buffer initialization, because when
+    //    initializing the local buffer with a load, we have already
+    //    rearranged that load.
+    // 3) There are more constrains on the loads to collect, see LoadCollector
+    //    for details.
+    LoadCollector collector(locally_defined_buffers_);
+    for (StmtRef stmt : block->stmts()) {
+      if (ContainsStmtInStmt<IfThenElse>(stmt)) continue;
+      if (!stmt.isa<Schedule>()) continue;
+      Schedule schedule_stmt = stmt.as<Schedule>();
+      Store store_stmt = GetStoreOfScheduleStmt(schedule_stmt);
+      if (IsLocalBufferInit(store_stmt)) continue;
+      collector(const_cast<ir::Expr*>(&store_stmt->value()));
+    }
+
+    // Step 2. Sort the loads by their buffer sizes from large to small, and
+    //    only keep the first `MaxRearrangeLoadNum` loads.
+    // Performance concerns:
+    // 1) Larger buffers need more time to access, so we should issue their
+    //    corresponding loads earlier.
+    // 2) Rearranged loads will consume registers, so we should set a limit
+    //    to prevent register overflow.
+    std::vector<std::string> load_list =
+        SortLoadsByBufferSizes(collector.load_map_, collector.load_list_);
+    if (load_list.size() > MaxRearrangeLoadNum) {
+      load_list.resize(MaxRearrangeLoadNum);
+    }
+
+    // Step 3. Create loads with Let at the beginning of the block.
+    std::vector<StmtRef> new_stmts;
+    std::unordered_map<std::string, ir::Var> var_map;
+    for (std::string& key : load_list) {
+      const ir::Expr* load_expr = collector.load_map_[key];
+      const auto tensor = load_expr->As<ir::Load>()->tensor.as_tensor();
+      ir::Var local_var = ir::Var(common::UniqName(tensor->name + "_local"),
+                                  tensor->buffer->dtype);
+      Let let_stmt = Let(local_var, *load_expr);
+      new_stmts.push_back(let_stmt);
+      var_map[key] = local_var;
+    }
+
+    // Step 4. Replace loads in schedule blocks with the above Let vars.
+    LoadReplacer replacer(var_map);
+    for (StmtRef stmt : block->stmts()) {
+      if (stmt.isa<Schedule>()) {
+        replacer(stmt);
+      }
+      new_stmts.push_back(stmt);
+    }
+    block->set_stmts(new_stmts);
+  }
+
+  void VisitBlock(BlockRef block) override {
+    ir::stmt::StmtMutator<>::VisitBlock(block);
+    if (IsLeafBlock(block)) {
+      DoRearrangeLoadInstruction(block);
+    }
+  }
+
+  void VisitStmt(Schedule stmt) override {
+    if (stmt->name().substr(0, 4) == "root") {
+      ir::stmt::StmtMutator<>::VisitBlock(stmt->body());
+      return;
+    }
+    for (auto& buffer_range : stmt->write_buffers()) {
+      auto& write_buffer = buffer_range.As<ir::_BufferRange_>()->buffer;
+      locally_defined_buffers_.insert(write_buffer.as_buffer_ref());
+    }
+  }
+
+  void VisitStmt(For stmt) override {
+    parent_loops_.push_back(stmt);
+    VisitBlock(stmt->body());
+    parent_loops_.pop_back();
+  }
+
+  void VisitStmt(IfThenElse stmt) override {
+    ir::stmt::BlockRef true_case = stmt->true_case();
+    VisitBlock(true_case);
+    stmt->set_true_case(true_case);
+    if (stmt->false_case().defined()) {
+      ir::stmt::BlockRef false_case = stmt->false_case();
+      VisitBlock(false_case);
+      stmt->set_false_case(false_case);
+    }
+  }
+
+  void VisitStmt(Let stmt) override { return; }
+  void VisitStmt(Store stmt) override { return; }
+  void VisitStmt(Alloc stmt) override { return; }
+  void VisitStmt(Free stmt) override { return; }
+  void VisitStmt(Evaluate stmt) override { return; }
+
+ private:
+  std::set<ir::Buffer> locally_defined_buffers_;
+  std::vector<For> parent_loops_;
+};
+}  // namespace
+
+LogicalResult cinn::optim::RearrangeLoadInstructionPass::Run(
+    ir::LoweredFunc func) {
+  if (FLAGS_cinn_enable_rearrange_load) {
+    BlockRef body = func->body_block;
+    RearrangeLoadInstructionMutator mutator;
+    mutator(body);
+  }
+  return LogicalResult::success();
+}
+
+std::unique_ptr<FuncPass> CreateRearrangeLoadInstructionPass() {
+  return std::make_unique<RearrangeLoadInstructionPass>();
+}
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/rearrange_load_instruction.h b/paddle/cinn/optim/rearrange_load_instruction_pass.h
similarity index 94%
rename from paddle/cinn/optim/rearrange_load_instruction.h
rename to paddle/cinn/optim/rearrange_load_instruction_pass.h
index 4c4d2b10728528..0128234a6ffa2e 100644
--- a/paddle/cinn/optim/rearrange_load_instruction.h
+++ b/paddle/cinn/optim/rearrange_load_instruction_pass.h
@@ -13,10 +13,17 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/pass/pass.h"
+
+PD_DECLARE_bool(cinn_enable_rearrange_load);
 
 namespace cinn {
 namespace optim {
+class RearrangeLoadInstructionPass : public FuncPass {
+ public:
+  RearrangeLoadInstructionPass() : FuncPass("rearrange_load_instruction") {}
+  LogicalResult Run(ir::LoweredFunc func) override;
+};
 
 /*
  * Rearrange global memory loads in front of expressions to optimize the
@@ -149,7 +156,7 @@ namespace optim {
  *   branch of Select, `var_3[k]` in ScheduleBlock(var_4) has data dependency
  *   with ScheduleBlock(var_3); none of them can be rearranged.
  */
-void RearrangeLoadInstruction(Expr *expr);
+std::unique_ptr<FuncPass> CreateRearrangeLoadInstructionPass();
 
 }  // namespace optim
 }  // namespace cinn

From 299ff2b35b01557492f8702bef1e157af8fd9c9c Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Tue, 7 Jan 2025 21:13:17 +0800
Subject: [PATCH 04/57] fix generator shape int63 to int32 bug (#70658)

---
 paddle/cinn/hlir/op/elementwise.cc | 1 +
 paddle/cinn/hlir/pe/elementwise.cc | 9 ++++++++-
 paddle/cinn/hlir/pe/elementwise.h  | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index 8f23a6b32b913a..db2a65b68c9c0d 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -1267,6 +1267,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForGenerateShapeSymbolic(
                                            symbol_bindings,
                                            output_dim_exprs,
                                            output_shapes[0],
+                                           out_type,
                                            tensor_name);
         std::vector<CINNValue> res;
         res.push_back(CINNValue(out));
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
index de0d8b63d872ec..8e16bd6a8c6d19 100644
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -351,6 +351,7 @@ ir::Tensor GenerateShape(const std::vector<ir::Tensor>& inputs,
                          const cinn::dialect::SymbolBindings& symbol_bindings,
                          const std::vector<symbol::DimExpr>& output_dim_exprs,
                          const std::vector<ir::Dim>& out_shape,
+                         const std::vector<Type>& out_type,
                          const std::string& name) {
   if (output_dim_exprs.size() != 1) {
     VLOG(4) << "pe::GenerateShape will return a meaningless tensor when "
@@ -365,7 +366,13 @@ ir::Tensor GenerateShape(const std::vector<ir::Tensor>& inputs,
   auto res = Compute(
       ToCinnExprs(out_shape),
       [=, &converter](const std::vector<Expr>& indice) {
-        return converter.ConvertToIrExpr(output_dim_exprs[0]);
+        auto dim_expr = converter.ConvertToIrExpr(output_dim_exprs[0]);
+
+        if (out_type[0] == type_of<int32_t>()) {
+          dim_expr = ir::Cast::Make(type_of<int32_t>(), dim_expr);
+        }
+
+        return dim_expr;
       },
       name);
   return res;
diff --git a/paddle/cinn/hlir/pe/elementwise.h b/paddle/cinn/hlir/pe/elementwise.h
index fd58bd39146f5a..5d7dd55416e3fb 100644
--- a/paddle/cinn/hlir/pe/elementwise.h
+++ b/paddle/cinn/hlir/pe/elementwise.h
@@ -165,6 +165,7 @@ ir::Tensor GenerateShape(
     const cinn::dialect::SymbolBindings& symbol_bindings,
     const std::vector<symbol::DimExpr>& output_dim_exprs,
     const std::vector<ir::Dim>& out_shape,
+    const std::vector<Type>& out_type,
     const std::string& name = UniqName("T_Generate_Shape_out"));
 
 // This operator checks if all x and y satisfy the condition: |x - y| <= atol +

From 1b1d815f7349fa3350a1e165f72755dc6fda4cde Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Tue, 7 Jan 2025 21:30:48 +0800
Subject: [PATCH 05/57] =?UTF-8?q?=E3=80=90CINN=E3=80=91Fix=20ir=20simplify?=
 =?UTF-8?q?=20bug=20(#70654)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update y layout

* fix bug
---
 paddle/cinn/optim/ir_simplify.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index 562b06686eaa4e..396e4b6e5c0697 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -266,6 +266,7 @@ struct SimplifyLogicalMutator : public ir::ExprMutator<> {
   }
 
   void Visit(const ir::Not* op, Expr* expr) override {
+    VLOG(7) << "Begin Visit Not op: " << *expr;
     auto* node = expr->As<ir::Not>();
     auto v = node->v();
     ir::ExprMutator<>::Visit(&v, &v);
@@ -273,19 +274,27 @@ struct SimplifyLogicalMutator : public ir::ExprMutator<> {
       case ir::IrNodeTy::IntImm:
       case ir::IrNodeTy::UIntImm:
         *expr = common::IsZero(v) ? Expr(true) : Expr(false);
+        return;
       case ir::IrNodeTy::Not:
         *expr = v.As<ir::Not>()->v();
+        return;
       case ir::IrNodeTy::LE:
         *expr = ir::GT::Make(v->operand(0), v->operand(1));
+        return;
       case ir::IrNodeTy::LT:
         *expr = ir::GE::Make(v->operand(0), v->operand(1));
+        return;
       case ir::IrNodeTy::GE:
         *expr = ir::LT::Make(v->operand(0), v->operand(1));
+        return;
       case ir::IrNodeTy::GT:
         *expr = ir::LE::Make(v->operand(0), v->operand(1));
+        return;
       default:
+        VLOG(7) << "End Visit Not op: " << *expr;
         return;
     }
+    VLOG(7) << "End Visit Not op: " << *expr;
   }
 };
 

From 80c376f3d0e0918a93819cafa304c27eb335fad1 Mon Sep 17 00:00:00 2001
From: Xinyi Li <xinyi1.li@intel.com>
Date: Wed, 8 Jan 2025 09:14:34 +0800
Subject: [PATCH 06/57] [PIR][oneDNN] Optimize bfloat16 placement logic
 (#70630)

* optimize placement logic

* fix format

* fix copyright

* reduce repetitive match
---
 .../transforms/onednn/cpu_bfloat16_pass.cc    |  14 +-
 .../onednn/cpu_bfloat16_placement_pass.cc     | 139 +++++-------------
 .../onednn/cpu_special_ops_bf16_pass.cc       |  23 +--
 .../onednn/cpu_special_ops_bf16_pass.h        |   2 +-
 .../onednn/onednn_placement_pass.cc           |   3 +-
 .../onednn/test_cpu_bfloat16_pir_pass.py      |   3 +-
 6 files changed, 54 insertions(+), 130 deletions(-)

diff --git a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc
index 80ed42414cdbaf..c1a3d4eea3dfdf 100644
--- a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -96,9 +96,6 @@ class CpuBfloat16Pattern : public paddle::drr::DrrPatternBase {
       op_attrs.emplace("keepdim", pat.Attr("keepdim"));
       op_attrs.emplace("dtype", pat.Attr("dtype"));
 
-    } else if (bfloat16_ops_ == "onednn_op.concat") {
-      op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type"));
-      op_attrs.emplace("use_quantizer", pat.Attr("use_quantizer"));
     } else if (bfloat16_ops_ == "onednn_op.reshape_" ||
                bfloat16_ops_ == "onednn_op.reshape") {
       op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type"));
@@ -224,11 +221,7 @@ class CpuBfloat16DequantPattern : public paddle::drr::DrrPatternBase {
     paddle::drr::SourcePattern pat = ctx->SourcePattern();
 
     std::unordered_map<std::string, paddle::drr::Attribute> op_attrs;
-    if (bfloat16_ops_ == "onednn_op.concat") {
-      op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type"));
-      op_attrs.emplace("use_quantizer", pat.Attr("use_quantizer"));
-
-    } else if (bfloat16_ops_ == "onednn_op.conv2d") {
+    if (bfloat16_ops_ == "onednn_op.conv2d") {
       op_attrs.emplace("strides", pat.Attr("strides"));
       op_attrs.emplace("paddings", pat.Attr("paddings"));
       op_attrs.emplace("padding_algorithm", pat.Attr("padding_algorithm"));
@@ -272,9 +265,6 @@ class CpuBfloat16DequantPattern : public paddle::drr::DrrPatternBase {
       op_attrs.emplace("keepdim", pat.Attr("keepdim"));
       op_attrs.emplace("dtype", pat.Attr("dtype"));
 
-    } else if (bfloat16_ops_ == "onednn_op.concat") {
-      op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type"));
-      op_attrs.emplace("use_quantizer", pat.Attr("use_quantizer"));
     } else if (bfloat16_ops_ == "onednn_op.reshape_" ||
                bfloat16_ops_ == "onednn_op.reshape") {
       op_attrs.emplace("mkldnn_data_type", pat.Attr("mkldnn_data_type"));
diff --git a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc
index 649389585915d7..ce0f873be31c74 100644
--- a/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/cpu_bfloat16_placement_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -59,6 +59,7 @@ class OneDNNBf16PlacementPattern : public pir::RewritePattern {
     if (!op->isa<paddle::onednn::dialect::BilinearInterpOp>() &&
         !op->isa<paddle::onednn::dialect::ClipOp>() &&
         !op->isa<paddle::onednn::dialect::Clip_Op>() &&
+        !op->isa<paddle::onednn::dialect::ConcatOp>() &&
         !op->isa<paddle::onednn::dialect::Conv2dOp>() &&
         !op->isa<paddle::onednn::dialect::Conv2dTransposeOp>() &&
         !op->isa<paddle::onednn::dialect::Conv2dTransposeBiasOp>() &&
@@ -100,7 +101,8 @@ class OneDNNBf16PlacementPattern : public pir::RewritePattern {
       auto mkldnn_data_type = op_attr.at("mkldnn_data_type")
                                   .dyn_cast<pir::StrAttribute>()
                                   .AsString();
-      if (mkldnn_data_type == "int8") {
+      // Reduce repetitive match
+      if (mkldnn_data_type != "float32") {
         return false;
       }
     }
@@ -143,14 +145,28 @@ class OneDNNBf16PlacementPattern : public pir::RewritePattern {
       }
       pir::Type type = op->operand_type(i);
       if (!type) continue;
-      if (!type.isa<paddle::dialect::DenseTensorType>()) {
-        // We skip pir::VectorType
-        // TODO(Lirong, Xinyi): Support pir::VectorType in bf16
-        return false;
-      }
-      pir::Type op_dtype = pir::GetDataTypeFromValue(value);
-      // Only float input can be converted to bfloat16
-      if (!op_dtype.isa<pir::Float32Type>()) {
+      if (type.isa<pir::VectorType>()) {
+        // Support pir::VectorType in bf16
+        // Special op will do detailed check in its pattern
+        pir::VectorType vector_type = value.type().dyn_cast<pir::VectorType>();
+        for (size_t idx = 0; idx < static_cast<size_t>(vector_type.size());
+             idx++) {
+          auto input_type =
+              vector_type[idx].isa<paddle::dialect::DenseTensorType>();
+          // We don't precess nested VectorType
+          if (!input_type) return false;
+          pir::Type input_dtype =
+              vector_type[idx]
+                  .dyn_cast<paddle::dialect::DenseTensorType>()
+                  .dtype();
+          // Only float input can be converted to bfloat16
+          if (!input_dtype.isa<pir::Float32Type>()) return false;
+        }
+      } else if (type.isa<paddle::dialect::DenseTensorType>()) {
+        pir::Type op_dtype = pir::GetDataTypeFromValue(value);
+        // Only float input can be converted to bfloat16
+        if (!op_dtype.isa<pir::Float32Type>()) return false;
+      } else {
         return false;
       }
     }
@@ -211,6 +227,7 @@ class RemoveOrphanedPattern : public pir::RewritePattern {
     if (!op->isa<paddle::onednn::dialect::BilinearInterpOp>() &&
         !op->isa<paddle::onednn::dialect::ClipOp>() &&
         !op->isa<paddle::onednn::dialect::Clip_Op>() &&
+        !op->isa<paddle::onednn::dialect::ConcatOp>() &&
         !op->isa<paddle::onednn::dialect::Conv2dOp>() &&
         !op->isa<paddle::onednn::dialect::Conv2dTransposeOp>() &&
         !op->isa<paddle::onednn::dialect::Conv2dTransposeBiasOp>() &&
@@ -292,15 +309,17 @@ class RemoveOrphanedPattern : public pir::RewritePattern {
         }
       }
     } else {
-      // The first op in graph
-      return false;
+      // The first op in graph should be treated as prev_fp32 = true
+      prev_fp32 = true;
     }
 
+    size_t num_useops = 0;
     for (uint32_t i = 0; i < op->num_results(); i++) {
       if (!op->result(i) || !op->result(i).type()) {
         continue;
       }
       auto next_op_list = pir::GetUseOpsForOutput(op, i);
+      num_useops += next_op_list.size();
       for (auto const& [next_op, op_index] : next_op_list) {
         // Some ops do not need to be processed
         std::string next_op_name = next_op->name();
@@ -325,6 +344,10 @@ class RemoveOrphanedPattern : public pir::RewritePattern {
       }
     }
 
+    // Check if it's the last op on graph. If it is, this op can be seen as a
+    // fp32 op down here
+    if (num_useops == 0) next_fp32 = true;
+
     return prev_fp32 && next_fp32;
   }
 
@@ -354,97 +377,6 @@ class RemoveOrphanedPattern : public pir::RewritePattern {
   }
 };
 
-class RemoveUnsupportedOpPattern : public pir::RewritePattern {
- public:
-  explicit RemoveUnsupportedOpPattern(pir::IrContext* context)
-      : pir::RewritePattern(MatchAnyOpTypeTag(),
-                            1 /*benefit*/,
-                            context,
-                            {} /*generated_names*/) {}
-
-  bool Match(pir::Operation* op) const override {  // NOLINT
-    if (!op->isa<paddle::onednn::dialect::BilinearInterpOp>() &&
-        !op->isa<paddle::onednn::dialect::ClipOp>() &&
-        !op->isa<paddle::onednn::dialect::Clip_Op>() &&
-        !op->isa<paddle::onednn::dialect::Conv2dOp>() &&
-        !op->isa<paddle::onednn::dialect::Conv2dTransposeOp>() &&
-        !op->isa<paddle::onednn::dialect::Conv2dTransposeBiasOp>() &&
-        !op->isa<paddle::onednn::dialect::AddOp>() &&
-        !op->isa<paddle::onednn::dialect::Add_Op>() &&
-        !op->isa<paddle::onednn::dialect::MultiplyOp>() &&
-        !op->isa<paddle::onednn::dialect::Multiply_Op>() &&
-        !op->isa<paddle::onednn::dialect::FcOp>() &&
-        !op->isa<paddle::onednn::dialect::FusionGruOp>() &&
-        !op->isa<paddle::onednn::dialect::GeluOp>() &&
-        !op->isa<paddle::onednn::dialect::LayerNormOp>() &&
-        !op->isa<paddle::onednn::dialect::MatmulOp>() &&
-        !op->isa<paddle::onednn::dialect::Pool2dOp>() &&
-        !op->isa<paddle::onednn::dialect::PreluOp>() &&
-        !op->isa<paddle::onednn::dialect::ReluOp>() &&
-        !op->isa<paddle::onednn::dialect::Relu_Op>() &&
-        !op->isa<paddle::onednn::dialect::Reshape_Op>() &&
-        !op->isa<paddle::onednn::dialect::ReshapeOp>() &&
-        !op->isa<paddle::onednn::dialect::ScaleOp>() &&
-        !op->isa<paddle::onednn::dialect::Scale_Op>() &&
-        !op->isa<paddle::onednn::dialect::SigmoidOp>() &&
-        !op->isa<paddle::onednn::dialect::Sigmoid_Op>() &&
-        !op->isa<paddle::onednn::dialect::SliceOp>() &&
-        !op->isa<paddle::onednn::dialect::SoftmaxOp>() &&
-        !op->isa<paddle::onednn::dialect::Softmax_Op>() &&
-        !op->isa<paddle::onednn::dialect::SqueezeOp>() &&
-        !op->isa<paddle::onednn::dialect::Squeeze_Op>() &&
-        !op->isa<paddle::onednn::dialect::SumOp>() &&
-        !op->isa<paddle::onednn::dialect::TransposeOp>() &&
-        !op->isa<paddle::onednn::dialect::Transpose_Op>() &&
-        !op->isa<paddle::onednn::dialect::FusedConv2dOp>() &&
-        !op->isa<paddle::onednn::dialect::FusedMatmulOp>()) {
-      return false;
-    }
-    auto op_attr = op->attributes();
-    if (op_attr.find("mkldnn_data_type") != op_attr.end()) {
-      auto mkldnn_data_type = op_attr.at("mkldnn_data_type")
-                                  .dyn_cast<pir::StrAttribute>()
-                                  .AsString();
-      if (mkldnn_data_type != "bfloat16") {
-        return false;
-      }
-    }
-
-    uint32_t num_operands = op->num_operands();
-    for (uint32_t i = 0; i < num_operands; i++) {
-      auto* pre_op = pir::GetDefiningOpForInput(op, i);
-      if (pre_op->HasAttribute("mkldnn_data_type")) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  void Rewrite(pir::Operation* op,
-               pir::PatternRewriter& rewriter) const override {  // NOLINT
-    std::string target_op_name = op->name();
-    auto op_info =
-        pir::IrContext::Instance()->GetRegisteredOpInfo(target_op_name);
-    if (op_info) {
-      std::vector<pir::Type> op_item_inner_output_types;
-      for (size_t i = 0; i < op->num_results(); ++i) {
-        op_item_inner_output_types.push_back(op->result_type(i));
-      }
-      auto attributes = op->attributes();
-      if (attributes.find("mkldnn_data_type") != attributes.end()) {
-        attributes["mkldnn_data_type"] =
-            pir::StrAttribute::get(pir::IrContext::Instance(), "float32");
-      }
-      pir::Operation* op_item_inner = rewriter.Build(op->operands_source(),
-                                                     attributes,
-                                                     op_item_inner_output_types,
-                                                     op_info);
-      rewriter.ReplaceOp(op, op_item_inner->results());
-    }
-  }
-};
-
 class OneDNNPlacementBf16Pass : public pir::PatternRewritePass {
  public:
   OneDNNPlacementBf16Pass()
@@ -454,7 +386,6 @@ class OneDNNPlacementBf16Pass : public pir::PatternRewritePass {
     pir::RewritePatternSet ps(context);
     ps.Add<OneDNNBf16PlacementPattern>(context);
     ps.Add<RemoveOrphanedPattern>(context);
-    ps.Add<RemoveUnsupportedOpPattern>(context);
 
     return ps;
   }
diff --git a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc
index eb586c40c16773..22179947f25afe 100644
--- a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -59,18 +59,15 @@ class ConcatBf16QuantizePattern
     auto onednn_data_type = op_attributes.at("mkldnn_data_type")
                                 .dyn_cast<pir::StrAttribute>()
                                 .AsString();
-    if (onednn_data_type == "bfloat16") return false;
-    op_attributes["mkldnn_data_type"] = rewriter.str_attr("bfloat16");
+    if (onednn_data_type != "bfloat16") return false;
 
     auto combine_inputs = pre_op.inputs();
 
     for (size_t idx = 0; idx < combine_inputs.size(); idx++) {
-      auto type = pre_op->operand_type(idx);
-      // Currently we only process case where elements are all DenseTensor(s)
-      if (!type.isa<pir::DenseTensorType>()) return false;
-      // All Tensors should be fp32
-      auto dtype = pir::GetDataTypeFromValue(pre_op->operand_source(idx));
-      if (!dtype.isa<pir::Float32Type>()) return false;
+      // Check if it's already quantized
+      auto pre_pre_op = pir::GetDefiningOpForInput(pre_op, idx);
+      if (pre_pre_op && pre_pre_op->name() == "onednn_op.quantize")
+        return false;
     }
 
     pir::IrContext *ctx = rewriter.ir_context();
@@ -95,6 +92,7 @@ class ConcatBf16QuantizePattern
       quant_op->result(0).set_type(new_type);
       new_combine_inputs[idx] = quant_op.output();
     }
+
     // Create new combine
     pir::CombineOp new_combine =
         rewriter.Build<pir::CombineOp>(new_combine_inputs);
@@ -146,7 +144,12 @@ class CPUSpecialOpsBf16Pass : public pir::PatternRewritePass {
 
     auto concat_bf16_quant_pattern =
         std::make_unique<ConcatBf16QuantizePattern>(
-            context, benefit--, std::vector<std::string>{});
+            context,
+            benefit--,
+            std::vector<std::string>{
+                paddle::onednn::dialect::QuantizeOp::name(),
+                paddle::onednn::dialect::DequantizeOp::name(),
+            });
     ps.Add(std::move(concat_bf16_quant_pattern));
 
     return ps;
diff --git a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h
index 9dcf771121c24b..781858f00e0a5d 100644
--- a/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h
+++ b/paddle/fluid/pir/transforms/onednn/cpu_special_ops_bf16_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc
index c4f6c4824ecdd4..fd26907815c1e8 100644
--- a/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/onednn_placement_pass.cc
@@ -1,5 +1,4 @@
-// REGISTER_IR_PASS(onednn_placement_pass, OneDNNPlacementPass);
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py b/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py
index 734611b5fe52ff..d8de881c364980 100644
--- a/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py
+++ b/test/ir/pir/fused_pass/onednn/test_cpu_bfloat16_pir_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1143,6 +1143,7 @@ def build_ir_program(self):
                 out = paddle.assign(out)
                 self.pass_attr_list = [
                     {'onednn_placement_pass': {}},
+                    {'cpu_bfloat16_placement_pass': {}},
                     {'cpu_special_ops_bf16_pass': {}},
                 ]
                 self.feeds = {

From fbc9a6a3755b6a35aef4593fb7e9af5ffccb6788 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Wed, 8 Jan 2025 10:26:15 +0800
Subject: [PATCH 07/57] [CodeStyle][Typos][F-[12-17],F-[19-24],F-[26-28]] Fix
 typo(`Flattend`,`flattend`,`flattern`,`Flattern`,`filpped`,`flaot`,`follwed`,`folowing`,`formater`,`formating`,`foramt`,`formt`,`formate`,`forwad`,`forwrad`,`forword`,`founf`,`framwork`,`frequence`,`fron`,`fullfill`)
 (#70646)

---------

Co-authored-by: Nyakku Shigure <sigure.qaq@gmail.com>
---
 CONTRIBUTING.md                               |   4 +-
 _typos.toml                                   |  21 ----
 paddle/cinn/common/ir_util.h                  |   2 +-
 paddle/cinn/runtime/cuda/cuda_util.cc         |   2 +-
 paddle/fluid/framework/data_transform.cc      |   2 +-
 .../framework/new_executor/pir_interpreter.cc |   2 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  |   4 +-
 .../tensorrt/plugin/custom_generic_plugin.cu  |   6 +-
 .../tensorrt/plugin/gelu_op_plugin.cu         |   2 +-
 .../multiary_infer_sym.cc                     |   2 +-
 paddle/phi/kernels/cpu/unique_kernel.cc       |  18 +--
 paddle/phi/kernels/funcs/math_cuda_utils.h    |   2 +-
 paddle/phi/kernels/funcs/unique_functor.h     |  56 +++++-----
 paddle/phi/kernels/gpu/rms_norm_funcs.h       |   2 +-
 .../phi/kernels/gpu/rms_norm_grad_kernel.cu   |   2 +-
 .../kernels/gpu/unique_consecutive_functor.h  |   4 +-
 paddle/phi/kernels/gpu/unique_kernel.cu       | 104 +++++++++---------
 paddle/phi/kernels/gpudnn/conv_gpudnn_base.h  |   2 +-
 python/paddle/amp/grad_scaler.py              |  16 +--
 .../hybrid_parallel_gradscaler.py             |   6 +-
 .../passes/auto_parallel_sharding.py          |   2 +-
 .../pipeline_zero_bubble.py                   |   6 +-
 python/paddle/incubate/asp/utils.py           |  14 +--
 .../paddle/io/dataloader/dataloader_iter.py   |   2 +-
 python/paddle/text/datasets/imikolov.py       |   3 +-
 test/legacy_test/test_gather_op.py            |   2 +-
 test/legacy_test/test_lstm_op.py              |   4 +-
 tools/gen_ut_cmakelists.py                    |   4 +-
 28 files changed, 138 insertions(+), 158 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d0c06e6ccf443f..8f03b35783a5ff 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -39,7 +39,7 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
    pre-commit install
    ```
 
-   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+   Our pre-commit configuration requires clang-format 3.8 for auto-formatting C/C++ code and yapf for Python.
 
    Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
 
@@ -52,7 +52,7 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
    Check for broken symlinks................................................Passed
    Detect Private Key...................................(no files to check)Skipped
    Fix End of Files.....................................(no files to check)Skipped
-   clang-formater.......................................(no files to check)Skipped
+   clang-format.........................................(no files to check)Skipped
    [my-cool-stuff c703c041] add test file
     1 file changed, 0 insertions(+), 0 deletions(-)
     create mode 100644 233
diff --git a/_typos.toml b/_typos.toml
index 169423520b98d5..2d6bcfacf8f928 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -46,27 +46,6 @@ dobule = 'dobule'
 Dowloading = 'Dowloading'
 downsteram = 'downsteram'
 fetchs = 'fetchs'
-Flattend = 'Flattend'
-flattend = 'flattend'
-flattern = 'flattern'
-Flattern = 'Flattern'
-filpped = 'filpped'
-flaot = 'flaot'
-follwed = 'follwed'
-folowing = 'folowing'
-formater = 'formater'
-formating = 'formating'
-foramt = 'foramt'
-formate = 'formate'
-formt = 'formt'
-forwrad = 'forwrad'
-forwad = 'forwad'
-forword = 'forword'
-founf = 'founf'
-framwork = 'framwork'
-frequence = 'frequence'
-fron = 'fron'
-fullfill = 'fullfill'
 Indexs = 'Indexs'
 indexs = 'indexs'
 indiates = 'indiates'
diff --git a/paddle/cinn/common/ir_util.h b/paddle/cinn/common/ir_util.h
index 724be629e6e93e..cbfe072d307016 100644
--- a/paddle/cinn/common/ir_util.h
+++ b/paddle/cinn/common/ir_util.h
@@ -191,7 +191,7 @@ inline void UnpackReduction(const ir::IndexExpr &expr, FLeaf fleaf) {
 }
 
 /*!
- * \brief Flattern the expression into a vector of expressions splited by `Add`
+ * \brief Flatten the expression into a vector of expressions splited by `Add`
  * or `Mul`.
  *
  * For example (Add):
diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc
index a0c12732a4ad5d..af0017222231bc 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.cc
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
@@ -1742,7 +1742,7 @@ void cinn_call_cholesky_nvgpu(void *v_args,
   cinn_buffer_t *x = args[0].operator cinn_buffer_t *();
   cinn_buffer_t *out = args[1].operator cinn_buffer_t *();
   // In cuSOLVER, dense matrix stores in COL_MAJOR, thus FILL_MODE needs to be
-  // filpped. See also:
+  // flipped. See also:
   // https://docs.nvidia.com/cuda/cusolver/index.html#matrix-dense-format
   cublasFillMode_t uplo =
       upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index c8cf06fe27aec8..71d1ae8047105b 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -173,7 +173,7 @@ phi::GetKernelTypeForVarContext BuildGetKernelTypeForVarContext(
   if (has_infer_varkernel_fn) {
     for (auto &attr : fluid_attrs) {
       switch (attr.second.index()) {
-        case 3:  // string type in framwork::Attribute
+        case 3:  // string type in framework::Attribute
           (*phi_attrs)[attr.first] = PADDLE_GET_CONST(std::string, attr.second);
           break;
         default:
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 75def437deafda..287ca3fb178ea5 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -979,7 +979,7 @@ void PirInterpreter::BuildInstruction() {
 }
 
 std::string PirInterpreter::DebugInstructions() {
-  // log formate: var[101] = pd_op.relu(var[100]) or for inplace op var[100] =
+  // log format: var[101] = pd_op.relu(var[100]) or for inplace op var[100] =
   // pd_op.relu_(var[100])
   std::stringstream ss;
   ss << "{outputs}"
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index d0becae8c45ed6..15f2fba66b1932 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -3467,9 +3467,9 @@ struct CustomGenericPluginTeller : public Teller {
                    "SetTrtInferShapeFn.";
         return false;
       }
-      auto& trt_supports_formate_config =
+      auto& trt_supports_format_config =
           OpMetaInfoHelper::GetTrtSupportsFormatConfig(op_info);
-      if (trt_supports_formate_config.empty()) {
+      if (trt_supports_format_config.empty()) {
         VLOG(3)
             << op_type
             << " has no trt supportsFormatCombination config. Please set by "
diff --git a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
index af5db479f10592..d6d76c6b9618ea 100644
--- a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
@@ -311,9 +311,9 @@ bool CustomGenericPlugin::supportsFormatCombination(
   auto& op_meta_info_map = OpMetaInfoMap::Instance();
   const auto& meta_info_map = op_meta_info_map.GetMap();
   auto& op_info = meta_info_map.at(op_desc_.Type()).front();
-  auto& supports_formate_config =
+  auto& supports_format_config =
       OpMetaInfoHelper::GetTrtSupportsFormatConfig(op_info);
-  PADDLE_ENFORCE_NE(supports_formate_config.empty(),
+  PADDLE_ENFORCE_NE(supports_format_config.empty(),
                     true,
                     common::errors::InvalidArgument(
                         "The %s op has no tensorrt plugin "
@@ -325,7 +325,7 @@ bool CustomGenericPlugin::supportsFormatCombination(
   size_t output_num = OpMetaInfoHelper::GetOutputs(op_info).size();
   std::vector<std::vector<std::pair<std::string, std::string>>>
       format_combinations;
-  for (auto& config : supports_formate_config) {
+  for (auto& config : supports_format_config) {
     auto format_combination = parseConfig(op_desc_.Type(), config);
     PADDLE_ENFORCE_EQ(input_num + output_num,
                       format_combination.size(),
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
index f5369eb691c69e..c1b4aad6d73c06 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@@ -77,7 +77,7 @@ __device__ half do_tanh<half>(half a) {
   return __float2half(tmp);
 }
 
-// the kernel below is not aligned with fluid fp32 forwrad ones, use it for
+// the kernel below is not aligned with fluid fp32 forward ones, use it for
 // fp16.
 template <typename T, unsigned TPB>
 __global__ void no_exact_gelu_kernel(
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
index 3f2c8397a61415..9809acfb576b71 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
@@ -2631,7 +2631,7 @@ bool GroupNormOpInferSymbolicShape(
     channel_idx = 1;
   } else {
     PADDLE_THROW(common::errors::Unimplemented(
-        "GroupNorm only suport NHWC and NCHW data formt"));
+        "GroupNorm only suport NHWC and NCHW data format"));
   }
 
   symbol::DimExpr channel_dim = x_shape.shape()[channel_idx];
diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc
index e3be49af16ed3c..8a0b9046a15b84 100644
--- a/paddle/phi/kernels/cpu/unique_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_kernel.cc
@@ -83,15 +83,15 @@ void UniqueRawKernel(const Context& context,
   if (axis.empty()) {
     phi::VisitDataTypeTiny(
         dtype,
-        phi::funcs::UniqueFlattendTensorFunctor<Context, T>(context,
-                                                            x,
-                                                            out,
-                                                            indices,
-                                                            index,
-                                                            counts,
-                                                            return_index,
-                                                            return_inverse,
-                                                            return_counts));
+        phi::funcs::UniqueFlattenedTensorFunctor<Context, T>(context,
+                                                             x,
+                                                             out,
+                                                             indices,
+                                                             index,
+                                                             counts,
+                                                             return_index,
+                                                             return_inverse,
+                                                             return_counts));
   } else {
     int axis_value = axis[0];
     axis_value = (axis_value == -1) ? (x.dims().size() - 1) : axis_value;
diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
index a5aaa1310b16db..f14b2af8c72609 100644
--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
+++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -298,7 +298,7 @@ __inline__ __device__ T PartialWarpReduceMin(T val, warp_mask_t lane_mask) {
   T warp_val = __shfl_sync(lane_mask, val, 0, warpSize);
 #else
   T warp_val = __shfl(
-      val, 0, warpSize);  // To fullfill the data in each thread of this warp.
+      val, 0, warpSize);  // To fulfill the data in each thread of this warp.
 #endif
   warp_val = val;
 
diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h
index 8d62a0c5255e46..4365f1a5f4cfe6 100644
--- a/paddle/phi/kernels/funcs/unique_functor.h
+++ b/paddle/phi/kernels/funcs/unique_functor.h
@@ -130,15 +130,15 @@ static bool Equal(const DenseTensor& a, const DenseTensor& b) {
 }
 
 template <typename Context, typename InT, typename IndexT>
-static void UniqueFlattendTensor(const Context& context,
-                                 const DenseTensor& in,
-                                 DenseTensor* out,
-                                 DenseTensor* indices,
-                                 DenseTensor* index,
-                                 DenseTensor* count,
-                                 bool return_index,
-                                 bool return_inverse,
-                                 bool return_counts) {
+static void UniqueFlattenedTensor(const Context& context,
+                                  const DenseTensor& in,
+                                  DenseTensor* out,
+                                  DenseTensor* indices,
+                                  DenseTensor* index,
+                                  DenseTensor* count,
+                                  bool return_index,
+                                  bool return_inverse,
+                                  bool return_counts) {
   const InT* in_data = in.data<InT>();
   std::set<InT> unique(in_data, in_data + in.numel());
   out->Resize(common::make_ddim({static_cast<int64_t>(unique.size())}));
@@ -327,7 +327,7 @@ static void UniqueDim(const Context& context,
 }
 
 template <typename Context, typename InT>
-struct UniqueFlattendTensorFunctor {
+struct UniqueFlattenedTensorFunctor {
   const Context& ctx_; /*  */
   const DenseTensor& in_;
   DenseTensor* out_;
@@ -338,15 +338,15 @@ struct UniqueFlattendTensorFunctor {
   const bool return_inverse_;
   const bool return_counts_;
 
-  UniqueFlattendTensorFunctor(const Context& context,
-                              const DenseTensor& in,
-                              DenseTensor* out,
-                              DenseTensor* indices,
-                              DenseTensor* index,
-                              DenseTensor* count,
-                              bool return_index,
-                              bool return_inverse,
-                              bool return_counts)
+  UniqueFlattenedTensorFunctor(const Context& context,
+                               const DenseTensor& in,
+                               DenseTensor* out,
+                               DenseTensor* indices,
+                               DenseTensor* index,
+                               DenseTensor* count,
+                               bool return_index,
+                               bool return_inverse,
+                               bool return_counts)
       : ctx_(context),
         in_(in),
         out_(out),
@@ -359,15 +359,15 @@ struct UniqueFlattendTensorFunctor {
 
   template <typename IndexT>
   void apply() const {
-    UniqueFlattendTensor<Context, InT, IndexT>(ctx_,
-                                               in_,
-                                               out_,
-                                               indices_,
-                                               index_,
-                                               count_,
-                                               return_index_,
-                                               return_inverse_,
-                                               return_counts_);
+    UniqueFlattenedTensor<Context, InT, IndexT>(ctx_,
+                                                in_,
+                                                out_,
+                                                indices_,
+                                                index_,
+                                                count_,
+                                                return_index_,
+                                                return_inverse_,
+                                                return_counts_);
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/rms_norm_funcs.h b/paddle/phi/kernels/gpu/rms_norm_funcs.h
index 2954d593014a6c..db6a137a02d386 100644
--- a/paddle/phi/kernels/gpu/rms_norm_funcs.h
+++ b/paddle/phi/kernels/gpu/rms_norm_funcs.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
 
-/*This code is copied fron NVIDIA apex:
+/*This code is copied from NVIDIA apex:
  *     https://github.com/NVIDIA/apex
  *     with minor changes. */
 
diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
index 5be55226813646..342737e9b20bd5 100644
--- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-/*This code is copied fron NVIDIA apex:
+/*This code is copied from NVIDIA apex:
  *     https://github.com/NVIDIA/apex
  *     with minor changes. */
 
diff --git a/paddle/phi/kernels/gpu/unique_consecutive_functor.h b/paddle/phi/kernels/gpu/unique_consecutive_functor.h
index dae83a45a8e917..f094da335f396d 100644
--- a/paddle/phi/kernels/gpu/unique_consecutive_functor.h
+++ b/paddle/phi/kernels/gpu/unique_consecutive_functor.h
@@ -32,7 +32,7 @@
 
 namespace phi {
 
-// The core logic of computing Unique Consecutive for a flattend Tensor
+// The core logic of computing Unique Consecutive for a flattened Tensor
 template <typename Context,
           typename InT,
           typename IndexT,
@@ -113,7 +113,7 @@ static void UniqueConsecutiveFlattenedCUDATensor(const Context& context,
   }
 }
 
-// functor for processing a flattend Tensor
+// functor for processing a flattened Tensor
 template <typename Context, typename InT>
 struct UniqueConsecutiveFlattenedCUDAFunctor {
   const Context& ctx_;
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index 341483e57d56b4..e08aa5bece3bc4 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -106,21 +106,21 @@ struct BinaryNotEqual {
   }
 };
 
-// The core logic of computing Unique for a flattend DenseTensor
+// The core logic of computing Unique for a flattened DenseTensor
 template <typename Context, typename InT, typename IndexT>
 static typename std::enable_if<
     !std::is_same<InT, phi::dtype::float16>::value &&
     !std::is_same<InT, phi::dtype::bfloat16>::value>::type
-UniqueFlattendCUDATensor(const Context& context,
-                         const DenseTensor& in,
-                         DenseTensor* out,
-                         DenseTensor* indices,
-                         DenseTensor* index,
-                         DenseTensor* counts,
-                         bool return_index,
-                         bool return_inverse,
-                         bool return_counts,
-                         int64_t num_input) {
+UniqueFlattenedCUDATensor(const Context& context,
+                          const DenseTensor& in,
+                          DenseTensor* out,
+                          DenseTensor* indices,
+                          DenseTensor* index,
+                          DenseTensor* counts,
+                          bool return_index,
+                          bool return_inverse,
+                          bool return_counts,
+                          int64_t num_input) {
   // 0. Prepration
   auto equal = thrust::equal_to<InT>();
   auto not_equal = thrust::not_equal_to<InT>();
@@ -242,21 +242,21 @@ UniqueFlattendCUDATensor(const Context& context,
   }
 }
 
-// The core logic of computing Unique for a flattend DenseTensor
+// The core logic of computing Unique for a flattened DenseTensor
 template <typename Context, typename InT, typename IndexT>
 static typename std::enable_if<
     std::is_same<InT, phi::dtype::float16>::value ||
     std::is_same<InT, phi::dtype::bfloat16>::value>::type
-UniqueFlattendCUDATensor(const Context& context,
-                         const DenseTensor& in,
-                         DenseTensor* out,
-                         DenseTensor* indices,
-                         DenseTensor* index,
-                         DenseTensor* counts,
-                         bool return_index,
-                         bool return_inverse,
-                         bool return_counts,
-                         int64_t num_input) {
+UniqueFlattenedCUDATensor(const Context& context,
+                          const DenseTensor& in,
+                          DenseTensor* out,
+                          DenseTensor* indices,
+                          DenseTensor* index,
+                          DenseTensor* counts,
+                          bool return_index,
+                          bool return_inverse,
+                          bool return_counts,
+                          int64_t num_input) {
   // 1. Sort indices
   DenseTensor in_resize;
   in_resize.ShareDataWith(in);
@@ -526,9 +526,9 @@ static void UniqueDimsCUDATensor(const Context& context,
   }
 }
 
-// functor for processing a flattend DenseTensor
+// functor for processing a flattened DenseTensor
 template <typename Context, typename InT>
-struct UniqueFlattendCUDAFunctor {
+struct UniqueFlattenedCUDAFunctor {
   const Context& ctx_;
   const DenseTensor& in_;
   DenseTensor* out_;
@@ -539,15 +539,15 @@ struct UniqueFlattendCUDAFunctor {
   const bool return_inverse_;
   const bool return_counts_;
 
-  UniqueFlattendCUDAFunctor(const Context& context,
-                            const DenseTensor& in,
-                            DenseTensor* out,
-                            DenseTensor* indices,
-                            DenseTensor* index,
-                            DenseTensor* counts,
-                            bool return_index,
-                            bool return_inverse,
-                            bool return_counts)
+  UniqueFlattenedCUDAFunctor(const Context& context,
+                             const DenseTensor& in,
+                             DenseTensor* out,
+                             DenseTensor* indices,
+                             DenseTensor* index,
+                             DenseTensor* counts,
+                             bool return_index,
+                             bool return_inverse,
+                             bool return_counts)
       : ctx_(context),
         in_(in),
         out_(out),
@@ -560,16 +560,16 @@ struct UniqueFlattendCUDAFunctor {
 
   template <typename IndexT>
   void apply() const {
-    UniqueFlattendCUDATensor<Context, InT, IndexT>(ctx_,
-                                                   in_,
-                                                   out_,
-                                                   indices_,
-                                                   index_,
-                                                   counts_,
-                                                   return_index_,
-                                                   return_inverse_,
-                                                   return_counts_,
-                                                   in_.numel());
+    UniqueFlattenedCUDATensor<Context, InT, IndexT>(ctx_,
+                                                    in_,
+                                                    out_,
+                                                    indices_,
+                                                    index_,
+                                                    counts_,
+                                                    return_index_,
+                                                    return_inverse_,
+                                                    return_counts_,
+                                                    in_.numel());
   }
 };
 
@@ -650,15 +650,15 @@ void UniqueRawKernel(const Context& context,
   if (axis.empty()) {
     phi::VisitDataTypeTiny(
         dtype,
-        UniqueFlattendCUDAFunctor<Context, T>(context,
-                                              x,
-                                              out,
-                                              indices,
-                                              index,
-                                              counts,
-                                              return_index,
-                                              return_inverse,
-                                              return_counts));
+        UniqueFlattenedCUDAFunctor<Context, T>(context,
+                                               x,
+                                               out,
+                                               indices,
+                                               index,
+                                               counts,
+                                               return_index,
+                                               return_inverse,
+                                               return_counts));
   } else {
     // 'axis' is required.
     int axis_value = axis[0];
diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
index 5b55aa8f70394a..a21ed28d839a4a 100644
--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -120,7 +120,7 @@ struct ConvArgsBase {
   // groups
   int group;
 
-  // data foramt
+  // data format
   GPUDNNDataLayout data_layout;
 
   ConvArgsBase(const HandleT& h,
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 4ba1524a307d9d..c371918e3f0e4f 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -181,7 +181,7 @@ def __init__(
                 self._scale = paddle.to_tensor(
                     np.array([self._init_loss_scaling]).astype(np.float32)
                 )
-                self._cache_founf_inf = None
+                self._cache_found_inf = None
                 self._optimizer_states = defaultdict(_refresh_optimizer_state)
 
     def scale(self, var: Tensor) -> Tensor:
@@ -335,13 +335,13 @@ def minimize(
             optimizer._set_auxiliary_var('found_inf', self._found_inf)
             optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
             # TODO: Fix to _cache_found_inf after PaddleNLP update
-            self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+            self._cache_found_inf = optimizer._get_auxiliary_var('found_inf')
         else:
             if self._found_inf:
-                self._cache_founf_inf = True
+                self._cache_found_inf = True
             else:
                 optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
-                self._cache_founf_inf = False
+                self._cache_found_inf = False
 
         if self._use_dynamic_loss_scaling:
             # update the scale
@@ -462,7 +462,7 @@ def _update(self):
         if not self._enable:
             return
 
-        if self._cache_founf_inf:
+        if self._cache_found_inf:
             self._incr_count = 0
             self._decr_count = self._decr_count + 1
             if self._decr_count == self._decr_every_n_nan_or_inf:
@@ -846,13 +846,13 @@ def step(self, optimizer: Optimizer) -> None:
         if hasattr(optimizer, "_set_auxiliary_var"):
             optimizer._set_auxiliary_var('found_inf', self._found_inf)
             optimizer.step()
-            self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+            self._cache_found_inf = optimizer._get_auxiliary_var('found_inf')
         else:
             if self._found_inf:
-                self._cache_founf_inf = True
+                self._cache_found_inf = True
             else:
                 optimizer.step()
-                self._cache_founf_inf = False
+                self._cache_found_inf = False
 
         optimizer_state["state"] = OptimizerState.STEPPED
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index 358c6023e6c6f7..c9a684ae807be4 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -45,13 +45,13 @@ def minimize(self, optimizer, *args, **kwargs):
             optimizer._set_auxiliary_var('found_inf', self._found_inf)
             optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
             # TODO: Fix to _cache_found_inf after PaddleNLP update
-            self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
+            self._cache_found_inf = optimizer._get_auxiliary_var('found_inf')
         else:
             if self._found_inf:
-                self._cache_founf_inf = True
+                self._cache_found_inf = True
             else:
                 optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
-                self._cache_founf_inf = False
+                self._cache_found_inf = False
 
         if self._use_dynamic_loss_scaling:
             self._update()
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 4b7814af7f53ea..c6315c78ad4617 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -196,7 +196,7 @@ def _apply_single_impl(self, main_program, startup_program, context):
         # NOTE Multi / Sub-Block Support
         # we assume that only parameter are present and partitioned in main_block,
         # there is NO new param in sub_block, and all params in sub_block follows the same
-        # partition as main_block. the above constraint fullfill the 3 most common use-cases in Paddle sub_block:
+        # partition as main_block. the above constraint fulfill the 3 most common use-cases in Paddle sub_block:
         # 1. subblock for lr scheduler
         # 2. sub-block uses the same or partial network of main-block, e.g. GPT3 generation model
         # 3. sub-block used for double backward
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py
index d6025d80e5e7c8..112373cebcd404 100644
--- a/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass/pipeline_zero_bubble.py
@@ -604,11 +604,11 @@ def _insert_jobs_after_backward_start(
                         break
 
             # Step3: Insert forward jobs after backward_b
-            forword_insert_order = range(self.num_stage)
+            forward_insert_order = range(self.num_stage)
             if self.num_model_chunks % 2:
-                forword_insert_order = range(self.num_stage - 1, -1, -1)
+                forward_insert_order = range(self.num_stage - 1, -1, -1)
 
-            for stage_id in forword_insert_order:
+            for stage_id in forward_insert_order:
                 for chunk_id in range(self.num_model_chunks - 1, -1, -1):
                     if self._can_schedule_f_task(stage_id, chunk_id):
                         while (
diff --git a/python/paddle/incubate/asp/utils.py b/python/paddle/incubate/asp/utils.py
index 408c3d3a6b0866..1fef294dc41826 100644
--- a/python/paddle/incubate/asp/utils.py
+++ b/python/paddle/incubate/asp/utils.py
@@ -220,14 +220,14 @@ def get_mask_1d(mat: npt.NDArray[Any], n: int, m: int) -> npt.NDArray[Any]:
     """
     mat_flatten, shape = _reshape_1d(mat, m)
 
-    mask_flattern = np.ones_like(mat_flatten)
+    mask_flatten = np.ones_like(mat_flatten)
     mask = np.ones_like(mat)
     for i in range(mat_flatten.shape[0]):
         sub_mat = mat_flatten[i]
         min_order_indices = np.argsort(np.absolute(sub_mat))
-        mask_flattern[i, min_order_indices[:n].tolist()] = 0
-    mask_flattern = mask_flattern.reshape(shape)
-    mask[:, :] = mask_flattern[:, : mat.shape[1]]
+        mask_flatten[i, min_order_indices[:n].tolist()] = 0
+    mask_flatten = mask_flatten.reshape(shape)
+    mask[:, :] = mask_flatten[:, : mat.shape[1]]
     return mask
 
 
@@ -486,13 +486,13 @@ def get_mask_2d_best(mat: npt.NDArray[Any], n: int, m: int) -> npt.NDArray[Any]:
     patterns = _compute_valid_2d_patterns(n, m)
 
     mat_flatten, shape = _reshape_2d(mat, m)
-    mask_flattern = np.ones_like(mat_flatten).reshape(-1, m, m)
+    mask_flatten = np.ones_like(mat_flatten).reshape(-1, m, m)
     pmax = np.argmax(
         np.matmul(mat_flatten, patterns.reshape(patterns.shape[0], m * m).T),
         axis=1,
     )
 
-    mask_flattern[:] = patterns[pmax[:]]
+    mask_flatten[:] = patterns[pmax[:]]
     mask = np.empty(shape)
 
     curr_idx = 0
@@ -500,7 +500,7 @@ def get_mask_2d_best(mat: npt.NDArray[Any], n: int, m: int) -> npt.NDArray[Any]:
         row_end = row_start + m
         for col_start in range(0, shape[1], m):
             col_end = col_start + m
-            mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
+            mask[row_start:row_end, col_start:col_end] = mask_flatten[curr_idx]
             curr_idx += 1
     return mask[: mat.shape[0], : mat.shape[1]]
 
diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py
index 836c0b40224c6f..8b3ba314388eab 100644
--- a/python/paddle/io/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
@@ -52,7 +52,7 @@
 # layers processing) after iterate **the first few data** in
 # distributed launch mode, distributed launch will call
 # terminate() to kill main process on each devices, but thread
-# is still iterating to fullfill blocking queue caches, which
+# is still iterating to fulfill blocking queue caches, which
 # may cause thread error `terminate called without an active
 # exception` for terminate is a strong signal and `__del__`
 # of DataLoader may not be called, so we add a global link to
diff --git a/python/paddle/text/datasets/imikolov.py b/python/paddle/text/datasets/imikolov.py
index df7b4383f6318f..6691b1fd6ef5c1 100644
--- a/python/paddle/text/datasets/imikolov.py
+++ b/python/paddle/text/datasets/imikolov.py
@@ -43,7 +43,7 @@ class Imikolov(Dataset):
         data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
         window_size(int): sliding window size for 'NGRAM' data. Default -1.
         mode(str): 'train' 'test' mode. Default 'train'.
-        min_word_freq(int): minimal word frequence for building word dictionary. Default 50.
+        min_word_freq(int): minimal word frequencies for building word dictionary. Default 50.
         download(bool): whether to download dataset automatically if
             :attr:`data_file` is not set. Default True
 
@@ -54,6 +54,7 @@ class Imikolov(Dataset):
 
         .. code-block:: python
 
+            >>> # doctest: +TIMEOUT(60)
             >>> import paddle
             >>> from paddle.text.datasets import Imikolov
 
diff --git a/test/legacy_test/test_gather_op.py b/test/legacy_test/test_gather_op.py
index c4ebe86af2ad97..d8227134d6b5d2 100644
--- a/test/legacy_test/test_gather_op.py
+++ b/test/legacy_test/test_gather_op.py
@@ -471,7 +471,7 @@ def config(self):
 
 
 class TestOutOfRangeError(unittest.TestCase):
-    def test_dygraph_forwad_and_backward(self):
+    def test_dygraph_forward_and_backward(self):
         with dygraph_guard():
             x = paddle.randn([100, 3]).cpu()
             x.stop_gradient = False
diff --git a/test/legacy_test/test_lstm_op.py b/test/legacy_test/test_lstm_op.py
index 2f3f3fe4ed683e..fca6d226e90705 100644
--- a/test/legacy_test/test_lstm_op.py
+++ b/test/legacy_test/test_lstm_op.py
@@ -207,7 +207,7 @@ def test_check_output(self):
         self.check_output(atol=1e-8, check_dygraph=False)
 
     def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        # TODO(qingqing) remove following lines after the check_grad is refined.
         N = len(self.lod[0])
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros((N, self.D)).astype(
@@ -259,7 +259,7 @@ def test_check_grad(self):
 #         self.use_peepholes = True
 
 #     def test_check_grad(self):
-#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
+#         # TODO(qingqing) remove following lines after the check_grad is refined.
 #         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
diff --git a/tools/gen_ut_cmakelists.py b/tools/gen_ut_cmakelists.py
index f64f065c19da65..50819aa9a0de58 100644
--- a/tools/gen_ut_cmakelists.py
+++ b/tools/gen_ut_cmakelists.py
@@ -238,7 +238,7 @@ def process_dist_port_num(self, port_num):
             re.compile("^[0-9]+$").search(port_num)
             and int(port_num) > 0
             or port_num.strip() == ""
-        ), f"""port_num must be foramt as a positive integer or empty, but this port_num is '{port_num}'"""
+        ), f"""port_num must be format as a positive integer or empty, but this port_num is '{port_num}'"""
         port_num = port_num.strip()
         if len(port_num) == 0:
             return 0
@@ -270,7 +270,7 @@ def _init_dist_ut_ports_from_cmakefile(self, cmake_file_name):
                         break
                 name = lines[k - 1].strip()
 
-                # matcg right tets name format, the name must start with 'test_' follwed bu at least one char of
+                # matcg right tets name format, the name must start with 'test_' followed bu at least one char of
                 # '0-9'. 'a-z'. 'A-Z' or '_'
                 assert re.compile("^test_[0-9a-zA-Z_]+").search(
                     name

From 6b69d206f5e1cc914610faee31700f1adde07aaf Mon Sep 17 00:00:00 2001
From: Chandler <44045446+BaolanChen@users.noreply.github.com>
Date: Wed, 8 Jan 2025 10:33:43 +0800
Subject: [PATCH 08/57] [CodeStyle][Typos][D-[37-44]] Fix typos
 (`dito`,`devide`,`documention`,`doens`,`doen`,`dobule`,`doubel`,`dowloading`,`downsteram`)
 (#70642)

* Typos fix D37-44

* Typos fix D37-44 changes

* merge changes
---
 _typos.toml                                              | 9 ---------
 paddle/cinn/operator_fusion/graph_transformer/matcher.h  | 6 +++---
 paddle/fluid/inference/tensorrt/op_teller.cc             | 2 +-
 .../fluid/operators/controlflow/control_flow_op_helper.h | 2 +-
 paddle/fluid/operators/elementwise/elementwise_mul_op.cc | 2 +-
 paddle/fluid/operators/elementwise/elementwise_sub_op.cc | 2 +-
 paddle/fluid/pir/dialect/op_generator/op_gen.py          | 2 +-
 paddle/phi/kernels/funcs/values_vectors_functor.h        | 1 -
 paddle/scripts/paddle_build.sh                           | 2 +-
 paddle/utils/string/printf.h                             | 2 +-
 .../static/tuner/to_distributed_api_patterns.py          | 2 +-
 python/paddle/nn/clip.py                                 | 2 +-
 .../hybrid_strategy/to_distributed_api_for_llama.py      | 2 +-
 test/ir/pir/cinn/llama_test_model.py                     | 2 +-
 test/legacy_test/test_cond.py                            | 2 +-
 15 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 2d6bcfacf8f928..5355bd0657d18b 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -36,15 +36,6 @@ unpacket = "unpacket"
 # These words need to be fixed
 Creater = 'Creater'
 creater = 'creater'
-dito = 'dito'
-devide = 'devide'
-documention = 'documention'
-doens = 'doens'
-doen = 'doen'
-doubel = 'doubel'
-dobule = 'dobule'
-Dowloading = 'Dowloading'
-downsteram = 'downsteram'
 fetchs = 'fetchs'
 Indexs = 'Indexs'
 indexs = 'indexs'
diff --git a/paddle/cinn/operator_fusion/graph_transformer/matcher.h b/paddle/cinn/operator_fusion/graph_transformer/matcher.h
index 80c205529009b1..36352e81a2f24c 100644
--- a/paddle/cinn/operator_fusion/graph_transformer/matcher.h
+++ b/paddle/cinn/operator_fusion/graph_transformer/matcher.h
@@ -285,9 +285,9 @@ struct LeafReshapeConnectionMatcher {
 struct NotAllElementWiseDownstreamMatcher {
   bool operator()(const PatternGraph& graph, const PatternNodePtr& node) {
     size_t count = 0;
-    for (const auto& downsteram : node->downstream()) {
-      if (StmtPatternGraphMatcher<TrivialPattern>()(graph, downsteram)) {
-        auto ops = std::get<TrivialPattern>(downsteram->stmt_pattern()).ops();
+    for (const auto& downstream : node->downstream()) {
+      if (StmtPatternGraphMatcher<TrivialPattern>()(graph, downstream)) {
+        auto ops = std::get<TrivialPattern>(downstream->stmt_pattern()).ops();
         bool is_elementwise =
             std::all_of(ops.begin(), ops.end(), [](pir::Operation* op) {
               return GetOpPatternKind(op) == hlir::framework::kElementWise;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 15f2fba66b1932..9a21edd52d838a 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2383,7 +2383,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 
     // conv3d_transpose
     if (op_type == "conv3d_transpose") {
-      // trt doen't support output_padding when < 8406
+      // trt doesn't support output_padding when < 8406
       // output_padding is usually set when stride > 1
 #if !IS_TRT_VERSION_GE(8400)
       if (desc.HasAttr("output_padding")) {
diff --git a/paddle/fluid/operators/controlflow/control_flow_op_helper.h b/paddle/fluid/operators/controlflow/control_flow_op_helper.h
index 188aa87c2bf9fb..52039c1049b958 100644
--- a/paddle/fluid/operators/controlflow/control_flow_op_helper.h
+++ b/paddle/fluid/operators/controlflow/control_flow_op_helper.h
@@ -121,7 +121,7 @@ static void AssignZeroToParentScope(
       PADDLE_ENFORCE_EQ(input_tensors.size(),
                         outside_tensors->size(),
                         common::errors::InvalidArgument(
-                            "DenseTensorArray outside_var %s doen't have same "
+                            "DenseTensorArray outside_var %s doesn't have same "
                             "size as input_var %s.",
                             outside_grad_name,
                             input_name));
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index fcbded2b78adbb..844a6e5c750a0d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -139,7 +139,7 @@ class ElementwiseMulCompositeDoubleGradOpMaker
                       -1,
                       common::errors::InvalidArgument(
                           "We only support axis = -1 in composite "
-                          "add_doubel_grad but we got: ",
+                          "add_double_grad but we got: ",
                           axis));
 
     // get output
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index cce90902bd7c02..e2126db86e7e3d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -125,7 +125,7 @@ class ElementwiseSubCompositeDoubleGradOpMaker
                       -1,
                       common::errors::InvalidArgument(
                           "We only support axis = -1 in composite "
-                          "subtract_doubel_grad but we got: ",
+                          "subtract_double_grad but we got: ",
                           axis));
 
     paddle::Tensor* grad_out_grad = this->GetOutputPtr(&grad_out_grad_t);
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index a45964841116aa..3e33fe2205d618 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -342,7 +342,7 @@ class {TEST_API} {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
     'int': 'pir::Int32Attribute',
     'int64_t': 'pir::Int64Attribute',
     'float': 'pir::FloatAttribute',
-    'dobule': 'pir::DoubleAttribute',
+    'double': 'pir::DoubleAttribute',
     'bool': 'pir::BoolAttribute',
 }
 
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index d95b28fc59718f..b14f36aebb7cb0 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -467,7 +467,6 @@ struct MatrixEighFunctor<GPUContext, T> {
                                   "When has_vectors is true,"
                                   "the eigenvectors needs to be calculated,"
                                   "so the eigenvectors must be provided."));
-      //   input_trans = dito.Transpose(input_trans);
       input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
       eigen_vectors->ShareDataWith(input_trans);
     }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d0c0ed8db6b8f7..7090df20d6a5e4 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3674,7 +3674,7 @@ function distribute_test() {
     parallel_fa_unit
     echo "End FA tests"
 
-    echo "Dowloading ...."
+    echo "Downloading ...."
     cd ${work_dir}
     wget https://paddlenlp.bj.bcebos.com/wheels/PaddleNLP_stable_paddle.tar.gz --no-proxy
     tar -zvxf PaddleNLP_stable_paddle.tar.gz
diff --git a/paddle/utils/string/printf.h b/paddle/utils/string/printf.h
index f4576c6bc4aa54..f2c87fb5e8ed31 100644
--- a/paddle/utils/string/printf.h
+++ b/paddle/utils/string/printf.h
@@ -54,7 +54,7 @@
 //                  weekday, month, day, hour, min);
 //
 // 2. High-performance -- most printed strings are not too long and
-//    doens't need dynamic memory allocation.  Many StringPrintf
+//    doesn't need dynamic memory allocation.  Many StringPrintf
 //    implementations doesn't enforce type-safe, but are
 //    high-performance, including
 //
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py b/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py
index 4887be8b757412..f0f564663d5e12 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/to_distributed_api_patterns.py
@@ -563,7 +563,7 @@ def apply(
         key_states = paddle.transpose(key_states, [0, 2, 1, 3])
         value_states = paddle.transpose(value_states, [0, 2, 1, 3])
 
-        # matmul and devide by sqrt(head_dim)
+        # matmul and divide by sqrt(head_dim)
         attn_weights = paddle.matmul(
             query_states / math.sqrt(head_dim),
             key_states.transpose([0, 1, 3, 2]),
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index f0ccab9ff068f1..9913063eb946f6 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -1407,7 +1407,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
         "We recommend a new strategy: set 'grad_clip' "
         "when initializing the 'optimizer'. "
         "This method can reduce the mistakes, please "
-        "refer to documention of 'optimizer'."
+        "refer to documentation of 'optimizer'."
     )
 
     if not isinstance(clip, ClipGradBase):
diff --git a/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py b/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py
index 2fe3a039b635be..da5402ed1031e5 100644
--- a/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py
+++ b/test/auto_parallel/hybrid_strategy/to_distributed_api_for_llama.py
@@ -117,7 +117,7 @@ def scaled_dot_product_attention(
     key_states = paddle.transpose(key_states, [0, 2, 1, 3])
     value_states = paddle.transpose(value_states, [0, 2, 1, 3])
 
-    # matmul and devide by sqrt(head_dim)
+    # matmul and divide by sqrt(head_dim)
     attn_weights = paddle.matmul(
         query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])
     )
diff --git a/test/ir/pir/cinn/llama_test_model.py b/test/ir/pir/cinn/llama_test_model.py
index 4761aa6f649246..250ce96d7c2f72 100644
--- a/test/ir/pir/cinn/llama_test_model.py
+++ b/test/ir/pir/cinn/llama_test_model.py
@@ -187,7 +187,7 @@ def scaled_dot_product_attention(
     key_states = paddle.transpose(key_states, [0, 2, 1, 3])
     value_states = paddle.transpose(value_states, [0, 2, 1, 3])
 
-    # matmul and devide by sqrt(head_dim)
+    # matmul and divide by sqrt(head_dim)
     attn_weights = paddle.matmul(
         query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2])
     )
diff --git a/test/legacy_test/test_cond.py b/test/legacy_test/test_cond.py
index bc9c61545cb473..d966db3587f4ae 100644
--- a/test/legacy_test/test_cond.py
+++ b/test/legacy_test/test_cond.py
@@ -471,7 +471,7 @@ def test_extremely_simple_net_with_op_in_condition(self):
                 main_program, fetch_list=[out, b, a.grad_name, b.grad_name]
             )
         # Note: fill_constant has loss of precision, you have to assertEqual
-        # with values doens't lose precision in float-point number.
+        # with values doesn't lose precision in float-point number.
         self.assertEqual(ret[0][0], ret[1][0])
         self.assertEqual(ret[2][0], 0.0)
         self.assertEqual(ret[3][0], 1.0)

From e8c33cdff9f32e72f6b09b92317e7f2a3f2928ef Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Wed, 8 Jan 2025 10:34:31 +0800
Subject: [PATCH 09/57] [CodeStyle][Typos][I-[12-14],I-[16-20]] Fix typos
 (`indiates`,`indeces`,`inferrence`,`infering`,`imformation`,`infomation`,`informations`,`Infor`,`infor`,`inheritted`,`initilization`)
 (#70650)

* fix

* fix
---
 _typos.toml                                    | 18 +++++++-----------
 .../operator/transforms/lowering_pass/utils.cc |  2 +-
 paddle/cinn/hlir/framework/pir/utils.cc        |  2 +-
 paddle/cinn/ir/ir.h                            |  2 +-
 paddle/cinn/poly/stage.h                       |  4 ++--
 .../fluid/distributed/ps/service/brpc_utils.cc |  2 +-
 .../eager_generated/backwards/scale_node.h     |  2 +-
 .../interpreter/interpreter_util.cc            |  2 +-
 .../tensorrt/convert/emb_eltwise_layernorm.cc  |  2 +-
 .../convert/preln_emb_eltwise_layernorm.cc     |  2 +-
 .../prompt_tuning_emb_eltwise_layernorm.cc     |  2 +-
 .../fluid/operators/generator/generate_op.py   |  2 +-
 paddle/fluid/pybind/tensor.cc                  |  2 +-
 paddle/phi/infermeta/unary.cc                  |  2 +-
 paddle/phi/kernels/funcs/seq2col.h             |  4 ++--
 paddle/phi/kernels/impl/einsum_impl.h          |  2 +-
 .../kernels/sparse/cpu/elementwise_kernel.cc   |  6 +++---
 paddle/phi/kernels/sparse/gpu/convolution.cu.h |  2 +-
 paddle/pir/include/core/ir_context.h           |  6 +++---
 .../auto_parallel/static/auto_align_tool.py    |  4 ++--
 .../distributed/auto_parallel/static/engine.py |  4 ++--
 .../paddle/distributed/fleet/utils/log_util.py |  2 +-
 python/paddle/distributed/rpc/rpc.py           |  2 +-
 python/paddle/framework/io_utils.py            | 18 +++++++++---------
 .../test_align_tool_deprecated.py              |  6 ++----
 tools/parallel_UT_rule.py                      |  2 --
 26 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 5355bd0657d18b..a29bf57b1677b1 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -12,6 +12,13 @@ extend-exclude = [
 [default]
 # Ignore 1-3 letter words, refer to https://github.com/crate-ci/typos/issues/1079
 extend-ignore-words-re = ["^[a-zA-Z]{1,3}$"]
+# refer to https://github.com/crate-ci/typos/blob/master/docs/reference.md#example-configurations
+extend-ignore-re = [
+    # Ignore lines by `# typos: disable-line`
+    "(?Rm)^.*(#|//)\\s*typos:\\s*disable-line$",
+    # Ignore block by `# typos: off` and `# typos: on`
+    "(?s)(#|//)\\s*typos:\\s*off.*?\\n\\s*(#|//)\\s*typos:\\s*on"
+]
 
 [default.extend-words]
 # PaddlePaddle specific words
@@ -39,19 +46,8 @@ creater = 'creater'
 fetchs = 'fetchs'
 Indexs = 'Indexs'
 indexs = 'indexs'
-indiates = 'indiates'
-indeces = 'indeces'
-inferrence = 'inferrence'
 Infered = 'Infered'
 infered = 'infered'
-infering = 'infering'
-informations = 'informations'
-imformation = 'imformation'
-infomation = 'infomation'
-Infor = 'Infor'
-infor = 'infor'
-inheritted = 'inheritted'
-initilization = 'initilization'
 initilized = 'initilized'
 initalized = 'initalized'
 initalize = 'initalize'
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
index 59d00770c5e753..b1273e42868024 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
@@ -117,7 +117,7 @@ OpLoweringGroupPtr BuildOpLoweringGroup(pir::Operation* fusion_op_ptr) {
   if (FLAGS_cinn_enable_map_expr) {
     cinn::adt::TryGenerateMapExprFromGroup(group);
   }
-  // Rebuild other informations
+  // Rebuild other information
   // TODO(zhangyuqin1998): Do we need group.master_ops?
   return group;
 }
diff --git a/paddle/cinn/hlir/framework/pir/utils.cc b/paddle/cinn/hlir/framework/pir/utils.cc
index 69482f296531a7..fb3d754e669e45 100644
--- a/paddle/cinn/hlir/framework/pir/utils.cc
+++ b/paddle/cinn/hlir/framework/pir/utils.cc
@@ -95,7 +95,7 @@ std::string GetDebugInfo(const std::unordered_set<std::string>& names) {
   return debug_info;
 }
 
-// OpTransInfo contains informations used to detect subgraphs
+// OpTransInfo contains information used to detect subgraphs
 // supported by the CINN compiler.
 class OpTransInfo {
   using DeParamCondT =
diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h
index 94d6000da798ad..9cfad3ba57ca8c 100644
--- a/paddle/cinn/ir/ir.h
+++ b/paddle/cinn/ir/ir.h
@@ -848,7 +848,7 @@ struct ForBase {
   BindInfo bind_info_;
 };
 
-/// LLVM loop unroll metadata infomation
+/// LLVM loop unroll metadata information
 struct LLVMForLoopMeta {
   enum UnrollMode { DefaultUnroll, FullyUnroll, NoUnroll };
 
diff --git a/paddle/cinn/poly/stage.h b/paddle/cinn/poly/stage.h
index b15d0149ed426a..7653bd8e5bfee9 100644
--- a/paddle/cinn/poly/stage.h
+++ b/paddle/cinn/poly/stage.h
@@ -57,7 +57,7 @@ struct StageForloopInfo {
   ir::DeviceAPI device;
 };
 
-//! Store the informations about some other tensor `compute_at` this tensor.
+//! Store the information about some other tensor `compute_at` this tensor.
 struct ComputeAtInfo {
   ComputeAtInfo(const std::string& consumer_tensor_name,
                 const std::string& producer_tensor_name,
@@ -84,7 +84,7 @@ struct ComputeAtInfo {
 };
 
 /**
- * Meta infomation for tensor.
+ * Meta information for tensor.
  */
 struct TensorScheduleMeta {
   //! Store the information of all the other producer tensors `compute_at` this
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
index 21ce06030c71f3..a1645302f0bfba 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -321,7 +321,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) {
 
   if (nullptr == hp) {
     LOG(ERROR) << "Brpc Start failed, ip_port= " << ip_port
-               << " , Error infomation: " << hstrerror(h_errno);
+               << " , Error information: " << hstrerror(h_errno);
   }
 
   int i = 0;
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index e2036bc8363d87..377a8354afde43 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/eager/tensor_wrapper.h"
 
 /*
-    Each Operation has a specific GradNode inheritted from GradNodeBase
+    Each Operation has a specific GradNode inherited from GradNodeBase
     A specific GradNode defines
     1. Input Tensors
     2. overrides operator() to perform actual backward computations
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 43267f1babb4a6..5b446605af2cc7 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -859,7 +859,7 @@ void BuildOpFuncList(const phi::Place& place,
                 op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
             RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context);
             // TODO(Aurelius84): In case of control flow ops, they are NOT
-            // inheritted from OperatorWithKernel.
+            // inherited from OperatorWithKernel.
             op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
           }
         }
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index d1efdc4cddc2fc..8b67d0df3ff011 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -92,7 +92,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
           output_fp16,
           1,
           common::errors::InvalidArgument(
-              "Only Precision::KHalf(fp16) is supported when infering "
+              "Only Precision::KHalf(fp16) is supported when inferring "
               "ernie(bert) model with config.EnableVarseqlen(). "
               "But Precision::KFloat32 is setted."));
 
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index ee6eaa1730fa23..f1a3b64cbd0f75 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -145,7 +145,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
         output_fp16,
         1,
         common::errors::InvalidArgument(
-            "Only Precision::KHalf(fp16) is supported when infering "
+            "Only Precision::KHalf(fp16) is supported when inferring "
             "ernie(bert) model with config.EnableVarseqlen(). "
             "But Precision::KFloat32 is setted."));
 
diff --git a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
index fd935b27393c22..47b9386ee2b621 100644
--- a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
@@ -91,7 +91,7 @@ class PromptTuningEmbEltwiseLayerNormOpConverter : public OpConverter {
         output_fp16,
         1,
         common::errors::InvalidArgument(
-            "Only Precision::KHalf(fp16) is supported when infering "
+            "Only Precision::KHalf(fp16) is supported when inferring "
             "ernie(bert) model with config.EnableVarseqlen(). "
             "But Precision::KFloat32 is setted."));
 
diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py
index 47ac091598eafc..a680f716ac58a4 100644
--- a/paddle/fluid/operators/generator/generate_op.py
+++ b/paddle/fluid/operators/generator/generate_op.py
@@ -309,7 +309,7 @@ def add_grad_op_compat_name(grad_op_item, args_name_map):
         if new_op_name != op_name:
             forward_op_item['op_name'] = op_name
 
-        # add complex promote infomation
+        # add complex promote information
         if "complex_promote" in op_args:
             forward_op_item["complex_promote"] = op_args["complex_promote"]
             if has_backward:
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index c0dce7d167371d..1d290729a54401 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -878,7 +878,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
            Returns:
                tuple: contains ipc name, data size, data type,
-                      tensor dims and lod imformation.
+                      tensor dims and lod information.
 
            Examples:
                 .. code-block:: python
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 115bc417a4ff1c..11d9ab80a48ef3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1242,7 +1242,7 @@ void EigvalshInferMeta(const MetaTensor& x,
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out) {
-  // collect the following informations to prepare einsum.
+  // collect the following information to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
diff --git a/paddle/phi/kernels/funcs/seq2col.h b/paddle/phi/kernels/funcs/seq2col.h
index 14665ada7b4a8a..656c96a8bfed69 100644
--- a/paddle/phi/kernels/funcs/seq2col.h
+++ b/paddle/phi/kernels/funcs/seq2col.h
@@ -35,7 +35,7 @@ struct Seq2ColFunctor {
   /*
     Convert sequences to frames.
 
-    1. Dimension infomation:
+    1. Dimension information:
 
        Sequences                   Frames
     (N, seq_length)  ->  (N, frame_length, n_frames)
@@ -105,7 +105,7 @@ struct Col2SeqFunctor {
   /*
     Accumulate output gradient d_out to d_x.
 
-    1. Dimension infomation:
+    1. Dimension information:
 
               d_out                        d_x
     (N, frame_length, n_frames)  ->  (N, seq_length)
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 9c73be86c05689..d8b0826ba75746 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -603,7 +603,7 @@ void EinsumKernelImpl(const Context& dev_ctx,
     VLOG(5) << "      inputs [ " << i << " ].shape=" << i->dims();
   }
   ValidationCheck(equation);
-  // collect the following informations to prepare einsum.
+  // collect the following information to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
index 3b04652701835a..004f22c66804e5 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
@@ -240,11 +240,11 @@ void ElementWiseCooKernelImpl(const Context& dev_ctx,
         common::make_ddim(
             {static_cast<int64_t>(sparse_dim), static_cast<int64_t>(nnz)}),
         DataLayout::NCHW);
-    auto indeces_dim = common::vectorize(
+    auto indices_dim = common::vectorize(
         slice_ddim(x.values().dims(), 1, x.values().dims().size()));
-    indeces_dim.insert(indeces_dim.begin(), nnz);
+    indices_dim.insert(indices_dim.begin(), nnz);
     DenseTensorMeta values_meta(
-        x.dtype(), common::make_ddim(indeces_dim), DataLayout::NCHW);
+        x.dtype(), common::make_ddim(indices_dim), DataLayout::NCHW);
     phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
     phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 7fd99c9166ba21..a049ee03047284 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -109,7 +109,7 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
 /**
  * @brief: update the out index and indices
  * unique_keys: save the index of the output feature list
- * unique_values: indiates the index of key before deduplication
+ * unique_values: indicates the index of key before deduplication
  * out_indexs: indicates the position of the output index in the rulebook
  * rulebook_len: indicates the length of rulebook
  * out_dims: indicates the output dims
diff --git a/paddle/pir/include/core/ir_context.h b/paddle/pir/include/core/ir_context.h
index 50ce178531673a..1e8d70b3b08e63 100644
--- a/paddle/pir/include/core/ir_context.h
+++ b/paddle/pir/include/core/ir_context.h
@@ -105,7 +105,7 @@ class IR_API IrContext {
   AbstractAttribute *GetRegisteredAbstractAttribute(TypeId id);
 
   ///
-  /// \brief Register an op infomation to IrContext
+  /// \brief Register an op information to IrContext
   ///
   void RegisterOpInfo(Dialect *dialect,
                       TypeId op_id,
@@ -118,12 +118,12 @@ class IR_API IrContext {
                       void (*verify_region)(Operation *));
 
   ///
-  /// \brief Get registered operation infomation.
+  /// \brief Get registered operation information.
   ///
   OpInfo GetRegisteredOpInfo(const std::string &name);
 
   ///
-  /// \brief Get registered operation infomation map.
+  /// \brief Get registered operation information map.
   ///
   const OpInfoMap &registered_op_info_map();
 
diff --git a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
index ec64c7e7b0e708..fc37b09b1599aa 100644
--- a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
+++ b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
@@ -402,7 +402,7 @@ def find_diff_vars(fixed_vars_map, query_vars_map):
         return diff_var_name_list
 
     @staticmethod
-    def diff_informations(right_dir, wrong_dir):
+    def diff_information(right_dir, wrong_dir):
         """
         Find the corresponding operator according to the variable name.
         """
@@ -448,7 +448,7 @@ def diff_informations(right_dir, wrong_dir):
         return diff_ops_varname_dict
 
     @staticmethod
-    def diff_informations_from_dirs(right_dirs, wrong_dirs):
+    def diff_information_from_dirs(right_dirs, wrong_dirs):
         right_vars_list = []
         right_program_list = []
         right_dist_attr_map = {}
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index db31165134b15b..f06c935c2f2c29 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -1338,12 +1338,12 @@ def _initialize(self, mode, init_parameters=True):
             )
 
         if self._in_pir_mode:
-            # FIXME(ljz) avoid shared same tensro more than once in different mode
+            # FIXME(ljz) avoid shared same tensor more than once in different mode
             if mode != "train":
                 return
             # TODO(2024-Q2)
             # 1. unify random control
-            # 2. initilization of non-parameter buffer
+            # 2. initialization of non-parameter buffer
             # 3. run startup program for pir
             # 4. lazy init adaption
             # 5. amp init adaption
diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py
index 7c8dea70ba8708..13e8bceae97654 100644
--- a/python/paddle/distributed/fleet/utils/log_util.py
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -157,7 +157,7 @@ def check_memory_usage(msg=""):
         mem_msg = f"checking pinned memory usage {msg}:"
         for key in mem_dict:
             mem_msg += f"\n{key}: {mem_dict[key]}GB"
-        logger.infor(mem_msg)
+        logger.info(mem_msg)
 
     if hasattr(paddle.device, 'cpu') and hasattr(
         paddle.device.cpu, 'max_memory_allocated'
diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py
index 4b6e80c8320dff..04c10039eb4de5 100644
--- a/python/paddle/distributed/rpc/rpc.py
+++ b/python/paddle/distributed/rpc/rpc.py
@@ -365,7 +365,7 @@ class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`.
 
 def get_all_worker_infos() -> list[WorkerInfo]:
     """
-    Get all worker informations.
+    Get all worker information.
 
     Returns:
         List[WorkerInfo].
diff --git a/python/paddle/framework/io_utils.py b/python/paddle/framework/io_utils.py
index b6b6aac0188294..258b8d1ff65b84 100644
--- a/python/paddle/framework/io_utils.py
+++ b/python/paddle/framework/io_utils.py
@@ -215,7 +215,7 @@ def _pickle_loads_mac(path, f):
 
 def _pack_loaded_dict(load_obj):
     if isinstance(load_obj, dict):
-        unpack_info = 'UnpackBigParamInfor@@'
+        unpack_info = 'UnpackBigParamInfor@@'  # typos: disable-line
         if unpack_info in load_obj:
             removes = []
             for key, value in load_obj[unpack_info].items():
@@ -233,7 +233,7 @@ def _pack_loaded_dict(load_obj):
 
 def _unpack_saved_dict(saved_obj, protocol):
     temp_saved_obj = {}
-    unpack_infor = {}
+    unpack_info = {}
     # When pickle protocol=2 or protocol=3 the serialized object cannot be larger than 4G.
     if 1 < protocol < 4:
         if isinstance(saved_obj, dict):
@@ -244,9 +244,9 @@ def _unpack_saved_dict(saved_obj, protocol):
                     )
                     num_element = np.prod(value.shape)
                     if num_element > MAX_NUMBER_OF_ELEMENT:
-                        unpack_infor[key] = {}
-                        unpack_infor[key]["OriginShape"] = value.shape
-                        unpack_infor[key]["slices"] = []
+                        unpack_info[key] = {}
+                        unpack_info[key]["OriginShape"] = value.shape
+                        unpack_info[key]["slices"] = []
                         value = value.flatten()
                         for i in range(
                             int(
@@ -256,20 +256,20 @@ def _unpack_saved_dict(saved_obj, protocol):
                             )
                         ):
                             part_name = key + "@@." + str(i)
-                            unpack_infor[key]["slices"].append(part_name)
+                            unpack_info[key]["slices"].append(part_name)
                             temp_saved_obj[part_name] = value[
                                 i
                                 * MAX_NUMBER_OF_ELEMENT : MAX_NUMBER_OF_ELEMENT
                                 * (i + 1)
                             ]
 
-    if unpack_infor:
-        for key, value in unpack_infor.items():
+    if unpack_info:
+        for key, value in unpack_info.items():
             if key in saved_obj:
                 saved_obj.pop(key)
                 for part in value['slices']:
                     saved_obj[part] = temp_saved_obj[part]
-        saved_obj['UnpackBigParamInfor@@'] = unpack_infor
+        saved_obj['UnpackBigParamInfor@@'] = unpack_info  # typos: disable-line
     return saved_obj
 
 
diff --git a/test/deprecated/auto_parallel/test_align_tool_deprecated.py b/test/deprecated/auto_parallel/test_align_tool_deprecated.py
index 85e5482ae5e0cb..b83f45d4c61457 100644
--- a/test/deprecated/auto_parallel/test_align_tool_deprecated.py
+++ b/test/deprecated/auto_parallel/test_align_tool_deprecated.py
@@ -97,10 +97,8 @@ def test_align_tool(self):
                         os.mkdir("./serial")
                     align_tool.save("./serial", vars, fetch_list)
                     break
-            AutoAlignTool.diff_informations("./serial", "./serial")
-            AutoAlignTool.diff_informations_from_dirs(
-                ["./serial"], ["./serial"]
-            )
+            AutoAlignTool.diff_information("./serial", "./serial")
+            AutoAlignTool.diff_information_from_dirs(["./serial"], ["./serial"])
             break
 
         print("test auto parallel align tool successfully!")
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index da905e8f04cf2f..6a28d63c017e95 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -89,7 +89,6 @@
     'test_sampling_id_op',
     'test_nce',
     'graph_helper_test',
-    'test_static_shape_inferrence_for_shape_tensor',
     'test_layer_norm_mkldnn_op',
     'test_fleet_launch_async',
     'test_multi_gru_fuse_pass',
@@ -1570,7 +1569,6 @@
     'test_sysconfig',
     'test_sync_batch_norm_pass',
     'test_switch',
-    'test_static_shape_inferrence_for_shape_tensor',
     'test_static_analysis',
     'test_squared_mat_sub_fuse_pass',
     'test_spawn_and_init_parallel_env',

From 29c3d915e0864022c31fc9830264ed7096e4616b Mon Sep 17 00:00:00 2001
From: fangfangssj <99968055+fangfangssj@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:05:00 +0800
Subject: [PATCH 10/57] [HEU][Paddle TensorRT No.69-72,74-85] Add UnaryOp
 converter (#70535)

* add converter

* fix

* add marker

* fix

* fix
---
 .../transforms/tensorrt/trt_op_marker_pass.cc |  79 +++--
 python/paddle/tensorrt/converter_utils.py     |  30 +-
 python/paddle/tensorrt/impls/ops.py           |  35 ++-
 test/tensorrt/CMakeLists.txt                  |   2 +-
 test/tensorrt/test_converter_ops.py           | 288 ++++++++++++++++++
 5 files changed, 397 insertions(+), 37 deletions(-)

diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
index c67bd5d012973b..0ad509a9601882 100644
--- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
+++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
@@ -94,7 +94,24 @@ DEFINE_GENERAL_PATTERN(Flip, paddle::dialect::FlipOp)
 DEFINE_GENERAL_PATTERN(Mish, paddle::dialect::MishOp)
 DEFINE_GENERAL_PATTERN(AssignValue, paddle::dialect::AssignValueOp)
 DEFINE_GENERAL_PATTERN(AssignValue_, paddle::dialect::AssignValue_Op)
+DEFINE_GENERAL_PATTERN(Exp, paddle::dialect::ExpOp)
+DEFINE_GENERAL_PATTERN(Abs, paddle::dialect::AbsOp)
+DEFINE_GENERAL_PATTERN(Abs_, paddle::dialect::Abs_Op)
+DEFINE_GENERAL_PATTERN(Sin, paddle::dialect::SinOp)
+DEFINE_GENERAL_PATTERN(Cos, paddle::dialect::CosOp)
+DEFINE_GENERAL_PATTERN(Sinh, paddle::dialect::SinhOp)
+DEFINE_GENERAL_PATTERN(Cosh, paddle::dialect::CoshOp)
+DEFINE_GENERAL_PATTERN(Asinh, paddle::dialect::AsinhOp)
+DEFINE_GENERAL_PATTERN(Acosh, paddle::dialect::AcoshOp)
+DEFINE_GENERAL_PATTERN(Atanh, paddle::dialect::AtanhOp)
+DEFINE_GENERAL_PATTERN(Ceil, paddle::dialect::CeilOp)
+DEFINE_GENERAL_PATTERN(Rsqrt, paddle::dialect::RsqrtOp)
+DEFINE_GENERAL_PATTERN(Reciprocal, paddle::dialect::ReciprocalOp)
+DEFINE_GENERAL_PATTERN(Erf, paddle::dialect::ErfOp)
+DEFINE_GENERAL_PATTERN(Sign, paddle::dialect::SignOp)
+DEFINE_GENERAL_PATTERN(Round, paddle::dialect::RoundOp)
 DEFINE_GENERAL_PATTERN(Numel, paddle::dialect::NumelOp)
+
 #undef DEFINE_GENERAL_PATTERN
 
 // Add ReduceCommonOpPattern base class to simplify code
@@ -267,8 +284,30 @@ class ActOpPattern : public pir::OpRewritePattern<OpType> {
 using TanhOpPattern = ActOpPattern<paddle::dialect::TanhOp>;
 using CeluOpPattern = ActOpPattern<paddle::dialect::CeluOp>;
 using TanhShrinkOpPattern = ActOpPattern<paddle::dialect::TanhShrinkOp>;
-using LogicalNotOpPattern = ActOpPattern<paddle::dialect::LogicalNotOp>;
-using LogicalNot_OpPattern = ActOpPattern<paddle::dialect::LogicalNot_Op>;
+
+template <typename OpType>
+class Logical_NotOpPattern : public pir::OpRewritePattern<OpType> {
+ public:
+  using pir::OpRewritePattern<OpType>::OpRewritePattern;
+  bool MatchAndRewrite(OpType op,
+                       pir::PatternRewriter &rewriter) const override {
+    if (op->HasAttribute(kCanRunTrtAttr) &&
+        op->template attribute<pir::BoolAttribute>(kCanRunTrtAttr).data()) {
+      return false;
+    }
+    pir::Value x = op.operand_source(0);
+    auto x_dtype = pir::GetDataTypeFromValue(x);
+    if (!x_dtype.isa<pir::BoolType>()) {
+      VLOG(3) << " logical_not op only support bool input in tensorrt.";
+      return false;
+    }
+    op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true));
+    return true;
+  }
+};
+using LogicalNotOpPattern = Logical_NotOpPattern<paddle::dialect::LogicalNotOp>;
+using LogicalNot_OpPattern =
+    Logical_NotOpPattern<paddle::dialect::LogicalNot_Op>;
 
 class Pool2dOpPattern
     : public pir::OpRewritePattern<paddle::dialect::Pool2dOp> {
@@ -538,24 +577,6 @@ class ArangeOpPattern
   }
 };
 
-class SignOpPattern : public pir::OpRewritePattern<paddle::dialect::SignOp> {
- public:
-  using pir::OpRewritePattern<paddle::dialect::SignOp>::OpRewritePattern;
-  bool MatchAndRewrite(paddle::dialect::SignOp op,
-                       pir::PatternRewriter &rewriter) const override {
-    if (op->HasAttribute(kCanRunTrtAttr) &&
-        op->attribute<pir::BoolAttribute>(kCanRunTrtAttr).data()) {
-      return false;
-    }
-#if IS_TRT_VERSION_LT(8200)
-    VLOG(3) << "sign op is only supported by tensorrt8.2 above ";
-    return false;
-#endif
-    op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true));
-    return true;
-  }
-};
-
 class GroupNormOpPattern
     : public pir::OpRewritePattern<paddle::dialect::GroupNormOp> {
  public:
@@ -2273,6 +2294,23 @@ class TrtOpMarkerPass : public pir::PatternRewritePass {
     ADD_PATTERN(Mish)
     ADD_PATTERN(AssignValue)
     ADD_PATTERN(AssignValue_)
+    ADD_PATTERN(Exp)
+    ADD_PATTERN(Abs)
+    ADD_PATTERN(Abs_)
+    ADD_PATTERN(Cos)
+    ADD_PATTERN(Sin)
+    ADD_PATTERN(Cos)
+    ADD_PATTERN(Sinh)
+    ADD_PATTERN(Cosh)
+    ADD_PATTERN(Asinh)
+    ADD_PATTERN(Acosh)
+    ADD_PATTERN(Atanh)
+    ADD_PATTERN(Ceil)
+    ADD_PATTERN(Rsqrt)
+    ADD_PATTERN(Reciprocal)
+    ADD_PATTERN(Erf)
+    ADD_PATTERN(Sign)
+    ADD_PATTERN(Round)
     ADD_PATTERN(Numel)
 #if IS_TRT_VERSION_GE(8600)
     ADD_PATTERN(Layer_norm)
@@ -2283,7 +2321,6 @@ class TrtOpMarkerPass : public pir::PatternRewritePass {
     ps.Add(std::make_unique<DepthwiseConv2dTransposeOpPattern>(context));
     ps.Add(std::make_unique<DeformableConvOpPattern>(context));
     ps.Add(std::make_unique<ArangeOpPattern>(context));
-    ps.Add(std::make_unique<SignOpPattern>(context));
     ps.Add(std::make_unique<LogicalNotOpPattern>(context));
     ps.Add(std::make_unique<BitwiseAndOpPattern>(context));
     ps.Add(std::make_unique<BitwiseOrOpPattern>(context));
diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py
index 5735b33ad42ace..76ccec354b0c5e 100644
--- a/python/paddle/tensorrt/converter_utils.py
+++ b/python/paddle/tensorrt/converter_utils.py
@@ -686,6 +686,29 @@ def squeeze_trt(network, input_tensor, axes):
 def unary_op_converter(network, paddle_op, inputs):
     from paddle.tensorrt import PrecisionMode
 
+    ops_type_map = {
+        "pd_op.sqrt": [trt.UnaryOperation.SQRT],
+        "pd_op.sqrt_": [trt.UnaryOperation.SQRT],
+        "pd_op.floor": [trt.UnaryOperation.FLOOR],
+        "pd_op.exp": [trt.UnaryOperation.EXP],
+        "pd_op.abs": [trt.UnaryOperation.ABS],
+        "pd_op.abs_": [trt.UnaryOperation.ABS],
+        "pd_op.sin": [trt.UnaryOperation.SIN],
+        "pd_op.cos": [trt.UnaryOperation.COS],
+        "pd_op.sinh": [trt.UnaryOperation.SINH],
+        "pd_op.cosh": [trt.UnaryOperation.COSH],
+        "pd_op.asinh": [trt.UnaryOperation.ASINH],
+        "pd_op.acosh": [trt.UnaryOperation.ACOSH],
+        "pd_op.atanh": [trt.UnaryOperation.ATANH],
+        "pd_op.ceil": [trt.UnaryOperation.CEIL],
+        "pd_op.reciprocal": [trt.UnaryOperation.RECIP],
+        "pd_op.erf": [trt.UnaryOperation.ERF],
+        "pd_op.sign": [trt.UnaryOperation.SIGN],
+        "pd_op.round": [trt.UnaryOperation.ROUND],
+        "pd_op.logical_not": [trt.UnaryOperation.NOT],
+        "pd_op.rsqrt": [trt.UnaryOperation.SQRT, trt.UnaryOperation.RECIP],
+    }
+
     input_tensor = inputs[0]
     layer = None
     org_type = input_tensor.dtype
@@ -707,9 +730,10 @@ def unary_op_converter(network, paddle_op, inputs):
             identity_layer.set_output_type(0, trt.float16)
         input_tensor = identity_layer.get_output(0)
 
-    if paddle_op.name() in ["pd_op.logical_not", "pd_op.logical_not_"]:
-        layer = network.add_unary(input_tensor, trt.UnaryOperation.NOT)
-        input_tensor = layer.get_output(0)
+    if paddle_op.name() in ops_type_map:
+        for trt_op in ops_type_map[paddle_op.name()]:
+            layer = network.add_unary(input_tensor, trt_op)
+            input_tensor = layer.get_output(0)
     else:
         raise NotImplementedError(
             f"Unsupported unary operation: {paddle_op.name()}"
diff --git a/python/paddle/tensorrt/impls/ops.py b/python/paddle/tensorrt/impls/ops.py
index 6416cb96e6af38..7370f10edc1eeb 100644
--- a/python/paddle/tensorrt/impls/ops.py
+++ b/python/paddle/tensorrt/impls/ops.py
@@ -11,21 +11,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import tensorrt as trt
 
+from paddle.tensorrt.converter_utils import unary_op_converter
 from paddle.tensorrt.register import converter_registry
 
-ops_type_map = {
-    "pd_op.sqrt": trt.UnaryOperation.SQRT,
-    "pd_op.sqrt_": trt.UnaryOperation.SQRT,
-    "pd_op.floor": trt.UnaryOperation.FLOOR,
-}
-
 
 @converter_registry.register("pd_op.sqrt", trt_version="trt_version_ge=8.0")
 @converter_registry.register("pd_op.sqrt_", trt_version="trt_version_ge=8.0")
-@converter_registry.register("pd_op.floor", trt_version="8.x")
-def sqrt_converter(network, paddle_op, inputs):
-    input_tensor = inputs[0]
-    layer = network.add_unary(input_tensor, ops_type_map[paddle_op.name()])
-    return layer.get_output(0)
+@converter_registry.register("pd_op.floor", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.exp", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.abs", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.abs_", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.sin", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.cos", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.sinh", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.cosh", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.asinh", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.acosh", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.atanh", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.ceil", trt_version="trt_version_ge=8.0")
+@converter_registry.register(
+    "pd_op.reciprocal", trt_version="trt_version_ge=8.0"
+)
+@converter_registry.register("pd_op.erf", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.rsqrt", trt_version="trt_version_ge=8.0")
+@converter_registry.register("pd_op.sign", trt_version="trt_version_ge=8.2")
+@converter_registry.register("pd_op.round", trt_version="trt_version_ge=8.2")
+def UnaryOpConverter(network, paddle_op, inputs):
+    layer_output = unary_op_converter(network, paddle_op, inputs)
+    return layer_output
diff --git a/test/tensorrt/CMakeLists.txt b/test/tensorrt/CMakeLists.txt
index 4735dc6def3345..201a1e02f2f3f7 100644
--- a/test/tensorrt/CMakeLists.txt
+++ b/test/tensorrt/CMakeLists.txt
@@ -14,7 +14,7 @@ if(NOT WIN32 AND TENSORRT_FOUND)
   set_tests_properties(test_converter_conv PROPERTIES TIMEOUT "300")
   set_tests_properties(test_export PROPERTIES TIMEOUT "500")
   set_tests_properties(test_converter_norm PROPERTIES TIMEOUT "300")
-  set_tests_properties(test_converter_ops PROPERTIES TIMEOUT "300")
+  set_tests_properties(test_converter_ops PROPERTIES TIMEOUT "500")
   set_tests_properties(test_converter_stat PROPERTIES TIMEOUT "300")
   set_tests_properties(test_converter_math PROPERTIES TIMEOUT "300")
   set_tests_properties(test_converter_activation PROPERTIES TIMEOUT "300")
diff --git a/test/tensorrt/test_converter_ops.py b/test/tensorrt/test_converter_ops.py
index 544fca80fbecc0..155a93d2827a19 100644
--- a/test/tensorrt/test_converter_ops.py
+++ b/test/tensorrt/test_converter_ops.py
@@ -34,6 +34,9 @@ def setUp(self):
     def test_trt_result(self):
         self.check_trt_result()
 
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
 
 class TestFloorFloatTRTPattern(TensorRTBaseTest):
     def setUp(self):
@@ -49,6 +52,291 @@ def setUp(self):
     def test_trt_result(self):
         self.check_trt_result()
 
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestExpFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.exp
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestAbsFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.abs
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestAbsIntTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.abs
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("int64"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+
+class TestSinFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.sin
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestCosFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.cos
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestSinhFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.sinh
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestCoshFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.cosh
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestAsinhFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.asinh
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestAcoshFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.acosh
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestCeilFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.ceil
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestRsqrtFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.rsqrt
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestReciprocalFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.reciprocal
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestErfFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.erf
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestSignFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.sign
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+
+class TestSignIntTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.sign
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestRoundFloatTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = paddle.round
+        self.api_args = {
+            "x": np.random.randn(7, 3).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [3, 3]}
+        self.opt_shape = {"x": [7, 3]}
+        self.max_shape = {"x": [10, 3]}
+
+    def test_trt_result(self):
+        self.check_trt_result()
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
 
 if __name__ == '__main__':
     unittest.main()

From 1cb01143febde6bd5bcc914bb4f3d2c6a0f30019 Mon Sep 17 00:00:00 2001
From: Junjie Zhang <1356732652@qq.com>
Date: Wed, 8 Jan 2025 11:06:37 +0800
Subject: [PATCH 11/57] =?UTF-8?q?=E3=80=90SCU=E3=80=91=E3=80=90Paddle=20Te?=
 =?UTF-8?q?nsor=20=E7=AC=AC=E4=BA=8C=E6=9C=9F=20API=20=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=200-size=20TensorNo.46=E3=80=91paddle.linalg.solve=20=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=200-size=20Tensor=20(#70575)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* support_0size

* fix codestyle

* Update solve_kernel_impl.h

* update

* fix codestyle

* Update test_solve_op.py

* Update test_solve_op.py

* Update test_solve_op.py
---
 paddle/phi/kernels/impl/solve_kernel_impl.h | 34 +++++++++
 test/legacy_test/test_solve_op.py           | 82 +++++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h
index 52d9cd131ba3ed..bbe63896fc4d91 100644
--- a/paddle/phi/kernels/impl/solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_kernel_impl.h
@@ -195,6 +195,40 @@ void SolveKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  const DenseTensor& y,
                  DenseTensor* out) {
+  if (x.numel() == 0 || y.numel() == 0) {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    std::vector<int> out_dims;
+    if (y_dims.size() == 1) {
+      out_dims =
+          std::vector<int>(x_dims.Get(), x_dims.Get() + x_dims.size() - 2);
+      out_dims.push_back(y_dims[y_dims.size() - 1]);
+    } else {
+      // broadcast
+      std::vector<int> x_shape(x_dims.Get(), x_dims.Get() + x_dims.size() - 2);
+      std::vector<int> y_shape(y_dims.Get(), y_dims.Get() + y_dims.size() - 2);
+      auto x_it = x_shape.rbegin();
+      auto y_it = y_shape.rbegin();
+      while (x_it != x_shape.rend() || y_it != y_shape.rend()) {
+        int x_dim = (x_it != x_shape.rend()) ? *x_it : 1;
+        int y_dim = (y_it != y_shape.rend()) ? *y_it : 1;
+        if (x_dim == 0 || y_dim == 0) {
+          out_dims.push_back(0);
+        } else {
+          out_dims.push_back(std::max(x_dim, y_dim));
+        }
+        if (x_it != x_shape.rend()) ++x_it;
+        if (y_it != y_shape.rend()) ++y_it;
+      }
+      std::reverse(out_dims.begin(), out_dims.end());
+      out_dims.insert(out_dims.end(),
+                      y_dims.Get() + y_dims.size() - 2,
+                      y_dims.Get() + y_dims.size());
+    }
+    out->Resize(phi::make_ddim(out_dims));
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
   linalg_solve<Context, T>(dev_ctx, x, y, out);
 }
 
diff --git a/test/legacy_test/test_solve_op.py b/test/legacy_test/test_solve_op.py
index 71ca1b5accae53..874f49ce5b3124 100644
--- a/test/legacy_test/test_solve_op.py
+++ b/test/legacy_test/test_solve_op.py
@@ -923,5 +923,87 @@ def test_dygraph(self):
                     print("The mat is singular")
 
 
+class TestSolveOpAPIZeroDimCase(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2021)
+        self.place = []
+        self.dtype = "float32"
+        if (
+            os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower()
+            in ['1', 'true', 'on']
+            or not core.is_compiled_with_cuda()
+        ):
+            self.place.append(paddle.CPUPlace())
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def check_static_result(self, place, x_shape, y_shape, np_y_shape):
+        paddle.enable_static()
+        with base.program_guard(base.Program(), base.Program()):
+            paddle_input_x = paddle.static.data(
+                name="input_x", shape=x_shape, dtype=self.dtype
+            )
+            paddle_input_y = paddle.static.data(
+                name="input_y", shape=y_shape, dtype=self.dtype
+            )
+            paddle_result = paddle.linalg.solve(
+                paddle_input_x, paddle_input_y, left=False
+            )
+
+            np_input_x = np.random.random(x_shape).astype(self.dtype)
+            np_input_y = np.random.random(np_y_shape).astype(self.dtype)
+
+            np_result = np.linalg.solve(np_input_x, np_input_y)
+
+            exe = base.Executor(place)
+            fetches = exe.run(
+                base.default_main_program(),
+                feed={"input_x": np_input_x, "input_y": np_input_y},
+                fetch_list=[paddle_result],
+            )
+            np.testing.assert_allclose(fetches[0], np_result, rtol=0.0001)
+
+    def test_static(self):
+        for place in self.place:
+            self.check_static_result(
+                place=place,
+                x_shape=[10, 0, 0],
+                y_shape=[6, 0, 0],
+                np_y_shape=[10, 0, 0],
+            )
+            with self.assertRaises(ValueError) as context:
+                self.check_static_result(
+                    place=place,
+                    x_shape=[10, 0, 0],
+                    y_shape=[10],
+                    np_y_shape=[10],
+                )
+
+    def test_dygraph(self):
+        def run(place, x_shape, y_shape):
+            with base.dygraph.guard(place):
+                input_x_np = np.random.random(x_shape).astype(self.dtype)
+                input_y_np = np.random.random(y_shape).astype(self.dtype)
+
+                tensor_input_x = paddle.to_tensor(input_x_np)
+                tensor_input_y = paddle.to_tensor(input_y_np)
+
+                numpy_output = np.linalg.solve(input_x_np, input_y_np)
+                paddle_output = paddle.linalg.solve(
+                    tensor_input_x, tensor_input_y, left=False
+                )
+                np.testing.assert_allclose(
+                    numpy_output, paddle_output.numpy(), rtol=0.0001
+                )
+                self.assertEqual(
+                    numpy_output.shape, paddle_output.numpy().shape
+                )
+
+        for place in self.place:
+            run(place, x_shape=[10, 0, 0], y_shape=[10, 0, 0])
+            with self.assertRaises(ValueError) as context:
+                run(place, x_shape=[10, 0, 0], y_shape=[10])
+
+
 if __name__ == "__main__":
     unittest.main()

From 418327b8b6f46651cf0248c77ef6d361a680f7a5 Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:24:24 +0800
Subject: [PATCH 12/57] [XPU] add data_type_transform_test_xpu (#70638)

* [XPU] add data_type_transform_test_xpu

* [XPU] add data_type_transform_test_xpu
---
 paddle/fluid/framework/data_type.h            |  13 ++
 paddle/fluid/framework/data_type_transform.cc |  66 +++---
 .../phi/core/framework/data_type_transform.cc |  66 +++---
 paddle/phi/core/framework/var_type_helper.h   |  13 ++
 test/cpp/phi/core/CMakeLists.txt              |   4 +
 .../phi/core/data_type_transform_test_xpu.cc  | 219 ++++++++++++++++++
 6 files changed, 319 insertions(+), 62 deletions(-)
 create mode 100644 test/cpp/phi/core/data_type_transform_test_xpu.cc

diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index a40f33e2f3fbfa..16df876079931c 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -112,6 +112,19 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(callback, ::phi::dtype::complex<float>, COMPLEX64); \
   _ForEachDataTypeHelper_(callback, ::phi::dtype::complex<double>, COMPLEX128);
 
+// complex and float8 are not supported on XPU.
+#define _ForEachDataTypeForXPU_(callback)                          \
+  _ForEachDataTypeHelper_(callback, float, FP32);                  \
+  _ForEachDataTypeHelper_(callback, ::phi::dtype::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, ::phi::dtype::bfloat16, BF16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                 \
+  _ForEachDataTypeHelper_(callback, int, INT32);                   \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);               \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                   \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);               \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);               \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8);
+
 #define DefineDataTypeTrait(cpp_type, proto_type)                           \
   template <>                                                               \
   struct DataTypeTrait<cpp_type> {                                          \
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 9fba57e10fd0b1..83905084907687 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -64,10 +64,11 @@ static void XPUTransDataType(
     }                                                  \
   } while (0)
 
-  if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 &&
-      dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 &&
-      dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) {
-    _ForEachDataType_(XPUCastCallback);
+  if (dst_type == proto::VarType::FP32 || dst_type == proto::VarType::FP16 ||
+      dst_type == proto::VarType::BOOL || dst_type == proto::VarType::INT16 ||
+      dst_type == proto::VarType::INT32 || dst_type == proto::VarType::INT64 ||
+      dst_type == proto::VarType::FP64) {
+    _ForEachDataTypeForXPU_(XPUCastCallback);
   } else {
     PADDLE_THROW(common::errors::Unimplemented(
         "Data type (%s) is not supported in XPU when casting data type.",
@@ -155,33 +156,37 @@ void TransDataType(const phi::DenseTensor& in,
   auto ctx = pool.Get(in.place());
 
 #if defined(PADDLE_WITH_XPU)
-  switch (src_type) {
-    case proto::VarType::FP16:
-      XPUTransDataType<phi::dtype::float16>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::FP32:
-      XPUTransDataType<float>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::BOOL:
-      XPUTransDataType<bool>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::INT16:
-      XPUTransDataType<int16_t>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::INT32:
-      XPUTransDataType<int>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::INT64:
-      XPUTransDataType<int64_t>(in, out, dst_type, ctx);
-      break;
-    default:
-      PADDLE_THROW(common::errors::Unimplemented(
-          "Data type (%s) is not supported in XPU when casting data type.",
-          DataTypeToString(src_type)));
+  if (phi::is_xpu_place(in.place())) {
+    switch (src_type) {
+      case proto::VarType::FP16:
+        XPUTransDataType<phi::dtype::float16>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::FP32:
+        XPUTransDataType<float>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::FP64:
+        XPUTransDataType<double>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::BOOL:
+        XPUTransDataType<bool>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::INT16:
+        XPUTransDataType<int16_t>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::INT32:
+        XPUTransDataType<int>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::INT64:
+        XPUTransDataType<int64_t>(in, out, dst_type, ctx);
+        break;
+      default:
+        PADDLE_THROW(common::errors::Unimplemented(
+            "Data type (%s) is not supported in XPU when casting data type.",
+            DataTypeToString(src_type)));
+    }
+    return;
   }
-
-#else
-
+#endif
   switch (src_type) {
     case proto::VarType::FP16:
       framework::VisitDataType(dst_type,
@@ -225,7 +230,6 @@ void TransDataType(const phi::DenseTensor& in,
           "Data type (%s) is not supported when casting data type.",
           DataTypeToString(src_type)));
   }
-#endif
 }
 
 void TransComplexToReal(const proto::VarType::Type& dst_type,
diff --git a/paddle/phi/core/framework/data_type_transform.cc b/paddle/phi/core/framework/data_type_transform.cc
index c20da1023b3310..6ed397d85d378e 100644
--- a/paddle/phi/core/framework/data_type_transform.cc
+++ b/paddle/phi/core/framework/data_type_transform.cc
@@ -66,10 +66,11 @@ static void XPUTransDataType(
     }                                                  \
   } while (0)
 
-  if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 &&
-      dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 &&
-      dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) {
-    _ForEachDataType_(XPUCastCallback);
+  if (dst_type == proto::VarType::FP32 || dst_type == proto::VarType::FP16 ||
+      dst_type == proto::VarType::BOOL || dst_type == proto::VarType::INT16 ||
+      dst_type == proto::VarType::INT32 || dst_type == proto::VarType::INT64 ||
+      dst_type == proto::VarType::FP64) {
+    _ForEachDataTypeForXPU_(XPUCastCallback);
   } else {
     PADDLE_THROW(common::errors::Unimplemented(
         "Data type (%s) is not supported in XPU when casting data type.",
@@ -158,33 +159,37 @@ void TransDataType(const phi::DenseTensor& in,
   auto ctx = pool.Get(in.place());
 
 #if defined(PADDLE_WITH_XPU)
-  switch (src_type) {
-    case proto::VarType::FP16:
-      XPUTransDataType<phi::dtype::float16>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::FP32:
-      XPUTransDataType<float>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::BOOL:
-      XPUTransDataType<bool>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::INT16:
-      XPUTransDataType<int16_t>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::INT32:
-      XPUTransDataType<int>(in, out, dst_type, ctx);
-      break;
-    case proto::VarType::INT64:
-      XPUTransDataType<int64_t>(in, out, dst_type, ctx);
-      break;
-    default:
-      PADDLE_THROW(common::errors::Unimplemented(
-          "Data type (%s) is not supported in XPU when casting data type.",
-          VarDataTypeToString(src_type)));
+  if (phi::is_xpu_place(in.place())) {
+    switch (src_type) {
+      case proto::VarType::FP16:
+        XPUTransDataType<phi::dtype::float16>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::FP32:
+        XPUTransDataType<float>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::FP64:
+        XPUTransDataType<double>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::BOOL:
+        XPUTransDataType<bool>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::INT16:
+        XPUTransDataType<int16_t>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::INT32:
+        XPUTransDataType<int>(in, out, dst_type, ctx);
+        break;
+      case proto::VarType::INT64:
+        XPUTransDataType<int64_t>(in, out, dst_type, ctx);
+        break;
+      default:
+        PADDLE_THROW(common::errors::Unimplemented(
+            "Data type (%s) is not supported in XPU when casting data type.",
+            VarDataTypeToString(src_type)));
+    }
+    return;
   }
-
-#else
-
+#endif
   switch (src_type) {
     case proto::VarType::FP16:
       phi::VisitDataType(dst_type,
@@ -228,7 +233,6 @@ void TransDataType(const phi::DenseTensor& in,
           "Data type (%s) is not supported when casting data type.",
           VarDataTypeToString(src_type)));
   }
-#endif
 }
 
 }  // namespace phi
diff --git a/paddle/phi/core/framework/var_type_helper.h b/paddle/phi/core/framework/var_type_helper.h
index 81636930019331..9a6306da520a27 100644
--- a/paddle/phi/core/framework/var_type_helper.h
+++ b/paddle/phi/core/framework/var_type_helper.h
@@ -113,6 +113,19 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(callback, ::phi::dtype::complex<float>, COMPLEX64); \
   _ForEachDataTypeHelper_(callback, ::phi::dtype::complex<double>, COMPLEX128);
 
+// complex and float8 are not supported on XPU.
+#define _ForEachDataTypeForXPU_(callback)                          \
+  _ForEachDataTypeHelper_(callback, float, FP32);                  \
+  _ForEachDataTypeHelper_(callback, ::phi::dtype::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, ::phi::dtype::bfloat16, BF16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                 \
+  _ForEachDataTypeHelper_(callback, int, INT32);                   \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);               \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                   \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);               \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);               \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8);
+
 #define DefineDataTypeTrait(cpp_type, proto_type)                           \
   template <>                                                               \
   struct DataTypeTrait<cpp_type> {                                          \
diff --git a/test/cpp/phi/core/CMakeLists.txt b/test/cpp/phi/core/CMakeLists.txt
index 5eb78dacd7cd31..30cebae20e1f08 100644
--- a/test/cpp/phi/core/CMakeLists.txt
+++ b/test/cpp/phi/core/CMakeLists.txt
@@ -86,3 +86,7 @@ endif()
 if(NOT WIN32)
   paddle_test(test_c_tcp_store SRCS test_tcp_store.cc DEPS phi common)
 endif()
+
+if(WITH_XPU)
+  paddle_test(data_type_transform_test_xpu SRCS data_type_transform_test_xpu.cc)
+endif()
diff --git a/test/cpp/phi/core/data_type_transform_test_xpu.cc b/test/cpp/phi/core/data_type_transform_test_xpu.cc
new file mode 100644
index 00000000000000..c897f59d8f9aca
--- /dev/null
+++ b/test/cpp/phi/core/data_type_transform_test_xpu.cc
@@ -0,0 +1,219 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/framework/data_type_transform.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+template <typename InT, typename OutT>
+void TransformTest(const phi::KernelKey& kernel_type_for_var,
+                   const phi::KernelKey& expected_kernel_type,
+                   const phi::CPUPlace& cpu_place,
+                   const phi::XPUPlace& xpu_place,
+                   const InT* cpu_data,
+                   const int data_number) {
+  phi::XPUContext context(xpu_place);
+  phi::DenseTensor in;
+  phi::DenseTensor in_xpu;
+  phi::DenseTensor out;
+  phi::DenseTensor out_xpu;
+
+  // copy from cpu_data to cpu tensor
+  InT* in_ptr =
+      in.mutable_data<InT>(common::make_ddim({data_number}), cpu_place);
+  memcpy(in_ptr, cpu_data, sizeof(InT) * data_number);
+
+  // test case 1: on xpu
+  {
+    // copy from cpu tensor to xpu tensor
+    paddle::framework::TensorCopy(in, xpu_place, context, &in_xpu);
+    context.Wait();
+
+    // call trans data
+    phi::TransDataType(
+        kernel_type_for_var, expected_kernel_type, in_xpu, &out_xpu);
+
+    // copy from xpu tensor to cpu tensor
+    paddle::framework::TensorCopy(out_xpu, cpu_place, context, &out);
+    context.Wait();
+
+    // check result
+    OutT* out_ptr = out.data<OutT>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_ptr[i], static_cast<OutT>(cpu_data[i]));
+    }
+  }
+
+  // test case 2: on cpu
+  {
+    // call trans data
+    phi::TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
+
+    // check result
+    OutT* out_ptr = out.data<OutT>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_ptr[i], static_cast<OutT>(cpu_data[i]));
+    }
+  }
+}
+
+TEST(DataTypeTransform, XPUTransform) {
+  auto cpu_place = phi::CPUPlace();
+  auto xpu_place = phi::XPUPlace(0);
+  phi::XPUContext context(xpu_place);
+
+  auto kernel_fp16 = phi::KernelKey(
+      xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT16);
+  auto kernel_fp32 = phi::KernelKey(
+      xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT32);
+  auto kernel_fp64 = phi::KernelKey(
+      xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT64);
+  auto kernel_int16 = phi::KernelKey(
+      xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::INT16);
+  auto kernel_int32 = phi::KernelKey(
+      xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::INT32);
+  auto kernel_int64 = phi::KernelKey(
+      xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::INT64);
+  auto kernel_bool = phi::KernelKey(
+      xpu_place, phi::DataLayout::ALL_LAYOUT, phi::DataType::BOOL);
+
+  {
+    // float16 -> any
+    phi::dtype::float16 cpu_data[6] = {phi::dtype::float16(0),
+                                       phi::dtype::float16(1),
+                                       phi::dtype::float16(2),
+                                       phi::dtype::float16(3),
+                                       phi::dtype::float16(4),
+                                       phi::dtype::float16(5)};
+    TransformTest<phi::dtype::float16, float>(
+        kernel_fp16, kernel_fp32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<phi::dtype::float16, double>(
+        kernel_fp16, kernel_fp64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<phi::dtype::float16, int32_t>(
+        kernel_fp16, kernel_int32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<phi::dtype::float16, int64_t>(
+        kernel_fp16, kernel_int64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<phi::dtype::float16, bool>(
+        kernel_fp16, kernel_bool, cpu_place, xpu_place, cpu_data, 6);
+  }
+  {
+    // float -> any
+    float cpu_data[6] = {0, 1, 2, 3, 4, 5};
+    TransformTest<float, phi::dtype::float16>(
+        kernel_fp32, kernel_fp16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<float, float>(
+        kernel_fp32, kernel_fp32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<float, double>(
+        kernel_fp32, kernel_fp64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<float, int16_t>(
+        kernel_fp32, kernel_int16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<float, int32_t>(
+        kernel_fp32, kernel_int32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<float, int64_t>(
+        kernel_fp32, kernel_int64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<float, bool>(
+        kernel_fp32, kernel_bool, cpu_place, xpu_place, cpu_data, 6);
+  }
+  {
+    // double -> any
+    double cpu_data[6] = {0, 1, 2, 3, 4, 5};
+    TransformTest<double, phi::dtype::float16>(
+        kernel_fp64, kernel_fp16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<double, float>(
+        kernel_fp64, kernel_fp32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<double, double>(
+        kernel_fp64, kernel_fp64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<double, int16_t>(
+        kernel_fp64, kernel_int16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<double, int32_t>(
+        kernel_fp64, kernel_int32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<double, int64_t>(
+        kernel_fp64, kernel_int64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<double, bool>(
+        kernel_fp64, kernel_bool, cpu_place, xpu_place, cpu_data, 6);
+  }
+  {
+    // int16 -> any
+    int16_t cpu_data[6] = {0, 1, 2, 3, 4, 5};
+    TransformTest<int16_t, phi::dtype::float16>(
+        kernel_int16, kernel_fp16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int16_t, float>(
+        kernel_int16, kernel_fp32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int16_t, double>(
+        kernel_int16, kernel_fp64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int16_t, int16_t>(
+        kernel_int16, kernel_int16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int16_t, int32_t>(
+        kernel_int16, kernel_int32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int16_t, int64_t>(
+        kernel_int16, kernel_int64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int16_t, bool>(
+        kernel_int16, kernel_bool, cpu_place, xpu_place, cpu_data, 6);
+  }
+  {
+    // int32 -> any
+    int32_t cpu_data[6] = {0, 1, 2, 3, 4, 5};
+    TransformTest<int32_t, phi::dtype::float16>(
+        kernel_int32, kernel_fp16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int32_t, float>(
+        kernel_int32, kernel_fp32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int32_t, double>(
+        kernel_int32, kernel_fp64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int32_t, int16_t>(
+        kernel_int32, kernel_int16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int32_t, int32_t>(
+        kernel_int32, kernel_int32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int32_t, int64_t>(
+        kernel_int32, kernel_int64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int32_t, bool>(
+        kernel_int32, kernel_bool, cpu_place, xpu_place, cpu_data, 6);
+  }
+  {
+    // int64 -> any
+    int64_t cpu_data[6] = {0, 1, 2, 3, 4, 5};
+    TransformTest<int64_t, phi::dtype::float16>(
+        kernel_int64, kernel_fp16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int64_t, float>(
+        kernel_int64, kernel_fp32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int64_t, double>(
+        kernel_int64, kernel_fp64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int64_t, int16_t>(
+        kernel_int64, kernel_int16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int64_t, int32_t>(
+        kernel_int64, kernel_int32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int64_t, int64_t>(
+        kernel_int64, kernel_int64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<int64_t, bool>(
+        kernel_int64, kernel_bool, cpu_place, xpu_place, cpu_data, 6);
+  }
+  {
+    // bool -> any
+    bool cpu_data[6] = {0, 1, 0, 1, 1, 0};
+    TransformTest<bool, phi::dtype::float16>(
+        kernel_bool, kernel_fp16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<bool, float>(
+        kernel_bool, kernel_fp32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<bool, double>(
+        kernel_bool, kernel_fp64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<bool, int16_t>(
+        kernel_bool, kernel_int16, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<bool, int32_t>(
+        kernel_bool, kernel_int32, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<bool, int64_t>(
+        kernel_bool, kernel_int64, cpu_place, xpu_place, cpu_data, 6);
+    TransformTest<bool, bool>(
+        kernel_bool, kernel_bool, cpu_place, xpu_place, cpu_data, 6);
+  }
+}

From 34e7b8811a4f5f067d32d45f31171ffa586d1758 Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:28:48 +0800
Subject: [PATCH 13/57] =?UTF-8?q?=E3=80=90CINN=E3=80=91Use=20ArithSimplify?=
 =?UTF-8?q?=20instead=20of=20Autosimplify--Part0=20(#70594)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* del autosimplify

* fix bug

* empty commit
---
 paddle/cinn/backends/codegen_gpu_dev.cc       |   4 +-
 paddle/cinn/backends/llvm/codegen_llvm.cc     |   6 +-
 paddle/cinn/common/ir_util.cc                 |   7 +-
 paddle/cinn/hlir/op/contrib/sort.cc           |   8 +-
 paddle/cinn/hlir/pe/elementwise.cc            |   2 +-
 paddle/cinn/hlir/pe/nn.cc                     | 117 +++++++++---------
 paddle/cinn/hlir/pe/transform.cc              |  24 ++--
 paddle/cinn/ir/buffer.cc                      |   2 +-
 .../ir/schedule/impl/loop_transformation.cc   |  18 ++-
 paddle/cinn/ir/schedule/ir_schedule_util.cc   |  21 ++--
 paddle/cinn/ir/schedule/ir_schedule_util.h    |   6 +-
 paddle/cinn/ir/tensor.cc                      |   4 +-
 .../eliminate_common_factor_of_local_index.cc |  14 +--
 .../eliminate_common_global_memory_read.cc    |   9 +-
 paddle/cinn/optim/ir_simplify.cc              |   5 +-
 15 files changed, 117 insertions(+), 130 deletions(-)

diff --git a/paddle/cinn/backends/codegen_gpu_dev.cc b/paddle/cinn/backends/codegen_gpu_dev.cc
index a3dbddfdb132e8..9886d7c3a9fc45 100644
--- a/paddle/cinn/backends/codegen_gpu_dev.cc
+++ b/paddle/cinn/backends/codegen_gpu_dev.cc
@@ -115,7 +115,7 @@ std::vector<Expr> FilterDeallocTempBuffers(const std::vector<Expr> &frees) {
     bool has_symbolic_constant = false;
     const ir::_Buffer_ *buffer = op->destination.As<ir::_Buffer_>();
     for (Expr shape : buffer->shape) {
-      shape = common::AutoSimplify(shape);
+      shape = optim::ArithSimplify(shape);
       ir::ir_utils::CollectIRNodes(shape, [&](const Expr *x) {
         if (x->as_var()) {
           PADDLE_ENFORCE_EQ(
@@ -540,7 +540,7 @@ ir::Expr CalculateSharedMemory(const ir::LoweredFunc &func) {
       shm_size = shm_size + CalculateSharedMemory(buffer);
     }
   }
-  return common::AutoSimplify(shm_size);
+  return optim::ArithSimplify(shm_size);
 }
 
 }  // namespace backends
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index 355ae881c6476e..3462325edd9b36 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -928,7 +928,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Store *op) {
       // fit the total_lanes in native_lanes(split into multiple native steps)
       for (int offset = 0; offset < total_lanes; offset += total_lanes) {
         int lanes = total_lanes;
-        Expr base = cinn::common::AutoSimplify(ramp->base + offset);
+        Expr base = optim::ArithSimplify(ramp->base + offset);
         optim::VarModSimplify(&base);
         auto *ptr =
             CreateBufferPtr(op->type().ElementOf(), buffer, Visit(&base));
@@ -1242,10 +1242,8 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) {
 
   for (int i = 0; i < load_lanes; i += load_lanes) {
     int slice_lanes = load_lanes;
-    auto slice_base = cinn::common::AutoSimplify(ramp->base + i);
+    auto slice_base = optim::ArithSimplify(ramp->base + i);
     optim::VarModSimplify(&slice_base);
-    auto slide_stride = Expr(1);
-    auto slide_index = slice_base;
 
 #if LLVM_VERSION_MAJOR >= 11
     const llvm::ElementCount elem_count(slice_lanes, /*scalable*/ false);
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index 0204e8dc1c5d0a..ace5e2e4cd2981 100644
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -101,8 +101,8 @@ Expr RampRelatedAdd(ir::Ramp *ramp, ir::Ramp *other) {
                           ::common::errors::InvalidArgument(
                               "Other ramp pointer should not be null."));
   if (ramp->lanes == other->lanes) {
-    Expr base_add = cinn::common::AutoSimplify(ramp->base + other->base);
-    Expr stride_add = cinn::common::AutoSimplify(ramp->stride + other->stride);
+    Expr base_add = optim::ArithSimplify(ramp->base + other->base);
+    Expr stride_add = optim::ArithSimplify(ramp->stride + other->stride);
     VLOG(2) << base_add;
     VLOG(2) << stride_add;
     return ir::Ramp::Make(base_add, stride_add, ramp->lanes);
@@ -641,8 +641,7 @@ ir::IndexExpr SimplifySymbolicDivide(const ir::IndexExpr &lhs,
 
 bool ProveDivisible(const ir::IndexExpr &lhs, const ir::IndexExpr &rhs) {
   if (IsZero(lhs % rhs)) return true;
-  // remove AutoSimplify later.
-  if (IsZero(AutoSimplify(lhs % rhs))) return true;
+  if (IsZero(optim::ArithSimplify(lhs % rhs))) return true;
   return false;
 }
 
diff --git a/paddle/cinn/hlir/op/contrib/sort.cc b/paddle/cinn/hlir/op/contrib/sort.cc
index 897bf288c4f812..ec6403d5c7dd77 100644
--- a/paddle/cinn/hlir/op/contrib/sort.cc
+++ b/paddle/cinn/hlir/op/contrib/sort.cc
@@ -95,8 +95,8 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
             stride = stride * A->shape[i];
           }
         }
-        offset = cinn::common::AutoSimplify(offset);
-        stride = cinn::common::AutoSimplify(stride);
+        offset = optim::ArithSimplify(offset);
+        stride = optim::ArithSimplify(stride);
         auto A_shape_axis = A->shape[pos_axis];
         return lang::CallExtern(index_func_name,
                                 {A, A_shape_axis, A(indices), offset, stride});
@@ -117,8 +117,8 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
             stride = stride * A->shape[i];
           }
         }
-        offset = cinn::common::AutoSimplify(offset);
-        stride = cinn::common::AutoSimplify(stride);
+        offset = optim::ArithSimplify(offset);
+        stride = optim::ArithSimplify(stride);
 
         auto A_shape_axis = A->shape[pos_axis];
         auto idx = lang::CallExtern(
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
index 8e16bd6a8c6d19..ec4b687f88a7fb 100644
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -222,7 +222,7 @@ Expr ReshapeHandler(const ir::Tensor& A,
       if (i > A_s) {
         temp = temp % A->shape[i];
       }
-      A_indice[i] = common::AutoSimplify(temp);
+      A_indice[i] = optim::ArithSimplify(temp);
     }
   };
 
diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc
index 15fc0575cae466..4954cda7976e0f 100644
--- a/paddle/cinn/hlir/pe/nn.cc
+++ b/paddle/cinn/hlir/pe/nn.cc
@@ -204,12 +204,12 @@ std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
   output_shape = {
       input->shape[0],    // B
       weights->shape[0],  // O
-      cinn::common::AutoSimplify(
+      optim::ArithSimplify(
           (input->shape[2] -
            ((weights_dilation->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) /
               stride_h +
           1),  // H
-      cinn::common::AutoSimplify(
+      optim::ArithSimplify(
           (input->shape[3] -
            ((weights_dilation->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) /
               stride_w +
@@ -222,8 +222,8 @@ std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
   ir::Tensor B = winograd_transform[1];
   ir::Tensor G = winograd_transform[2];
 
-  int nH = (cinn::common::AutoSimplify(output_shape[2]).as_int32() + m - 1) / m;
-  int nW = (cinn::common::AutoSimplify(output_shape[3]).as_int32() + m - 1) / m;
+  int nH = (optim::ArithSimplify(output_shape[2]).as_int32() + m - 1) / m;
+  int nW = (optim::ArithSimplify(output_shape[3]).as_int32() + m - 1) / m;
 
   int P = input->shape[0].as_int32() * nH * nW;
 
@@ -489,9 +489,9 @@ std::vector<ir::Tensor> Conv2d_NCHW_5D(const ir::Tensor &input,
       shape_weights.size(),
       4U,
       ::common::errors::InvalidArgument("weight's shape size should be 4"));
-  Expr c_in = cinn::common::AutoSimplify(shape_input[1]);
-  Expr c_filter = cinn::common::AutoSimplify(shape_weights[1]);
-  Expr c_out = cinn::common::AutoSimplify(shape_weights[0]);
+  Expr c_in = optim::ArithSimplify(shape_input[1]);
+  Expr c_filter = optim::ArithSimplify(shape_weights[1]);
+  Expr c_out = optim::ArithSimplify(shape_weights[0]);
   absl::flat_hash_map<std::string, int> conv2d_factors;
   int oc = c_out.as_int32();
   int ic = c_in.as_int32();
@@ -559,12 +559,12 @@ std::vector<ir::Tensor> Conv2d_NCHW_5D(const ir::Tensor &input,
   std::vector<Expr> output_shape = {
       batch,  // B
       c_out,  // O
-      cinn::common::AutoSimplify(
-          (h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h +
-          1),  // H
-      cinn::common::AutoSimplify(
-          (w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w +
-          1)  // W
+      optim::ArithSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) /
+                               stride_h +
+                           1),  // H
+      optim::ArithSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) /
+                               stride_w +
+                           1)  // W
   };
   auto res = Compute(
       output_shape,
@@ -601,33 +601,33 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
       ::common::errors::InvalidArgument("weight's shape size should be 6"));
 
   Expr batch = shape_input[0];
-  Expr c_in_outer = cinn::common::AutoSimplify(shape_input[1]);
+  Expr c_in_outer = optim::ArithSimplify(shape_input[1]);
   Expr h_in = shape_input[2];
   Expr w_in = shape_input[3];
-  Expr c_in_inner = cinn::common::AutoSimplify(shape_input[4]);
+  Expr c_in_inner = optim::ArithSimplify(shape_input[4]);
 
   Expr c_out_outer = shape_weights[0];
-  Expr c_filter_outer = cinn::common::AutoSimplify(shape_weights[1]);
+  Expr c_filter_outer = optim::ArithSimplify(shape_weights[1]);
   Expr h_f = shape_weights[2];
   Expr w_f = shape_weights[3];
-  Expr c_filter_inner = cinn::common::AutoSimplify(shape_weights[4]);
-  Expr c_out_inner = cinn::common::AutoSimplify(shape_weights[5]);
+  Expr c_filter_inner = optim::ArithSimplify(shape_weights[4]);
+  Expr c_out_inner = optim::ArithSimplify(shape_weights[5]);
 
-  Expr c_filter = cinn::common::AutoSimplify(c_filter_outer * c_filter_inner);
-  Expr c_out = cinn::common::AutoSimplify(c_out_outer * c_out_inner);
-  Expr c_in = cinn::common::AutoSimplify(c_in_outer * c_in_inner);
+  Expr c_filter = optim::ArithSimplify(c_filter_outer * c_filter_inner);
+  Expr c_out = optim::ArithSimplify(c_out_outer * c_out_inner);
+  Expr c_in = optim::ArithSimplify(c_in_outer * c_in_inner);
   Var fc(c_filter, UniqName("fc"));
   Var fy(h_f, UniqName("fy"));
   Var fx(w_f, UniqName("fx"));
   std::vector<Expr> output_shape = {
       batch,        // B
       c_out_outer,  // O
-      cinn::common::AutoSimplify(
-          (h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h +
-          1),  // H
-      cinn::common::AutoSimplify(
-          (w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w +
-          1),  // W
+      optim::ArithSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) /
+                               stride_h +
+                           1),  // H
+      optim::ArithSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) /
+                               stride_w +
+                           1),  // W
       c_out_inner};
 
   ir::Tensor input_pad;
@@ -639,18 +639,18 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
         },
         UniqName("input_pad"));
   } else {
-    auto pad_h_bound = cinn::common::AutoSimplify(
-        (output_shape[2] - 1) * stride_h + (h_f - 1) * dilation_h + 1);
-    auto pad_w_bound = cinn::common::AutoSimplify(
-        (output_shape[3] - 1) * stride_w + (w_f - 1) * dilation_w + 1);
+    auto pad_h_bound = optim::ArithSimplify((output_shape[2] - 1) * stride_h +
+                                            (h_f - 1) * dilation_h + 1);
+    auto pad_w_bound = optim::ArithSimplify((output_shape[3] - 1) * stride_w +
+                                            (w_f - 1) * dilation_w + 1);
     auto pad_out_h =
         std::min(pad_h_bound.as_int32(),
-                 cinn::common::AutoSimplify(h_in + 2 * pad_h).as_int32());
+                 optim::ArithSimplify(h_in + 2 * pad_h).as_int32());
     auto pad_out_w =
         std::min(pad_w_bound.as_int32(),
-                 cinn::common::AutoSimplify(w_in + 2 * pad_w).as_int32());
-    auto h_in_pad = cinn::common::AutoSimplify(h_in + pad_h);
-    auto w_in_pad = cinn::common::AutoSimplify(w_in + pad_w);
+                 optim::ArithSimplify(w_in + 2 * pad_w).as_int32());
+    auto h_in_pad = optim::ArithSimplify(h_in + pad_h);
+    auto w_in_pad = optim::ArithSimplify(w_in + pad_w);
     input_pad = Compute(
         {batch, c_in_outer, Expr(pad_out_h), Expr(pad_out_w), c_in_inner},
         [=](Expr n, Expr icc, Expr yy, Expr xx, Expr icb) {
@@ -670,23 +670,20 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
   auto packed_out = Compute(
       output_shape,
       [=](Expr n, Expr oc_chunk, Expr oh, Expr ow, Expr oc_block) {
-        Expr c_out_per_group =
-            cinn::common::AutoSimplify(c_out * c_filter / c_in);
+        Expr c_out_per_group = optim::ArithSimplify(c_out * c_filter / c_in);
         Expr ic_outer, ic_inner;
         if (c_in == c_filter) {
-          ic_outer = cinn::common::AutoSimplify(fc / c_in_inner);
-          ic_inner = cinn::common::AutoSimplify(fc % c_in_inner);
+          ic_outer = optim::ArithSimplify(fc / c_in_inner);
+          ic_inner = optim::ArithSimplify(fc % c_in_inner);
         } else {
-          ic_outer =
-              cinn::common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) /
-                                              c_out_per_group * c_filter +
-                                          fc) /
-                                         c_in_inner);
-          ic_inner =
-              cinn::common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) /
-                                              c_out_per_group * c_filter +
-                                          fc) %
-                                         c_in_inner);
+          ic_outer = optim::ArithSimplify(((oc_chunk * c_out_inner + oc_block) /
+                                               c_out_per_group * c_filter +
+                                           fc) /
+                                          c_in_inner);
+          ic_inner = optim::ArithSimplify(((oc_chunk * c_out_inner + oc_block) /
+                                               c_out_per_group * c_filter +
+                                           fc) %
+                                          c_in_inner);
         }
         return lang::ReduceSum(input_pad(n,
                                          ic_outer,
@@ -1264,8 +1261,8 @@ Tensor Pad(const Tensor &tensor,
     if (i >= pad_before.size()) {
       output_shape.push_back(tensor->shape[i]);
     } else {
-      auto shape = cinn::common::AutoSimplify(tensor->shape[i] + pad_before[i] +
-                                              pad_after[i]);
+      auto shape =
+          optim::ArithSimplify(tensor->shape[i] + pad_before[i] + pad_after[i]);
       output_shape.push_back(shape);
     }
   }
@@ -1291,8 +1288,8 @@ Tensor Pad(const Tensor &tensor,
       }
       Expr sel_after;
       if (!MathEqual(pad_after[i], Expr(0))) {
-        sel_after = cinn::common::AutoSimplify(ovars[i] < pad_before[i] +
-                                                              tensor->shape[i]);
+        sel_after =
+            optim::ArithSimplify(ovars[i] < pad_before[i] + tensor->shape[i]);
         sel.push_back(sel_after);
       }
       if (pad_mode == "edge") {
@@ -1407,7 +1404,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
     do_pad = (do_pad) ? do_pad : (padding_size[i] || padding_size[i + k_size]);
 
     if (ceil_mode) {
-      pad_tail[i] = cinn::common::AutoSimplify(pad_tail[i] + stride[i] - 1);
+      pad_tail[i] = optim::ArithSimplify(pad_tail[i] + stride[i] - 1);
     }
 
     daxis.emplace_back(Var(kernel[i], UniqName("kernel_idx")));
@@ -1415,7 +1412,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
     pad_before[ii] = pad_head[i];
     pad_after[ii] = pad_tail[i];
 
-    auto out_dim = cinn::common::AutoSimplify(
+    auto out_dim = optim::ArithSimplify(
         (tensor->shape[ii] - kernel[i] + pad_head[i] + pad_tail[i]) /
             stride[i] +
         1);
@@ -1470,13 +1467,13 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
             auto temp_factor = make_const(Int(32), 1);
             for (int i = 0; i < k_size; i++) {
               int ii = axis[i];
-              start[i] = cinn::common::AutoSimplify(output[ii] * stride[i] -
-                                                    pad_head[i]);
+              start[i] =
+                  optim::ArithSimplify(output[ii] * stride[i] - pad_head[i]);
               end[i] = Min::Make(start[i] + kernel[i], tensor->shape[ii]);
               start[i] = Max::Make(start[i], make_const(Int(32), 0));
               temp_factor = temp_factor * (end[i] - start[i]);
             }
-            cinn::common::AutoSimplify(temp_factor);
+            optim::ArithSimplify(temp_factor);
             Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1));
             return lang::ReduceSum(
                 ir::Div::Make(temp(indices),
@@ -1487,7 +1484,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
             for (int i = 0; i < k_size; i++) {
               temp_factor = temp_factor * kernel[i];
             }
-            cinn::common::AutoSimplify(temp_factor);
+            optim::ArithSimplify(temp_factor);
             return lang::ReduceSum(
                 ir::Div::Make(temp(indices),
                               ir::Cast::Make(temp->type(), temp_factor)),
@@ -1553,7 +1550,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
                 Expr(static_cast<int>(tensor->shape[axis[i]].get_constant()) /
                      kernel_size[i]);
           }
-          cinn::common::AutoSimplify(temp_factor);
+          optim::ArithSimplify(temp_factor);
           Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1));
           return lang::ReduceSum(
               ir::Div::Make(temp(indices),
diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc
index c4d6d649b2d264..9aed131a42a494 100644
--- a/paddle/cinn/hlir/pe/transform.cc
+++ b/paddle/cinn/hlir/pe/transform.cc
@@ -420,7 +420,7 @@ std::vector<ir::Tensor> Split(
         out_shape[i],
         [=](const std::vector<Expr>& indice) {
           auto temp = indice;
-          temp[axis] = cinn::common::AutoSimplify(temp[axis] + Expr(start[i]));
+          temp[axis] = optim::ArithSimplify(temp[axis] + Expr(start[i]));
           return A(temp);
         },
         names[i]);
@@ -442,7 +442,7 @@ ir::Tensor Concat(const ir::Tensor& A,
   std::vector<Expr> output_shape = A->shape;
   Expr pivot = A->shape[axis];
   output_shape[axis] =
-      cinn::common::AutoSimplify(output_shape[axis] + B->shape[axis]);
+      optim::ArithSimplify(output_shape[axis] + B->shape[axis]);
   auto res = Compute(
       output_shape,
       [=](const std::vector<Expr>& indice) {
@@ -481,8 +481,8 @@ ir::Tensor Concat(const std::vector<ir::Tensor>& input_tensors,
         ::common::errors::InvalidArgument(
             "Dimensions of inputs tensors in Concat should be equal! Please "
             "check."));
-    output_shape[axis] = cinn::common::AutoSimplify(
-        output_shape[axis] + input_tensors[i]->shape[axis]);
+    output_shape[axis] = optim::ArithSimplify(output_shape[axis] +
+                                              input_tensors[i]->shape[axis]);
   }
 
   auto res = Compute(
@@ -491,7 +491,7 @@ ir::Tensor Concat(const std::vector<ir::Tensor>& input_tensors,
         auto ret = input_tensors[0](indice);
         Expr accumulate_shape = Expr(0);
         for (int i = 0; i < input_size - 1; i++) {
-          accumulate_shape = cinn::common::AutoSimplify(
+          accumulate_shape = optim::ArithSimplify(
               accumulate_shape + input_tensors[i]->shape[axis]);
           std::vector<Expr> new_indice = indice;
           new_indice[axis] =
@@ -1068,7 +1068,7 @@ std::vector<Expr> InferShapeLayoutTransform(
         int dst_prim_index = (*split_index_map)[i][0];
         int dst_sub_index = (*split_index_map)[i][1];
         int factor = (*split_index_map)[i][2];
-        Expr chunk_shape = cinn::common::AutoSimplify(input_shapes[i] / factor);
+        Expr chunk_shape = optim::ArithSimplify(input_shapes[i] / factor);
         Expr block_shape = Expr(factor);
         output_shape[dst_prim_index] = chunk_shape;
         output_shape[dst_sub_index] = block_shape;
@@ -1100,7 +1100,7 @@ std::vector<Expr> InferShapeLayoutTransform(
             ::common::errors::InvalidArgument(
                 "input_shapes[src_sub_index] should be equal to factor"));
         output_shape[i] =
-            cinn::common::AutoSimplify(input_shapes[src_prim_index] * factor);
+            optim::ArithSimplify(input_shapes[src_prim_index] * factor);
       } else if ((*split_index_map)[i].size() == 1) {
         int src_prim_index = (*split_index_map)[i][0];
         output_shape[i] = input_shapes[src_prim_index];
@@ -1164,13 +1164,11 @@ ir::Tensor LayoutTransform(const Tensor& input,
             int sub_index = split_infos[1];
             int factor = split_infos[2];
             if (dst_dim > src_dim) {
-              new_indice[i] = cinn::common::AutoSimplify(
-                  indice[prim_index] * factor + indice[sub_index]);
+              new_indice[i] = optim::ArithSimplify(indice[prim_index] * factor +
+                                                   indice[sub_index]);
             } else {
-              new_indice[prim_index] =
-                  cinn::common::AutoSimplify(indice[i] / factor);
-              new_indice[sub_index] =
-                  cinn::common::AutoSimplify(indice[i] % factor);
+              new_indice[prim_index] = optim::ArithSimplify(indice[i] / factor);
+              new_indice[sub_index] = optim::ArithSimplify(indice[i] % factor);
             }
 
           } else if (split_infos.size() == 1) {
diff --git a/paddle/cinn/ir/buffer.cc b/paddle/cinn/ir/buffer.cc
index cec3f91db7e650..9dc6f4e209b1d6 100644
--- a/paddle/cinn/ir/buffer.cc
+++ b/paddle/cinn/ir/buffer.cc
@@ -160,7 +160,7 @@ ir::Expr _Buffer_::SymbolicNumel() const {
   for (auto &i : shape) {
     res = res * i;
   }
-  return common::AutoSimplify(res);
+  return optim::ArithSimplify(res);
 }
 
 void _Buffer_::Verify() const {
diff --git a/paddle/cinn/ir/schedule/impl/loop_transformation.cc b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
index f54b0fd81a9d81..e0797212ad4d78 100644
--- a/paddle/cinn/ir/schedule/impl/loop_transformation.cc
+++ b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
@@ -125,7 +125,7 @@ std::vector<Expr> DyScheduleImpl::Split(const Expr& loop,
           Expr(temp_var) + substitute_value * Expr(processed_factors[i]);
       new_loop_vars.push_back(temp_var);
     }
-    substitute_value = cinn::common::AutoSimplify(substitute_value);
+    substitute_value = optim::ArithSimplify(substitute_value);
     Expr new_node = ir::ir_utils::IRCopy(for_node->body);
     ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value});
     std::vector<Expr> splited_loops;
@@ -167,8 +167,7 @@ std::vector<Expr> DyScheduleImpl::Split(const Expr& loop,
   for (auto factor : factors) prod_size = prod_size * Expr(factor);
   std::for_each(factors.begin(), factors.end(), [&](int factor) {
     if (factor == -1) {
-      process_factors.push_back(
-          cinn::common::AutoSimplify(tot_extent / prod_size));
+      process_factors.push_back(optim::ArithSimplify(tot_extent / prod_size));
       idx_neg1 = -idx_neg1;
     } else {
       process_factors.push_back(Expr(factor));
@@ -180,12 +179,11 @@ std::vector<Expr> DyScheduleImpl::Split(const Expr& loop,
 
   idx_neg1 = (-idx_neg1) - 1;
 
-  bool exact_split =
-      (tot_extent ==
-       cinn::common::AutoSimplify(process_factors[0] * process_factors[1]));
+  bool exact_split = (tot_extent == optim::ArithSimplify(process_factors[0] *
+                                                         process_factors[1]));
   if (!exact_split) {
     process_factors[idx_neg1] =
-        cinn::common::AutoSimplify(process_factors[idx_neg1] + Expr(1));
+        optim::ArithSimplify(process_factors[idx_neg1] + Expr(1));
   }
 
   PADDLE_ENFORCE_LE(
@@ -218,7 +216,7 @@ std::vector<Expr> DyScheduleImpl::Split(const Expr& loop,
     substitute_value = Expr(temp_var) + substitute_value * process_factors[i];
     new_loop_vars.push_back(temp_var);
   }
-  substitute_value = cinn::common::AutoSimplify(substitute_value);
+  substitute_value = optim::ArithSimplify(substitute_value);
   Expr new_node = ir::ir_utils::IRCopy(for_node->body);
   ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value});
   std::vector<Expr> splited_loops;
@@ -329,7 +327,7 @@ std::vector<Expr> DyScheduleImpl::Split(const Expr& loop,
     substitute_value = Expr(temp_var) + substitute_value * process_factors[i];
     new_loop_vars.push_back(temp_var);
   }
-  substitute_value = cinn::common::AutoSimplify(substitute_value);
+  substitute_value = optim::ArithSimplify(substitute_value);
   Expr new_node = ir::ir_utils::IRCopy(for_node->body);
   ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value});
   std::vector<Expr> splited_loops;
@@ -442,7 +440,7 @@ Expr DyScheduleImpl::Fuse(const std::vector<Expr>& loops) {
   for (int i = 0; i < loops_number; ++i) {
     fused_extent = fused_extent * for_nodes[i]->extent;
   }
-  fused_extent = cinn::common::AutoSimplify(fused_extent);
+  fused_extent = optim::ArithSimplify(fused_extent);
   if (!fused_body.As<ir::Block>()) fused_body = Block::Make({fused_body});
   Expr new_stmt = For::Make(fused_var,
                             Expr(0),
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index 316854db08ebed..756b76f271efb7 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -478,8 +478,8 @@ IterRange GetAccessedRange(const Expr& index,
   ReplaceExpr(&indice_min, iter_vars, var_mins);
   ReplaceExpr(&indice_max, iter_vars, var_maxs);
   // simplify expression
-  indice_min = cinn::common::AutoSimplify(indice_min);
-  indice_max = cinn::common::AutoSimplify(indice_max);
+  indice_min = optim::ArithSimplify(indice_min);
+  indice_max = optim::ArithSimplify(indice_max);
 
   Expr indice_extent;
   Expr mod_extent(0);
@@ -487,7 +487,7 @@ IterRange GetAccessedRange(const Expr& index,
     Expr mod_right_min = indice_min.As<Mod>()->a();
     Expr mod_right_max = indice_max.As<Mod>()->a();
     Expr mod_right_extent =
-        cinn::common::AutoSimplify(mod_right_max - mod_right_min + 1);
+        optim::ArithSimplify(mod_right_max - mod_right_min + 1);
     mod_extent = indice_min.As<Mod>()->b();
     if (mod_right_extent.get_constant() < mod_extent.get_constant()) {
       mod_extent = mod_right_extent;
@@ -502,9 +502,8 @@ IterRange GetAccessedRange(const Expr& index,
       indice_extent = mod_extent;
     }
   } else {
-    indice_extent =
-        cinn::common::AutoSimplify(cinn::common::AutoSimplify(indice_max) -
-                                   cinn::common::AutoSimplify(indice_min) + 1);
+    indice_extent = optim::ArithSimplify(optim::ArithSimplify(indice_max) -
+                                         optim::ArithSimplify(indice_min) + 1);
   }
 
   if (indice_extent.is_constant() && indice_extent.get_constant() < 0) {
@@ -650,7 +649,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
         cinn::common::UniqName("cache_ax" + std::to_string(loop_vars.size())));
     // Var loop_var("ax" + std::to_string(loop_vars.size()));
     loop_vars.push_back(loop_var);
-    iter_values.push_back(cinn::common::AutoSimplify(range.min + loop_var));
+    iter_values.push_back(optim::ArithSimplify(range.min + loop_var));
   }
   // block variables
   std::vector<Var> block_vars;
@@ -681,7 +680,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
   for (int i = static_cast<int>(loop_vars.size()) - 1; i >= 0; i--) {
     new_body = For::Make(loop_vars[i],
                          Expr(0),
-                         cinn::common::AutoSimplify(buffer_ranges[i].extent),
+                         optim::ArithSimplify(buffer_ranges[i].extent),
                          ir::ForType::Serial,
                          device_api,
                          ir::Block::Make({new_body}));
@@ -1284,9 +1283,9 @@ void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {  // NOLINT
 }
 
 IterRange RangeUnion(const IterRange& range1, const IterRange& range2) {
-  Expr new_min = cinn::common::AutoSimplify(Min::Make(range1.min, range2.min));
-  Expr new_extent = cinn::common::AutoSimplify(
-      cinn::common::AutoSimplify(
+  Expr new_min = optim::ArithSimplify(Min::Make(range1.min, range2.min));
+  Expr new_extent = optim::ArithSimplify(
+      optim::ArithSimplify(
           Max::Make(range1.min + range1.extent, range2.min + range2.extent)) -
       new_min);
   return IterRange(new_min, new_extent);
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h
index 576a7448147e6e..d0e102b0050751 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.h
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.h
@@ -555,7 +555,7 @@ struct RfMutator : public ir::IRMutator<> {
           true,
           ::common::errors::InvalidArgument(
               "The rfactor loop's minimum value should be zero."));
-      auto extent = cinn::common::AutoSimplify(rf_for->extent);
+      auto extent = optim::ArithSimplify(rf_for->extent);
       auto& shape = tensor->shape;
       auto& domain = tensor->domain;
       PADDLE_ENFORCE_LE(
@@ -673,9 +673,9 @@ struct LoopReconstructor : public ir::IRMutator<> {
         Var var(var_name, Int(32));
         loop_vars.push_back(var);
         loop_extents.push_back(range.extent);
-        iter_values.push_back(cinn::common::AutoSimplify(range.min) + var);
+        iter_values.push_back(optim::ArithSimplify(range.min) + var);
       } else {
-        iter_values.push_back(cinn::common::AutoSimplify(range.min));
+        iter_values.push_back(optim::ArithSimplify(range.min));
       }
     }
     auto schedule_block_node =
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index f05da6ce6dcfeb..8e065541d10407 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -443,8 +443,8 @@ bool _Tensor_::HasSameShapeWith(const Tensor &other) const {
   if (shape.size() != other->shape.size()) return false;
 
   for (int i = 0; i < shape.size(); i++) {
-    Expr dim0 = cinn::common::AutoSimplify(shape[i]);
-    Expr dim1 = cinn::common::AutoSimplify(other->shape[i]);
+    Expr dim0 = optim::ArithSimplify(shape[i]);
+    Expr dim1 = optim::ArithSimplify(other->shape[i]);
 
     if (dim0 != dim1) return false;
   }
diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
index 4a3c101f3c325f..ea2af9033a5423 100644
--- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
+++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
@@ -136,7 +136,7 @@ CollectLocalVarToIndexes(ir::Expr* expr) {
 }
 
 int ExtractMulNumberFromExpr(const ir::Expr& expr) {
-  ir::Expr simplied_expr = cinn::common::AutoSimplify(expr);
+  ir::Expr simplied_expr = optim::ArithSimplify(expr);
   if (simplied_expr.is_constant()) {
     return static_cast<int>(simplied_expr.get_constant());
   } else if (expr.As<ir::Mul>()) {
@@ -151,7 +151,7 @@ int ExtractMulNumberFromExpr(const ir::Expr& expr) {
 }
 
 int ExtractAddNumberFromExpr(const ir::Expr& expr) {
-  ir::Expr simplied_expr = cinn::common::AutoSimplify(expr);
+  ir::Expr simplied_expr = optim::ArithSimplify(expr);
   if (simplied_expr.is_constant()) {
     return static_cast<int>(simplied_expr.get_constant());
   } else if (expr.As<ir::Add>()) {
@@ -173,7 +173,7 @@ int gcd(int a, int b) {
 }
 
 ir::Expr ExtractSymbolicFromExpr(const ir::Expr& expr) {
-  ir::Expr simplied_expr = cinn::common::AutoSimplify(expr);
+  ir::Expr simplied_expr = optim::ArithSimplify(expr);
   if (simplied_expr.is_constant()) {
     return ir::Expr(0);
   } else if (expr.As<ir::_Var_>()) {
@@ -210,7 +210,7 @@ struct CommonFactorTrait<Gcd> {
 
   static ir::Expr Simplify(const ir::Expr& expr, const ir::Expr& factor) {
     if (factor != unit) {
-      return cinn::common::AutoSimplify(ir::Div::Make(expr, factor));
+      return optim::ArithSimplify(ir::Div::Make(expr, factor));
     }
     return expr;
   }
@@ -229,7 +229,7 @@ struct CommonFactorTrait<Offset> {
 
   static ir::Expr Simplify(const ir::Expr& expr, const ir::Expr& factor) {
     if (factor != unit) {
-      return cinn::common::AutoSimplify(ir::Sub::Make(expr, factor));
+      return optim::ArithSimplify(ir::Sub::Make(expr, factor));
     }
     return expr;
   }
@@ -244,7 +244,7 @@ struct CommonFactorTrait<Symbolic> {
   static ir::Expr Calculate(const ir::Expr& expr1, const ir::Expr& expr2) {
     auto IsSymbolicNotEqual = [&](const ir::Expr& expr1,
                                   const ir::Expr& expr2) -> bool {
-      return cinn::common::AutoSimplify(
+      return optim::ArithSimplify(
                  ir::Sub::Make(ExtractSymbolicFromExpr(expr1),
                                ExtractSymbolicFromExpr(expr2))) != ir::Expr(0);
     };
@@ -256,7 +256,7 @@ struct CommonFactorTrait<Symbolic> {
 
   static ir::Expr Simplify(const ir::Expr& expr, const ir::Expr& factor) {
     if (factor != unit) {
-      return cinn::common::AutoSimplify(ir::Sub::Make(expr, factor));
+      return optim::ArithSimplify(ir::Sub::Make(expr, factor));
     }
     return expr;
   }
diff --git a/paddle/cinn/optim/eliminate_common_global_memory_read.cc b/paddle/cinn/optim/eliminate_common_global_memory_read.cc
index 4af31da3b2ecaa..42bc0805137d3c 100644
--- a/paddle/cinn/optim/eliminate_common_global_memory_read.cc
+++ b/paddle/cinn/optim/eliminate_common_global_memory_read.cc
@@ -46,7 +46,7 @@ std::unordered_map<ir::Var, ir::Var> ConstructForVarReplaceMap(
   for (const auto& [lhs_var, lhs_extent] : lhs_extents) {
     for (std::size_t i = 0; i < rhs_extents.size(); ++i) {
       const auto& [rhs_var, rhs_extent] = rhs_extents[i];
-      if (cinn::common::AutoSimplify(ir::Sub::Make(lhs_extent, rhs_extent)) ==
+      if (optim::ArithSimplify(ir::Sub::Make(lhs_extent, rhs_extent)) ==
               ir::Expr(0) &&
           visited_rhs_index.count(i) == 0) {
         ret[lhs_var] = rhs_var;
@@ -88,8 +88,7 @@ struct GlobalTensorInfoCollector : public ir::IRMutator<Expr*> {
       for (size_t i = 0; i < indice1.size(); ++i) {
         ir::Expr lhs = IndiceToExprWithForVar(indice1.at(i), for_var_map);
         ir::Expr rhs = IndiceToExprWithForVar(indice2.at(i), for_var_map);
-        if (cinn::common::AutoSimplify(ir::Sub::Make(lhs, rhs)) !=
-            ir::Expr(0)) {
+        if (optim::ArithSimplify(ir::Sub::Make(lhs, rhs)) != ir::Expr(0)) {
           return false;
         }
       }
@@ -166,7 +165,7 @@ struct GlobalTensorInfoCollector : public ir::IRMutator<Expr*> {
         }
         VLOG(6) << "Iter var name: " << iter_var_name << " with extent: "
                 << iter_var_name_to_extent_.at(iter_var_name);
-        buffer_size = cinn::common::AutoSimplify(ir::Mul::Make(
+        buffer_size = optim::ArithSimplify(ir::Mul::Make(
             buffer_size, iter_var_name_to_extent_.at(iter_var_name)));
       }
       return buffer_size;
@@ -182,7 +181,7 @@ struct GlobalTensorInfoCollector : public ir::IRMutator<Expr*> {
             CalculateBufferSize(indices_and_extent[0].indices);
         VLOG(6) << "Global buffer name: " << name
                 << " with size: " << buffer_size;
-        size = cinn::common::AutoSimplify(ir::Add::Make(size, buffer_size));
+        size = optim::ArithSimplify(ir::Add::Make(size, buffer_size));
       }
       if (BufferSizeContainsSymbolic(size)) {
         VLOG(6) << "Local buffer size contains symbolic: " << size;
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index 396e4b6e5c0697..7dc54f5b47c1ac 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -132,9 +132,8 @@ struct SimplifyRampMutator : public ir::IRMutator<Expr*> {
     auto b_ramp = b.As<ir::Ramp>();
 
     if (a_ramp && b_ramp && a_ramp->lanes == b_ramp->lanes) {
-      Expr base_add = cinn::common::AutoSimplify(a_ramp->base + b_ramp->base);
-      Expr stride_add =
-          cinn::common::AutoSimplify(a_ramp->stride + b_ramp->stride);
+      Expr base_add = optim::ArithSimplify(a_ramp->base + b_ramp->base);
+      Expr stride_add = optim::ArithSimplify(a_ramp->stride + b_ramp->stride);
       *expr = ir::Ramp::Make(base_add, stride_add, a_ramp->lanes);
     }
   }

From 440570ee8aeef4015a2a0ce3ae76f005b0f6dd03 Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:29:55 +0800
Subject: [PATCH 14/57] [CINN] Enhance reduce anchor fusion with different
 flatten axis (#70665)

---
 .../policy/iters_fusion_policy.cc             | 68 ++++++++++++-------
 .../policy/iters_fusion_policy.h              |  2 +
 test/ir/pir/cinn/test_reduce_fusion.py        | 20 ++++++
 3 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc
index c0da3a56ea9a67..f07b57ab596ffd 100644
--- a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc
+++ b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.cc
@@ -154,14 +154,37 @@ std::optional<ItersTransform> ItersFusionPolicy::GetReuseItersTransform(
   }
 }
 
+std::optional<ItersTransform> ItersFusionPolicy::GetAppendItersTransform(
+    FusionIters* source_iters, const FusionIters& target_iters) {
+  const auto target_unique_iters =
+      GatherFirstNotInSecond(target_iters, *source_iters);
+  if (!target_unique_iters.empty()) {
+    if (!transform_strategy_[ItersTransformType::AppendIters] ||
+        !FLAGS_enable_append_iters_in_fusion) {
+      VLOG(4) << "Can not append iters in fusion, because of AppendIters "
+                 "tranform is disabled.";
+      return std::nullopt;
+    }
+    std::vector<int32_t> append_axis;
+    std::vector<symbol::DimExpr> append_symbols;
+    for (const auto& iter : target_unique_iters) {
+      const size_t pos =
+          std::find(target_iters.begin(), target_iters.end(), iter) -
+          target_iters.begin();
+      append_axis.push_back(pos);
+      append_symbols.push_back(iters_manager_->GetIterSymbol(iter));
+      source_iters->insert(source_iters->begin() + pos, iter);
+    }
+    return AppendItersTransform(append_axis, append_symbols);
+  }
+  return IdentityItersTransform();
+}
+
 std::optional<ItersTransformRoute>
 ItersFusionPolicy::SearchTransformRouteFromReduce2Reduce(
     const FusionItersSignature& source, const FusionItersSignature& target) {
   VLOG(4) << "Start search transform Route from reduce to reduce.";
-  if (source.loop_iters.size() == target.loop_iters.size() &&
-      source.reduce_iter_nums == target.reduce_iter_nums) {
-    // Currently only support fusion with same iter_nums and same reduce axis
-    // TODO(huangjiyi): Analysis fusion with different non reduce axis
+  if (source.reduce_iter_nums == target.reduce_iter_nums) {
     auto [source_flatten_iters, source_reduce_iters] = SplitReduceIters(source);
     auto [target_flatten_iters, target_reduce_iters] = SplitReduceIters(target);
 
@@ -186,6 +209,15 @@ ItersFusionPolicy::SearchTransformRouteFromReduce2Reduce(
     route.push_back(flatten_reuse_iters_transform.value());
     route.push_back(reduce_reuse_iters_transform.value());
 
+    // 2. Apply AppendItersTransform for flatten iters
+    const auto flatten_append_iters_transform =
+        GetAppendItersTransform(&source_flatten_iters, target_flatten_iters);
+    if (flatten_append_iters_transform == std::nullopt) {
+      return std::nullopt;
+    } else {
+      route.push_back(flatten_append_iters_transform.value());
+    }
+
     // 2. Apply TransposeItersTransform
     if (source_flatten_iters == target_flatten_iters &&
         source_reduce_iters == target_reduce_iters) {
@@ -317,28 +349,12 @@ std::optional<ItersTransformRoute> ItersFusionPolicy::SearchItersTransformRoute(
   // 3. Apply AppendItersTransform
   // if exist iters in target can not find in source
   FusionIters appended_source_iters = reused_source_iters;
-  if (!reused_target_unique_iters.empty()) {
-    if (!transform_strategy_[ItersTransformType::AppendIters] ||
-        !FLAGS_enable_append_iters_in_fusion) {
-      VLOG(4) << "Can not append iters in fusion, because of AppendIters "
-                 "tranform is disabled.";
-      return std::nullopt;
-    }
-    std::vector<int32_t> append_axis;
-    std::vector<symbol::DimExpr> append_symbols;
-    for (const auto& iter : reused_target_unique_iters) {
-      const size_t pos =
-          std::find(target_iters.begin(), target_iters.end(), iter) -
-          target_iters.begin();
-      append_axis.push_back(pos);
-      append_symbols.push_back(iters_manager_->GetIterSymbol(iter));
-      appended_source_iters.insert(appended_source_iters.begin() + pos, iter);
-    }
-    iters_transforms.push_back(
-        AppendItersTransform(append_axis, append_symbols));
-    if (appended_source_iters == target_iters) {
-      return iters_transforms;
-    }
+  const auto append_iters_transform =
+      GetAppendItersTransform(&appended_source_iters, target_iters);
+  if (append_iters_transform == std::nullopt) {
+    return std::nullopt;
+  } else {
+    iters_transforms.push_back(append_iters_transform.value());
   }
   VLOG(4) << "source iters after reuse and append: "
           << PrintFusionIters(appended_source_iters);
diff --git a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h
index dfe4b78030e2b4..636d330e1990b5 100644
--- a/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h
+++ b/paddle/cinn/operator_fusion/policy/iters_fusion_policy.h
@@ -62,6 +62,8 @@ struct ItersFusionPolicy final : public PolicyBase {
  private:
   std::optional<ItersTransform> GetReuseItersTransform(
       FusionIters* source_iters, const FusionIters& target_iters);
+  std::optional<ItersTransform> GetAppendItersTransform(
+      FusionIters* source_iters, const FusionIters& target_iters);
   std::optional<ItersTransformRoute> SearchTransformRouteFromReduce2Reduce(
       const FusionItersSignature& source, const FusionItersSignature& target);
   std::optional<ItersTransformRoute> SearchItersTransformRoute(
diff --git a/test/ir/pir/cinn/test_reduce_fusion.py b/test/ir/pir/cinn/test_reduce_fusion.py
index 58a1d9184c42b1..a8f3140672ee04 100644
--- a/test/ir/pir/cinn/test_reduce_fusion.py
+++ b/test/ir/pir/cinn/test_reduce_fusion.py
@@ -197,6 +197,26 @@ def init():
 
         self.check_accuracy_and_kernel_num(init, func)
 
+    def test_reduce_anchor_fusion(self):
+        #      T
+        #    /   \
+        #   R --> T
+        #       /   \
+        #      R --> T
+        def func(x):
+            x = x + 1
+            a = paddle.max(x, axis=-1, keepdim=True)
+            b = x + a
+            c = paddle.max(b, axis=-1, keepdim=True)
+            d = c + b
+            return d
+
+        def init():
+            x = paddle.rand((1, 32, 4, 8), dtype='float32')
+            return (x,)
+
+        self.check_accuracy_and_kernel_num(init, func, kernel_num=1)
+
 
 if __name__ == "__main__":
     unittest.main()

From 961393d766fefdb85b322de86369adb6bb8001c9 Mon Sep 17 00:00:00 2001
From: fangfangssj <99968055+fangfangssj@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:31:42 +0800
Subject: [PATCH 15/57] add support complex (#70635)

---
 .../phi/kernels/cpu/activation_grad_kernel.cc |  2 +-
 paddle/phi/kernels/cpu/activation_kernel.cc   |  2 +-
 paddle/phi/kernels/funcs/activation_functor.h | 34 +++++++++++++++++
 .../phi/kernels/gpu/activation_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/activation_kernel.cu   |  2 +-
 python/paddle/tensor/ops.py                   |  2 +
 test/legacy_test/test_activation_op.py        | 37 +++++++++++++++----
 7 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index b8ced8d4defe2f..2e95e70a9c5a2a 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -307,7 +307,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(reciprocal_grad,
                                                 ReciprocalGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sqrt_grad, SqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad,
                                                 SoftplusGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 9db1466b4b7ae6..1ac2ed0f1a26b8 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -212,7 +212,7 @@ PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(silu, SiluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, STanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(reciprocal, ReciprocalKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softplus, SoftplusKernel)
 
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 9e02d9ae860ba5..57f3e08121c545 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -759,6 +759,24 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct SqrtGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * (static_cast<ComplexType<T>>(0.5) / out).unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 // rsqrt(x) = x^(-1/2)
 template <typename T>
 struct RsqrtFunctor : public BaseActivationFunctor<T> {
@@ -4050,6 +4068,22 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct CudaSqrtGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one_half = static_cast<ComplexType<T>>(0.5f);
+
+  // dx = dout * 0.5 / out
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> out) const {
+    return dout * conj(one_half / out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 template <typename T>
 struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index ecfd46852c1343..602a4b8f2dd617 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -386,7 +386,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad,
                                                 SoftplusGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_double_grad,
                                                 SoftplusDoubleGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sqrt_grad, SqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_double_grad, SqrtDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_double_grad, RsqrtDoubleGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 0ad0cb9f8c8f6c..3afc392a01497d 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -261,7 +261,7 @@ PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(reciprocal, ReciprocalKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softplus, SoftplusKernel)
 
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index 7d48614d176295..28fbfe7c277cf9 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -1114,6 +1114,8 @@ def sqrt(x: Tensor, name: str | None = None) -> Tensor:
                 'int16',
                 'int32',
                 'int64',
+                'complex64',
+                'complex128',
             ],
             'sqrt',
         )
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index ad556f57af1c6b..16515942aaf4f4 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -1667,6 +1667,11 @@ def setUp(self):
 
         np.random.seed(1023)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(-1, 1, self.shape)
+                + 1j * np.random.uniform(-1, 1, self.shape)
+            ).astype(self.dtype)
         out = np.sqrt(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -1679,14 +1684,20 @@ def if_enable_cinn(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(
-            ['X'],
-            'Out',
-            check_prim=True,
-            check_pir=True,
-            check_prim_pir=True,
-            check_pir_onednn=self.check_pir_onednn,
-        )
+        if self.dtype not in [np.complex64, np.complex128]:
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=True,
+                check_pir=True,
+                check_prim_pir=True,
+                check_pir_onednn=self.check_pir_onednn,
+            )
+        else:
+            self.check_grad(
+                ['X'],
+                'Out',
+            )
 
     def test_check_output(self):
         self.check_output(
@@ -1746,6 +1757,16 @@ def init_shape(self):
         self.shape = []
 
 
+class TestSqrt_Complex64(TestSqrt):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestSqrt_Complex128(TestSqrt):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(),
     "core is not compiled with CUDA",

From e40f1da13d0e264283e371dfc60a89f029d314b7 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 8 Jan 2025 12:10:18 +0800
Subject: [PATCH 16/57] disable pattern match one log (#70669)

---
 paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc b/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc
index 57754f583b0450..cbac44b94a4517 100644
--- a/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc
+++ b/paddle/pir/src/pattern_rewrite/pattern_rewrite_driver.cc
@@ -231,7 +231,7 @@ std::pair<bool, int64_t> ApplyPatternsGreedily(
 
   GreedyPatternRewriteDriver driver(region.ir_context(), patterns, config);
   auto [converged, num_rewrites] = driver.Simplify();
-  if (!converged) {
+  if (!converged && config.max_iterations != 1) {
     LOG(WARNING) << "The pattern rewrite did not converge after scanning "
                  << config.max_iterations << " times";
   }

From 05f3be6a5d5814e6c19609fb05d3adbbc1b37334 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:30:55 +0800
Subject: [PATCH 17/57] [fluid_ops]Modify c_allreduce_sum in
 collective_allreduce_op_wait.py (#70671)

---
 test/collective/collective_allreduce_op_wait.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/test/collective/collective_allreduce_op_wait.py b/test/collective/collective_allreduce_op_wait.py
index 9d677211e71449..5c020fb4e45f3e 100644
--- a/test/collective/collective_allreduce_op_wait.py
+++ b/test/collective/collective_allreduce_op_wait.py
@@ -70,11 +70,13 @@ def get_model(self, main_prog, startup_program, dtype="float32"):
             )
 
             main_prog.global_block().append_op(
-                type="c_allreduce_sum",
-                inputs={'X': toutdata},
-                attrs={'ring_id': ring_id},
-                outputs={'Out': toutdata},
-                attr={'use_calc_stream': False},
+                type="all_reduce",
+                inputs={'x': toutdata},
+                attrs={
+                    'ring_id': ring_id,
+                    'reduce_type': paddle.distributed.ReduceOp.SUM,
+                },
+                outputs={'out': toutdata},
             )
 
             main_prog.global_block().append_op(

From cdebfcdb9b930721e41de8b0e9c084c926ebddf9 Mon Sep 17 00:00:00 2001
From: AIbin <37361953+chang-wenbin@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:13:43 +0800
Subject: [PATCH 18/57] update return isinstance(args, (list, tuple)) (#70657)

[inference] support jit.inference input is tuple(Paddle.Tensor)
---
 python/paddle/incubate/jit/inference_decorator.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/paddle/incubate/jit/inference_decorator.py b/python/paddle/incubate/jit/inference_decorator.py
index a162489a971b81..4aa3b028a04634 100644
--- a/python/paddle/incubate/jit/inference_decorator.py
+++ b/python/paddle/incubate/jit/inference_decorator.py
@@ -74,11 +74,15 @@ def is_fixed_type(input):
         return False
 
 
+def is_list_or_tuple(args):
+    return isinstance(args, (list, tuple))
+
+
 # get paddle.Tensor for paddle inference use.
 def get_tensor(run_time_args, arg_name):
     if isinstance(run_time_args, paddle.Tensor):
         return [run_time_args]
-    elif isinstance(run_time_args, list):
+    elif is_list_or_tuple(run_time_args):
         this_input_tensor_lists = []
         for ele in run_time_args:
             assert isinstance(
@@ -90,7 +94,7 @@ def get_tensor(run_time_args, arg_name):
         return [run_time_args]
     else:
         raise AssertionError(
-            f'''we only support adding paddle.incubate.jit.inference() in functions whose arguments are paddle.Tensor or list[paddle.Tensor] or None,
+            f'''we only support adding paddle.incubate.jit.inference() in functions whose arguments are paddle.Tensor or list[paddle.Tensor] & tuple[paddle.Tensor] or None,
             but here we get {arg_name} in your function is {type(run_time_args)}, please modify your function to meet our requirement.'''
         )
 
@@ -99,7 +103,7 @@ def get_tensor(run_time_args, arg_name):
 def get_d2s_spec(run_time_args, name):
     if isinstance(run_time_args, paddle.Tensor):
         return InputSpec.from_tensor(run_time_args, name=name)
-    elif isinstance(run_time_args, list):
+    elif is_list_or_tuple(run_time_args):
         this_input_spec = []
         suffix = 0
         for ele in run_time_args:
@@ -273,7 +277,7 @@ def forward(self, args):
                 input_specs.append(this_input)
 
         for i in range(len(input_specs)):
-            if isinstance(input_specs[i], list):
+            if is_list_or_tuple(input_specs[i]):
                 for j in range(len(input_specs[i])):
                     input_specs[i][j].stop_gradient = True
             elif isinstance(input_specs[i], paddle.static.InputSpec):
@@ -285,7 +289,7 @@ def forward(self, args):
         if len(self.d2s_input_names) == 0:
             self.d2s_input_names.extend([None] * len(input_tensor_lists))
         for i in range(len(input_specs)):
-            if isinstance(input_specs[i], list):
+            if is_list_or_tuple(input_specs[i]):
                 for j in range(len(input_specs[i])):
                     input_specs[i][j].shape = self.d2s_input_shapes[
                         d2s_shapes_id

From be71e9766bde4d455d8955f99c7cf6ce5a43746c Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Wed, 8 Jan 2025 14:50:58 +0800
Subject: [PATCH 19/57] Remove Wait in if_instruction (#70599)

---
 .../new_executor/instruction/control_flow/if_instruction.cc      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
index 1b1231359fe833..01b97bf9bb12a5 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
@@ -226,7 +226,6 @@ void IfInstruction::Run() {
       // phi::is_xpu_place(cond.place()) is true
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
-      DeviceContext().Wait();
       phi::DenseTensor cpu_cond;
       paddle::framework::TensorCopySync(
           cond_tensor, phi::CPUPlace(), &cpu_cond);

From fe00bd8e47f6ba8721062a5ab724d439baa1325c Mon Sep 17 00:00:00 2001
From: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
Date: Wed, 8 Jan 2025 15:21:14 +0800
Subject: [PATCH 20/57] [Infrence]Temporarily disable AVX kernel inlining for
 GCC12 (#70603)

---
 cmake/simd.cmake          | 10 ++++++++++
 paddle/phi/CMakeLists.txt |  5 +++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 500e8c234407ff..119d9e91cacdeb 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -13,6 +13,11 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
   set(AVX512F_FLAG "-mavx512f")
   set(Wno_Maybe_Uninitialized "-Wno-maybe-uninitialized")
   set(FMA_FLAG "-mfma")
+  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0)
+    set(NO_INLINE "-fno-inline")
+  else()
+    set(NO_INLINE "")
+  endif()
 elseif(MSVC)
   set(MMX_FLAG "/arch:MMX")
   set(SSE2_FLAG "/arch:SSE2")
@@ -22,6 +27,11 @@ elseif(MSVC)
   set(AVX512F_FLAG "/arch:AVX512")
   set(Wno_Maybe_Uninitialized "/wd4701")
   set(FMA_FLAG "/arch:AVX2")
+  if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0)
+    set(NO_INLINE "/Ob0")
+  else()
+    set(NO_INLINE "")
+  endif()
 endif()
 
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 6162de3b58cd8b..6a17e55d9bcb94 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -141,8 +141,9 @@ if(WITH_AVX
     kernels/fusion/cpu/fused_layer_norm_avx_kernel.cc
     kernels/fusion/cpu/self_dp_attention_kernel.cc
     kernels/fusion/cpu/rms_norm_avx_kernel.cc
-    PROPERTIES COMPILE_FLAGS
-               "${Wno_Maybe_Uninitialized} ${FMA_FLAG} ${AVX512F_FLAG}")
+    PROPERTIES
+      COMPILE_FLAGS
+      "${Wno_Maybe_Uninitialized} ${FMA_FLAG} ${AVX512F_FLAG} ${NO_INLINE}")
 endif()
 
 if(WITH_GPU)

From afcd24b0af4bdafed6f5e946c43e45519334e008 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Wed, 8 Jan 2025 15:23:39 +0800
Subject: [PATCH 21/57] add local layer api (#70600)

* add local layer api

* add doc and example codes

---------

Co-authored-by: andsonder <changlu@keter.top>
---
 python/paddle/distributed/__init__.py         |  2 +
 .../distributed/auto_parallel/local_layer.py  | 95 +++++++++++++++++++
 test/auto_parallel/pir/CMakeLists.txt         |  2 +
 test/auto_parallel/pir/local_layer_demo.py    | 64 +++++++++++++
 test/auto_parallel/pir/test_local_layer.py    | 42 ++++++++
 5 files changed, 205 insertions(+)
 create mode 100644 python/paddle/distributed/auto_parallel/local_layer.py
 create mode 100644 test/auto_parallel/pir/local_layer_demo.py
 create mode 100644 test/auto_parallel/pir/test_local_layer.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index ac0cf6ba3eac9e..61bd791948bdc6 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -59,6 +59,7 @@
     SequenceParallelEnable,
     SequenceParallelEnd,
 )
+from .auto_parallel.local_layer import LocalLayer
 from .auto_parallel.placement_type import (
     Partial,
     Replicate,
@@ -190,6 +191,7 @@
     "to_static",
     "Strategy",
     "DistModel",
+    "LocalLayer",
     "unshard_dtensor",
     "parallelize",
     "SequenceParallelEnd",
diff --git a/python/paddle/distributed/auto_parallel/local_layer.py b/python/paddle/distributed/auto_parallel/local_layer.py
new file mode 100644
index 00000000000000..63b114a2c84946
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/local_layer.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import paddle
+import paddle.distributed as dist
+from paddle.nn import Layer
+
+if TYPE_CHECKING:
+    from paddle.distributed import Placement
+    from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+
+
+class LocalLayer(Layer):
+    """
+    The `LocalLayer` class is a specialized `Layer` for managing distributed tensors during
+    forward and backward passes in a parallelized training environment. It converts distributed tensors
+    to local tensors for computation and then back to distributed tensors as output, ensuring seamless
+    integration with distributed parallelism frameworks.
+
+    Args:
+        out_dist_attrs (list[tuple[ProcessMesh, list[Placement]]]):
+            A list where each entry is a tuple containing the `ProcessMesh` and the list of `Placement`
+            attributes for the corresponding output tensors. These attributes define the distribution
+            strategy for the outputs.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+            from paddle import nn
+
+            class CustomLayer(LocalLayer):
+                def __init__(self, mesh):
+                    super().__init__(
+                        out_dist_attrs=[(mesh, [dist.Partial(dist.ReduceType.kRedSum)])]
+                    )
+                    self.fc = nn.Linear(16, 8)
+
+                def forward(self, x):
+                    return self.fc(x)
+
+            # doctest: +REQUIRES(env:DISTRIBUTED)
+            mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+            custom_layer = CustomLayer(mesh)
+            input_tensor = dist.auto_parallel.api.dtensor_from_local(
+                paddle.randn([4, 16]), mesh, [dist.Replicate()]
+            )
+
+            output_tensor = custom_layer(input_tensor)
+            print(output_tensor)
+    """
+
+    def __init__(
+        self, out_dist_attrs: list[tuple[ProcessMesh, list[Placement]]]
+    ):
+        super().__init__()
+        self.out_dist_attrs = out_dist_attrs
+
+    def __call__(self, *inputs: Any, **kwargs: Any) -> Any:
+        """
+        Overrides the base `Layer`'s `__call__` method. Transforms distributed tensors to local tensors
+        before computation, invokes the parent class's `__call__` method, and then transforms the
+        outputs back to distributed tensors based on the specified distribution attributes.
+        """
+        inputs = list(inputs)
+        for idx in range(len(inputs)):
+            if inputs[idx].is_dist():
+                inputs[idx] = dist.auto_parallel.api.dtensor_to_local(
+                    inputs[idx]
+                )
+        outputs = Layer.__call__(self, *inputs, **kwargs)
+        list_outs = paddle.utils.flatten(outputs)
+        for idx in range(len(list_outs)):
+            list_outs[idx] = dist.auto_parallel.api.dtensor_from_local(
+                list_outs[idx],
+                self.out_dist_attrs[idx][0],
+                self.out_dist_attrs[idx][1],
+            )
+        return paddle.utils.pack_sequence_as(outputs, list_outs)
diff --git a/test/auto_parallel/pir/CMakeLists.txt b/test/auto_parallel/pir/CMakeLists.txt
index 06172a555fef90..71a649276240bd 100644
--- a/test/auto_parallel/pir/CMakeLists.txt
+++ b/test/auto_parallel/pir/CMakeLists.txt
@@ -17,6 +17,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_pir_reshard_s_to_r MODULES test_pir_reshard_s_to_r)
   set_tests_properties(test_pir_reshard_s_to_r PROPERTIES TIMEOUT 120)
   py_test_modules(test_mlp MODULES test_mlp ENVS FLAGS_enable_pir_api=1)
+  py_test_modules(test_local_layer MODULES test_local_layer ENVS
+                  FLAGS_enable_pir_api=1)
   py_test_modules(
     test_semi_auto_parallel_dist_to_static_pir MODULES
     test_semi_auto_parallel_dist_to_static_pir ENVS FLAGS_enable_pir_api=1)
diff --git a/test/auto_parallel/pir/local_layer_demo.py b/test/auto_parallel/pir/local_layer_demo.py
new file mode 100644
index 00000000000000..be66d50fb8aa48
--- /dev/null
+++ b/test/auto_parallel/pir/local_layer_demo.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from test_to_static_pir_program import (
+    DemoNet,
+    create_data_loader,
+)
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+
+BATCH_SIZE = 4
+BATCH_NUM = 40
+IMAGE_SIZE = 16
+CLASS_NUM = 8
+np.random.seed(2025)
+paddle.seed(2025)
+
+
+class LocalLossLayer(dist.LocalLayer):
+    def __init__(self, mesh):
+        super().__init__(
+            out_dist_attrs=[(mesh, [dist.Partial(dist.ReduceType.kRedSum)])]
+        )
+        self.loss = nn.MSELoss()
+
+    def forward(self, input, label):
+        return self.loss(input, label)
+
+
+class TestMLPTensorParallel(unittest.TestCase):
+    def test_to_static_program(self):
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        mp_layer = DemoNet(mesh)
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=mp_layer.parameters()
+        )
+        loss_fn = LocalLossLayer(mesh)
+        loader = create_data_loader()
+        dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
+        dist_model = dist.to_static(mp_layer, dist_loader, loss_fn, opt)
+
+        dist_model.train()
+        for batch_id, (image, label) in enumerate(dist_loader()):
+            loss = dist_model(image, label)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/pir/test_local_layer.py b/test/auto_parallel/pir/test_local_layer.py
new file mode 100644
index 00000000000000..ddd5afa52e13cc
--- /dev/null
+++ b/test/auto_parallel/pir/test_local_layer.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestLocalLayer(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=2,
+            timeout=300,
+        )
+        self._default_envs = {"dtype": "float32", "seed": "2023"}
+        self._changeable_envs = {"backend": ["gpu"]}
+
+    def test_local_layer(self):
+        envs_list = test_base.gen_product_envs_list(
+            {"dtype": "float32", "seed": "2023"}, {"backend": ["gpu"]}
+        )
+        # self._log_dir.name = "./log"
+        for envs in envs_list:
+            self.run_test_case(
+                "local_layer_demo.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2834e1e0300bbe575e4b0448b01174d57c28f19d Mon Sep 17 00:00:00 2001
From: Zhou Xin <zhou.xin@mail.ustc.edu.cn>
Date: Wed, 8 Jan 2025 15:50:10 +0800
Subject: [PATCH 22/57] [CINN][Backend Pass Update No.1] Update
 EliminateCommonFactorOfLocalIndex pass (#70619)

* Add comment for eliminateCommonFactorOfLocalIndex, test=document_fix

* Update eliminateCommonFactorOfLocalIndex
---
 .../eliminate_common_factor_of_local_index.cc | 329 +++++++++++++-----
 .../eliminate_common_factor_of_local_index.h  |   8 +-
 paddle/cinn/optim/transform_gpu_forloop.cc    |   7 +-
 3 files changed, 249 insertions(+), 95 deletions(-)

diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
index ea2af9033a5423..3eaa1723f46179 100644
--- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
+++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.cc
@@ -27,32 +27,24 @@
 namespace cinn {
 namespace optim {
 namespace {
+using ir::Expr;
 
-class GatherLocalIndexVisitor : public ir::IRMutator<> {
+class GatherLocalIndexAndProhibitedLocalVarVisitor
+    : public ir::IRMutator<>,
+      public ir::stmt::StmtVisitor<> {
  public:
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+  void operator()(ir::stmt::BlockRef func_body) { VisitBlock(func_body); }
 
   const std::unordered_map<std::string, std::vector<std::vector<ir::Expr>>>&
   local_var_to_indexes() const {
     return local_var_to_indexes_;
   }
 
- private:
-  void Visit(const ir::Store* op, Expr* expr) override {
-    auto store = expr->As<ir::Store>();
-
-    ir::IRMutator<>::Visit(op, expr);
-    if (!store->tensor.as_tensor_ref()->buffer.defined()) {
-      return;
-    }
-
-    if (store->tensor.as_tensor_ref()->buffer->memory_type ==
-        ir::MemoryType::GPULocal) {
-      local_var_to_indexes_[store->tensor.as_tensor_ref()->buffer->name]
-          .push_back(store->indices);
-    }
+  const std::unordered_set<std::string>& prohibited_local_vars() const {
+    return prohibited_local_vars_;
   }
 
+ private:
   void Visit(const ir::Load* op, Expr* expr) override {
     auto load = expr->As<ir::Load>();
 
@@ -71,40 +63,81 @@ class GatherLocalIndexVisitor : public ir::IRMutator<> {
     ir::IRMutator<>::Visit(op, expr);
   }
 
-  std::unordered_map<std::string, std::vector<std::vector<ir::Expr>>>
-      local_var_to_indexes_;
-};
-
-class GatherProhibitedLocalVarVisitor : public ir::IRMutator<> {
- public:
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
-  const std::unordered_set<std::string>& prohibited_local_vars() const {
-    return prohibited_local_vars_;
+  void Visit(const Expr& expr) {
+    Expr expr_ = expr;
+    ir::IRMutator<>::Visit(&expr_, &expr_);
   }
 
- private:
-  void Visit(const ir::Store* op, Expr* expr) override {
-    auto store = expr->As<ir::Store>();
+  void VisitStmt(const ir::stmt::Store& stmt) override {
+    Visit(stmt->value());
 
-    ir::IRMutator<>::Visit(op, expr);
-    if (!store->tensor.as_tensor_ref()->buffer.defined()) {
+    if (!stmt->tensor().as_tensor_ref()->buffer.defined()) {
       return;
     }
-    if (store->tensor.as_tensor_ref()->buffer->memory_type !=
+
+    if (stmt->tensor().as_tensor_ref()->buffer->memory_type ==
         ir::MemoryType::GPULocal) {
-      return;
-    }
-    const auto& local_var_name = store->tensor.as_tensor_ref()->buffer->name;
-    if (store->value.As<ir::Call>()) {
-      const auto& call_name = store->value.As<ir::Call>()->name;
-      if (cinn::utils::GetProhibitScheduleExternalFuncNames().count(call_name) >
-          0) {
-        prohibited_local_vars_.insert(local_var_name);
+      local_var_to_indexes_[stmt->tensor().as_tensor_ref()->buffer->name]
+          .push_back(stmt->indices());
+
+      if (stmt->value().As<ir::Call>()) {
+        const std::string& local_var_name =
+            stmt->tensor().as_tensor_ref()->buffer->name;
+        const std::string& call_name = stmt->value().As<ir::Call>()->name;
+        if (cinn::utils::GetProhibitScheduleExternalFuncNames().count(
+                call_name) > 0) {
+          prohibited_local_vars_.insert(local_var_name);
+        }
       }
     }
   }
 
+  void VisitStmt(const ir::stmt::IfThenElse& stmt) override {
+    Visit(stmt->condition());
+    VisitBlock(stmt->true_case());
+    if (stmt->false_case().defined()) {
+      VisitBlock(stmt->false_case());
+    }
+  }
+
+  void VisitStmt(const ir::stmt::Schedule& stmt) override {
+    for (const Expr& value : stmt->iter_values()) {
+      Visit(value);
+    }
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(const ir::stmt::For& stmt) override {
+    Visit(stmt->min());
+    Visit(stmt->extent());
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(const ir::stmt::Alloc& stmt) override {
+    for (const Expr& extent : stmt->extents()) {
+      Visit(extent);
+    }
+    if (stmt->condition().defined()) {
+      Visit(stmt->condition());
+    }
+    if (stmt->body().defined()) {
+      Visit(stmt->body());
+    }
+  }
+
+  void VisitStmt(const ir::stmt::Evaluate& stmt) override {
+    Visit(stmt->value());
+  }
+
+  void VisitStmt(const ir::stmt::Free& stmt) override {
+    Visit(stmt->destination());
+  }
+
+  void VisitStmt(const ir::stmt::Let& stmt) override { Visit(stmt->body()); }
+
+ private:
+  std::unordered_map<std::string, std::vector<std::vector<ir::Expr>>>
+      local_var_to_indexes_;
   std::unordered_set<std::string> prohibited_local_vars_;
 };
 
@@ -123,16 +156,12 @@ EraseProhibitedLocalVar(
 }
 
 std::unordered_map<std::string, std::vector<std::vector<ir::Expr>>>
-CollectLocalVarToIndexes(ir::Expr* expr) {
-  GatherLocalIndexVisitor gather_local_index_visitor;
-  gather_local_index_visitor(expr);
+CollectLocalVarToIndexes(ir::stmt::BlockRef func_body) {
+  GatherLocalIndexAndProhibitedLocalVarVisitor gather;
+  gather(func_body);
 
-  GatherProhibitedLocalVarVisitor gather_prohibited_local_var_visitor;
-  gather_prohibited_local_var_visitor(expr);
-
-  return EraseProhibitedLocalVar(
-      gather_local_index_visitor.local_var_to_indexes(),
-      gather_prohibited_local_var_visitor.prohibited_local_vars());
+  return EraseProhibitedLocalVar(gather.local_var_to_indexes(),
+                                 gather.prohibited_local_vars());
 }
 
 int ExtractMulNumberFromExpr(const ir::Expr& expr) {
@@ -284,11 +313,12 @@ std::vector<ir::Expr> CalculateIndexCommonFactor(
           "We should guarantee indexes.size() >= 2, because local variable "
           "should at least load and store once. "));
   for (std::size_t i = 1; i < indexes.size(); ++i) {
-    // NOTE(Hongyu Jia): Ideally, we can guarantee the size of indexes are equal
-    // However, some unit tests (e.g. test_resnet_cinn, test_instance_norm_op
-    // are still running with the deprecated OpScheduler, and the ir::Expr
-    // will break this guarantee after IRGpuScheduleBlockReduce function.
-    // So we have to relax the restriction here.
+    // NOTE(Hongyu Jia): Ideally, we can guarantee the size of indexes are
+    // equal However, some unit tests (e.g. test_resnet_cinn,
+    // test_instance_norm_op are still running with the deprecated
+    // OpScheduler, and the ir::Expr will break this guarantee after
+    // IRGpuScheduleBlockReduce function. So we have to relax the restriction
+    // here.
     if (indexes[i].size() != indexes[0].size()) {
       LOG(WARNING)
           << "Not supported for calculating common factor, local var = "
@@ -330,14 +360,15 @@ CalculateLocalVarCommonFactor(
 }
 
 template <typename Op>
-class EliminateCommonFactorVisitor : public ir::IRMutator<> {
+class EliminateCommonFactorVisitor : public ir::IRMutator<>,
+                                     public ir::stmt::StmtMutator<> {
  public:
   EliminateCommonFactorVisitor(
       const std::unordered_map<std::string, std::vector<ir::Expr>>&
           local_var_to_common_factor)
       : local_var_to_common_factor_(local_var_to_common_factor) {}
 
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+  void operator()(ir::stmt::BlockRef func_body) { VisitBlock(func_body); }
 
  private:
   void Visit(const ir::Store* op, Expr* expr) override {
@@ -386,27 +417,106 @@ class EliminateCommonFactorVisitor : public ir::IRMutator<> {
     }
     ir::IRMutator<>::Visit(op, expr);
   }
+
+  void Visit(const Expr& expr) {
+    Expr expr_ = expr;
+    ir::IRMutator<>::Visit(&expr_, &expr_);
+  }
+
+  void VisitStmt(ir::stmt::Store stmt) override {
+    Visit(stmt->value());
+    const auto& store_buffer = stmt->tensor().as_tensor_ref()->buffer;
+
+    if (!store_buffer.defined()) {
+      return;
+    }
+
+    if (store_buffer->memory_type == ir::MemoryType::GPULocal) {
+      if (local_var_to_common_factor_.count(store_buffer->name) == 0) {
+        return;
+      }
+      const auto& common_factors =
+          local_var_to_common_factor_.at(store_buffer->name);
+      for (std::size_t i = 0; i < stmt->indices().size(); ++i) {
+        std::vector<Expr> new_indices = stmt->indices();
+        new_indices[i] =
+            CommonFactorTrait<Op>::Simplify(new_indices[i], common_factors[i]);
+        stmt->set_indices(new_indices);
+      }
+    }
+  }
+
+  void VisitStmt(ir::stmt::IfThenElse stmt) override {
+    Visit(stmt->condition());
+    VisitBlock(stmt->true_case());
+    if (stmt->false_case().defined()) {
+      VisitBlock(stmt->false_case());
+    }
+  }
+
+  void VisitStmt(ir::stmt::Schedule stmt) override {
+    for (const Expr& value : stmt->iter_values()) {
+      Visit(value);
+    }
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(ir::stmt::For stmt) override {
+    Visit(stmt->min());
+    Visit(stmt->extent());
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(ir::stmt::Alloc stmt) override {
+    for (const Expr& extent : stmt->extents()) {
+      Visit(extent);
+    }
+    if (stmt->condition().defined()) {
+      Visit(stmt->condition());
+    }
+    if (stmt->body().defined()) {
+      Visit(stmt->body());
+    }
+  }
+
+  void VisitStmt(ir::stmt::Evaluate stmt) override { Visit(stmt->value()); }
+
+  void VisitStmt(ir::stmt::Free stmt) override { Visit(stmt->destination()); }
+
+  void VisitStmt(ir::stmt::Let stmt) override { Visit(stmt->body()); }
+
+ private:
   std::unordered_map<std::string, std::vector<ir::Expr>>
       local_var_to_common_factor_;
 };
 
 }  // namespace
 
+// Eliminate common factors from local indices in a function's body.
+// If applied to various statement blocks, this may incorrectly simplify
+// distinct local buffer indices across different statement blocks to the same
+// value.
 template <typename Op>
-void EliminateCommonFactorHelper(ir::Expr* expr) {
+void EliminateCommonFactorHelper(ir::stmt::BlockRef func_body) {
   std::unordered_map<std::string, std::vector<std::vector<ir::Expr>>>
-      local_var_to_indexes = CollectLocalVarToIndexes(expr);
+      local_var_to_indexes = CollectLocalVarToIndexes(func_body);
   std::unordered_map<std::string, std::vector<ir::Expr>>
       local_var_to_common_factor =
           CalculateLocalVarCommonFactor<Op>(local_var_to_indexes);
+  for (const auto& [local_var, common_factor] : local_var_to_common_factor) {
+    auto index = local_var_to_indexes.at(local_var);
+    for (std::size_t i = 0; i < index.size(); ++i) {
+    }
+  }
   EliminateCommonFactorVisitor<Op> eliminate_common_factor_visitor(
       local_var_to_common_factor);
-  eliminate_common_factor_visitor(expr);
+  eliminate_common_factor_visitor(func_body);
 }
 
-class TransformLocalIndicesVisitor : public ir::IRMutator<> {
+class TransformLocalIndicesVisitor : public ir::IRMutator<>,
+                                     public ir::stmt::StmtMutator<> {
  public:
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+  void operator()(ir::stmt::BlockRef func_body) { VisitBlock(func_body); }
 
  private:
   template <typename OpType>
@@ -463,30 +573,14 @@ class TransformLocalIndicesVisitor : public ir::IRMutator<> {
     };
 
     std::unordered_map<std::string, ir::Expr> name_to_iter;
-    for (const auto& indice : indices) {
-      ExtractIterFromIndice(indice, &name_to_iter);
-      VLOG(6) << "extract iter: " << indice
+    for (const auto& index : indices) {
+      ExtractIterFromIndice(index, &name_to_iter);
+      VLOG(6) << "extract iter: " << index
               << " iter_set size: " << name_to_iter.size();
     }
     return CopyIndiceItersToLocalBuffer(name_to_iter, indices);
   }
 
-  void Visit(const ir::For* op, ir::Expr* expr) override {
-    auto* for_ir = expr->As<ir::For>();
-    loop_vars_.push_back(for_ir->loop_var);
-    IRMutator<>::Visit(op, expr);
-    loop_vars_.pop_back();
-  }
-
-  void Visit(const ir::Store* op, ir::Expr* expr) override {
-    auto store = expr->As<ir::Store>();
-    if (store->tensor.as_tensor_ref()->buffer->memory_type ==
-        ir::MemoryType::GPULocal) {
-      store->indices = ConvertIndicesToIters(store->indices);
-    }
-    ir::IRMutator<>::Visit(op, expr);
-  }
-
   void Visit(const ir::Load* op, ir::Expr* expr) override {
     auto load = expr->As<ir::Load>();
     if (load->tensor.as_tensor_ref()->buffer->memory_type ==
@@ -496,23 +590,80 @@ class TransformLocalIndicesVisitor : public ir::IRMutator<> {
     ir::IRMutator<>::Visit(op, expr);
   }
 
+  void Visit(const Expr& expr) {
+    Expr expr_ = expr;
+    ir::IRMutator<>::Visit(&expr_, &expr_);
+  }
+
+  void VisitStmt(ir::stmt::Store stmt) override {
+    if (stmt->tensor().as_tensor_ref()->buffer->memory_type ==
+        ir::MemoryType::GPULocal) {
+      stmt->set_indices(ConvertIndicesToIters(stmt->indices()));
+    }
+    Visit(stmt->value());
+  }
+
+  void VisitStmt(ir::stmt::IfThenElse stmt) override {
+    Visit(stmt->condition());
+    VisitBlock(stmt->true_case());
+    if (stmt->false_case().defined()) {
+      VisitBlock(stmt->false_case());
+    }
+  }
+
+  void VisitStmt(ir::stmt::Schedule stmt) override {
+    for (const Expr& value : stmt->iter_values()) {
+      Visit(value);
+    }
+    VisitBlock(stmt->body());
+  }
+
+  void VisitStmt(ir::stmt::For stmt) override {
+    Visit(stmt->min());
+    Visit(stmt->extent());
+    loop_vars_.push_back(stmt->loop_var());
+    VisitBlock(stmt->body());
+    loop_vars_.pop_back();
+  }
+
+  void VisitStmt(ir::stmt::Alloc stmt) override {
+    for (const Expr& extent : stmt->extents()) {
+      Visit(extent);
+    }
+    if (stmt->condition().defined()) {
+      Visit(stmt->condition());
+    }
+    if (stmt->body().defined()) {
+      Visit(stmt->body());
+    }
+  }
+
+  void VisitStmt(ir::stmt::Evaluate stmt) override { Visit(stmt->value()); }
+
+  void VisitStmt(ir::stmt::Free stmt) override { Visit(stmt->destination()); }
+
+  void VisitStmt(ir::stmt::Let stmt) override { Visit(stmt->body()); }
+
+ private:
   std::vector<ir::Var> loop_vars_;
 };
 
-void TransformLocalIndicesToIters(ir::Expr* expr) {
+void TransformLocalIndicesToIters(ir::stmt::BlockRef func_body) {
   TransformLocalIndicesVisitor transform_local_indices_visitor;
-  transform_local_indices_visitor(expr);
+  transform_local_indices_visitor(func_body);
 }
 
-void EliminateCommonFactorOfLocalIndex(ir::Expr* expr) {
-  VLOG(4) << "Before EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
-  EliminateCommonFactorHelper<Gcd>(expr);
-  EliminateCommonFactorHelper<Offset>(expr);
-  EliminateCommonFactorHelper<Symbolic>(expr);
+void EliminateCommonFactorOfLocalIndex(ir::stmt::BlockRef func_body) {
+  VLOG(4) << "Before EliminateCommonFactorOfLocalIndex, func_body = \n"
+          << func_body;
+  EliminateCommonFactorHelper<Gcd>(func_body);
+  EliminateCommonFactorHelper<Offset>(func_body);
+  EliminateCommonFactorHelper<Symbolic>(func_body);
 
-  TransformLocalIndicesToIters(expr);
+  TransformLocalIndicesToIters(func_body);
 
-  VLOG(4) << "After EliminateCommonFactorOfLocalIndex, Expr = \n" << *expr;
+  VLOG(4) << "After EliminateCommonFactorOfLocalIndex, func_body = \n"
+          << func_body;
 }
 
 }  // namespace optim
diff --git a/paddle/cinn/optim/eliminate_common_factor_of_local_index.h b/paddle/cinn/optim/eliminate_common_factor_of_local_index.h
index e85cfae242a2fd..c7bd9c22524413 100644
--- a/paddle/cinn/optim/eliminate_common_factor_of_local_index.h
+++ b/paddle/cinn/optim/eliminate_common_factor_of_local_index.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/stmt.h"
 
 namespace cinn {
 namespace optim {
@@ -40,9 +40,9 @@ namespace optim {
  *       e.g., ([i+2, i+3], [i+4, i+6]) -> ([i, i], [i+2, i+3])
  *    c) Symbolic common factor elimination.
  *       e.g., ([C, 2], [3C, 4]) -> ([1, 2], [3, 4])
- * 3. Transform simplified indices into iterator-based forms.
+ * 3. Update the IR, replacing original indices with simplified versions.
+ * 4. Transform local buffer indices into iterator-based forms.
  *       e.g., [i, 0, 0] -> [0, 0, i]
- * 4. Update the IR, replacing original indices with simplified versions.
  *
  * Key benefits:
  * 1. Reduces computational overhead in index calculations.
@@ -93,7 +93,7 @@ namespace optim {
  * Output:
  *     local_tensor[0, 0, 0] = global_tensor[i, 0, 0];
  */
-void EliminateCommonFactorOfLocalIndex(ir::Expr* expr);
+void EliminateCommonFactorOfLocalIndex(ir::stmt::BlockRef func_body);
 
 }  // namespace optim
 }  // namespace cinn
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 82eac4839c48e1..020cdc4dade8d5 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -489,8 +489,11 @@ void OptimizeExprGPU(Expr *expr) {
   // Replace variables that are in range [0, 1) to zero.
   ReplaceUnitVarToZero replace_unit_var_to_zero;
   replace_unit_var_to_zero(expr);
-
-  EliminateCommonFactorOfLocalIndex(expr);
+  VLOG(10) << "After ReplaceUnitVarToZero: \n" << *expr;
+  ir::stmt::BlockRef func_body = ir::ConvertExprBlockToStmtBlock(*expr);
+  EliminateCommonFactorOfLocalIndex(func_body);
+  *expr = ir::ConvertStmtBlockToExprBlock(func_body);
+  VLOG(10) << "After EliminateCommonFactorOfLocalIndex: \n" << *expr;
 
   ResizeBufferToMaxVarRange(expr);
 

From 7f2a3e45307d6c07006d5e4b63c8c105b9550ee5 Mon Sep 17 00:00:00 2001
From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:10:22 +0800
Subject: [PATCH 23/57] aligin_diff (#70613)

---
 python/paddle/distributed/auto_parallel/api.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index 7606911531ca21..4b86405418ae2d 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -1156,6 +1156,10 @@ def _shard_accumulator(self, param):
             accumulator = self._inner_opt._accumulators[key][target_name]
             if accumulator.is_dist() and not isinstance(accumulator, pir.Value):
                 continue
+
+            if paddle.in_dynamic_mode():
+                origin_accumulator_name = accumulator.name
+
             if self._shard_fn is not None:
                 self._inner_opt._accumulators[key][target_name] = (
                     self._shard_fn(key, param, accumulator)
@@ -1179,12 +1183,10 @@ def _shard_accumulator(self, param):
                             placements=placements,
                         )
                     )
-            if not isinstance(
-                self._inner_opt._accumulators[key][target_name], pir.Value
-            ):
-                self._inner_opt._accumulators[key][target_name].name = (
-                    target_name + "_" + key
-                )
+            if paddle.in_dynamic_mode():
+                self._inner_opt._accumulators[key][
+                    target_name
+                ].name = origin_accumulator_name
 
     def _reset_placements(self, param):
         if param.is_dist() and isinstance(

From 6d96bed484587dc435e6975abed9e69dd4aaf23f Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:14:05 +0800
Subject: [PATCH 24/57] [fluid_ops] clean collective operators part (#70588)

---
 .../fluid/operators/collective/alltoall_op.cc |  71 ----
 .../fluid/operators/collective/alltoall_op.h  |  42 --
 .../collective/c_allreduce_avg_op.cc          |  40 --
 .../collective/c_allreduce_max_op.cc          |  44 ---
 .../collective/c_allreduce_prod_op.cc         |  44 ---
 .../operators/collective/c_broadcast_op.cc    |  59 ---
 .../operators/collective/c_broadcast_op.h     |  77 ----
 .../operators/collective/c_reduce_avg_op.cc   |  39 --
 .../operators/collective/c_reduce_max_op.cc   |  41 --
 .../operators/collective/c_reduce_min_op.cc   |  40 --
 .../fluid/operators/collective/c_reduce_op.h  | 371 ------------------
 .../operators/collective/c_reduce_prod_op.cc  |  41 --
 .../operators/collective/c_reduce_sum_op.cc   |  41 --
 .../operators/collective/global_gather_op.h   |  43 --
 .../operators/collective/global_scatter_op.h  |  43 --
 15 files changed, 1036 deletions(-)
 delete mode 100644 paddle/fluid/operators/collective/alltoall_op.cc
 delete mode 100644 paddle/fluid/operators/collective/alltoall_op.h
 delete mode 100644 paddle/fluid/operators/collective/c_allreduce_avg_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_allreduce_max_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_allreduce_prod_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_broadcast_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_broadcast_op.h
 delete mode 100644 paddle/fluid/operators/collective/c_reduce_avg_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_reduce_max_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_reduce_min_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_reduce_op.h
 delete mode 100644 paddle/fluid/operators/collective/c_reduce_prod_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_reduce_sum_op.cc
 delete mode 100644 paddle/fluid/operators/collective/global_gather_op.h
 delete mode 100644 paddle/fluid/operators/collective/global_scatter_op.h

diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc
deleted file mode 100644
index 1cb8a0b1352842..00000000000000
--- a/paddle/fluid/operators/collective/alltoall_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/alltoall_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AllToAllBaseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllToAll");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AllToAll");
-    int ring_id = ctx->Attrs().Get<int>("ring_id");
-    PADDLE_ENFORCE_GE(
-        ring_id,
-        0,
-        common::errors::InvalidArgument(
-            "The ring_id (%d) for alltoall op must be non-negative.", ring_id));
-    phi::DDim dim = ctx->GetInputDim("X");
-    if (dim[0] < 0) dim[0] = -1;
-    ctx->SetOutputDim("Out", dim);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class AllToAllBaseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) tensor send.");
-    AddOutput("Out", "(Tensor) the result of alltoall.");
-    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "use_calc_stream",
-        "(bool default false) eject CUDA operations to calculation stream.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-AllToAll Operator
-Scatter tensors from all participators to all participators.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(alltoall,
-                             ops::AllToAllBaseOp,
-                             ops::AllToAllBaseOpMaker)
diff --git a/paddle/fluid/operators/collective/alltoall_op.h b/paddle/fluid/operators/collective/alltoall_op.h
deleted file mode 100644
index 464a53668bd8a5..00000000000000
--- a/paddle/fluid/operators/collective/alltoall_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_GLOO)
-#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class AllToAllOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(common::errors::Unavailable(
-        "Do not support alltoall for cpu kernel now."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc b/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
deleted file mode 100644
index 13d07557f1e7c9..00000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_avg_op.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CAllReduceAvgOpMaker : public CAllReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Avg"; }
-};
-
-DECLARE_INPLACE_OP_INFERER(AllreduceAvgInplaceInferer, {"X", "Out"});
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_avg,
-                             ops::CAllReduceOp,
-                             ops::CAllReduceAvgOpMaker,
-                             ops::AllreduceAvgInplaceInferer)
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
deleted file mode 100644
index 3faf360636a769..00000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Max"; }
-};
-
-DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"});
-
-DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceMax, kRedMax)
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max,
-                             ops::CAllReduceOp,
-                             ops::CAllReduceMaxOpMaker,
-                             ops::AllreduceMaxInplaceInferer)
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
deleted file mode 100644
index 4c2bf9528d854d..00000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CAllReduceProdOpMaker : public CAllReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Prod"; }
-};
-
-DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"});
-
-DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceProd, kRedProd)
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod,
-                             ops::CAllReduceOp,
-                             ops::CAllReduceProdOpMaker,
-                             ops::AllreduceProdInplaceInferer)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
deleted file mode 100644
index f1672f6dd04b0d..00000000000000
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-
-namespace paddle::operators {
-
-class CBroadcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) tensor to be broadcasted.");
-    AddOutput("Out", "(Tensor) the result of broadcast.");
-    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
-        .SetDefault(0);
-    AddAttr<int>("root", "(int default 0) root id for broadcasting.")
-        .SetDefault(0);
-
-    AddComment(R"DOC(
-CBroadcast Operator
-
-Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#broadcast
-)DOC");
-  }
-};
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_broadcast,
-                             ops::CBroadcastOp,
-                             ops::CBroadcastOpMaker);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
deleted file mode 100644
index 79fc593be5da79..00000000000000
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/distributed/comm_context_manager.h"
-
-#if defined(PADDLE_WITH_GLOO)
-#include <gloo/broadcast.h>
-
-#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#include "paddle/phi/core/distributed/gloo_comm_context.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_GLOO)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto root = ctx.Attr<int>("root");
-
-    int rid = ctx.Attr<int>("ring_id");
-    ctx.device_context().Alloc<T>(out);
-
-    const auto& comm_context_manager =
-        phi::distributed::CommContextManager::GetInstance();
-    if (comm_context_manager.Has(std::to_string(rid))) {
-      auto* comm_context = static_cast<phi::distributed::GlooCommContext*>(
-          comm_context_manager.Get(std::to_string(rid)));
-      comm_context->Broadcast(out, *in, root);
-    } else {
-      // NOTE: This will be removed after moving this operator to phi.
-      int64_t send_numel = in->numel();
-      T* recv_buff = reinterpret_cast<T*>(out->data());
-      auto gloo = paddle::framework::GlooWrapper::GetInstance();
-      PADDLE_ENFORCE_EQ(
-          gloo->IsInitialized(),
-          true,
-          common::errors::PreconditionNotMet(
-              "You must initialize the gloo environment first to use it."));
-      gloo::BroadcastOptions opts(gloo->GetContext());
-      opts.setOutput(recv_buff, send_numel);
-      opts.setRoot(root);
-      gloo::broadcast(opts);
-    }
-#else
-    PADDLE_THROW(common::errors::Unavailable(
-        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_avg_op.cc b/paddle/fluid/operators/collective/c_reduce_avg_op.cc
deleted file mode 100644
index f8d827a708c004..00000000000000
--- a/paddle/fluid/operators/collective/c_reduce_avg_op.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CReduceAvgOpMaker : public CReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Avg"; }
-};
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_reduce_avg,
-                             ops::CReduceOp,
-                             ops::CReduceAvgOpMaker);
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
deleted file mode 100644
index f08b6eda3e18b5..00000000000000
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CReduceMaxOpMaker : public CReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Max"; }
-};
-
-DEFINE_C_REDUCE_CPU_KERNEL(CReduceMax, kRedMax)
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_reduce_max,
-                             ops::CReduceOp,
-                             ops::CReduceMaxOpMaker);
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
deleted file mode 100644
index 87c1197cee6ecf..00000000000000
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CReduceMinOpMaker : public CReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Min"; }
-};
-
-DEFINE_C_REDUCE_CPU_KERNEL(CReduceMin, kRedMin)
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_reduce_min,
-                             ops::CReduceOp,
-                             ops::CReduceMinOpMaker);
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
deleted file mode 100644
index 49c0f1f52b10e0..00000000000000
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ /dev/null
@@ -1,371 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/common/ddim.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/distributed/comm_context_manager.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL)
-#include "paddle/common/flags.h"
-#include "paddle/phi/core/platform/collective_helper.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
-#endif
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#include "paddle/phi/core/distributed/nccl_comm_context.h"
-#elif defined(PADDLE_WITH_XPU_BKCL)
-#include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
-#include "paddle/phi/core/distributed/bkcl_comm_context.h"
-#endif
-
-#if defined(PADDLE_WITH_GLOO)
-#include <gloo/reduce.h>
-
-#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd, kRedAvg };
-
-class CReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-template <ReduceType red_type, typename T>
-class CReduceOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_GLOO)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto root_id = ctx.Attr<int>("root_id");
-
-    auto place = ctx.GetPlace();
-    int64_t send_numel = in->numel();
-    const T* send_buff = in->data<T>();
-    T* recv_buff = out->mutable_data<T>(in->dims(), place);
-    auto gloo = paddle::framework::GlooWrapper::GetInstance();
-    PADDLE_ENFORCE_EQ(
-        gloo->IsInitialized(),
-        true,
-        common::errors::PreconditionNotMet(
-            "You must initialize the gloo environment first to use it."));
-    gloo::ReduceOptions opts(gloo->GetContext());
-    opts.setInput(const_cast<T*>(send_buff), send_numel);
-    opts.setOutput(recv_buff, send_numel);
-    opts.setRoot(root_id);
-    switch (red_type) {
-      case kRedSum:
-        opts.setReduceFunction(
-            static_cast<void (*)(void*, const void*, const void*, size_t)>(
-                &gloo::sum<T>));
-        break;
-      case kRedMax:
-        opts.setReduceFunction(
-            static_cast<void (*)(void*, const void*, const void*, size_t)>(
-                &gloo::max<T>));
-        break;
-      case kRedMin:
-        opts.setReduceFunction(
-            static_cast<void (*)(void*, const void*, const void*, size_t)>(
-                &gloo::min<T>));
-        break;
-      case kRedProd:
-        opts.setReduceFunction(
-            static_cast<void (*)(void*, const void*, const void*, size_t)>(
-                &gloo::product<T>));
-        break;
-      default:
-        PADDLE_ENFORCE_EQ(true,
-                          false,
-                          common::errors::InvalidArgument(
-                              "Invalid reduce type: %d.", red_type));
-    }
-    gloo::reduce(opts);
-#else
-    PADDLE_THROW(common::errors::Unavailable(
-        "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON"));
-#endif
-  }
-};
-
-#define DEFINE_C_REDUCE_CPU_KERNEL(op_name, red_type) \
-  template <typename T, typename DeviceContext>       \
-  class op_name##CPUKernel : public CReduceOpCPUKernel<red_type, T> {};
-
-template <ReduceType red_type, typename T>
-class CReduceOpXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_XPU_BKCL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    BKCLDataType dtype = phi::ToBKCLDataType(in->dtype());
-    int64_t numel = in->numel();
-    const void* sendbuff = in->data();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    int rid = ctx.Attr<int>("ring_id");
-    int root = ctx.Attr<int>("root_id");
-
-    XPUStream stream = nullptr;
-    platform::BKCLComm* comm = nullptr;
-    phi::distributed::BKCLCommContext* comm_ctx = nullptr;
-
-    const auto& comm_context_manager =
-        phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(rid)));
-      comm_ctx = static_cast<phi::distributed::BKCLCommContext*>(
-          comm_context_manager.Get(std::to_string(rid)));
-      PADDLE_ENFORCE_NE(comm_ctx,
-                        nullptr,
-                        common::errors::Unavailable(
-                            "BKCLCommContext is nullptr, collective op should "
-                            "has ring_id attr."));
-      stream = comm_ctx->GetStream();
-      VLOG(3) << "new comm_context_manager has rid " << rid;
-    } else {  // old comm_context
-      comm = platform::BKCLCommContext::Instance().Get(rid, place);
-      stream = comm->stream();
-      VLOG(3) << "old BKCLCommContext has rid " << rid;
-    }
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = phi::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<phi::XPUContext*>(dev_ctx)->x_context()->xpu_stream;
-    }
-
-    BKCLOp bkcl_red_type = BKCL_ADD;
-    switch (red_type) {
-      case kRedSum:
-        bkcl_red_type = BKCL_ADD;
-        break;
-
-      case kRedMax:
-        bkcl_red_type = BKCL_MAX;
-        break;
-
-      case kRedMin:
-        bkcl_red_type = BKCL_MIN;
-        break;
-
-      case kRedProd:
-        bkcl_red_type = BKCL_PRODUCT;
-        break;
-
-      default:
-        PADDLE_THROW(common::errors::InvalidArgument("Invalid reduce type: %d",
-                                                     red_type));
-    }
-
-    if (comm_ctx) {
-      comm_ctx->Reduce(out, *in, bkcl_red_type, root, stream);
-    } else {
-      PADDLE_ENFORCE_XPU_SUCCESS(bkcl_reduce(comm->comm(),
-                                             sendbuff,
-                                             recvbuff,
-                                             numel,
-                                             dtype,
-                                             bkcl_red_type,
-                                             root,
-                                             stream));
-    }
-#else
-    PADDLE_THROW(common::errors::PreconditionNotMet(
-        "PaddlePaddle should be compiled with XPU."));
-#endif
-  }
-};
-
-#define DEFINE_C_REDUCE_XPU_KERNEL(op_name, red_type) \
-  template <typename T, typename DeviceContext>       \
-  class op_name##XPUKernel : public CReduceOpXPUKernel<red_type, T> {};
-
-template <ReduceType red_type, typename T>
-class CReduceOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    ncclDataType_t dtype = phi::ToNCCLDataType(in->dtype());
-    int64_t numel = in->numel();
-    const void* sendbuff = in->data();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    int rid = ctx.Attr<int>("ring_id");
-    int root = ctx.Attr<int>("root_id");
-
-    gpuStream_t stream = nullptr;
-    platform::NCCLComm* comm = nullptr;
-    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
-
-    const auto& comm_context_manager =
-        phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(rid)));
-      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-          comm_context_manager.Get(std::to_string(rid)));
-      PADDLE_ENFORCE_NE(comm_ctx,
-                        nullptr,
-                        common::errors::Unavailable(
-                            "NCCLCommContext is nullptr, collective op should "
-                            "has ring_id attr."));
-      stream = comm_ctx->GetStream();
-      VLOG(3) << "new comm_context_manager has rid " << rid;
-    } else {  // old comm_context
-      comm = platform::NCCLCommContext::Instance().Get(rid, place);
-      stream = comm->stream();
-      VLOG(3) << "old NCCLCommContext has rid " << rid;
-    }
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      // should ExecutionContext for calc stream.
-      stream = ctx.cuda_device_context().stream();
-    }
-
-    ncclRedOp_t nccl_red_type = ncclSum;
-    switch (red_type) {
-      case kRedSum:
-        nccl_red_type = ncclSum;
-        break;
-
-      case kRedMax:
-        nccl_red_type = ncclMax;
-        break;
-
-      case kRedMin:
-        nccl_red_type = ncclMin;
-        break;
-
-      case kRedProd:
-        nccl_red_type = ncclProd;
-        break;
-
-#if (NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000) || \
-    defined(PADDLE_WITH_HIP)
-      case kRedAvg:
-        nccl_red_type = ncclAvg;
-        break;
-#endif
-
-      default:
-        PADDLE_ENFORCE_EQ(
-            true,
-            false,
-            common::errors::InvalidArgument("red_type must be one of kRedSum, "
-                                            "kRedMax, kRedMin, kRedProd."));
-    }
-
-    if (comm_ctx) {
-      comm_ctx->Reduce(out, *in, nccl_red_type, root, stream);
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(sendbuff,
-                                                          recvbuff,
-                                                          numel,
-                                                          dtype,
-                                                          nccl_red_type,
-                                                          root,
-                                                          comm->comm(),
-                                                          stream));
-    }
-#else
-    PADDLE_ENFORCE_EQ(
-        true,
-        false,
-        common::errors::Unavailable("PaddlePaddle should compile with GPU.."));
-#endif
-  }
-};
-
-#define DEFINE_C_REDUCE_CUDA_KERNEL(op_name, red_type) \
-  template <typename T, typename DeviceContext>        \
-  class op_name##CUDAKernel : public CReduceOpCUDAKernel<red_type, T> {};
-
-class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be reduced.");
-    AddOutput("Out", "(Tensor) the reduced result.");
-    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
-        .SetDefault(0);
-
-    AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
-    AddAttr<bool>(
-        "use_calc_stream",
-        "(bool default false) eject CUDA operations to calculation stream.")
-        .SetDefault(false);
-    AddComment(string::Sprintf(R"DOC(
-CReduce %s Operator
-
-Call collective Reduce with reduce type %s. If input and output are
-the same variable, in-place reduce will be used.
-)DOC",
-                               GetName(),
-                               GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
deleted file mode 100644
index eb2e614405235b..00000000000000
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CReduceProdOpMaker : public CReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Prod"; }
-};
-
-DEFINE_C_REDUCE_CPU_KERNEL(CReduceProd, kRedProd)
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_reduce_prod,
-                             ops::CReduceOp,
-                             ops::CReduceProdOpMaker);
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
deleted file mode 100644
index 3758877d1b993b..00000000000000
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle::framework {
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace paddle::framework
-namespace paddle::imperative {
-class OpBase;
-}  // namespace paddle::imperative
-
-namespace paddle::operators {
-
-class CReduceSumOpMaker : public CReduceOpMaker {
- protected:
-  std::string GetName() const override { return "Sum"; }
-};
-
-DEFINE_C_REDUCE_CPU_KERNEL(CReduceSum, kRedSum)
-
-}  // namespace paddle::operators
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_reduce_sum,
-                             ops::CReduceOp,
-                             ops::CReduceSumOpMaker);
diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h
deleted file mode 100644
index e6d5c717571df9..00000000000000
--- a/paddle/fluid/operators/collective/global_gather_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(common::errors::Unavailable(
-        "Do not support global gather op for cpu kernel now."));
-  }
-};
-
-template <typename Context, typename T>
-struct GlobalGatherFunctor {
-  void operator()(const framework::ExecutionContext& ctx);
-};
-
-template <typename Context, typename T>
-struct GlobalGatherProcessGroupFunctor {
-  void operator()(const framework::ExecutionContext& ctx);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h
deleted file mode 100644
index 70e5d7c2e5d536..00000000000000
--- a/paddle/fluid/operators/collective/global_scatter_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx UNUSED) const override {
-    PADDLE_THROW(common::errors::Unavailable(
-        "Do not support global scatter op for cpu kernel now."));
-  }
-};
-
-template <typename Context, typename T>
-struct GlobalScatterFunctor {
-  void operator()(const framework::ExecutionContext& ctx);
-};
-
-template <typename Context, typename T>
-struct GlobalScatterProcessGroupFunctor {
-  void operator()(const framework::ExecutionContext& ctx);
-};
-
-}  // namespace operators
-}  // namespace paddle

From 31e8c012852282ab442bcd2aa59194ec46f5debc Mon Sep 17 00:00:00 2001
From: Shuhao Liang <50269654+lshpku@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:31:23 +0800
Subject: [PATCH 25/57] [CINN] Implement the new AlignIterSpaceTactic (#70649)

---
 .../dy_shape_group_scheduler.cc               |   2 +
 .../tactic/align_iter_space_tactic.cc         | 227 +++++++++++++-----
 .../tactic/align_iter_space_tactic.h          |   4 +-
 .../tactic/tile_first_general_tactic.cc       |  44 ----
 4 files changed, 175 insertions(+), 102 deletions(-)

diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
index e533e35c67663b..758464d5d21857 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
@@ -16,6 +16,7 @@
 #include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/hlir/framework/pir/trivial_op_impl.h"
 #include "paddle/cinn/ir/group_schedule/config/schedule_config_manager.h"
+#include "paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h"
 #include "paddle/cinn/ir/group_schedule/tactic/compute_at_reduction_tactic.h"
 #include "paddle/cinn/ir/group_schedule/tactic/compute_inline_tactic.h"
 #include "paddle/cinn/ir/group_schedule/tactic/tile_broadcast_tactic.h"
@@ -33,6 +34,7 @@ void DynamicShapeGroupScheduler::Init() {
   VLOG(4) << "original group func body: \n"
           << ir_sch_->GetModule().GetExprs()[0];
   InitBuckets();
+  tactics_.emplace_back(CreateAlignIterSpaceTactic());
   tactics_.emplace_back(CreateTileBroadcastTactic());
   tactics_.emplace_back(CreateTileFirstGeneralTactic());
   tactics_.emplace_back(CreateComputeInlineTactic());
diff --git a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc
index dcc72e4a217d82..3476755d2460be 100644
--- a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+// Copyright (c) 2025 CINN Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,89 +13,206 @@
 // limitations under the License.
 
 #include "paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h"
-#include "paddle/cinn/common/cas.h"
-#include "paddle/cinn/common/integer_set.h"
-#include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_analyzer/ir_analyzer.h"
-#include "paddle/cinn/ir/op/ir_operators.h"
-#include "paddle/cinn/ir/utils/ir_copy.h"
 
 namespace cinn {
 namespace ir {
+namespace {
 
+/**
+ * Reorder the loops according to the memory-consistent order of input or output
+ * to make memory access as coalesced as possible.
+ *
+ * This tactic uses different alignment policies for Reduce and Trivial:
+ * 1) Reduce: align with the input, because after reduction, the output data is
+ *    significantly smaller than the input data, so it's more critical to make
+ *    input coalesced.
+ * 2) Trivial: align with the output, because discrete writes incur higher costs
+ *    than discrete reads for the same volume of data due to the hardware design
+ *    of cache. Therefore, we should ensure coalesced writes in priority.
+ *
+ * Note: we reorder spatial and reduce loops seperately, because we need to
+ * maintain the relative order between spatial and reduce loops, so as for later
+ * tactics to work properly. Thus, we use two lists sp_loop_perm & rd_loop_perm
+ * to record the permutation of spatial and reduce loops respectively.
+ *
+ *
+ * Examples:
+ * 1. Reduce
+ * Input:
+ *   for (i, 0, 8):          # S
+ *     for (j, 0, 32):       # S
+ *       for (k, 0, 128):    # R
+ *         for (a, 0, 256):  # R
+ *           var_1[i, j] += var_0[j, a, k, i]
+ * Analysis:
+ *   We align Reduce to the input `var_0[j, a, k, i]`. In the indices of var_0,
+ *   the mapping from each index to the loop index is:
+ *      indices[0] = j   =>  loops[1]  # S
+ *      indices[1] = a   =>  loops[3]  # R
+ *      indices[2] = k   =>  loops[2]  # R
+ *      indices[3] = i   =>  loops[0]  # S
+ *   To make the indices of var_0 consistent with its original memory layout, we
+ *   need to permute the loops in the order {1, 3, 2, 0}. However, as we reorder
+ *   spatial and reduce loop seperately, we split the permutation into sp & rd,
+ *   getting sp_loop_perm = {1, 0} and rd_loop_perm = {3, 2}.
+ * Output:
+ *   for (j, 0, 32):         # S
+ *     for (i, 0, 8):        # S
+ *       for (a, 0, 256):    # R
+ *         for (k, 0, 128):  # R
+ *           var_1[i, j] += var_0[j, a, k, i]
+ *
+ * 2. Trivial
+ * Input:
+ *   for (i, 0, 32):
+ *     for (j, 0, 128):
+ *       for (k, 0, 256):
+ *         var_1[k, i, j] = exp(var_0[j, i, k])
+ * Analysis:
+ *   We align Trivial to the output `var_1[k, i, j]`. In the indices of var_1,
+ *   the mapping from each index to the loop index is:
+ *      indices[0] = k  => loops[2]
+ *      indices[1] = i  => loops[0]
+ *      indices[2] = j  => loops[1]
+ *   Like example 1, we should permute the loops in the order {2, 0, 1}. As this
+ *   graph doesn't contain reduce loops, all we get is sp_loop_perm = {2, 0, 1},
+ *   and rd_loop_perm = {}.
+ * Output:
+ *   for (k, 0, 256):
+ *     for (i, 0, 32):
+ *       for (j, 0, 128):
+ *         var_1[k, i, j] = exp(var_0[j, i, k])
+ */
 class AlignIterSpaceTactic final : public ScheduleTactic {
  public:
-  void Init(ScheduleContext* context) override;
+  void Init(ScheduleContext* context, ir::IRSchedule* sch) override;
 
   void Apply(ir::IRSchedule* sch, const std::string& block_id) override;
 
   std::string TacticName() const override { return "AlignIterSpaceTactic"; }
 
+ private:
+  /**
+   * Get the common memory-consistent order of loops according to the outputs.
+   * Returns null if not all outputs share the same order.
+   */
+  std::vector<int> GetCommonOutputLoopPerm(ir::IRSchedule* sch);
+
  private:
   ScheduleContext* context_;
+
+  // The permutation of spatial and reduce loops, in other to achieve the
+  // memory-consistent alignment.
+  std::vector<int> sp_loop_perm_;
+  std::vector<int> rd_loop_perm_;
 };
 
-void AlignIterSpaceTactic::Init(ScheduleContext* context) {
+void AlignIterSpaceTactic::Init(ScheduleContext* context, ir::IRSchedule* sch) {
   context_ = context;
-}
+  sp_loop_perm_.clear();
+  rd_loop_perm_.clear();
 
-void AlignIterSpaceTactic::Apply(ir::IRSchedule* sch,
-                                 const std::string& block_id) {
-  ir::Expr block = sch->GetBlock(block_id);
+  auto& loop_strides = context_->config.base_info->loop_strides;
+  auto& reduce_axis = context_->config.base_info->reduce_axis;
+  std::set<int> reduce_axis_set(reduce_axis.begin(), reduce_axis.end());
 
-  std::vector<ir::Expr> loops = sch->GetLoops(block_id);
-  ir::Expr src_total_extent{1};
-  for (const auto& loop : loops) {
-    src_total_extent = src_total_extent * loop.As<ir::For>()->extent;
-  }
-  ir::Expr target_sp_extent{1};
-  for (const auto& iter : context_->iter_space_info.sp_space) {
-    target_sp_extent = target_sp_extent * std::get<0>(iter);
+  if (!loop_strides.empty()) {
+    // If this is a Reduce, calculate the loop_perm by sorting the loops in the
+    // descending order of their strides according to the input, then split the
+    // loop_perm into sp_loop_perm & rd_loop_perm.
+    std::vector<int> loop_perm(loop_strides.size());
+    std::iota(loop_perm.begin(), loop_perm.end(), 0);
+    std::stable_sort(loop_perm.begin(), loop_perm.end(), [&](int a, int b) {
+      return loop_strides[a] > loop_strides[b];
+    });
+
+    for (int axis : loop_perm) {
+      if (reduce_axis_set.count(axis) > 0) {
+        rd_loop_perm_.push_back(axis);
+      } else if (loop_strides[axis] != 0) {
+        sp_loop_perm_.push_back(axis);
+      }
+    }
+  } else {
+    // If this is a Trvial, calculate the sp_loop_perm according to the output.
+    sp_loop_perm_ = GetCommonOutputLoopPerm(sch);
   }
-  ir::Expr target_total_extent = ir_utils::IRCopy(target_sp_extent);
-  for (const auto& iter : context_->iter_space_info.rb_space) {
-    target_total_extent = target_total_extent * std::get<0>(iter);
+
+  VLOG(4) << "AlignIterSpaceTactic:\n"
+          << "sp_loop_perm: " << utils::Join(sp_loop_perm_, ", ") << "\n"
+          << "rd_loop_perm: " << utils::Join(rd_loop_perm_, ", ");
+}
+
+std::unordered_map<ir::Var, int> GetLoopVarToIndex(
+    const std::vector<ir::Expr>& loops) {
+  std::unordered_map<ir::Var, int> loop_var2index;
+  for (int i = 0; i < loops.size(); ++i) {
+    auto* node = loops[i].As<ir::For>();
+    loop_var2index[node->loop_var] = i;
   }
+  return loop_var2index;
+}
 
-  common::cas_intervals_t var_intervals;
-  common::SymbolicExprAnalyzer symbolic_expr_analyzer(var_intervals);
-  std::optional<bool> total_extent_eq =
-      symbolic_expr_analyzer.ProveEQ(src_total_extent, target_total_extent);
-  bool need_reorder = false;
-  for (int i = 0; i < context_->iter_space_info.rb_last_order.size(); ++i) {
-    if (context_->iter_space_info.rb_last_order[i] != i) {
-      need_reorder = true;
-      break;
-    }
+/**
+ * Check whether this is an effective permutation.
+ * A permutation is ineffective if it's entirely in ascending order.
+ */
+bool IsPermutationEffective(const std::vector<int>& perm) {
+  for (int i = 1; i < perm.size(); ++i) {
+    if (perm[i - 1] > perm[i]) return true;
   }
+  return false;
+}
 
-  if (total_extent_eq.has_value() && total_extent_eq.value()) {
-    if (need_reorder) {
-      sch->Reorder(block_id, context_->iter_space_info.rb_last_order);
-    }
-    if (context_->iter_space_info.sp_space.size() < loops.size() - 1) {
-      loops = sch->GetLoops(block_id);
-
-      // Align the loop in the current block that needs to be aligned with the
-      // reduce loop in iter_space_info
-      std::vector<ir::Expr> rb_loops(
-          loops.end() - context_->iter_space_info.rb_space.size(), loops.end());
-      sch->Fuse(rb_loops);
+std::vector<int> AlignIterSpaceTactic::GetCommonOutputLoopPerm(
+    ir::IRSchedule* sch) {
+  std::vector<int> common_loop_perm;
+
+  for (auto& block : sch->GetAllBlocks()) {
+    std::string block_id = ir::analyzer::GetBlockName(block);
+    if (context_->output_names.count(block_id) == 0) continue;
+
+    auto store = ir::analyzer::GetStoreOfSBlock(block);
+    auto& indices = store.As<ir::Store>()->indices;
+    std::unordered_map<ir::Var, ir::Expr> iter_var2iter_value =
+        ir::analyzer::GetIterVarToValueOfSBlock(block);
+    std::unordered_map<ir::Var, int> loop_var2index =
+        GetLoopVarToIndex(sch->GetLoops(block));
+
+    std::vector<int> loop_perm;
+    for (auto& index : indices) {
+      if (index.is_constant()) continue;
+      if (!index.is_var()) return {};
+      ir::Expr iter_value = iter_var2iter_value[index.as_var_ref()];
+      if (!iter_value.is_var()) return {};
+      ir::Expr loop_var = iter_value.as_var_ref();
+      loop_perm.push_back(loop_var2index[loop_var]);
     }
-    if (context_->iter_space_info.sp_space.size() > 1) {
-      // Align the loop in the current block that needs to be aligned with the
-      // spatial loop in iter_space_info
-      loops = sch->GetLoops(block_id);
-      std::vector<ir::Expr> sp_loops(
-          loops.begin(),
-          loops.end() - context_->iter_space_info.rb_space.size());
-      sch->Fuse(sp_loops);
+
+    if (common_loop_perm.empty()) {
+      common_loop_perm = std::move(loop_perm);
+    } else if (common_loop_perm != loop_perm) {
+      return {};
     }
-  } else {
-    sch->Fuse(loops);
   }
+
+  return common_loop_perm;
 }
 
+void AlignIterSpaceTactic::Apply(ir::IRSchedule* sch,
+                                 const std::string& block_id) {
+  if (ir::IsReduceInitTensorName(block_id)) return;
+  if (IsPermutationEffective(sp_loop_perm_)) {
+    sch->Reorder(block_id, sp_loop_perm_);
+  }
+  if (IsPermutationEffective(rd_loop_perm_)) {
+    sch->Reorder(block_id, rd_loop_perm_);
+  }
+}
+
+}  // namespace
+
 std::unique_ptr<ScheduleTactic> CreateAlignIterSpaceTactic() {
   return std::make_unique<AlignIterSpaceTactic>();
 }
diff --git a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h
index 2ac65d114c7f51..12891818120712 100644
--- a/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h
+++ b/paddle/cinn/ir/group_schedule/tactic/align_iter_space_tactic.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+// Copyright (c) 2025 CINN Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,10 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #pragma once
 
-#include <string>
 #include "paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h"
 
 namespace cinn {
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
index e71e0052a3803c..1022c97420e7cc 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
@@ -48,7 +48,6 @@ class TileFirstGeneralTactic final : public ScheduleTactic {
   std::string TacticName() const override { return "TileFirstGeneralTactic"; }
 
  private:
-  void AlignToReduceInput(ir::IRSchedule* sch, const std::string& block_id);
   void MergeFlattenAxis(ir::IRSchedule* sch, const std::string& block_id);
   void MergeDiscreteFlattenAxis(ir::IRSchedule* sch,
                                 const std::string& block_id);
@@ -128,11 +127,6 @@ void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch,
   if (!can_apply_) return;
   if (ir::IsReduceInitTensorName(block_id)) return;
 
-  AlignToReduceInput(sch, block_id);
-  VLOG(6) << "After AlignToReduceInput on block: [" << block_id
-          << "], loop nest:\n"
-          << sch->GetLoops(block_id)[0];
-
   if (UseContinuousDataTile(context_->config)) {
     VLOG(4) << "Using ApplyContinuousDataTile";
     ApplyContinuousDataTile(sch, block_id);
@@ -293,44 +287,6 @@ void TileFirstGeneralTactic::ApplyContinuousDataTile(
   SetReduceType(sch, block_id);
 }
 
-void TileFirstGeneralTactic::AlignToReduceInput(ir::IRSchedule* sch,
-                                                const std::string& block_id) {
-  const auto& loop_strides = context_->config.base_info->loop_strides;
-  if (loop_strides.empty()) {
-    return;
-  }
-
-  std::vector<ir::Expr> loops = sch->GetLoops(block_id);
-  std::vector<int64_t> loop_perm(loops.size());
-  std::iota(loop_perm.begin(), loop_perm.end(), 0);
-
-  const auto IsReduce = [&](int64_t axis) {
-    auto& reduce_axis = context_->config.base_info->reduce_axis;
-    return std::find(reduce_axis.begin(), reduce_axis.end(), axis) !=
-           reduce_axis.end();
-  };
-
-  std::sort(loop_perm.begin(), loop_perm.end(), [&](int64_t a, int64_t b) {
-    if (IsReduce(a) == IsReduce(b)) {
-      return loop_strides[a] > loop_strides[b];
-    }
-    return IsReduce(b);
-  });
-  VLOG(4) << "loop_perm: " << utils::Join(loop_perm, ", ");
-
-  // Reorder S/R loops seperately, otherwise reduce_init will be de-inlined.
-  std::vector<Expr> sp_loops, rd_loops;
-  for (auto i : loop_perm) {
-    if (IsReduce(i)) {
-      rd_loops.push_back(loops[i]);
-    } else if (loop_strides[i] != 0) {
-      sp_loops.push_back(loops[i]);
-    }
-  }
-  sch->Reorder(sp_loops);
-  sch->Reorder(rd_loops);
-}
-
 void TileFirstGeneralTactic::MergeFlattenAxis(ir::IRSchedule* sch,
                                               const std::string& block_id) {
   if (vec_flatten_axis_.size() >= 2) {

From 18af5d8fcc88821c7ba1f026e8b165000c58c631 Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:31:54 +0800
Subject: [PATCH 26/57] =?UTF-8?q?=E3=80=90Bug=20Fix=E3=80=91Fix=20ReduceSu?=
 =?UTF-8?q?m=20inferMeta=20bug=20(#70660)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix infer bug

* fix bug

* fix undefined bug
---
 paddle/cinn/hlir/dialect/operator/ir/ops.yaml            | 2 +-
 .../hlir/dialect/operator/transforms/pd_to_cinn_pass.cc  | 9 +++++++--
 paddle/phi/infermeta/unary.cc                            | 3 ++-
 paddle/phi/infermeta/unary.h                             | 1 +
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
index 9fe7530e94bd26..4bab4807511538 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
+++ b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
@@ -73,7 +73,7 @@
   output : Tensor(out)
   infer_meta :
     func : ReduceSumInferMeta
-    param : [x, axis, keepdim]
+    param : [x, axis, keepdim, dtype]
   kernel :
     func : frobenius_norm
     param : [x, axis, keepdim]
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 7eb0992d69c454..588febb460498c 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -117,11 +117,16 @@ class SumOpPattern : public pir::OpRewritePattern<paddle::dialect::SumOp> {
 
     auto in = op->operand_source(0);
     auto in_data_type = in.type().dyn_cast<pir::DenseTensorType>().dtype();
-    if (in_data_type.isa<pir::Int32Type>() ||
-        in_data_type.isa<pir::BoolType>()) {
+
+    if (dtype != phi::DataType::UNDEFINED &&
+        dtype != paddle::dialect::TransToPhiDataType(in_data_type)) {
+      in = rewriter.Build<paddle::dialect::CastOp>(in, dtype).result(0);
+    } else if (in_data_type.isa<pir::Int32Type>() ||
+               in_data_type.isa<pir::BoolType>()) {
       in = rewriter.Build<paddle::dialect::CastOp>(in, phi::DataType::INT64)
                .result(0);
     }
+
     auto cinn_reduce =
         rewriter.Build<cinn::dialect::ReduceSumOp>(in, axis, keepdim, dtype);
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 11d9ab80a48ef3..c744b699950b64 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3707,12 +3707,13 @@ void ReduceInferMetaBase(const MetaTensor& x,
 void ReduceSumInferMeta(const MetaTensor& x,
                         const std::vector<int64_t>& axis,
                         bool keep_dim,
+                        DataType dtype,
                         MetaTensor* out) {
   bool reduce_all = false;
   if (axis.empty()) {
     reduce_all = true;
   }
-  SumRawInferMeta(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
+  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
 }
 
 void ReduceInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 7ce6a526829f8a..9a07cee72e0412 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -589,6 +589,7 @@ void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
 void ReduceSumInferMeta(const MetaTensor& x,
                         const std::vector<int64_t>& axis,
                         bool keep_dim,
+                        DataType dtype,
                         MetaTensor* out);
 
 void ReduceInferMeta(const MetaTensor& x,

From 2f34ecf1c2f42fd232a8a6dcd901f6740a989054 Mon Sep 17 00:00:00 2001
From: nizne <97940276+nizne9@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:17:17 +0800
Subject: [PATCH 27/57] =?UTF-8?q?=E3=80=90BUPT=E3=80=91[Paddle=20Tensor=20?=
 =?UTF-8?q?=E7=AC=AC=E4=BA=8C=E6=9C=9F=20API=20=E9=B2=81=E6=A3=92=E6=80=A7?=
 =?UTF-8?q?=E5=A2=9E=E5=BC=BA]=20`paddle.linalg.vector=5Fnorm`=20API=20?=
 =?UTF-8?q?=E9=B2=81=E6=A3=92=E6=80=A7=E5=A2=9E=E5=BC=BA=20(#70499)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix bug of paddle.linalg.vector_norm and add test

* Add test case base on feedback from the review
---
 paddle/phi/infermeta/unary.cc     | 13 +----
 python/paddle/tensor/linalg.py    | 17 ++++--
 test/legacy_test/test_norm_all.py | 94 ++++++++++++++++++++++++++++++-
 3 files changed, 107 insertions(+), 17 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index c744b699950b64..e6ea59be2365ca 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3320,17 +3320,8 @@ void PNormInferMeta(const MetaTensor& x,
   auto x_dim = x.dims();
   auto x_rank = x_dim.size();
 
-  PADDLE_ENFORCE_GE(axis,
-                    -x_rank,
-                    errors::InvalidArgument(
-                        "Attr(axis) value should be in range [-R, R-1], R is "
-                        "the rank of Input(X). But received axis: %d, R: %d. "
-                        "Current Input(X)'s shape is=[%s].",
-                        axis,
-                        x_rank,
-                        x_dim));
-  PADDLE_ENFORCE_LT(axis,
-                    x_rank,
+  PADDLE_ENFORCE_EQ((axis >= -x_rank && axis < x_rank) || x_rank == 0,
+                    true,
                     errors::InvalidArgument(
                         "Attr(axis) value should be in range [-R, R-1], R is "
                         "the rank of Input(X). But received axis: %d, R: %d. "
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 93d7d279bf5e76..2c6508200ed1ae 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -672,10 +672,15 @@ def vector_norm_axis_int(
     if isinstance(axis, list) and len(axis) == 1:
         axis = axis[0]
 
+    if paddle.is_complex(x):
+        abs_x = paddle.abs(x)
+    else:
+        abs_x = x
+
     # when len(axis) == 1, use the original op to calculate
     if isinstance(axis, int):
         return vector_norm_axis_int(
-            x,
+            abs_x,
             axis=axis,
             porder=p,
             keepdim=keepdim,
@@ -686,12 +691,16 @@ def vector_norm_axis_int(
     # when len(axis) >= 1, calculate by combining other Python apis
     elif isinstance(axis, list):
         if p == np.inf or p == -np.inf:
-            return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
+            return inf_norm(
+                abs_x, porder=p, axis=axis, keepdim=keepdim, name=name
+            )
         elif p == 0:
-            return zero_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
+            return zero_norm(
+                abs_x, porder=p, axis=axis, keepdim=keepdim, name=name
+            )
         else:
             return vector_norm_axis_tuple(
-                x, porder=p, axis=axis, keepdim=keepdim, name=name
+                abs_x, porder=p, axis=axis, keepdim=keepdim, name=name
             )
 
 
diff --git a/test/legacy_test/test_norm_all.py b/test/legacy_test/test_norm_all.py
index 70dd38a79e8c8f..cdcada06e06d4a 100644
--- a/test/legacy_test/test_norm_all.py
+++ b/test/legacy_test/test_norm_all.py
@@ -603,7 +603,7 @@ def check_linalg_vector_static(
         )
         place = base.CPUPlace()
         exe = base.Executor(place)
-        np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
+        np_input = np.array(np.random.rand(*shape_x) + 1.0).astype(dtype)
         expected_result = np_linalg_vector_norm(
             np_input, porder=p, axis=axis, keepdims=keep_dim
         ).astype(dtype)
@@ -616,7 +616,7 @@ def check_linalg_vector_static(
 def check_linalg_vector_dygraph(
     self, p, axis, shape_x, dtype, keep_dim, check_dim=False
 ):
-    x_numpy = (np.random.random(shape_x) + 1.0).astype(dtype)
+    x_numpy = np.array(np.random.random(shape_x) + 1.0).astype(dtype)
     expected_result = np_linalg_vector_norm(
         x_numpy, porder=p, axis=axis, keepdims=keep_dim
     )
@@ -909,6 +909,51 @@ def test_basic(self):
                 keep_dim=keep,
                 check_dim=True,
             )
+            check_linalg_vector_static(
+                self,
+                p=2,
+                axis=None,
+                shape_x=[],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_static(
+                self,
+                p=np.inf,
+                axis=None,
+                shape_x=[],
+                dtype="complex64",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_static(
+                self,
+                p=-np.inf,
+                axis=[0, 1, 2, 3],
+                shape_x=[1, 14, 5, 14],
+                dtype="complex128",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_static(
+                self,
+                p=np.inf,
+                axis=2,
+                shape_x=[1, 14, 5, 14],
+                dtype="complex128",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_static(
+                self,
+                p=0,
+                axis=[1, 3],
+                shape_x=[1, 14, 5, 14],
+                dtype="complex128",
+                keep_dim=keep,
+                check_dim=True,
+            )
             check_linalg_matrix_static(
                 self,
                 p=-np.inf,
@@ -1237,6 +1282,51 @@ def test_dygraph(self):
                 keep_dim=keep,
                 check_dim=True,
             )
+            check_linalg_vector_dygraph(
+                self,
+                p=2,
+                axis=None,
+                shape_x=(),
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_dygraph(
+                self,
+                p=np.inf,
+                axis=None,
+                shape_x=[],
+                dtype="complex64",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_dygraph(
+                self,
+                p=-np.inf,
+                axis=[0, 1, 2, 3],
+                shape_x=[1, 14, 5, 14],
+                dtype="complex128",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_dygraph(
+                self,
+                p=np.inf,
+                axis=2,
+                shape_x=[1, 14, 5, 14],
+                dtype="complex128",
+                keep_dim=keep,
+                check_dim=True,
+            )
+            check_linalg_vector_dygraph(
+                self,
+                p=0,
+                axis=[1, 3],
+                shape_x=[1, 14, 5, 14],
+                dtype="complex128",
+                keep_dim=keep,
+                check_dim=True,
+            )
             check_linalg_matrix_dygraph(
                 self,
                 p=-np.inf,

From 6705ab15edcdede1a4bd53a57af23b64b49e8604 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Wed, 8 Jan 2025 17:17:39 +0800
Subject: [PATCH 28/57] [CodeStyle][Typos][Q-[1-2],R-[1-12]] Fix typos
 (`qucik`,`quitted`,`runned`,`readed`,`recived`,`recevied`,`recieved`,`reveived`,`recieves`,`recive`,`receving`,`recommand`,`recomplie`,`recored`,`Recusive`,`recusive`,`Recursivly`,`recursivly`,`reduntant`)
 (#70674)

* fix

* fix
---
 _typos.toml                                   | 19 -------------------
 .../transforms/check_infer_symbolic_util.cc   |  6 +++---
 ...e_shape_ops_into_generate_shape_op_pass.cc |  2 +-
 .../operator/transforms/pd_to_cinn_pass.cc    |  2 +-
 paddle/cinn/hlir/pe/nn.cc                     | 10 +++++-----
 paddle/cinn/runtime/cuda/cuda_util.cc         | 14 +++++++-------
 paddle/cinn/runtime/sycl/sycl_util.cc         | 10 +++++-----
 paddle/common/flags.h                         |  2 +-
 paddle/common/flags_native.cc                 |  2 +-
 .../distributed/common/chunk_allocator.h      |  4 ++--
 .../distributed/ps/service/brpc_ps_client.cc  |  2 +-
 .../distributed/ps/service/brpc_ps_client.h   |  2 +-
 .../distributed/ps/table/ctr_accessor.cc      |  2 +-
 .../ps/table/ctr_double_accessor.cc           |  2 +-
 .../distributed/ps/table/ctr_dymf_accessor.cc |  2 +-
 .../distributed/ps/table/sparse_accessor.cc   |  2 +-
 paddle/fluid/framework/channel.h              |  2 +-
 paddle/fluid/framework/dist_multi_trainer.cc  |  4 ++--
 .../framework/fleet/heter_ps/heter_comm.h     |  4 ++--
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   | 10 +++++-----
 .../framework/ir/auto_mixed_precision_pass.cc |  2 +-
 .../ir/xpu/decoder_attention_xpu_fuse_pass.cc |  2 +-
 .../ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc  |  2 +-
 .../multihead_matmul_roformer_plugin.cu       |  2 +-
 .../plugin/preln_residual_bias_plugin.cu      |  2 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |  2 +-
 .../operators/elementwise/elementwise_op.h    |  4 ++--
 .../generator/get_expected_kernel_func.cc     |  2 +-
 paddle/phi/infermeta/multiary.cc              |  2 +-
 paddle/phi/kernels/funcs/blas/blas.cc         |  2 +-
 paddle/phi/kernels/gpu/flash_attn_utils.h     |  6 +++---
 paddle/scripts/paddle_build.sh                |  2 +-
 python/paddle/decomposition/recompute.py      |  8 ++++----
 .../auto_parallel/static/helper.py            |  4 ++--
 .../reshard_funcs/sub_to_global_mesh_func.py  |  2 +-
 .../static/tuner/rule_based_tuner.py          |  2 +-
 .../fleet/utils/tensor_parallel_utils.py      |  2 +-
 python/paddle/tensor/creation.py              |  2 +-
 test/ir/pir/cinn/utils.py                     |  2 +-
 39 files changed, 68 insertions(+), 87 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index a29bf57b1677b1..81230a2f09629c 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -156,25 +156,6 @@ protocal = 'protocal'
 PROTOCAL = 'PROTOCAL'
 pyrhon = 'pyrhon'
 pthon = 'pthon'
-qucik = 'qucik'
-quitted = 'quitted'
-runned = 'runned'
-readed = 'readed'
-recived = 'recived'
-recevied = 'recevied'
-reveived = 'reveived'
-recieved = 'recieved'
-recieves = 'recieves'
-recive = 'recive'
-receving = 'receving'
-recommand = 'recommand'
-recomplie = 'recomplie'
-recored = 'recored'
-Recusive = 'Recusive'
-recusive = 'recusive'
-recursivly = 'recursivly'
-Recursivly = 'Recursivly'
-reduntant = 'reduntant'
 Refered = 'Refered'
 refered = 'refered'
 registed = 'registed'
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
index bed943587c1637..1471e041a58493 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
@@ -404,13 +404,13 @@ struct ShapeSignatureGenerator {
                        const DoEachT& DoEach) {
     if (set_size <= 0) return DoEach(is_subset_flags);
 
-    const auto& RecusiveVisit = [&](bool is_subset) {
+    const auto& RecursiveVisit = [&](bool is_subset) {
       std::vector<IsSubset> current_is_subset_flags(is_subset_flags);
       current_is_subset_flags.push_back(static_cast<int>(is_subset));
       VisitEachSubSet(set_size - 1, current_is_subset_flags, DoEach);
     };
-    RecusiveVisit(true);
-    RecusiveVisit(false);
+    RecursiveVisit(true);
+    RecursiveVisit(false);
   }
 
   std::optional<ConstrainedSymbolNamesList> GetConstrainedSymbolNamesList(
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
index 4917a0dd2aa9d5..345d88301da639 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
@@ -76,7 +76,7 @@ std::vector<pir::Value> FindSourceDenseTensorOfDimTensor(
       [](const symbol::NullShapeOrDataDimExpr& null_shape_or_data) {
         return false;
       }};
-  // For TensorListShapeOrDataDimExprs case, we should recursivly visit its
+  // For TensorListShapeOrDataDimExprs case, we should recursively visit its
   // each dim_expr, which is automatically in next step.
   const auto& NeedTrackUpstream = [&](pir::Value value) -> bool {
     const auto& sym_shape = ShapeOrDataDimExprs4Value(value);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 588febb460498c..008ef30762ece8 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -1028,7 +1028,7 @@ class SqueezeOpPattern
               in_shape[i],
               1,
               ::common::errors::PreconditionNotMet(
-                  "sequeze dim MUST be 1, but recive axis [%d] is [%d]",
+                  "squeeze dim MUST be 1, but receive axis [%d] is [%d]",
                   i,
                   in_shape[i]));
         }
diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc
index 4954cda7976e0f..61ec4978509dd9 100644
--- a/paddle/cinn/hlir/pe/nn.cc
+++ b/paddle/cinn/hlir/pe/nn.cc
@@ -104,7 +104,7 @@ Tensor PRelu(const Tensor &A,
   PADDLE_ENFORCE_EQ(A->shape[axis],
                     slope->shape[0],
                     ::common::errors::InvalidArgument(
-                        "Wrong slope shape: excepted %d but recieved %d.",
+                        "Wrong slope shape: excepted %d but received %d.",
                         A->shape[axis],
                         slope->shape[0]));
   return Compute(
@@ -163,7 +163,7 @@ std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
       true,
       ::common::errors::InvalidArgument(
           "Filter's output channel size must be divisible by group, but "
-          "recieved %d as output channel size and %d as group.",
+          "received %d as output channel size and %d as group.",
           weights->shape[0] * weights->shape[1],
           input->shape[1]));
 
@@ -447,7 +447,7 @@ std::vector<ir::Tensor> Conv2d_NCHW(const ir::Tensor &input,
       true,
       ::common::errors::InvalidArgument(
           "Filter's output channel size must be divisible by group, but "
-          "recieved %d as output channel size and %d as group.",
+          "received %d as output channel size and %d as group.",
           weights->shape[0] * weights->shape[1],
           input->shape[1]));
   auto res = Compute(
@@ -838,7 +838,7 @@ std::vector<ir::Tensor> Conv2d_NHWC(const ir::Tensor &input,
       true,
       ::common::errors::InvalidArgument(
           "Filter's output channel size must be divisible by group, but "
-          "recieved %d as output channel size and %d as group.",
+          "received %d as output channel size and %d as group.",
           weights->shape[0] * weights->shape[1],
           input->shape[3]));
   auto res = Compute(
@@ -1683,7 +1683,7 @@ std::vector<Tensor> Pool2d(const Tensor &tensor,
       (tensor->shape.size() == 4U || tensor->shape.size() == 5U),
       true,
       ::common::errors::InvalidArgument(
-          "Pool2d requires tensor's shape_size to be 4 or 5, but recieved %d.",
+          "Pool2d requires tensor's shape_size to be 4 or 5, but received %d.",
           tensor->shape.size()));
   std::vector<int> axis = {height_axis, width_axis};
   return PoolImpl(tensor,
diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc
index af0017222231bc..6349a342d93f3f 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.cc
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
@@ -771,7 +771,7 @@ void cinn_call_cudnn_conv2d_forward(void *v_args,
       num_args,
       3,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 3, but recived %d.", num_args));
+          "Expected number of argruments is 3, but received %d.", num_args));
   cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
   CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
@@ -925,7 +925,7 @@ void cinn_call_cudnn_conv2d_backward_data(void *v_args,
       num_args,
       3,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 3, but recived %d.", num_args));
+          "Expected number of argruments is 3, but received %d.", num_args));
   cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
   CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
@@ -1082,7 +1082,7 @@ void cinn_call_cudnn_conv2d_backward_filter(void *v_args,
       num_args,
       3,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 3, but recived %d.", num_args));
+          "Expected number of argruments is 3, but received %d.", num_args));
   cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
   CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
@@ -1236,7 +1236,7 @@ void cinn_call_cudnn_pool2d_forward(void *v_args,
       num_args,
       2,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 2, but recived %d.", num_args));
+          "Expected number of argruments is 2, but received %d.", num_args));
   cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
   CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
@@ -1334,7 +1334,7 @@ void cinn_call_cudnn_pool2d_backward(void *v_args,
       num_args,
       4,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 4, but recived %d.", num_args));
+          "Expected number of argruments is 4, but received %d.", num_args));
   cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
   CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
@@ -1448,7 +1448,7 @@ void cinn_call_cudnn_softmax_forward(void *v_args,
       num_args,
       2,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 2, but recived %d.", num_args));
+          "Expected number of argruments is 2, but received %d.", num_args));
   cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
   CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
@@ -1522,7 +1522,7 @@ void cinn_call_cudnn_softmax_backward(void *v_args,
       num_args,
       3,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 3, but recived %d.", num_args));
+          "Expected number of argruments is 3, but received %d.", num_args));
   cudnnHandle_t &handle = CudnnHandle::GetInstance().GetCudnnHandle();
   CUDNN_CALL(cudnnSetStream(handle, static_cast<cudaStream_t>(stream)));
   cinn_pod_value_t *args = static_cast<cinn_pod_value_t *>(v_args);
diff --git a/paddle/cinn/runtime/sycl/sycl_util.cc b/paddle/cinn/runtime/sycl/sycl_util.cc
index 5c14c9ddfdeb6e..7eb7f69bf0474a 100644
--- a/paddle/cinn/runtime/sycl/sycl_util.cc
+++ b/paddle/cinn/runtime/sycl/sycl_util.cc
@@ -657,7 +657,7 @@ void cinn_call_cnnl_conv2d_forward(void *v_args,
       num_args,
       3,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 3, but recived %d.", num_args));
+          "Expected number of argruments is 3, but received %d.", num_args));
   cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle();
   auto Queue = SYCLBackendAPI::Global()->get_now_queue();
   CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>();
@@ -790,7 +790,7 @@ void cinn_call_cnnl_conv2d_backward_data(void *v_args,
       num_args,
       3,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 3, but recived %d.", num_args));
+          "Expected number of argruments is 3, but received %d.", num_args));
   cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle();
   auto Queue = SYCLBackendAPI::Global()->get_now_queue();
   CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>();
@@ -918,7 +918,7 @@ void cinn_call_cnnl_conv2d_backward_filter(void *v_args,
       num_args,
       3,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 3, but recived %d.", num_args));
+          "Expected number of argruments is 3, but received %d.", num_args));
   cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle();
   auto Queue = SYCLBackendAPI::Global()->get_now_queue();
   CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>();
@@ -1042,7 +1042,7 @@ void cinn_call_cnnl_pool2d_forward(void *v_args,
       num_args,
       2,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 2, but recived %d.", num_args));
+          "Expected number of argruments is 2, but received %d.", num_args));
   cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle();
   auto Queue = SYCLBackendAPI::Global()->get_now_queue();
   CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>();
@@ -1178,7 +1178,7 @@ void cinn_call_cnnl_pool2d_backward(void *v_args,
       num_args,
       4,
       ::common::errors::InvalidArgument(
-          "Expected number of argruments is 4, but recived %d.", num_args));
+          "Expected number of argruments is 4, but received %d.", num_args));
   cnnlHandle_t handle = CnnlHandle::GetInstance().GetCnnlHandle();
   auto Queue = SYCLBackendAPI::Global()->get_now_queue();
   CNdev device = Queue->get_device().get_native<::sycl::backend::cnrt>();
diff --git a/paddle/common/flags.h b/paddle/common/flags.h
index 006f2fea5355da..3ea201fa97899c 100644
--- a/paddle/common/flags.h
+++ b/paddle/common/flags.h
@@ -110,7 +110,7 @@ namespace flags {
 /**
  * @brief Parse commandline flags.
  *
- * It recieves commandline arguments passed in argc and argv from main function,
+ * It receives commandline arguments passed in argc and argv from main function,
  * argv[0] is the program name, and argv[1:] are the commandline arguments
  * which matching the format "--name=value" or "--name value". After parsing,
  * the corresponding flag value will be reset.
diff --git a/paddle/common/flags_native.cc b/paddle/common/flags_native.cc
index 12af71499dec2b..220401e14efec4 100644
--- a/paddle/common/flags_native.cc
+++ b/paddle/common/flags_native.cc
@@ -368,7 +368,7 @@ bool GetValueFromEnv(const std::string& name, std::string* value) {
 /**
  * @brief Set flags from environment variables.
  *
- * It recieves a list of flags name, and will find the corresponding environment
+ * It receives a list of flags name, and will find the corresponding environment
  * variables named "FLAGS_name", if found, it will set the environment variable
  * values to the flags. If error_fatal is true, the program will exit when the
  * environment variable is not set or the flag is not defined, that is the same
diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h
index aa708ffccf9c40..21c2ddddf31bc2 100644
--- a/paddle/fluid/distributed/common/chunk_allocator.h
+++ b/paddle/fluid/distributed/common/chunk_allocator.h
@@ -29,8 +29,8 @@ class ChunkAllocator {
         std::max(sizeof(void*), sizeof(T)),
         common::errors::InvalidArgument(
             "The size of Node is invalid. Expected sizeof(Node) == "
-            "max(sizeof(void*), sizeif(T)).\nBut recieved sizeof(Node) = %u "
-            "and max(sizeof(void*), sizeif(T)) = %u.",
+            "max(sizeof(void*), sizeof(T)).\nBut received sizeof(Node) = %u "
+            "and max(sizeof(void*), sizeof(T)) = %u.",
             sizeof(Node),
             std::max(sizeof(void*), sizeof(T))));
     _chunk_size = chunk_size;
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index cb38f07dc68ea7..11998020042a9e 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -1920,7 +1920,7 @@ std::future<int32_t> BrpcPsClient::PushDense(const Region *regions,
                           "Invalid dense size."
                           "Expect the sum of current position and data number "
                           "to be equal to or smaller than the size."
-                          "But recieved current position = %lu, data number = "
+                          "But received current position = %lu, data number = "
                           "%lu, size = %lu.",
                           pos,
                           data_num,
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
index 3ce8ffbadfe604..dd3f3293f506ed 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -73,7 +73,7 @@ class DownpourPsClientService : public PsService {
         client_id,
         (_client->_client_id),
         common::errors::PreconditionNotMet(
-            "Wrong request client's id. Expect to match self. But recieved "
+            "Wrong request client's id. Expect to match self. But received "
             "request client's id = %lu and self = %lu.",
             client_id,
             (_client->_client_id)));
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index f3cd0c79f62fb5..ee9926f21c1e8a 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -341,7 +341,7 @@ int CtrCommonAccessor::ParseFromString(const std::string& str, float* value) {
       ret,
       6UL,
       common::errors::InvalidArgument(
-          "Invalid return value. Expect more than 6. But recieved %d.", ret));
+          "Invalid return value. Expect more than 6. But received %d.", ret));
   return ret;
 }
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index 99e3fd4579feb4..34d563bfc8723d 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -347,7 +347,7 @@ int CtrDoubleAccessor::ParseFromString(const std::string& str, float* value) {
       str_len,
       6UL,
       common::errors::InvalidArgument(
-          "Invalid string length. Expect more than 6. But recieved %d.",
+          "Invalid string length. Expect more than 6. But received %d.",
           str_len));
   int show_index = CtrDoubleFeatureValue::ShowIndex();
   int click_index = CtrDoubleFeatureValue::ClickIndex();
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index 0c5ea90895f4c5..46dde28fc9fe8c 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -394,7 +394,7 @@ int CtrDymfAccessor::ParseFromString(const std::string& str, float* value) {
       ret,
       7UL,
       common::errors::InvalidArgument(
-          "Invalid return value. Expect more than 7. But recieved %d.", ret));
+          "Invalid return value. Expect more than 7. But received %d.", ret));
   return ret;
 }
 
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
index a12523b013b9f6..d5bbf950b7cc58 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
@@ -304,7 +304,7 @@ int SparseAccessor::ParseFromString(const std::string& str, float* value) {
       ret,
       6UL,
       common::errors::InvalidArgument(
-          "Invalid return value. Expect more than 6. But recieved %d.", ret));
+          "Invalid return value. Expect more than 6. But received %d.", ret));
   return ret;
 }
 
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 1e307558976adf..7f1955079b57b2 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -340,7 +340,7 @@ Channel<T> MakeChannel(const Channel<U>& other) {
 // NOTE: ChannelReader is a wrapper for quick read channel with a buffer. It
 // will read a block data from channel, but user can get data one by one. So it
 // is important to notice that user must call operator>> until false, or call
-// get_buffer_remain until false to make sure the buffered data all readed.
+// get_buffer_remain until false to make sure the buffered data all read.
 template <class T>
 class ChannelReader {
  public:
diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc
index afca688c01fbcf..023832c5cb40cd 100644
--- a/paddle/fluid/framework/dist_multi_trainer.cc
+++ b/paddle/fluid/framework/dist_multi_trainer.cc
@@ -114,7 +114,7 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
                     thread_num_,
                     common::errors::InvalidArgument(
                         "static_cast<int>(pool.size()) is invalid, "
-                        "expected %d but recieved %d.",
+                        "expected %d but received %d.",
                         thread_num_,
                         static_cast<int>(pool.size())));
   for (int i = 0; i < thread_num_; ++i) {
@@ -163,7 +163,7 @@ void DistMultiTrainer::Run() {
                     thread_num_,
                     common::errors::InvalidArgument(
                         "static_cast<int>(pool.size()) is invalid, "
-                        "expected %d but recieved %d.",
+                        "expected %d but received %d.",
                         thread_num_,
                         static_cast<int>(pool.size())));
   for (int i = 0; i < thread_num_; ++i) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index acb71f0e6a2b57..28e9a248342f91 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -351,7 +351,7 @@ class HeterComm {
                         len,
                         common::errors::InvalidArgument(
                             "Invalid size of all keys memory. Expect to be "
-                            "equal to length %d. But recieved %d.",
+                            "equal to length %d. But received %d.",
                             len,
                             all_keys_mem->size()));
       PADDLE_ENFORCE_GE(
@@ -359,7 +359,7 @@ class HeterComm {
           len * value_bytes,
           common::errors::InvalidArgument(
               "Invalid size of all gradients memory. Expect to be equal to "
-              "length * value bytes %d. But recieved %d.",
+              "length * value bytes %d. But received %d.",
               len * value_bytes,
               all_grads_mem->size()));
     }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index e1469e1ead2dfe..edab7bd80287aa 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -410,7 +410,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
                     ranks_vec[i]->size(),
                     common::errors::InvalidArgument(
                         "keys_vec[i]->size() should be equal to "
-                        "ranks_vec[i]->size(), but recieved "
+                        "ranks_vec[i]->size(), but received "
                         "keys_vec[i]->size() is %d, ranks_vec[i]->size() is %d",
                         keys_vec[i]->size(),
                         ranks_vec[i]->size()));
@@ -428,7 +428,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
                     0UL,
                     common::errors::InvalidArgument(
                         "ranks_vec[i]->size() should be equal to 0, "
-                        "but recieved %d.",
+                        "but received %d.",
                         ranks_vec[i]->size()));
                 for (size_t j = 0; j < keys_vec[i]->size(); ++j) {
                   auto& key = (*keys_vec[i])[j];
@@ -469,7 +469,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task,
           total_keys,
           common::errors::InvalidArgument(
               "Total shard keys number should be less than or equal to total "
-              "keys number, but recieved %d as total shard keys number and %d "
+              "keys number, but received %d as total shard keys number and %d "
               "as total keys number.",
               total_shard_keys,
               total_keys));
@@ -1638,7 +1638,7 @@ void PSGPUWrapper::divide_to_device(std::shared_ptr<HeterContext> gpu_task) {
                           nullptr,
                           common::errors::InvalidArgument(
                               "The value of local dimension pointer should not "
-                              "be nullptr but recieved %d at position %d.",
+                              "be nullptr but received %d at position %d.",
                               h_dim_ptrs[pos],
                               pos));
         d_dim_ptr[cur + k] = h_dim_ptrs[pos];
@@ -2398,7 +2398,7 @@ void PSGPUWrapper::PullSparse(const phi::Place& place,
                               const std::vector<float*>& values,
                               const std::vector<int64_t>& slot_lengths,
                               const int hidden_size) {
-  VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This "
+  VLOG(0) << "Warning:: recommend use pull_gpups_sparse op instead. This "
              "PullSparse is not used.";
 }
 
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 4b5d551eafc100..e3c22df825214e 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -1070,7 +1070,7 @@ void AutoMixedPrecisionPass::InsertCastOp() const {
             cache_kv_outputs.size(),
             common::errors::InvalidArgument(
                 "Cache inputs should be the same size with cache outputs, but "
-                "recieved %d as inputs and %d as outputs.",
+                "received %d as inputs and %d as outputs.",
                 cache_kv_inputs.size(),
                 cache_kv_outputs.size()));
         for (size_t i = 0; i < cache_kv_inputs.size(); ++i) {
diff --git a/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc
index cbff317d4383fd..8e56f712cb27d5 100644
--- a/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/decoder_attention_xpu_fuse_pass.cc
@@ -234,7 +234,7 @@ void DecoderAttentionXPUFusePass::ApplyDecoderAttentionXPUFuse(
     fused_op_desc.SetInput("v", {input_v->Name()});
     std::unordered_map<std::string, std::vector<float>> var_quant_scales =
         GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
-    // recored q/k/v max, qk_max, and qkv_max
+    // recorded q/k/v max, qk_max, and qkv_max
     std::vector<Node*> input_max_nodes;
     if (var_quant_scales.find(input_q->Name()) != var_quant_scales.end() &&
         var_quant_scales.find(input_k->Name()) != var_quant_scales.end() &&
diff --git a/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc
index 5908974d486644..8675d5eedbda13 100644
--- a/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/qk_qkv_attention_xpu_fuse_pass.cc
@@ -253,7 +253,7 @@ void QkQkvAttentionXPUFusePass::ApplyQkQkvAttentionXPUFuse(
     fused_op_desc.SetInput("v", {input->Name()});
     std::unordered_map<std::string, std::vector<float>> var_quant_scales =
         GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
-    // recored q/k/v max, qk_max, and qkv_max
+    // recorded q/k/v max, qk_max, and qkv_max
     std::vector<Node*> input_max_nodes;
     if (var_quant_scales.find(input->Name()) != var_quant_scales.end() &&
         var_quant_scales.find(qk_matmul_out->Name()) !=
diff --git a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
index 7d7a771a67eb82..7bd1ca9226fcd0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
@@ -360,7 +360,7 @@ int MultiheadMatmulRoformerPlugin::enqueue(
     PADDLE_THROW(common::errors::Fatal(
         "The Ernie(Bert) TensorRT Plugin should be "
         "complied with CUDA version >= 10.0 when running with fp16. "
-        "Please recomplie it or try to use fp32 by set "
+        "Please recompile it or try to use fp32 by set "
         "config.SetTRTDynamicShapeInfo(min_input_shape, "
         "max_input_shape, opt_input_shape, true"));
 #endif
diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu
index c1c04bdd80f636..d871bab0823a2c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu
@@ -537,7 +537,7 @@ int PrelnResidualBiasPluginDynamic::enqueue(
     PADDLE_THROW(common::errors::Fatal(
         "The Ernie(Bert) tensorRT plugin should be "
         "complied with CUDA version >= 10.0 when running with fp16. "
-        "Please recomplie it or try to use fp32 by set "
+        "Please recompile it or try to use fp32 by set "
         "config.SetTRTDynamicShapeInfo(min_input_shape, "
         "max_input_shape, opt_input_shape, true"));
 #endif
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 6cba98fb7dd725..f614ca12d046c3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -533,7 +533,7 @@ int QkvToContextPluginDynamic::enqueue(
     PADDLE_THROW(common::errors::Fatal(
         "The Ernie(Bert) TensorRT Plugin should be "
         "complied with CUDA version >= 10.0 when running with fp16. "
-        "Please recomplie it or try to use fp32 by set "
+        "Please recompile it or try to use fp32 by set "
         "config.SetTRTDynamicShapeInfo(min_input_shape, "
         "max_input_shape, opt_input_shape, true"));
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 1df5f2d05eef16..39a5a76f8bcaa0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -59,7 +59,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
           common::errors::InvalidArgument(
               "For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS"
               "), Y must be scalar, the size of Y should be 1. "
-              "But reveived the size of Y = %s.",
+              "But received the size of Y = %s.",
               ctx->GetInputDim("Y").size()));
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("Y")[0],
@@ -67,7 +67,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
           common::errors::InvalidArgument(
               "For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS"
               "), Y must be scalar, the first dimension of Y should be 1. "
-              "But reveived the first dimension of Y = %s.",
+              "But received the first dimension of Y = %s.",
               ctx->GetInputDim("Y")[0]));
     } else if (ctx->GetInputsVarType("X").front() !=
                framework::proto::VarType::DENSE_TENSOR) {
diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
index eba9c8ca7c2836..97afddf8e10122 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -275,7 +275,7 @@ phi::KernelKey GetStridedSliceExpectedKernelType(
             true,
             common::errors::InvalidArgument(
                 "Place of context is %s. Place of input tensor is %s. They "
-                "are should be same, but reveived different place.",
+                "are should be same, but received different place.",
                 string::to_string(ctx.device_context().GetPlace()),
                 string::to_string(tensor.place())));
       }
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 80bc394fa62492..c00a0141a5ba70 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -6055,7 +6055,7 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
       0,
       errors::InvalidArgument(
           "The num_head of query must be divisible by the num_head of key, but "
-          "recived num_head of query is %d, and the num_head of key is %d",
+          "received num_head of query is %d, and the num_head of key is %d",
           num_head,
           k_num_head));
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/phi/kernels/funcs/blas/blas.cc b/paddle/phi/kernels/funcs/blas/blas.cc
index ef1d2f2f591955..6117a01a3bb9bc 100644
--- a/paddle/phi/kernels/funcs/blas/blas.cc
+++ b/paddle/phi/kernels/funcs/blas/blas.cc
@@ -23,7 +23,7 @@ MatDescriptor CreateMatrixDescriptor(const DDim &tensor_dim,
       tensor_dim.size(),
       1,
       common::errors::InvalidArgument("The tensor dim size should be greater "
-                                      "than 1, but reveived dim size is %d",
+                                      "than 1, but received dim size is %d",
                                       tensor_dim.size()));
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
diff --git a/paddle/phi/kernels/gpu/flash_attn_utils.h b/paddle/phi/kernels/gpu/flash_attn_utils.h
index 42cd09c21e2ddf..d03225f4f290c8 100644
--- a/paddle/phi/kernels/gpu/flash_attn_utils.h
+++ b/paddle/phi/kernels/gpu/flash_attn_utils.h
@@ -94,14 +94,14 @@ static std::vector<int64_t> GetAttnSparseMaskDims(
         dtype,
         DataType::INT32,
         common::errors::InvalidArgument("dtype of startend_row_indices must be "
-                                        "int32, but recieved %d",
+                                        "int32, but received %d",
                                         dtype));
     PADDLE_ENFORCE_GE(
         rank,
         4,
         common::errors::InvalidArgument(
             "The number of dimensions of startend_row_indices is expected to "
-            "be greater or equal to 4, but recieved %d. The shape of "
+            "be greater or equal to 4, but received %d. The shape of "
             "startend_row_indices is [%s]",
             rank,
             origin_dims));
@@ -110,7 +110,7 @@ static std::vector<int64_t> GetAttnSparseMaskDims(
                       common::errors::InvalidArgument(
                           "The sparse_mask_dims[%d] of "
                           "attn_mask_start_row_indices is expected to be "
-                          "equal to %d, but recieved %d.",
+                          "equal to %d, but received %d.",
                           rank - 2,
                           max_seqlen_q,
                           origin_dims[2]));
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7090df20d6a5e4..fdddf2c040f583 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1476,7 +1476,7 @@ function collect_failed_tests() {
     done
 }
 
-# getting qucik disable ut list
+# getting quick disable ut list
 function get_quickly_disable_ut() {
     python -m pip install httpx
     if disable_ut_quickly=$(python ${PADDLE_ROOT}/tools/get_quick_disable_lt.py); then
diff --git a/python/paddle/decomposition/recompute.py b/python/paddle/decomposition/recompute.py
index 1fea2497284754..effd0882000092 100644
--- a/python/paddle/decomposition/recompute.py
+++ b/python/paddle/decomposition/recompute.py
@@ -226,7 +226,7 @@ def _get_consumer_ops(op):
                         self.result_value_set.add(result)
             return consumers
 
-        def _get_producer_ops_recursivly(root):
+        def _get_producer_ops_recursively(root):
             visited = set()
             queue = deque()
             queue.append(root)
@@ -240,7 +240,7 @@ def _get_producer_ops_recursivly(root):
                     visited.add(new_op)
                     queue.append(new_op)
 
-        def _get_consumer_ops_recursivly(root):
+        def _get_consumer_ops_recursively(root):
             visited = set()
             queue = deque()
             queue.append(root)
@@ -256,8 +256,8 @@ def _get_consumer_ops_recursivly(root):
 
         for op in self.ops:
             if op.name() in self.unrecomputable_ops:
-                _get_producer_ops_recursivly(op)
-                _get_consumer_ops_recursivly(op)
+                _get_producer_ops_recursively(op)
+                _get_consumer_ops_recursively(op)
 
     def _has_unfusible_op_on_any_path(self, op1, op2):
         no_unfusible_op_on_path = (
diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py
index f540d5cd319380..46b8d52a9dc798 100644
--- a/python/paddle/distributed/auto_parallel/static/helper.py
+++ b/python/paddle/distributed/auto_parallel/static/helper.py
@@ -410,7 +410,7 @@ def init_pir(self, main_program, place):
             if param is None:
                 continue
             if param.name not in dy_param_name_to_pir_param_name:
-                # Release the reduntant params
+                # Release the redundant params
                 param.get_tensor()._clear()
                 continue
             if not param._is_initialized():
@@ -493,7 +493,7 @@ def init(self, main_program, place, dist_context):
             if param is None:
                 continue
             if param.name not in main_program.global_block().vars:
-                # Release the reduntant params
+                # Release the redundant params
                 param.get_tensor()._clear()
                 continue
             if not param._is_initialized():
diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py
index cdb87b3b70477c..8fedb9f8a0c287 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard_funcs/sub_to_global_mesh_func.py
@@ -73,7 +73,7 @@ def reshard(self, src_dist_attr, dst_dist_attr, src_value, dst_type):
                 src_mesh, [src_dist_attr], [src_dist_attr], chunk_id
             )
         else:
-            # create the buffer on other ranks for receving the data
+            # create the buffer on other ranks for receiving the data
             tmp_value = paddle.zeros(dst_type.shape, dst_type.dtype)
             op = tmp_value.get_defining_op()
             mesh = paddle.distributed.ProcessMesh(other_ranks)
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
index 22da88364d3691..2bf40ecc7c97c3 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
@@ -2695,7 +2695,7 @@ def run_or_quit(self):
         # Quit if just tune
         if not self._is_run:
             self._logger.info(
-                "The process will be quitted when just tune not run."
+                "The process will be quit when just tune not run."
             )
             sys.exit()
 
diff --git a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
index 662552119a21ca..1fd43cd5b602b0 100644
--- a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
@@ -326,7 +326,7 @@ def add_extra_synchronization(
         if params_filter_fn(param):
             params_to_sync.append(param)
     logger.info(
-        "The following param are going to be synchronization everytime the optimizer update phase of the program is runned: "
+        "The following param are going to be synchronization everytime the optimizer update phase of the program is run: "
     )
     logger.info([p.name for p in params_to_sync])
 
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 2f8567fb0e14d5..ff90fa0b80a90a 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1089,7 +1089,7 @@ def fill_constant(
 
         if out.dtype != dtype:
             raise TypeError(
-                "Required out.dtype == dtype if specifying out, but recevied f{out.dtype} != f{dtype}"
+                "Required out.dtype == dtype if specifying out, but received f{out.dtype} != f{dtype}"
             )
         out = _C_ops.full_(out, shape, value, dtype, place)
         out.stop_gradient = True
diff --git a/test/ir/pir/cinn/utils.py b/test/ir/pir/cinn/utils.py
index 62642af979522d..ca1ff888e56490 100644
--- a/test/ir/pir/cinn/utils.py
+++ b/test/ir/pir/cinn/utils.py
@@ -77,7 +77,7 @@ def check_jit_kernel_number(static_fn, expected_number):
 
 def get_jit_kernel_structure_helper(block, map_info, if_op_idx='_0'):
     """
-    Recursivly generate JIT_KERNEL map_info for Static/Dynmaic Shape UT.
+    Recursively generate JIT_KERNEL map_info for Static/Dynamic Shape UT.
     """
     if_count = 0
     for op in block.ops:

From 37e33e5b6f499159d328762c2faf378b059fd216 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:18:52 +0800
Subject: [PATCH 29/57] Fix (#70679)

---
 .../tensorrt/plugin/c_allreduce_op_plugin.cu  |  2 +-
 .../tensorrt/plugin/custom_generic_plugin.cu  | 14 +++++------
 .../plugin/deformable_conv_op_plugin.cu       |  2 +-
 .../tensorrt/plugin/elementwise_op_plugin.cu  |  2 +-
 .../elementwiseadd_transpose_op_plugin.cu     |  2 +-
 .../plugin/fused_token_prune_op_plugin.cu     |  2 +-
 .../tensorrt/plugin/gelu_op_plugin.cu         |  2 +-
 .../tensorrt/plugin/group_norm_op_plugin.cu   |  2 +-
 .../tensorrt/plugin/hard_swish_op_plugin.cu   |  2 +-
 .../tensorrt/plugin/layer_norm_op_plugin.cu   |  2 +-
 .../plugin/layernorm_shift_partition_op.cu    |  2 +-
 .../plugin/merge_layernorm_op_plugin.cu       |  2 +-
 .../tensorrt/plugin/mish_op_plugin.cu         |  2 +-
 .../multihead_matmul_roformer_plugin.cu       |  2 +-
 .../tensorrt/plugin/pool3d_op_plugin.cu       |  2 +-
 .../tensorrt/plugin/pool_op_plugin.cu         |  2 +-
 .../plugin/preln_groupnorm_act_op_plugin.cu   |  2 +-
 .../plugin/preln_residual_bias_plugin.cu      |  2 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |  2 +-
 .../tensorrt/plugin/reverse_roll_op_plugin.cu |  2 +-
 .../plugin/skip_groupnorm_act_op_plugin.cu    |  2 +-
 .../plugin/skip_merge_layernorm_op_plugin.cu  |  2 +-
 .../tensorrt/plugin/swish_op_plugin.cu        |  2 +-
 .../plugin/trans_layernorm_op_plugin.cu       |  2 +-
 ...transformer_input_output_convert_plugin.cu |  2 +-
 .../common_subexpression_elimination_pass.cc  |  6 ++---
 .../general/transfer_layout_pass.cc           | 24 +++++++++----------
 27 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
index 0cab9341b09495..d0627ecf950dae 100644
--- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
@@ -94,7 +94,7 @@ bool CAllReducePluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of CAllReduce plugin shoule not be nullptr."));
+          "The input of CAllReduce plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
index d6d76c6b9618ea..73a4462bdef519 100644
--- a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
@@ -48,7 +48,7 @@ void validate(const std::string& op_type,
   PADDLE_ENFORCE_GE(supports_dtypes.count(datatype),
                     0,
                     common::errors::InvalidArgument(
-                        "custorm op [%s] has unsupported datatype: [%s], "
+                        "custom op [%s] has unsupported datatype: [%s], "
                         "now only support: [float32, float16, int8, int32].",
                         op_type,
                         datatype));
@@ -56,7 +56,7 @@ void validate(const std::string& op_type,
       supports_tensor_formats.count(tensor_format),
       0,
       common::errors::InvalidArgument(
-          "custorm op [%s] has unsupported tensor format: [%s], "
+          "custom op [%s] has unsupported tensor format: [%s], "
           "now only support: [LINEAR, CHW32, CHW2, HWC8, CHW4, DHWC8(TensorRT "
           "7.2 and after), HWC16(TensorRT 8.0 and after)].",
           op_type,
@@ -68,7 +68,7 @@ void validate(const std::string& op_type,
         supports_formats_tmp.count(tensor_format),
         0,
         common::errors::InvalidArgument(
-            "custorm op [%s]: float32 only supports [LINEAR, CHW32], "
+            "custom op [%s]: float32 only supports [LINEAR, CHW32], "
             "but got tensor format: [%s], ",
             op_type,
             tensor_format));
@@ -85,7 +85,7 @@ void validate(const std::string& op_type,
     PADDLE_ENFORCE_GE(supports_formats_tmp.count(tensor_format),
                       0,
                       common::errors::InvalidArgument(
-                          "custorm op [%s]: float16 only supports [LINEAR, "
+                          "custom op [%s]: float16 only supports [LINEAR, "
                           "CHW2, HWC8, CHW4, DHWC8(TensorRT 7.2 and after), "
                           "HWC16(TensorRT 8.0 and after)], "
                           "but got tensor format: [%s], ",
@@ -99,7 +99,7 @@ void validate(const std::string& op_type,
         supports_formats_tmp.count(tensor_format),
         0,
         common::errors::InvalidArgument(
-            "custorm op [%s]: int8 only supports [LINEAR, CHW32, CHW4], "
+            "custom op [%s]: int8 only supports [LINEAR, CHW32, CHW4], "
             "but got tensor format: [%s], ",
             op_type,
             tensor_format));
@@ -109,7 +109,7 @@ void validate(const std::string& op_type,
     PADDLE_ENFORCE_GE(supports_formats_tmp.count(tensor_format),
                       0,
                       common::errors::InvalidArgument(
-                          "custorm op [%s]: int32 only supports [LINEAR], "
+                          "custom op [%s]: int32 only supports [LINEAR], "
                           "but got tensor format: [%s], ",
                           op_type,
                           tensor_format));
@@ -320,7 +320,7 @@ bool CustomGenericPlugin::supportsFormatCombination(
                         "supportsFormatCombination config!"
                         "Please use SetTrtSupportsFormatConfig to set.",
                         op_desc_.Type().c_str()));
-  // generate support format combaination function by config
+  // generate support format combination function by config
   size_t input_num = OpMetaInfoHelper::GetInputs(op_info).size();
   size_t output_num = OpMetaInfoHelper::GetOutputs(op_info).size();
   std::vector<std::vector<std::pair<std::string, std::string>>>
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index df6290fc3ae5f4..1f787c259b0518 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -1172,7 +1172,7 @@ bool DeformableConvPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of groupnorm plugin shoule not be nullptr."));
+          "The input of groupnorm plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 3c4c9df2f16f08..82d003cfba293a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -241,7 +241,7 @@ bool ElementwisePluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
index 855c80e18d88f6..aa89ffd4e222d4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
@@ -62,7 +62,7 @@ bool ElementwiseAddTransposePluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument("The input of elementwiseadd_transpose "
-                                      "plugin shoule not be nullptr."));
+                                      "plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
index 835b222943a9b8..b18a0c2d6d357c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -237,7 +237,7 @@ bool FusedTokenPrunePluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
index c1b4aad6d73c06..46628128e3b0a3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@@ -152,7 +152,7 @@ bool GeluPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
index e50be737719945..589ab150ae6fd8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
@@ -475,7 +475,7 @@ bool GroupNormPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of groupnorm plugin shoule not be nullptr."));
+          "The input of groupnorm plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
index fb328277ab86a4..682929e9d64fb3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
@@ -142,7 +142,7 @@ bool HardSwishPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 2ebce801564457..ebc539e32718fd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -210,7 +210,7 @@ bool LayerNormPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of layernorm plugin shoule not be nullptr."));
+          "The input of layernorm plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu b/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
index cd5e1ad9032f8e..1190d9d0d08413 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
@@ -554,7 +554,7 @@ bool LayernormShiftPartitionPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument("The input of LayernormShiftPartition "
-                                      "plugin shoule not be nullptr."));
+                                      "plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu
index 2e228ed3d69744..5972f5c05964b9 100644
--- a/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/merge_layernorm_op_plugin.cu
@@ -214,7 +214,7 @@ bool MergeLayernormPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument("The input of MergeLayernorm "
-                                      "plugin shoule not be nullptr."));
+                                      "plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
index 3263880b883b01..a25f218b0feee7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
@@ -180,7 +180,7 @@ bool MishPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of mish plugin shoule not be nullptr."));
+          "The input of mish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
index 7bd1ca9226fcd0..8fcf3f520de015 100644
--- a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
@@ -74,7 +74,7 @@ bool MultiheadMatmulRoformerPlugin::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
index f80556567431b7..eefc0b2f9e8547 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -329,7 +329,7 @@ bool Pool3DPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index bda2ebcaf853a4..e81114c6f2d7ea 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -260,7 +260,7 @@ bool PoolPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
index ab99587dfec1b7..7da3bdeae03d94 100644
--- a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
@@ -40,7 +40,7 @@ bool PrelnGroupnormActPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of prelnGroupnormAct plugin shoule not be nullptr."));
+          "The input of prelnGroupnormAct plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu
index d871bab0823a2c..6e3334ef5ff3d4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu
@@ -291,7 +291,7 @@ bool PrelnResidualBiasPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index f614ca12d046c3..3d443eba031a02 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -236,7 +236,7 @@ bool QkvToContextPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu
index 6322fa29606864..0fa40fd08e1a99 100644
--- a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.cu
@@ -143,7 +143,7 @@ bool ReverseRollPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument("The input of ReverseRoll "
-                                      "plugin shoule not be nullptr."));
+                                      "plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
index 20d13c1c6f8c7d..85ad7d808cccc2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
@@ -40,7 +40,7 @@ bool SkipGroupnormActPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of SkipGroupnormAct plugin shoule not be nullptr."));
+          "The input of SkipGroupnormAct plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu
index 5171f3ae0475ec..658b9eceb492ea 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_merge_layernorm_op_plugin.cu
@@ -227,7 +227,7 @@ bool SkipMergeLayernormPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument("The input of MergeLayernorm "
-                                      "plugin shoule not be nullptr."));
+                                      "plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
index 9a485b4d1d7c69..e4702b0032c69e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
@@ -165,7 +165,7 @@ bool SwishPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of swish plugin shoule not be nullptr."));
+          "The input of swish plugin should not be nullptr."));
 
   PADDLE_ENFORCE_LT(
       pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
index 30787d118b5414..459998020b62fd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
@@ -206,7 +206,7 @@ bool TransLayerNormPluginDynamic::supportsFormatCombination(
   PADDLE_ENFORCE_NOT_NULL(
       in_out,
       common::errors::InvalidArgument(
-          "The input of layernorm plugin shoule not be nullptr."));
+          "The input of layernorm plugin should not be nullptr."));
   PADDLE_ENFORCE_LT(
       pos,
       nb_inputs + nb_outputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu
index 9e5ff08411cbca..b601b3fd9e3d1d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu
@@ -222,7 +222,7 @@ int TransformerInputConvertPlugin::enqueue(
       B,
       MaxLength,
       vector_length /
-          num_threads);  //  batches, max sequnce length, input0.dims.d[2]/*
+          num_threads);  //  batches, max sequence length, input0.dims.d[2]/*
   remove_padding_kernel<<<num_blocks, num_threads, 0, stream>>>(
       input0, output2, output0);  // input(no_varlen), pos_id, input(varlen)
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc b/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc
index 682109b5784640..52cafa8793a300 100644
--- a/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc
+++ b/paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.cc
@@ -452,9 +452,9 @@ struct ExpressionEqual {
 struct ExpressionTable {
  public:
   ExpressionTable() = default;
-  void RegisiterExpression(Expression expr) {
+  void RegisterExpression(Expression expr) {
     auto op_info = expr.CalcOpInfo();
-    VLOG(7) << "[RegisiterExpression] op " << expr.op()->name() << " ["
+    VLOG(7) << "[RegisterExpression] op " << expr.op()->name() << " ["
             << expr.op() << "]"
             << "\n  hash: " << op_info.first
             << "\n  can_be_safe_to_replace: " << std::boolalpha
@@ -506,7 +506,7 @@ struct CSEAnalyzer {
 
     // Handle the operation
     auto expr = expression_table->CreateExpression(op);
-    expression_table->RegisiterExpression(expr);
+    expression_table->RegisterExpression(expr);
     auto maybe_same_expression = expression_table->Lookup(expr);
     if (expr.CanBeSafeToReplace()) {
       if (!maybe_same_expression.has_value()) {
diff --git a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
index 607b48ac4d55ab..780809ca4b410f 100644
--- a/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
+++ b/paddle/fluid/pir/transforms/general/transfer_layout_pass.cc
@@ -212,17 +212,17 @@ struct FlowGraph {
       Node op_node(&op);
       auto layout_transform_iface =
           op.dyn_cast<paddle::dialect::LayoutTransformationInterface>();
-      const auto& relevate_inputs =
+      const auto& relevant_inputs =
           layout_transform_iface ? layout_transform_iface.RelevantInputs(&op)
                                  : op.operands_source();
-      const auto& relevate_outputs =
+      const auto& relevant_outputs =
           layout_transform_iface ? layout_transform_iface.RelevantOutputs(&op)
                                  : op.results();
-      VLOG(10) << "[BuildGraph]" << op_node << " isz:" << relevate_inputs.size()
-               << " osz:" << relevate_outputs.size();
+      VLOG(10) << "[BuildGraph]" << op_node << " isz:" << relevant_inputs.size()
+               << " osz:" << relevant_outputs.size();
 
       // add in edge
-      for (auto& operand : relevate_inputs) {
+      for (auto& operand : relevant_inputs) {
         Node operand_node(operand);
         // the capacity should be set as the out_degree of operand node
         float weight = 1.0f;
@@ -235,7 +235,7 @@ struct FlowGraph {
         AddEdge(operand_node, op_node, weight, 0.0f, true);
       }
 
-      for (const auto& op_result : relevate_outputs) {
+      for (const auto& op_result : relevant_outputs) {
         // we have ssa, so the output must not be processed
         Node op_result_node(op_result);
 
@@ -275,19 +275,19 @@ struct FlowGraph {
 
       auto layout_transform_iface =
           op.dyn_cast<paddle::dialect::LayoutTransformationInterface>();
-      const auto& relevate_inputs =
+      const auto& relevant_inputs =
           layout_transform_iface ? layout_transform_iface.RelevantInputs(&op)
                                  : op.operands_source();
-      const auto& relevate_outputs =
+      const auto& relevant_outputs =
           layout_transform_iface ? layout_transform_iface.RelevantOutputs(&op)
                                  : op.results();
 
-      for (const auto& op_operand : relevate_inputs) {
+      for (const auto& op_operand : relevant_inputs) {
         Node operand_node(op_operand);
         AddEdge(src_node(), operand_node, THRESHOLD);
       }
 
-      for (const auto& op_result : relevate_outputs) {
+      for (const auto& op_result : relevant_outputs) {
         Node op_result_node(op_result);
         AddEdge(src_node(), op_result_node, THRESHOLD);
       }
@@ -328,11 +328,11 @@ struct FlowGraph {
     for (auto& op : *(program.block())) {
       auto layout_transform_iface =
           op.dyn_cast<paddle::dialect::LayoutTransformationInterface>();
-      const auto& relevate_outputs =
+      const auto& relevant_outputs =
           layout_transform_iface ? layout_transform_iface.RelevantOutputs(&op)
                                  : op.results();
 
-      for (const auto& op_result : relevate_outputs) {
+      for (const auto& op_result : relevant_outputs) {
         Node op_result_node(op_result);
         for (auto it = op_result.use_begin(); it != op_result.use_end(); ++it) {
           auto user_op = it->owner();

From ebc5239a61c17a29807b38e33a994926629943c7 Mon Sep 17 00:00:00 2001
From: zhanghonggeng <43205915+zhanghonggeng@users.noreply.github.com>
Date: Wed, 8 Jan 2025 18:19:19 +0800
Subject: [PATCH 30/57] Add matmul_add_act_fuse_pass in inference process
 (#70663)

---
 paddle/fluid/inference/api/analysis_predictor.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a3b8a881400a4b..610d1019126cc4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -908,11 +908,13 @@ void AnalysisPredictor::OptimizeInferencePirProgram() {
         });
         // Infer symbol shape for all ops before fused pass
         fused_op_pm.AddPass(pir::CreateShapeOptimizationPass());
-        const std::vector<std::string> FusedOpPasses{// Operator fusion pass
-                                                     "map_op_to_another_pass",
-                                                     "conv2d_bn_fuse_pass",
-                                                     "conv2d_add_act_fuse_pass",
-                                                     "conv2d_add_fuse_pass"};
+        const std::vector<std::string> FusedOpPasses{
+            // Operator fusion pass
+            "map_op_to_another_pass",
+            "conv2d_bn_fuse_pass",
+            "conv2d_add_act_fuse_pass",
+            "conv2d_add_fuse_pass",
+            "matmul_add_act_fuse_pass"};
 
         for (const auto &fused_op : FusedOpPasses) {
           fused_op_pm.AddPass(pir::PassRegistry::Instance().Get(fused_op));

From 52ebe47398b21b1327f2b9fb0bfe8ba53c0848af Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Wed, 8 Jan 2025 19:15:56 +0800
Subject: [PATCH 31/57] del autosimplify (#70691)

---
 paddle/cinn/ir/group_schedule/config/group_tile_util.cc   | 4 ++--
 paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc | 4 ++--
 paddle/cinn/ir/ir_analyzer/ir_analyzer.cc                 | 2 +-
 paddle/cinn/optim/replace_cross_block_reduction.cc        | 2 +-
 paddle/cinn/optim/replace_mod_to_max.cc                   | 2 +-
 paddle/cinn/optim/resize_buffer.cc                        | 4 ++--
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/cinn/ir/group_schedule/config/group_tile_util.cc b/paddle/cinn/ir/group_schedule/config/group_tile_util.cc
index 933cb3a6477565..30ab52b8bb65a7 100644
--- a/paddle/cinn/ir/group_schedule/config/group_tile_util.cc
+++ b/paddle/cinn/ir/group_schedule/config/group_tile_util.cc
@@ -58,7 +58,7 @@ std::vector<int64_t> GetVarStrides(ir::Expr load_offset,
     ir::Expr expr = ir::ir_utils::IRCopy(load_offset);
     replacer.inspecting_var = var;
     replacer.IRMutator::Visit(&expr, &expr);
-    ir::Expr res = common::AutoSimplify(expr);
+    ir::Expr res = optim::ArithSimplify(expr);
     if (res.is_constant()) {
       return res.as_int64();
     }
@@ -90,7 +90,7 @@ ir::Expr GetLargestLoad(const std::vector<ir::Expr>& exprs) {
     for (size_t i = 1; i < tensor->shape.size(); i++) {
       size = size * tensor->shape[i];
     }
-    return common::AutoSimplify(size);
+    return optim::ArithSimplify(size);
   };
 
   ir::Expr res = exprs[0];
diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
index 758464d5d21857..fb0b2cadc6f034 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
@@ -185,8 +185,8 @@ SymbolicPredicate DynamicShapeGroupScheduler::MakeBucketPredicate(
       }
     }
 
-    sp_extent = common::AutoSimplify(sp_extent);
-    rd_extent = common::AutoSimplify(rd_extent);
+    sp_extent = optim::ArithSimplify(sp_extent);
+    rd_extent = optim::ArithSimplify(rd_extent);
     return {sp_extent, rd_extent};
   }();
 
diff --git a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
index e7e258d6d5a7ca..5b73b0196e7b28 100644
--- a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
+++ b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
@@ -464,7 +464,7 @@ std::vector<ir::Expr> GetIterValuesOfAccess(ir::Expr load_or_store,
   for (ir::Expr index : indices) {
     ir::Expr index_value = ReplaceVarWithExpr(
         index, s_block->iter_vars, s_block_realize->iter_values);
-    iter_values.push_back(common::AutoSimplify(index_value));
+    iter_values.push_back(optim::ArithSimplify(index_value));
   }
   return iter_values;
 }
diff --git a/paddle/cinn/optim/replace_cross_block_reduction.cc b/paddle/cinn/optim/replace_cross_block_reduction.cc
index 452697fd372e3d..5f597e1ef26f1f 100644
--- a/paddle/cinn/optim/replace_cross_block_reduction.cc
+++ b/paddle/cinn/optim/replace_cross_block_reduction.cc
@@ -30,7 +30,7 @@ namespace {
 
 ir::Expr CalcBufferSizeInBytes(const ir::Buffer& buffer) {
   const ir::Expr numel = buffer->SymbolicNumel();
-  return common::AutoSimplify(numel * buffer->dtype.bytes());
+  return optim::ArithSimplify(numel * buffer->dtype.bytes());
 }
 
 std::unordered_set<std::string> GetReduceVarNames(
diff --git a/paddle/cinn/optim/replace_mod_to_max.cc b/paddle/cinn/optim/replace_mod_to_max.cc
index 2b723f43638976..f55f8aa68c4e41 100644
--- a/paddle/cinn/optim/replace_mod_to_max.cc
+++ b/paddle/cinn/optim/replace_mod_to_max.cc
@@ -37,7 +37,7 @@ class ReplaceModToMaxMutator : public ir::IRMutator<> {
     ir::Mod* node = expr->As<ir::Mod>();
     Expr base = ir::Sub::Make(node->operand(1), Expr(1));
     Expr min_expr = ir::Min::Make(node->operand(0), base);
-    *expr = cinn::common::AutoSimplify(min_expr);
+    *expr = cinn::optim::ArithSimplify(min_expr);
     ir::IRMutator<>::Visit(expr, expr);
   }
 };
diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc
index ab91648f2f96ef..2a09d2f5f841f6 100644
--- a/paddle/cinn/optim/resize_buffer.cc
+++ b/paddle/cinn/optim/resize_buffer.cc
@@ -184,7 +184,7 @@ class AnalyzeLoopVarRange : public ir::IRMutator<> {
                         0,
                         ::common::errors::PreconditionNotMet(
                             "Cannot find the extent of var %s", var_name));
-      size = common::AutoSimplify(size * var_name_to_extent_.at(var_name));
+      size = optim::ArithSimplify(size * var_name_to_extent_.at(var_name));
     }
 
     return size;
@@ -215,7 +215,7 @@ class AnalyzeLoopVarRange : public ir::IRMutator<> {
       }
     }
     ir::Expr tmp = ir::Add::Make(copy, ir::Expr(1));
-    ir::Expr simplified = common::AutoSimplify(tmp);
+    ir::Expr simplified = optim::ArithSimplify(tmp);
     if (simplified.As<ir::Min>()) {
       ir::Expr lhs = simplified.As<ir::Min>()->a();
       ir::Expr rhs = simplified.As<ir::Min>()->b();

From 9b06852d4a6057ef6fc7f7a4235905e7b8ddafc7 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Wed, 8 Jan 2025 20:48:41 +0800
Subject: [PATCH 32/57] [CodeStyle][Typos][C-71] Fix typos(`creater`,`Creater`)
 (#70684)

* fix

* Update _typos.toml

---------

Co-authored-by: Nyakku Shigure <sigure.qaq@gmail.com>
---
 _typos.toml                                   |  6 ++-
 .../transforms/check_infer_symbolic_util.cc   |  4 +-
 .../transforms/check_infer_symbolic_util.h    |  4 +-
 paddle/cinn/ir/schedule/factorize_reduction.h | 16 +++---
 paddle/cinn/ir/schedule/impl/reduction.cc     | 18 +++----
 paddle/cinn/ir/schedule/ir_schedule_util.h    |  4 +-
 .../fused_multi_transformer_decoder_pass.cc   | 12 ++---
 .../fused_multi_transformer_encoder_pass.cc   | 16 +++---
 .../ir/multihead_matmul_fuse_pass.cc          | 12 ++---
 .../ir/multihead_matmul_roformer_fuse_pass.cc |  4 +-
 .../trt_cross_multihead_matmul_fuse_pass.cc   |  4 +-
 .../trt_flash_multihead_matmul_fuse_pass.cc   |  4 +-
 .../ir/trt_multihead_matmul_fuse_pass.cc      | 12 ++---
 .../ir/trt_qk_multihead_matmul_fuse_pass.cc   |  4 +-
 .../fluid/inference/api/analysis_predictor.cc |  6 +--
 .../convert/flash_multihead_matmul_op.cc      |  2 +-
 .../generic_and_custom_plugin_creater.cc      | 13 ++---
 .../inference/tensorrt/convert/op_converter.h | 53 ++++++++++---------
 paddle/fluid/inference/tensorrt/op_teller.cc  |  8 +--
 paddle/fluid/inference/tensorrt/op_teller.h   |  6 +--
 .../tensorrt/plugin/reverse_roll_op_plugin.h  |  4 +-
 .../plugin/test_fused_token_prune_plugin.cc   |  2 +-
 .../tensorrt/plugin/test_split_plugin.cc      |  2 +-
 test/dygraph_to_static/test_cycle_gan.py      |  6 +--
 24 files changed, 114 insertions(+), 108 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 81230a2f09629c..cfd08daf4c29e2 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -7,6 +7,10 @@ extend-exclude = [
     # Skip `intermidiate` check in these files
     "test/cpp/eager/task_tests/CMakeLists.txt",
     "test/cpp/eager/task_tests/hook_test_intermidiate.cc",
+    # Skip `creater` check in these files
+    "paddle/fluid/inference/tensorrt/convert/CMakeLists.txt",
+    "paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc",
+    "paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc",
 ]
 
 [default]
@@ -41,8 +45,6 @@ pash = 'pash'
 unpacket = "unpacket"
 
 # These words need to be fixed
-Creater = 'Creater'
-creater = 'creater'
 fetchs = 'fetchs'
 Indexs = 'Indexs'
 indexs = 'indexs'
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
index 1471e041a58493..12ef2ebc4d0fe9 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.cc
@@ -54,7 +54,7 @@ std::ostream& operator<<(std::ostream& stream,
 }
 
 DimExprs4ValueT MakeDimExprs4Value(
-    pir::Program* program, const PassManagerCreater& CreatePassManager) {
+    pir::Program* program, const PassManagerCreator& CreatePassManager) {
   std::shared_ptr<pir::PassManager> pass_manager = CreatePassManager();
   pass_manager->AddPass(pir::CreateShapeOptimizationPass());
   pass_manager->Run(program);
@@ -623,7 +623,7 @@ void CheckProgramDimExprConstraints(
 }  // namespace
 
 void CheckInferSymbolicIfNeed(pir::Program* program,
-                              const PassManagerCreater& CreatePassManager) {
+                              const PassManagerCreator& CreatePassManager) {
   if (!FLAGS_prim_all || !FLAGS_check_infer_symbolic) return;
   const auto& GraphDimExprs4Value =
       MakeDimExprs4Value(program, CreatePassManager);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h
index d61dd2c6d27f38..1ec72bb4180218 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/check_infer_symbolic_util.h
@@ -24,10 +24,10 @@ namespace cinn {
 namespace dialect {
 namespace ir {
 
-using PassManagerCreater = std::function<std::shared_ptr<pir::PassManager>()>;
+using PassManagerCreator = std::function<std::shared_ptr<pir::PassManager>()>;
 
 void CheckInferSymbolicIfNeed(pir::Program* program,
-                              const PassManagerCreater& CreatePassManager);
+                              const PassManagerCreator& CreatePassManager);
 
 }  // namespace ir
 }  // namespace dialect
diff --git a/paddle/cinn/ir/schedule/factorize_reduction.h b/paddle/cinn/ir/schedule/factorize_reduction.h
index 7c68370d34b818..b330eaf3b1c850 100644
--- a/paddle/cinn/ir/schedule/factorize_reduction.h
+++ b/paddle/cinn/ir/schedule/factorize_reduction.h
@@ -48,9 +48,9 @@ Tensor CreateRFTensor(const Tensor& original_tensor,
 
 // Base class to create a new reduce block,
 // only used for FactorizeReduction schedule primitive.
-class ReduceBlockCreater {
+class ReduceBlockCreator {
  public:
-  ReduceBlockCreater(const Expr& original_block,
+  ReduceBlockCreator(const Expr& original_block,
                      const std::vector<Expr>& original_loops,
                      const Expr& rf_loop,
                      const Expr& original_update_stmt,
@@ -245,9 +245,9 @@ class LoadReplacer : public ir::IRMutator<> {
 
 // Implement class for building Reduction-Factorized block,
 // only used for FactorizeReduction schedule primitive.
-class RFBlockCreater : public ReduceBlockCreater {
+class RFBlockCreator : public ReduceBlockCreator {
  public:
-  RFBlockCreater(const Expr& original_block,
+  RFBlockCreator(const Expr& original_block,
                  const std::vector<Expr>& original_loops,
                  const Expr& rf_loop,
                  const Expr& original_update_stmt,
@@ -255,7 +255,7 @@ class RFBlockCreater : public ReduceBlockCreater {
                  const std::map<Var, Expr, CompVar>& var2loops,
                  const Expr& bound_check,
                  int rf_axis)
-      : ReduceBlockCreater(original_block,
+      : ReduceBlockCreator(original_block,
                            original_loops,
                            rf_loop,
                            original_update_stmt,
@@ -391,16 +391,16 @@ class RFBlockCreater : public ReduceBlockCreater {
 
 // Implement class for building Writing-Back block,
 // only used for FactorizeReduction schedule primitive.
-class RBBlockCreater : public ReduceBlockCreater {
+class RBBlockCreator : public ReduceBlockCreator {
  public:
-  RBBlockCreater(const Expr& original_block,
+  RBBlockCreator(const Expr& original_block,
                  const std::vector<Expr>& original_loops,
                  const Expr& rf_loop,
                  const Expr& original_update_stmt,
                  const ir::Tensor& rf_tensor,
                  const std::vector<Expr>& rf_tensor_access_indices,
                  const Var& rf_block_rf_iter_var)
-      : ReduceBlockCreater(original_block,
+      : ReduceBlockCreator(original_block,
                            original_loops,
                            rf_loop,
                            original_update_stmt,
diff --git a/paddle/cinn/ir/schedule/impl/reduction.cc b/paddle/cinn/ir/schedule/impl/reduction.cc
index e9df0c7520fa49..0b517264e9707f 100644
--- a/paddle/cinn/ir/schedule/impl/reduction.cc
+++ b/paddle/cinn/ir/schedule/impl/reduction.cc
@@ -45,7 +45,7 @@ Expr DyScheduleImpl::Rfactor(const Expr& rf_loop, int rf_axis) {
   // get root ScheduleBlockRealize
   Expr root = GetRootBlock(rf_loop);
   // create all stmts after rfactor transformation
-  RfCreater rf_create(root, rf_loop, rf_axis);
+  RfCreator rf_create(root, rf_loop, rf_axis);
   // return new created rfactor tensor
   return rf_create.CreateRfAllStmts();
   CINN_IR_SCHEDULE_END(this->err_msg_level_);
@@ -121,7 +121,7 @@ Expr DyScheduleImpl::FactorizeReduction(const Expr& rf_loop,
 
   // Create new blocks and loops.
   Tensor rf_tensor = CreateRFTensor(original_tensor, rf_loop, rf_axis);
-  RFBlockCreater rf_block_creater(original_block,
+  RFBlockCreator rf_block_creator(original_block,
                                   original_loops,
                                   rf_loop,
                                   original_update_stmt,
@@ -129,18 +129,18 @@ Expr DyScheduleImpl::FactorizeReduction(const Expr& rf_loop,
                                   var2loops,
                                   Expr(false),
                                   rf_axis);
-  rf_block_creater.CreateBlock();
-  RBBlockCreater wb_block_creater(original_block,
+  rf_block_creator.CreateBlock();
+  RBBlockCreator wb_block_creator(original_block,
                                   original_loops,
                                   rf_loop,
                                   original_update_stmt,
                                   rf_tensor,
-                                  rf_block_creater.rf_tensor_access_indices_,
-                                  rf_block_creater.rf_var_);
-  wb_block_creater.CreateBlock();
+                                  rf_block_creator.rf_tensor_access_indices_,
+                                  rf_block_creator.rf_var_);
+  wb_block_creator.CreateBlock();
 
-  Expr rf_body = rf_block_creater.CreateLoops();
-  Expr wb_body = wb_block_creater.CreateLoops(
+  Expr rf_body = rf_block_creator.CreateLoops();
+  Expr wb_body = wb_block_creator.CreateLoops(
       /* with_init = */ with_write_back_block_init);
 
   Expr new_computational_body = Block::Make({rf_body, wb_body});
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h
index d0e102b0050751..6e81ab855e7f8b 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.h
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.h
@@ -1320,9 +1320,9 @@ struct FindBlockParent : public ir::IRMutator<> {
 };
 
 // The struct used to create all stmts after rfactor transformation.
-struct RfCreater : public ir::IRMutator<> {
+struct RfCreator : public ir::IRMutator<> {
  public:
-  RfCreater(const Expr& root, const Expr& rf_loop, const int& rf_axis)
+  RfCreator(const Expr& root, const Expr& rf_loop, const int& rf_axis)
       : root_(root), rf_loop_(rf_loop), rf_axis_(rf_axis) {}
   void operator()(Expr* expr) { IRMutator::Visit(expr, expr); }
 
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
index b0f2b78ca3db0a..2be353d224c6d9 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
@@ -1114,7 +1114,7 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph,
   fused_multi_transformer_pattern();
 
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* layer_norm,
                           Node* layer_norm_scale,
                           Node* layer_norm_bias,
@@ -1548,7 +1548,7 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph,
     GET_IR_NODE_FROM_SUBGRAPH(
         eltadd_out, eltadd_out, fused_multi_transformer_pattern)
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  layer_norm,
                  layer_norm_scale,
                  layer_norm_bias,
@@ -1858,7 +1858,7 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion(
   fused_multi_transformer_fuse_qkv_pattern();
 
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* layer_norm,
                           Node* layer_norm_scale,
                           Node* layer_norm_bias,
@@ -2277,7 +2277,7 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion(
     GET_IR_NODE_FROM_SUBGRAPH(
         eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern)
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  layer_norm,
                  layer_norm_scale,
                  layer_norm_bias,
@@ -2592,7 +2592,7 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion(
   fused_multi_transformer_fuse_qkv_pattern();
 
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* layer_norm,
                           Node* layer_norm_scale,
                           Node* layer_norm_bias,
@@ -3047,7 +3047,7 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion(
     GET_IR_NODE_FROM_SUBGRAPH(
         eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern)
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  layer_norm,
                  layer_norm_scale,
                  layer_norm_bias,
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
index 3c1dba76fd18c5..bc6a77d0a60e81 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
@@ -1740,7 +1740,7 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph,
   fused_multi_transformer_pattern();
 
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* layer_norm,
                           Node* layer_norm_scale,
                           Node* layer_norm_bias,
@@ -2281,7 +2281,7 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph,
     GET_IR_NODE_FROM_SUBGRAPH(
         eltadd_out, eltadd_out, fused_multi_transformer_pattern)
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  layer_norm,
                  layer_norm_scale,
                  layer_norm_bias,
@@ -2576,7 +2576,7 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
   fused_multi_transformer_fuse_qkv_pattern();
 
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* layer_norm,
                           Node* layer_norm_scale,
                           Node* layer_norm_bias,
@@ -3116,7 +3116,7 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
     GET_IR_NODE_FROM_SUBGRAPH(
         while0, while0, fused_multi_transformer_fuse_qkv_pattern)
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  layer_norm,
                  layer_norm_scale,
                  layer_norm_bias,
@@ -3419,7 +3419,7 @@ int MultiDevicesFusedMultiTransformerEncoderPass::BuildFusion(
   multi_devices_fused_multi_transformer_pattern();
 
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* c_identity,
                           Node* layer_norm,
                           Node* layer_norm_scale,
@@ -3904,7 +3904,7 @@ int MultiDevicesFusedMultiTransformerEncoderPass::BuildFusion(
     GET_IR_NODE_FROM_SUBGRAPH(
         eltadd_out, eltadd_out, multi_devices_fused_multi_transformer_pattern)
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  c_identity0,
                  layer_norm,
                  layer_norm_scale,
@@ -4211,7 +4211,7 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
   fused_multi_transformer_fuse_qkv_pattern();
 
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* layer_norm,
                           Node* layer_norm_scale,
                           Node* layer_norm_bias,
@@ -4787,7 +4787,7 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
     GET_IR_NODE_FROM_SUBGRAPH(
         while0, while0, fused_multi_transformer_fuse_qkv_pattern);
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  layer_norm,
                  layer_norm_scale,
                  layer_norm_bias,
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 85a3cad5446d10..244f581c6bdac5 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -49,7 +49,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* mul0,
                           Node* mul1,
                           Node* mul2,
@@ -195,7 +195,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
     GET_IR_NODE_FROM_SUBGRAPH(
         transpose2_qkv_out, transpose2_qkv_out, multihead_pattern);
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  mul0,
                  mul1,
                  mul2,
@@ -861,7 +861,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* mul0,
                           Node* mul1,
                           Node* mul2,
@@ -1081,7 +1081,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     if (is_fc_params_shared) {
       return;
     }
-    fuse_creater(input0,
+    fuse_creator(input0,
                  mul0,
                  mul1,
                  mul2,
@@ -1312,7 +1312,7 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* mul0,
                           Node* mul1,
                           Node* mul2,
@@ -1528,7 +1528,7 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
     if (is_fc_params_shared) {
       return;
     }
-    fuse_creater(input0,
+    fuse_creator(input0,
                  mul0,
                  mul1,
                  mul2,
diff --git a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
index 0c9ba92e4ca6fd..1cfc046b85fb08 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
@@ -399,7 +399,7 @@ int MultiHeadMatmulRoformerFusePass::BuildFusion(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* input_cos,
                           Node* input_sin,
                           Node* mul0,
@@ -649,7 +649,7 @@ int MultiHeadMatmulRoformerFusePass::BuildFusion(Graph* graph,
     if (is_fc_params_shared) {
       return;
     }
-    fuse_creater(input0,
+    fuse_creator(input0,
                  input_cos,
                  input_sin,
                  mul0,
diff --git a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
index 2bb30602dcc3de..c267956e55e73a 100644
--- a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
@@ -270,7 +270,7 @@ int TrtCrossMultiHeadMatmulFusePass::BuildCrossFusion(
                                                              name_scope);
 
   multihead_pattern();
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* input1,
                           Node* mul0,
                           Node* mul1,
@@ -430,7 +430,7 @@ int TrtCrossMultiHeadMatmulFusePass::BuildCrossFusion(
     GET_IR_NODE_FROM_SUBGRAPH(
         transpose2_qkv_out, transpose2_qkv_out, multihead_pattern);
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  input1,
                  mul0,
                  mul1,
diff --git a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
index 44c19de295f22a..1eedd2fadf484c 100644
--- a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
@@ -277,7 +277,7 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion(
                                                              name_scope);
 
   multihead_pattern();
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* mul0,
                           Node* mul1,
                           Node* mul2,
@@ -444,7 +444,7 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion(
     GET_IR_NODE_FROM_SUBGRAPH(
         transpose2_qkv_out, transpose2_qkv_out, multihead_pattern);
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  mul0,
                  mul1,
                  mul2,
diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
index 5652e54ce1fc81..8fd3882c3b3161 100644
--- a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
@@ -46,7 +46,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* mul0,
                           Node* mul1,
                           Node* mul2,
@@ -192,7 +192,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
     GET_IR_NODE_FROM_SUBGRAPH(
         transpose2_qkv_out, transpose2_qkv_out, multihead_pattern);
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  mul0,
                  mul1,
                  mul2,
@@ -736,7 +736,7 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* mul0,
                           Node* mul1,
                           Node* mul2,
@@ -1001,7 +1001,7 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     if (is_fc_params_shared) {
       return;
     }
-    fuse_creater(input0,
+    fuse_creator(input0,
                  mul0,
                  mul1,
                  mul2,
@@ -1207,7 +1207,7 @@ int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* mul0,
                           Node* mul1,
                           Node* mul2,
@@ -1423,7 +1423,7 @@ int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
     if (is_fc_params_shared) {
       return;
     }
-    fuse_creater(input0,
+    fuse_creator(input0,
                  mul0,
                  mul1,
                  mul2,
diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
index 1e2a17b5a6ad66..f0f83a53cb2560 100644
--- a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
@@ -259,7 +259,7 @@ int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph,
   patterns::TrtQKMultiHeadMatmulPattern multihead_pattern(pattern, name_scope);
 
   multihead_pattern();
-  auto fuse_creater = [&](Node* input0,
+  auto fuse_creator = [&](Node* input0,
                           Node* input1,
                           Node* mul0,
                           Node* mul1,
@@ -481,7 +481,7 @@ int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph,
     GET_IR_NODE_FROM_SUBGRAPH(
         transpose2_qkv_out, transpose2_qkv_out, multihead_pattern);
 
-    fuse_creater(input0,
+    fuse_creator(input0,
                  input1,
                  mul0,
                  mul1,
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 610d1019126cc4..8128c45a527255 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -3459,9 +3459,9 @@ USE_TRT_CONVERTER(preln_layernorm_shift_partition)
 USE_TRT_CONVERTER(merge_layernorm)
 USE_TRT_CONVERTER(trans_layernorm)
 USE_TRT_CONVERTER(skip_merge_layernorm)
-USE_TRT_CONVERTER(generic_plugin_creater)
-USE_TRT_CONVERTER(custom_plugin_creater)
-USE_TRT_CONVERTER(custom_generic_plugin_creater)
+USE_TRT_CONVERTER(generic_plugin_creator)
+USE_TRT_CONVERTER(custom_plugin_creater)  // typos: disable-line
+USE_TRT_CONVERTER(custom_generic_plugin_creator)
 USE_TRT_CONVERTER(fuse_eleadd_transpose)
 USE_TRT_CONVERTER(tanh_shrink)
 USE_TRT_CONVERTER(logsigmoid)
diff --git a/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc
index cf0fe2884c4978..afb22dc3b5dace 100644
--- a/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/flash_multihead_matmul_op.cc
@@ -218,7 +218,7 @@ class FlashMultiheadMatMulOpConverter : public OpConverter {
         ("shuffle_before_multihead_matmul(Output: " + output_name + ")")
             .c_str());
     auto creator = GetPluginRegistry()->getPluginCreator("fMHA_V2", "1");
-    assert("fmha_v2 plugin creater must not be null" && creator != nullptr);
+    assert("fmha_v2 plugin creator must not be null" && creator != nullptr);
     std::vector<nvinfer1::PluginField> fields{};
     std::unique_ptr<nvinfer1::PluginFieldCollection> plugin_collection(
         new nvinfer1::PluginFieldCollection);
diff --git a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
index 8b1c825c991016..05b61a8b46254b 100644
--- a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
+++ b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
@@ -160,7 +160,7 @@ class CustomPluginCreater : public OpConverter {
   }
 };
 
-class GenericPluginCreater : public OpConverter {
+class GenericPluginCreator : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc &op,
                   const framework::Scope &scope,
@@ -245,7 +245,7 @@ class GenericPluginCreater : public OpConverter {
   }
 };
 
-class CustomGenericPluginCreater : public OpConverter {
+class CustomGenericPluginCreator : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc &op,
                   const framework::Scope &scope,
@@ -334,7 +334,8 @@ class CustomGenericPluginCreater : public OpConverter {
 
 }  // namespace paddle::inference::tensorrt
 
-REGISTER_TRT_OP_CONVERTER(custom_plugin_creater, CustomPluginCreater);
-REGISTER_TRT_OP_CONVERTER(generic_plugin_creater, GenericPluginCreater);
-REGISTER_TRT_OP_CONVERTER(custom_generic_plugin_creater,
-                          CustomGenericPluginCreater);
+REGISTER_TRT_OP_CONVERTER(custom_plugin_creater,
+                          CustomPluginCreater);  // typos: disable-line
+REGISTER_TRT_OP_CONVERTER(generic_plugin_creator, GenericPluginCreator);
+REGISTER_TRT_OP_CONVERTER(custom_generic_plugin_creator,
+                          CustomGenericPluginCreator);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index c3b0b5e15f40f9..bae972efce7775 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -145,23 +145,24 @@ class OpConverter {
         }
         break;
 
-      case OpConverterType::GenericPluginCreater:
+      case OpConverterType::GenericPluginCreator:
         LOG(INFO) << "There is no OpConverter for type " << op_desc.Type()
-                  << ", now use generic_plugin_creater!";
-        it = Registry<OpConverter>::Global().Lookup("generic_plugin_creater");
+                  << ", now use generic_plugin_creator!";
+        it = Registry<OpConverter>::Global().Lookup("generic_plugin_creator");
         break;
 
-      case OpConverterType::CustomPluginCreater:
+      case OpConverterType::CustomPluginCreater:  // typos: disable-line
         LOG(INFO) << "There is no OpConverter for type " << op_desc.Type()
-                  << ", now use custom_plugin_creater!";
-        it = Registry<OpConverter>::Global().Lookup("custom_plugin_creater");
+                  << ", now use custom_plugin_creater!";  // typos: disable-line
+        it = Registry<OpConverter>::Global().Lookup(
+            "custom_plugin_creater");  // typos: disable-line
         break;
 
-      case OpConverterType::CustomGenericPluginCreater:
+      case OpConverterType::CustomGenericPluginCreator:
         LOG(INFO) << "There is no OpConverter for type " << op_desc.Type()
-                  << ", now use custom_generic_plugin_creater!";
+                  << ", now use custom_generic_plugin_creator!";
         it = Registry<OpConverter>::Global().Lookup(
-            "custom_generic_plugin_creater");
+            "custom_generic_plugin_creator");
         break;
 
       default:
@@ -174,24 +175,24 @@ class OpConverter {
         common::errors::Unimplemented("no OpConverter for optype [%s]",
                                       op_desc.Type()));
 
-    std::string all_outpus_name = "(Outputs:";
-    std::string all_inpus_name = "(Inputs:";
+    std::string all_outputs_name = "(Outputs:";
+    std::string all_inputs_name = "(Inputs:";
     for (auto it1 : op_desc.OutputNames()) {
       for (auto it2 : op_desc.Output(it1)) {
-        all_outpus_name += it2;
-        all_outpus_name += ",";
+        all_outputs_name += it2;
+        all_outputs_name += ",";
       }
     }
-    all_outpus_name += ")";
+    all_outputs_name += ")";
     for (auto it1 : op_desc.InputNames()) {
       for (auto it2 : op_desc.Input(it1)) {
-        all_inpus_name += it2;
-        all_inpus_name += ",";
+        all_inputs_name += it2;
+        all_inputs_name += ",";
       }
     }
 
-    all_inpus_name += ")";
-    VLOG(1) << op_desc.Type() << all_inpus_name << all_outpus_name
+    all_inputs_name += ")";
+    VLOG(1) << op_desc.Type() << all_inputs_name << all_outputs_name
             << "are to be converted to TensorRT layer";
 
     it->SetEngine(engine);
@@ -219,8 +220,8 @@ class OpConverter {
                                      op_desc.Type()));
       }
 
-      auto* output_itensor = engine->GetITensor(output_name);
-      engine->SetTensorDynamicRange(output_itensor, out_scale);
+      auto* output_tensor = engine->GetITensor(output_name);
+      engine->SetTensorDynamicRange(output_tensor, out_scale);
       VLOG(1) << "Set out scale = " << out_scale << " for tensor "
               << output_name << ".";
     }
@@ -231,8 +232,8 @@ class OpConverter {
             float, op_desc.GetAttr("out_" + std::to_string(i) + "_threshold"));
         std::string output_name =
             op_desc.Output(op_desc.OutputNames()[i]).front();
-        auto* output_itensor = engine->GetITensor(output_name);
-        engine->SetTensorDynamicRange(output_itensor, out_scale);
+        auto* output_tensor = engine->GetITensor(output_name);
+        engine->SetTensorDynamicRange(output_tensor, out_scale);
         VLOG(1) << "Set out scale = " << out_scale << " for tensor "
                 << output_name << ".";
       }
@@ -246,10 +247,10 @@ class OpConverter {
     for (size_t i = 0; i < inputs_name.size(); i++) {
       if (op_desc.HasAttr(inputs_name[i])) {
         std::string input_tensor_name = op_desc.Input(inputs_name[i])[0];
-        auto* input_itensor = engine->GetITensor(input_tensor_name);
+        auto* input_tensor = engine->GetITensor(input_tensor_name);
         float input_scale =
             PADDLE_GET_CONST(float, op_desc.GetAttr(inputs_name[i]));
-        engine->SetTensorDynamicRange(input_itensor, input_scale);
+        engine->SetTensorDynamicRange(input_tensor, input_scale);
         VLOG(1) << "Set input tensor scale = " << input_scale
                 << " for tensor: " << input_tensor_name << ".";
       }
@@ -257,10 +258,10 @@ class OpConverter {
     for (size_t i = 0; i < outputs_name.size(); i++) {
       if (op_desc.HasAttr(outputs_name[i])) {
         std::string output_tensor_name = op_desc.Output(outputs_name[i])[0];
-        auto* output_itensor = engine->GetITensor(output_tensor_name);
+        auto* output_tensor = engine->GetITensor(output_tensor_name);
         float output_scale =
             PADDLE_GET_CONST(float, op_desc.GetAttr(outputs_name[i]));
-        engine->SetTensorDynamicRange(output_itensor, output_scale);
+        engine->SetTensorDynamicRange(output_tensor, output_scale);
         VLOG(1) << "Set output tensor scale = " << output_scale
                 << " for tensor: " << output_tensor_name << ".";
       }
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 9a21edd52d838a..0356b17d432300 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -3515,7 +3515,7 @@ bool OpTeller::Tell(const framework::ir::Node* node,
                                with_dynamic_shape,
                                forbid_dynamic_op_enter_into_trt,
                                use_explicit_quantization)) {
-    SetOpConverterType(node->Op(), OpConverterType::GenericPluginCreater);
+    SetOpConverterType(node->Op(), OpConverterType::GenericPluginCreator);
     return true;
   }
   auto& custom_plugin_teller = GetCustomPluginTeller();
@@ -3524,7 +3524,9 @@ bool OpTeller::Tell(const framework::ir::Node* node,
                               with_dynamic_shape,
                               forbid_dynamic_op_enter_into_trt,
                               use_explicit_quantization)) {
-    SetOpConverterType(node->Op(), OpConverterType::CustomPluginCreater);
+    SetOpConverterType(
+        node->Op(),
+        OpConverterType::CustomPluginCreater);  // typos: disable-line
     return true;
   }
   auto& custom_generic_plugin_teller = GetCustomGenericPluginTeller();
@@ -3533,7 +3535,7 @@ bool OpTeller::Tell(const framework::ir::Node* node,
                                       with_dynamic_shape,
                                       forbid_dynamic_op_enter_into_trt,
                                       use_explicit_quantization)) {
-    SetOpConverterType(node->Op(), OpConverterType::CustomGenericPluginCreater);
+    SetOpConverterType(node->Op(), OpConverterType::CustomGenericPluginCreator);
     return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index f955396b9ac119..63e3614e7cc2e2 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -59,9 +59,9 @@ struct Teller {
 
 enum class OpConverterType {
   Default = 0,
-  GenericPluginCreater,
-  CustomPluginCreater,
-  CustomGenericPluginCreater
+  GenericPluginCreator,
+  CustomPluginCreater,  // typos: disable-line
+  CustomGenericPluginCreator
 };
 /*
  * class OpTeller helps to tell whether a fluid
diff --git a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h
index 328b596594006b..24a9d2fe2cd5f9 100644
--- a/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/reverse_roll_op_plugin.h
@@ -117,7 +117,7 @@ class ReverseRollPluginDynamic : public DynamicPluginTensorRT {
   bool with_fp16_;
 };
 
-class ReverseRollPluginDynamicCreater : public TensorRTPluginCreator {
+class ReverseRollPluginDynamicCreator : public TensorRTPluginCreator {
  public:
   const char* getPluginName() const TRT_NOEXCEPT override {
     return "reverse_roll_dynamic";
@@ -130,7 +130,7 @@ class ReverseRollPluginDynamicCreater : public TensorRTPluginCreator {
     return new ReverseRollPluginDynamic(serial_data, serial_length);
   }
 };
-REGISTER_TRT_PLUGIN_V2(ReverseRollPluginDynamicCreater);
+REGISTER_TRT_PLUGIN_V2(ReverseRollPluginDynamicCreator);
 
 }  // namespace plugin
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
index aed689e8fb44cb..0ec6f3370934d8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
@@ -31,7 +31,7 @@ TEST(fused_token_prune_op_plugin, test_plugin) {
   plugin.serialize(buf.data());
 }
 
-TEST(fused_token_prune_op_plugin, test_plugin_creater) {
+TEST(fused_token_prune_op_plugin, test_plugin_creator) {
   FusedTokenPrunePluginDynamicCreator creator;
   creator.getFieldNames();
   creator.createPlugin("test", nullptr);
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
index 64e55023892c40..1c927ef6949075 100644
--- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
@@ -50,7 +50,7 @@ TEST(split_op_plugin, test_plugin) {
   sp_plugin.terminate();
 }
 
-TEST(split_op_plugin, test_plugin_creater) {
+TEST(split_op_plugin, test_plugin_creator) {
   SplitPluginCreator creator;
   creator.getFieldNames();
   creator.createPlugin("test", nullptr);
diff --git a/test/dygraph_to_static/test_cycle_gan.py b/test/dygraph_to_static/test_cycle_gan.py
index 6272c4d91d5989..36cb7434db021c 100644
--- a/test/dygraph_to_static/test_cycle_gan.py
+++ b/test/dygraph_to_static/test_cycle_gan.py
@@ -480,7 +480,7 @@ def pool_image(self, image):
                 return image
 
 
-def reader_creater():
+def reader_creator():
     def reader():
         while True:
             fake_image = np.uint8(
@@ -551,8 +551,8 @@ def train(args):
 
         A_pool = ImagePool()
         B_pool = ImagePool()
-        A_reader = paddle.batch(reader_creater(), args.batch_size)()
-        B_reader = paddle.batch(reader_creater(), args.batch_size)()
+        A_reader = paddle.batch(reader_creator(), args.batch_size)()
+        B_reader = paddle.batch(reader_creator(), args.batch_size)()
         cycle_gan = paddle.jit.to_static(
             Cycle_Gan(input_channel=data_shape[1], istrain=True)
         )

From ca41a7af04458c73403aae5edda976a3d0c50c99 Mon Sep 17 00:00:00 2001
From: Shi Kai <albre02@outlook.com>
Date: Wed, 8 Jan 2025 20:51:02 +0800
Subject: [PATCH 33/57] [Docathon][Add API Legend No.16] Add legend and update
 doc for atleast_2d -part (#70242)

* Add legend and docathon in EN for atleast_2d API

* Fix codestyle

* Update manipulation.py; test=document_fix
---
 python/paddle/tensor/manipulation.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index bc78d510a91ee3..5f794b2e9fd866 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -5295,6 +5295,19 @@ def atleast_2d(*inputs, name=None):
     """
     Convert inputs to tensors and return the view with at least 2-dimension. Two or high-dimensional inputs are preserved.
 
+    The following diagram illustrates the behavior of atleast_2d on different dimensional inputs for the following cases:
+
+        1. A 0-dim tensor input.
+        2. A 0-dim tensor and a 1-dim tensor input.
+        3. A 0-dim tensor and a 3-dim tensor input.
+
+    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/images/api_legend/atleast_2d.png
+        :width: 600
+        :alt: legend of atleast_2d API
+        :align: center
+
+    In each case, the function returns the tensors (or a list of tensors) in views with at least 2 dimensions.
+
     Args:
         inputs (Tensor|list(Tensor)): One or more tensors. The data type is ``float16``, ``float32``, ``float64``, ``int16``, ``int32``, ``int64``, ``int8``, ``uint8``, ``complex64``, ``complex128``, ``bfloat16`` or ``bool``.
         name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

From 0f72fabd513cd54233fcbe8c117f436ff3496c03 Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Wed, 8 Jan 2025 21:26:23 +0800
Subject: [PATCH 34/57] del autosimplify 3 (#70695)

---
 paddle/cinn/optim/vectorize_loops.cc          | 10 ++++----
 paddle/cinn/runtime/cpu/cblas.cc              | 12 +++++-----
 paddle/cinn/runtime/cpu/onednn_math.cc        | 24 +++++++++----------
 .../pir/cinn/adt/merge_block_utils_test.cc    |  2 +-
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index f4d4f005857c96..e6324d0db2e409 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -170,7 +170,7 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
       Expr next_idx = ir::ir_utils::IRCopy(indices.back());
       cinn::ir::ir_utils::IrReplaceVarBroadcast(
           &next_idx, Expr(iter_var_), Expr(i));
-      auto gap = cinn::common::AutoSimplify(Expr(next_idx - first_idx));
+      auto gap = cinn::optim::ArithSimplify(Expr(next_idx - first_idx));
       if (!gap.As<IntImm>() || gap.as_int32() != i) {
         VLOG(5) << "Tensor:" << tensor->name
                 << " is not accessed sequentially, next:" << next_idx
@@ -781,7 +781,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
           true,
           ::common::errors::InvalidArgument(
               "The minimum of forloop should be zero, please check."));
-      Expr for_extent = cinn::common::AutoSimplify(forloop->extent);
+      Expr for_extent = cinn::optim::ArithSimplify(forloop->extent);
       Simplify(&for_extent);
       node->extent = for_extent;
       auto *extent_min = for_extent.As<Min>();
@@ -918,7 +918,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
         inner_for,
         ::common::errors::InvalidArgument(
             "Inner_for is nullptr in UnrollCmpFor function."));
-    Expr inner_for_extent = cinn::common::AutoSimplify(inner_for->extent);
+    Expr inner_for_extent = cinn::optim::ArithSimplify(inner_for->extent);
     Simplify(&inner_for_extent);
     auto *extent_min = inner_for_extent.As<Min>();
     if (extent_min) {
@@ -951,7 +951,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
                                      DeviceAPI::UNK,
                                      inner_for->body,
                                      inner_for->vectorize_info())});
-          Expr new_extent_a = cinn::common::AutoSimplify(le_n->b() + 1);
+          Expr new_extent_a = cinn::optim::ArithSimplify(le_n->b() + 1);
           Expr out_for_a = For::Make(outer_for->loop_var,
                                      outer_for->min,
                                      new_extent_a,
@@ -1021,7 +1021,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
           extent_int % factor == 0 ? extent_trunc : extent_trunc + 1;
       times = cinn::common::make_const(forloop->extent->type(), extent_times);
     } else {
-      times = cinn::common::AutoSimplify(
+      times = cinn::optim::ArithSimplify(
           Div::Make(forloop->extent, make_const(factor)));
       Simplify(&times);
     }
diff --git a/paddle/cinn/runtime/cpu/cblas.cc b/paddle/cinn/runtime/cpu/cblas.cc
index 5b9ed4dbaca76b..adf6bf6fb17db0 100644
--- a/paddle/cinn/runtime/cpu/cblas.cc
+++ b/paddle/cinn/runtime/cpu/cblas.cc
@@ -151,8 +151,8 @@ CINN_REGISTER_HELPER(cinn_cpu_mkl) {
                           12UL,
                           ::common::errors::InvalidArgument(
                               "Wrong number of arguments passed in."));
-        auto M = cinn::common::AutoSimplify(args[1]);
-        auto N = cinn::common::AutoSimplify(args[2]);
+        auto M = cinn::optim::ArithSimplify(args[1]);
+        auto N = cinn::optim::ArithSimplify(args[2]);
         std::vector<Expr> shape;
         shape.push_back(M);
         shape.push_back(N);
@@ -173,16 +173,16 @@ CINN_REGISTER_HELPER(cinn_cpu_mkl) {
             A_tensor,
             ::common::errors::InvalidArgument("expected type is tensor."));
 
-        auto batch_size = cinn::common::AutoSimplify(args[1]);
+        auto batch_size = cinn::optim::ArithSimplify(args[1]);
         int32_t batch_size_val = batch_size.as_int32();
 
-        auto M = cinn::common::AutoSimplify(args[2]);
-        auto N = cinn::common::AutoSimplify(args[3]);
+        auto M = cinn::optim::ArithSimplify(args[2]);
+        auto N = cinn::optim::ArithSimplify(args[3]);
 
         std::vector<Expr> shape;
         int total = 1;
         for (auto& v : A_tensor->shape) {
-          auto val = cinn::common::AutoSimplify(v);
+          auto val = cinn::optim::ArithSimplify(v);
           PADDLE_ENFORCE_EQ(
               val.is_constant(),
               true,
diff --git a/paddle/cinn/runtime/cpu/onednn_math.cc b/paddle/cinn/runtime/cpu/onednn_math.cc
index 668788c1194e63..e41fc6119ee116 100644
--- a/paddle/cinn/runtime/cpu/onednn_math.cc
+++ b/paddle/cinn/runtime/cpu/onednn_math.cc
@@ -168,18 +168,18 @@ CINN_REGISTER_HELPER(cinn_cpu_onednn) {
                           16UL,
                           ::common::errors::InvalidArgument(
                               "Wrong number of arguments passed in."));
-        auto N = cinn::common::AutoSimplify(args[0]);
-        int input_h = cinn::common::AutoSimplify(args[2]).as_int32();
-        int input_w = cinn::common::AutoSimplify(args[3]).as_int32();
-        auto c_out = cinn::common::AutoSimplify(args[4]);
-        int filter_h = cinn::common::AutoSimplify(args[6]).as_int32();
-        int filter_w = cinn::common::AutoSimplify(args[7]).as_int32();
-        int pad_h = cinn::common::AutoSimplify(args[8]).as_int32();
-        int pad_w = cinn::common::AutoSimplify(args[9]).as_int32();
-        int stride_h = cinn::common::AutoSimplify(args[10]).as_int32();
-        int stride_w = cinn::common::AutoSimplify(args[11]).as_int32();
-        int dilation_h = cinn::common::AutoSimplify(args[12]).as_int32();
-        int dilation_w = cinn::common::AutoSimplify(args[13]).as_int32();
+        auto N = cinn::optim::ArithSimplify(args[0]);
+        int input_h = cinn::optim::ArithSimplify(args[2]).as_int32();
+        int input_w = cinn::optim::ArithSimplify(args[3]).as_int32();
+        auto c_out = cinn::optim::ArithSimplify(args[4]);
+        int filter_h = cinn::optim::ArithSimplify(args[6]).as_int32();
+        int filter_w = cinn::optim::ArithSimplify(args[7]).as_int32();
+        int pad_h = cinn::optim::ArithSimplify(args[8]).as_int32();
+        int pad_w = cinn::optim::ArithSimplify(args[9]).as_int32();
+        int stride_h = cinn::optim::ArithSimplify(args[10]).as_int32();
+        int stride_w = cinn::optim::ArithSimplify(args[11]).as_int32();
+        int dilation_h = cinn::optim::ArithSimplify(args[12]).as_int32();
+        int dilation_w = cinn::optim::ArithSimplify(args[13]).as_int32();
         int out_h = (input_h - ((filter_h - 1) * dilation_h + 1) + 2 * pad_h) /
                         stride_h +
                     1;
diff --git a/test/cpp/pir/cinn/adt/merge_block_utils_test.cc b/test/cpp/pir/cinn/adt/merge_block_utils_test.cc
index bb5ba4beefe74c..315ab8941b4965 100644
--- a/test/cpp/pir/cinn/adt/merge_block_utils_test.cc
+++ b/test/cpp/pir/cinn/adt/merge_block_utils_test.cc
@@ -29,7 +29,7 @@ bool IsBlockForAllEqual(const ForTreeNode& first, const ForTreeNode& second) {
                                const ForTreeNode& second) -> bool {
     const ir::Expr lhs = first.val->extent();
     const ir::Expr rhs = second.val->extent();
-    if (cinn::common::AutoSimplify(ir::Sub::Make(lhs, rhs)) != ir::Expr(0)) {
+    if (lhs != rhs) {
       return false;
     }
     return true;

From cfb3a7a7b0dc716178c89270092bc6e6d6a87a01 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 9 Jan 2025 09:16:13 +0800
Subject: [PATCH 35/57] fix bug of convert squeeze to reshape in cinn (#70720)

---
 .../dialect/operator/transforms/pd_to_cinn_pass.cc    | 11 ++++++++---
 .../pir_graph_analyzing/shardable_axes_base.cc        |  8 ++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 008ef30762ece8..537c6239fd19b2 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -1010,18 +1010,23 @@ class SqueezeOpPattern
     if (IsDefinedBy<FullIntArrayOp>(op, 1) && !is_dyshape) {
       const FullIntArrayOp axis_full_op = CastDefinedTo<FullIntArrayOp>(op, 1);
       auto axis_vec = cinn::dialect::ir::GetVectorAttr(axis_full_op, "value");
-      std::set<int64_t> axis_set(axis_vec.begin(), axis_vec.end());
-
       auto in_shape =
           phi::vectorize(op.operand_source(0)
                              .type()
                              .dyn_cast<paddle::dialect::DenseTensorType>()
                              .dims());
+      const std::set<int64_t> axis_set = [&] {
+        std::set<int64_t> axis_set;
+        for (int64_t axis : axis_vec) {
+          axis_set.insert(axis < 0 ? axis + in_shape.size() : axis);
+        }
+        return axis_set;
+      }();
 
       std::vector<int> output_shape;
 
       for (size_t i = 0; i < in_shape.size(); ++i) {
-        if (!axis_set.count(i)) {
+        if (!axis_set.count(i) || in_shape[i] != 1) {
           output_shape.push_back(in_shape[i]);
         } else {
           PADDLE_ENFORCE_EQ(
diff --git a/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc b/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc
index 03aea53b8ddebd..3c58262e65d6f0 100644
--- a/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc
+++ b/paddle/cinn/operator_fusion/pir_graph_analyzing/shardable_axes_base.cc
@@ -160,7 +160,9 @@ ShardableAxesSignature CreateSignatureForElementWise(pir::Operation* op) {
                       GetCompatibleRank(op->operand_source(i)),
                       ::common::errors::PreconditionNotMet(
                           "Required all inputs rank shall be equal output in "
-                          "elementwise op."));
+                          "elementwise op : %s [id:%d]",
+                          op->name(),
+                          op->id()));
     result.inputs.emplace_back(same_axes);
   }
   for (int i = 0; i < op->num_results(); ++i) {
@@ -168,7 +170,9 @@ ShardableAxesSignature CreateSignatureForElementWise(pir::Operation* op) {
                       GetCompatibleRank(op->result(i)),
                       ::common::errors::PreconditionNotMet(
                           "Required all outputs rank shall be equal each other "
-                          "in elementwise op."));
+                          "in elementwise op : %s [id:%d]",
+                          op->name(),
+                          op->id()));
     result.outputs.emplace_back(same_axes);
   }
   result.loop = result.outputs.back();

From 84ab826e0f5404845a86b3d077099de7a182f744 Mon Sep 17 00:00:00 2001
From: fangfangssj <99968055+fangfangssj@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:26:36 +0800
Subject: [PATCH 36/57] [CodeStyle][Typos][T-[1-5]] Fix typo(targt, Taget,
 templat,temporaily,temporily,Temperarily,temporaly,Temperary) (#70722)

---
 _typos.toml                                          |  8 --------
 .../dialect/operator/transforms/pd_to_cinn_pass.cc   | 12 ++++++------
 paddle/cinn/hlir/framework/pir/fusion_info.cc        |  2 +-
 paddle/cinn/hlir/pe/schedule.cc                      |  2 +-
 paddle/fluid/ir_adaptor/translator/CMakeLists.txt    |  4 ++--
 paddle/phi/kernels/funcs/segmented_array.h           |  2 +-
 python/paddle/nn/functional/loss.py                  |  8 ++++----
 test/dygraph_to_static/test_break_continue.py        |  2 +-
 test/sot/test_step_profiler.py                       |  2 +-
 9 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index cfd08daf4c29e2..135388b0344429 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -271,14 +271,6 @@ suppport = 'suppport'
 SWTICH = 'SWTICH'
 Swith = 'Swith'
 sysyem = 'sysyem'
-targt = 'targt'
-Taget = 'Taget'
-templat = 'templat'
-temporaily = 'temporaily'
-temporily = 'temporily'
-Temperarily = 'Temperarily'
-temporaly = 'temporaly'
-Temperary = 'Temperary'
 tenosr = 'tenosr'
 iterm = 'iterm'
 termiante = 'termiante'
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 537c6239fd19b2..1baa9197c19fbd 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -41,21 +41,21 @@ using paddle::dialect::FullOp;
 
 namespace {
 
-template <typename TagetOpT, typename SourceOpT>
+template <typename TargetOpT, typename SourceOpT>
 bool IsDefinedBy(const SourceOpT &op, const size_t idx) {
   const pir::Operation *defined_op = op->operand_source(idx).defining_op();
-  return defined_op && defined_op->isa<TagetOpT>();
+  return defined_op && defined_op->isa<TargetOpT>();
 }
 
-template <typename TagetOpT, typename SourceOpT>
-TagetOpT CastDefinedTo(const SourceOpT &op, const size_t idx) {
-  PADDLE_ENFORCE_EQ(IsDefinedBy<TagetOpT>(op, idx),
+template <typename TargetOpT, typename SourceOpT>
+TargetOpT CastDefinedTo(const SourceOpT &op, const size_t idx) {
+  PADDLE_ENFORCE_EQ(IsDefinedBy<TargetOpT>(op, idx),
                     true,
                     ::common::errors::PreconditionNotMet(
                         "Required defined op shall not be nullptr and can cast "
                         "to target type."));
   pir::Operation *defined_op = op->operand_source(idx).defining_op();
-  return defined_op->dyn_cast<TagetOpT>();
+  return defined_op->dyn_cast<TargetOpT>();
 }
 
 template <typename T = int>
diff --git a/paddle/cinn/hlir/framework/pir/fusion_info.cc b/paddle/cinn/hlir/framework/pir/fusion_info.cc
index d445910e0909e7..6d1067cfc52b71 100644
--- a/paddle/cinn/hlir/framework/pir/fusion_info.cc
+++ b/paddle/cinn/hlir/framework/pir/fusion_info.cc
@@ -236,7 +236,7 @@ std::ostream& operator<<(std::ostream& os, const FusionInfo& fusion_info) {
 
 std::vector<const ::pir::Operation*> TopologySort(
     const OpLoweringGroup& group) {
-  // NOTE(Aurelius84): Use simplest one-by-one order temporaly.
+  // NOTE(Aurelius84): Use simplest one-by-one order temporarily.
   auto* block = group.GetParentBlock();
   std::vector<const ::pir::Operation*> ops;
   ops.reserve(block->size());
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
index dc30364bcda379..fada77826134bf 100644
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -211,7 +211,7 @@ int GetArrayPackingFactor(int shape,
                           const cinn::common::Target &target) {
   int split_base = GetBasicFactor(type, target);
   int split_factor = 1;
-  // temporily use shape-1 instead of shape for isl wrong for1 elimination
+  // temporarily use shape-1 instead of shape for isl wrong for1 elimination
   int i = split_base * split_base < shape ? split_base * split_base : shape;
   for (; i > 1; i--) {
     if (shape % i == 0) {
diff --git a/paddle/fluid/ir_adaptor/translator/CMakeLists.txt b/paddle/fluid/ir_adaptor/translator/CMakeLists.txt
index c8b145c449e37b..7cd1c839845e14 100644
--- a/paddle/fluid/ir_adaptor/translator/CMakeLists.txt
+++ b/paddle/fluid/ir_adaptor/translator/CMakeLists.txt
@@ -8,7 +8,7 @@ set(sparse_op_yaml
     ${PADDLE_SOURCE_DIR}/paddle/phi/ops/yaml/sparse_ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/ops/yaml/sparse_backward.yaml
 )
 set(op_compat_source_file ${PD_PROGRAM_TRANSLATOR_SOURCE_DIR}/op_compat_info.cc)
-set(op_compat_templat_file
+set(op_compat_template_file
     ${PD_PROGRAM_TRANSLATOR_SOURCE_DIR}/op_compat_info.cc.j2)
 
 add_custom_command(
@@ -17,7 +17,7 @@ add_custom_command(
     ${PYTHON_EXECUTABLE} ${op_gen_file} --op_compat_yaml_file
     ${op_compat_yaml_file} --sparse_op_yaml ${sparse_op_yaml}
     --output_source_file ${op_compat_source_file}
-  DEPENDS ${op_gen_file} ${op_compat_yaml_file} ${op_compat_templat_file}
+  DEPENDS ${op_gen_file} ${op_compat_yaml_file} ${op_compat_template_file}
   VERBATIM)
 file(GLOB PD_PROGRAM_TRANSLATOR_SRCS "*.cc")
 
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index 24046da52aeeeb..71b7e5f4e52739 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -171,7 +171,7 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
 
   // need_alloc : tensor data needs extra buffer or not.
   // use_cuda_graph: tensor data shall be captured by cuda_graph or not.
-  // pre_alloc_host_buf: tensor data is temporaily stored by pinned memory or
+  // pre_alloc_host_buf: tensor data is temporarily stored by pinned memory or
   // not.
   PointerArraySetter(const Context& ctx,
                      std::vector<DenseTensor*>* t,
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index cdd734c42a07e0..8dd0e2f0a41ede 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -4510,9 +4510,9 @@ def adaptive_log_softmax_with_loss(
             Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
             1.14779019)
     """
-    targt_dim = label.dim()
+    target_dim = label.dim()
 
-    if targt_dim == 1:
+    if target_dim == 1:
         if input.shape[0] != label.shape[0]:
             raise ValueError(
                 'Input and label should have the same size '
@@ -4523,7 +4523,7 @@ def adaptive_log_softmax_with_loss(
                 '1D label tensor expects 2D input tensors, '
                 f'but found inputs with size {input.shape}'
             )
-    elif targt_dim == 0:
+    elif target_dim == 0:
         if input.dim() != 1:
             raise ValueError(
                 '0D label tensor expects 1D input tensors, '
@@ -4534,7 +4534,7 @@ def adaptive_log_softmax_with_loss(
             '0D or 1D label tensor expected, ' 'multi-label not supported'
         )
 
-    is_batched = targt_dim > 0
+    is_batched = target_dim > 0
     input = input if is_batched else input.unsqueeze(0)
     label = label if is_batched else label.unsqueeze(0)
 
diff --git a/test/dygraph_to_static/test_break_continue.py b/test/dygraph_to_static/test_break_continue.py
index 0d15dac2843711..ad6ade66ac74fe 100644
--- a/test/dygraph_to_static/test_break_continue.py
+++ b/test/dygraph_to_static/test_break_continue.py
@@ -351,7 +351,7 @@ def init_dygraph_func(self):
     def test_transformed_static_result(self):
         self.init_dygraph_func()
         dygraph_res = self.run_dygraph_mode()
-        # NOTE(SigureMo): Temperary run the test in sequential run mode to avoid dependency
+        # NOTE(SigureMo): Temporarily run the test in sequential run mode to avoid dependency
         # on the execution order of the test cases.
         if use_pir_api():
             with exe_sequential_run_guard(True):
diff --git a/test/sot/test_step_profiler.py b/test/sot/test_step_profiler.py
index 82279b3bc09543..b6c895281959a2 100644
--- a/test/sot/test_step_profiler.py
+++ b/test/sot/test_step_profiler.py
@@ -43,7 +43,7 @@ def forward(self, x):
 
 
 class TestStepProfilerSmokeTest(unittest.TestCase):
-    # Temperarily disable this test
+    # Temporarily disable this test
     # @sot_step_profiler_guard(True)
     @strict_mode_guard(False)
     def test_step_profiler_smoke(self):

From a881fcd6a9c74df658354e4d833786a597909a9b Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:38:15 +0800
Subject: [PATCH 37/57] del autosimplify 2 (#70694)

---
 .../optim/trans_buffer_with_dynamic_shape.cc  |  6 ++---
 paddle/cinn/optim/transform_gpu_forloop.cc    | 12 +++++-----
 paddle/cinn/optim/transform_polyfor_to_for.cc |  2 +-
 paddle/cinn/optim/update_buffer_axis_pass.cc  | 22 ++-----------------
 paddle/cinn/optim/var_mod_simplify.cc         |  4 ++--
 5 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
index 22f92e0290d997..ac030ec4bbfa8c 100644
--- a/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
+++ b/paddle/cinn/optim/trans_buffer_with_dynamic_shape.cc
@@ -59,8 +59,8 @@ struct Mutator : public ir::IRMutator<>, public ir::stmt::StmtMutator<> {
         Expr e = expr->as_tensor()->shape[i];
         Expr buf_e = buf->shape[i];
         if (buf->memory_type == ir::MemoryType::GPULocal) {
-          e = cinn::common::AutoSimplify(e);
-          buf_e = cinn::common::AutoSimplify(buf_e);
+          e = cinn::optim::ArithSimplify(e);
+          buf_e = cinn::optim::ArithSimplify(buf_e);
           if (!e.is_constant()) {
             auto new_shape = ir::ir_utils::IRCopy(e);
             new_shape = analyzer.UpperBound(new_shape);
@@ -86,7 +86,7 @@ struct Mutator : public ir::IRMutator<>, public ir::stmt::StmtMutator<> {
         auto e = buf->shape.size() > tensor->shape.size() ? buf->shape[i]
                                                           : tensor->shape[i];
         if (buf->memory_type == ir::MemoryType::GPULocal) {
-          e = cinn::common::AutoSimplify(e);
+          e = cinn::optim::ArithSimplify(e);
           if (!e.is_constant()) {
             auto new_shape = ir::ir_utils::IRCopy(e);
             new_shape = analyzer.UpperBound(new_shape);
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 020cdc4dade8d5..4012acb2ca10d9 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -317,7 +317,7 @@ class SharedAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = cinn::common::AutoSimplify(indice);
+        indice = cinn::optim::ArithSimplify(indice);
       }
     }
     ir::IRMutator<>::Visit(op, expr);
@@ -338,7 +338,7 @@ class SharedAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = cinn::common::AutoSimplify(indice);
+        indice = cinn::optim::ArithSimplify(indice);
       }
     }
     ir::IRMutator<>::Visit(op, expr);
@@ -367,7 +367,7 @@ class LocalAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = cinn::common::AutoSimplify(indice);
+        indice = cinn::optim::ArithSimplify(indice);
       }
     }
   }
@@ -388,7 +388,7 @@ class LocalAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = cinn::common::AutoSimplify(indice);
+        indice = cinn::optim::ArithSimplify(indice);
       }
     }
     ir::IRMutator<>::Visit(op, expr);
@@ -418,7 +418,7 @@ class ReplaceUnitVarToZero : public ir::IRMutator<> {
       for (auto var_ : loop_var_) {
         optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0));
       }
-      indice = cinn::common::AutoSimplify(indice);
+      indice = cinn::optim::ArithSimplify(indice);
     }
     ir::IRMutator<>::Visit(op, expr);
   }
@@ -434,7 +434,7 @@ class ReplaceUnitVarToZero : public ir::IRMutator<> {
       for (auto var_ : loop_var_) {
         optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0));
       }
-      indice = cinn::common::AutoSimplify(indice);
+      indice = cinn::optim::ArithSimplify(indice);
     }
 
     ir::IRMutator<>::Visit(op, expr);
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc
index ab811a792e09a4..99a145d924ff35 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for.cc
@@ -136,7 +136,7 @@ struct PolyForWithSimpleConditionToForMutator : public ir::IRMutator<Expr*> {
 
     Expr lhs = lt_n ? lt_n->a() : le_n->a();
     Expr rhs = lt_n ? lt_n->b() : PlusOneWithMinMax(le_n->b());
-    rhs = cinn::common::AutoSimplify(rhs);
+    rhs = cinn::optim::ArithSimplify(rhs);
 
     if (op->is_vectorized())
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/cinn/optim/update_buffer_axis_pass.cc b/paddle/cinn/optim/update_buffer_axis_pass.cc
index b43b7fc8349145..12927b1a971b55 100644
--- a/paddle/cinn/optim/update_buffer_axis_pass.cc
+++ b/paddle/cinn/optim/update_buffer_axis_pass.cc
@@ -28,24 +28,6 @@
 namespace cinn {
 namespace optim {
 
-bool ExprMathEqual(const Expr& expr1, const Expr& expr2) {
-  ir::Expr cmp_expr = common::AutoSimplify(ir::Sub::Make(expr1, expr2));
-  // This is ugly code since AutoSimplify is not powerful enough. Modify it
-  // after we make auto simplify better
-  ir::Expr simplified = common::AutoSimplify(cmp_expr);
-  int count = 0;
-  while (simplified != cmp_expr) {
-    cmp_expr = simplified;
-    simplified = common::AutoSimplify(cmp_expr);
-    ++count;
-    // Control dead loop
-    if (count >= 5) {
-      break;
-    }
-  }
-  return simplified.is_constant() && simplified.get_constant() == 0;
-}
-
 void FormalizeSingleIndex(const ir::Tensor& tensor,
                           std::vector<ir::Expr>* indices) {
   if (tensor->shape.size() > 1 && indices->size() == 1) {
@@ -56,7 +38,7 @@ void FormalizeSingleIndex(const ir::Tensor& tensor,
       mul = ir::Mul::Make(tensor->shape[i + 1], mul);
       ir::Expr div_expr = ir::Div::Make(origin_index_expr, mul);
       ir::Expr index_expr = ir::Mod::Make(div_expr, tensor->shape[i]);
-      indices->insert(indices->begin(), common::AutoSimplify(index_expr));
+      indices->insert(indices->begin(), optim::ArithSimplify(index_expr));
     }
   }
 }
@@ -150,7 +132,7 @@ class AnalyzeBufferAxis : public ir::IRMutator<> {
         buffer_name_access_same_index_expr[buffer_name];
     for (int i = 0; i < indices.size(); ++i) {
       if (index_expr.count(i)) {
-        if (!ExprMathEqual(index_expr[i], GetIndexBindExpr(indices[i]))) {
+        if (index_expr[i] != GetIndexBindExpr(indices[i])) {
           index_expr.erase(i);
         }
       }
diff --git a/paddle/cinn/optim/var_mod_simplify.cc b/paddle/cinn/optim/var_mod_simplify.cc
index 7306bc7ff2a506..bab7d7f5877722 100644
--- a/paddle/cinn/optim/var_mod_simplify.cc
+++ b/paddle/cinn/optim/var_mod_simplify.cc
@@ -86,11 +86,11 @@ struct ReplaceVarWithDivMutator : public ir::IRMutator<> {
 }  // namespace
 
 void VarModSimplify(Expr* e) {
-  *e = cinn::common::AutoSimplify(*e);
+  *e = cinn::optim::ArithSimplify(*e);
   ReplaceModWithDivMutator()(e);
   ReplaceDivWithVarMutator mutator;
   mutator(e);
-  *e = cinn::common::AutoSimplify(*e);
+  *e = cinn::optim::ArithSimplify(*e);
   auto div_var_map = mutator.div_var_map_;
   ReplaceVarWithDivMutator()(e, mutator.div_var_map_);
 }

From 62dc1bd4f6ae93e6e7ea2022cb19a7c2ea720598 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 9 Jan 2025 10:38:45 +0800
Subject: [PATCH 38/57] update vlog level (#70692)

---
 paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc b/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc
index 1844a0e7ed661d..e5f50f9c00f642 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/generate_shape_util.cc
@@ -666,7 +666,7 @@ bool MakeGenerateShapeOpAttribute(
       }
     }
     if (!has_symbol_binding) {
-      LOG(WARNING) << "no symbol binding found for dim expr: " << symbol_name;
+      VLOG(2) << "no symbol binding found for dim expr: " << symbol_name;
       return false;
     }
   }

From d59da9fb17156136e1dfe0df31d7d6e436d5ae4d Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:44:35 +0800
Subject: [PATCH 39/57] [CINN] Align initial subgraph order with block.ops
 (#70719)

---
 paddle/fluid/pir/transforms/sub_graph_detector.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pir/transforms/sub_graph_detector.cc b/paddle/fluid/pir/transforms/sub_graph_detector.cc
index 913617c8e5b30e..57343056ee87aa 100644
--- a/paddle/fluid/pir/transforms/sub_graph_detector.cc
+++ b/paddle/fluid/pir/transforms/sub_graph_detector.cc
@@ -461,13 +461,14 @@ void SubgraphDetector::MergeSource2Target(const SubGraphPtr& source,
 
 SubgraphDetector::SubgraphDetector(pir::Block* block,
                                    const OpClassifier& classifier) {
-  // init sort_ops_ in reverse topo order
-  sort_ops_ = InverselyTopologicalSort(block);
-  // init op2index_ in topo order
+  // init sort_ops_ in reverse topo order and op2index_ in topo order
   int index = 0;
   for (auto& op : *block) {
+    sort_ops_.push_back(&op);
     op2index_[&op] = index++;
   }
+  std::reverse(sort_ops_.begin(), sort_ops_.end());
+
   // construct subgraphs and upstream/downstream relation
   std::vector<SubGraphPtr> subgraph_list;
   for (const auto& op : sort_ops_) {

From f747eefc184b51e85dac650da1bc9ff97253871a Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:51:38 +0800
Subject: [PATCH 40/57] [fluid_ops] Replace c_allreduce_sum in
 python/paddle/nn/clip.py (#70707)

---
 .../distributed/transpiler/collective.py      | 38 +++++++++++--------
 python/paddle/nn/clip.py                      | 14 ++++---
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/python/paddle/distributed/transpiler/collective.py b/python/paddle/distributed/transpiler/collective.py
index 2ce07a8db585af..5a408671e34136 100644
--- a/python/paddle/distributed/transpiler/collective.py
+++ b/python/paddle/distributed/transpiler/collective.py
@@ -352,16 +352,17 @@ def _insert_allreduce_ops(self):
                         )
                         offset += 1
 
-                    # As we search ops reversely, we should insert c_allreduce_sum
+                    # As we search ops reversely, we should insert all_reduce sum
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
                     block._insert_op(
                         offset,
-                        type='c_allreduce_sum',
-                        inputs={'X': grad},
-                        outputs={'Out': grad},
+                        type='all_reduce',
+                        inputs={'x': grad},
+                        outputs={'out': grad},
                         attrs={
                             'ring_id': ring_id,
+                            'reduce_type': paddle.distributed.ReduceOp.SUM,
                             self.op_role_key: OpRole.Backward,
                         },
                     )
@@ -454,11 +455,12 @@ def _transpile_main_program(self):
                 ring_id = (ring_id + 1) % self.nrings
                 block._insert_op(
                     idx + 3,
-                    type='c_allreduce_sum',
-                    inputs={'X': [param]},
-                    outputs={'Out': [param]},
+                    type='all_reduce',
+                    inputs={'x': [param]},
+                    outputs={'out': [param]},
                     attrs={
                         'ring_id': ring_id,
+                        'reduce_type': paddle.distributed.ReduceOp.SUM,
                         self.op_role_key: OpRole.Optimize,
                     },
                 )
@@ -701,10 +703,14 @@ def _insert_fuse_allreduce_ops(self):
             ring_id = (ring_id + 1) % self.nrings
             block._insert_op(
                 global_offset,
-                type='c_allreduce_sum',
-                inputs={'X': fused_output},
-                outputs={'Out': fused_output},
-                attrs={'ring_id': ring_id, self.op_role_key: OpRole.Backward},
+                type='all_reduce',
+                inputs={'x': fused_output},
+                outputs={'out': fused_output},
+                attrs={
+                    'ring_id': ring_id,
+                    'reduce_type': paddle.distributed.ReduceOp.SUM,
+                    self.op_role_key: OpRole.Backward,
+                },
             )
             global_offset += 1
 
@@ -1013,18 +1019,18 @@ def _insert_fuse_allreduce_ops(self):
                     )
                 break
 
-        # insert the allreduce_sum op
+        # insert the all_reduce sum op
         for idx, op in enumerate(block.ops):
             if self._is_optimizer_op(op):
                 for fused_var in fused_vars:
                     block._insert_op(
                         idx,
-                        type='c_allreduce_sum',
-                        inputs={'X': fused_var},
-                        outputs={'Out': fused_var},
+                        type='all_reduce',
+                        inputs={'x': fused_var},
+                        outputs={'out': fused_var},
                         attrs={
                             'ring_id': ring_id,
-                            'use_calc_stream': False,
+                            'reduce_type': paddle.distributed.ReduceOp.SUM,
                             self.op_role_key: OpRole.Backward,
                         },
                     )
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 9913063eb946f6..c48d0b358eaac2 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -1018,11 +1018,11 @@ def async_add_n(var_list):
             )
 
         if self.should_comm_on_shard_dim and self.has_dist_param:
-            global_norm_dist_var = paddle._C_ops.c_allreduce_sum(
-                global_norm_dist_var, self.sharding_group.id, True, False
+            global_norm_dist_var = paddle._C_ops.all_reduce(
+                global_norm_dist_var, self.sharding_group.id, dist.ReduceOp.SUM
             )
-            global_norm_dist_var = paddle._C_ops.c_allreduce_sum(
-                global_norm_dist_var, self.mp_group.id, True, False
+            global_norm_dist_var = paddle._C_ops.all_reduce(
+                global_norm_dist_var, self.mp_group.id, dist.ReduceOp.SUM
             )
             if global_norm_var is None:
                 global_norm_var = global_norm_dist_var
@@ -1036,8 +1036,10 @@ def async_add_n(var_list):
                 shape=[1], dtype=sum_dtype, fill_value=0.0
             )
         if self.should_comm_on_shard_dim and self.has_not_dist_param:
-            global_norm_not_dist_var = paddle._C_ops.c_allreduce_sum(
-                global_norm_not_dist_var, self.sharding_group.id, True, False
+            global_norm_not_dist_var = paddle._C_ops.all_reduce(
+                global_norm_not_dist_var,
+                self.sharding_group.id,
+                dist.ReduceOp.SUM,
             )
             if global_norm_var is None:
                 global_norm_var = global_norm_not_dist_var

From aef9f5e1c4be60f9254fbb25904a6f407a9a6387 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:53:10 +0800
Subject: [PATCH 41/57] [fluid_ops] collective_global_gather.py remove
 dynamic_static_unified_comm (#70713)

---
 test/collective/collective_global_gather.py  | 13 ++++---------
 test/collective/collective_global_scatter.py | 13 ++++---------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/test/collective/collective_global_gather.py b/test/collective/collective_global_gather.py
index 77d5df10c5fdd5..70c1abd6b3e338 100644
--- a/test/collective/collective_global_gather.py
+++ b/test/collective/collective_global_gather.py
@@ -62,10 +62,8 @@ def run_trainer(self, args):
         endpoints = args["endpoints"].split(",")
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
-        if args["dynamic_static_unified_comm"]:
-            paddle.distributed.collective._init_parallel_env(args["backend"])
-        else:
-            paddle.distributed.init_parallel_env()
+
+        paddle.distributed.collective._init_parallel_env(args["backend"])
         nranks = 2
         if args['backend'] == 'nccl':
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
@@ -112,11 +110,8 @@ def run_trainer(self, args):
         )
 
         if args['static_mode']:
-            result = (
-                self.get_model(train_prog, startup_prog, rank)
-                if args["dynamic_static_unified_comm"]
-                else self.get_model(train_prog, startup_prog, rank)
-            )
+            result = self.get_model(train_prog, startup_prog, rank)
+
             fetch_list = []
             for elem in result:
                 fetch_list.append(elem.name)
diff --git a/test/collective/collective_global_scatter.py b/test/collective/collective_global_scatter.py
index 2987c30e34f28d..b63a0e564f09d3 100644
--- a/test/collective/collective_global_scatter.py
+++ b/test/collective/collective_global_scatter.py
@@ -63,10 +63,8 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        if args["dynamic_static_unified_comm"]:
-            paddle.distributed.collective._init_parallel_env(args["backend"])
-        else:
-            paddle.distributed.init_parallel_env()
+
+        paddle.distributed.collective._init_parallel_env(args["backend"])
         if args['backend'] == 'nccl':
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = base.CUDAPlace(
@@ -90,11 +88,8 @@ def run_trainer(self, args):
             "float32"
         )
         if args['static_mode']:
-            result = (
-                self.get_model(train_prog, startup_prog, rank)
-                if args["dynamic_static_unified_comm"]
-                else self.get_model(train_prog, startup_prog, rank)
-            )
+            result = self.get_model(train_prog, startup_prog, rank)
+
             exe = base.Executor(place)
             exe.run(startup_prog)
             fetch_list = []

From 24b0e23a1c891a138773fb51b2f230e85f4ecf6a Mon Sep 17 00:00:00 2001
From: chen2016013 <111894720+chen2016013@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:54:41 +0800
Subject: [PATCH 42/57] open transpose op in auto-recompute (#70711)

---
 python/paddle/decomposition/recompute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/decomposition/recompute.py b/python/paddle/decomposition/recompute.py
index effd0882000092..cca3a5c8a04abe 100644
--- a/python/paddle/decomposition/recompute.py
+++ b/python/paddle/decomposition/recompute.py
@@ -80,7 +80,7 @@
     "pd_op.slice",
     "pd_op.squeeze",
     "pd_op.unsqueeze",
-    # "pd_op.transpose",
+    "pd_op.transpose",
     # "pd_op.prod",
     "pd_op.log",
     "pd_op.log1p",

From 3180ca0123f94b3b15a338553cd10d5289fc8175 Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:59:32 +0800
Subject: [PATCH 43/57] [Paddle TensorRT No.8] pd_op.anchor_generator (#70667)

* pd_op.anchor_generator

* fix

* fix

* fix

* pd_op.anchor_generator
---
 .../plugin/anchor_generator_op_plugin.cu      | 285 ++++++++++++++++++
 .../plugin/anchor_generator_op_plugin.h       |  98 ++++++
 .../transforms/tensorrt/trt_op_marker_pass.cc |   2 +
 .../fluid/pybind/manual_static_op_function.h  |  17 ++
 paddle/fluid/pybind/pybind.cc                 |   5 +
 python/paddle/tensorrt/converter.py           |   4 +
 python/paddle/tensorrt/impls/others.py        |  60 ++++
 test/tensorrt/test_converter_others.py        |  54 +++-
 8 files changed, 524 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
index f7adaab13d1167..2378e8e11097b7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -667,6 +667,291 @@ nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin(
 }
 #endif
 
+PIRAnchorGeneratorPluginDynamic::PIRAnchorGeneratorPluginDynamic(
+    const nvinfer1::DataType data_type,
+    const std::vector<float>& anchor_sizes,
+    const std::vector<float>& aspect_ratios,
+    const std::vector<float>& stride,
+    const std::vector<float>& variances,
+    const float offset,
+    const int num_anchors)
+    : data_type_(data_type),
+      anchor_sizes_(anchor_sizes),
+      aspect_ratios_(aspect_ratios),
+      stride_(stride),
+      variances_(variances),
+      offset_(offset),
+      num_anchors_(num_anchors) {
+  // data_type_ is used to determine the output data type
+  // data_type_ can only be float32
+  // height, width, num_anchors are calculated at configurePlugin
+  PADDLE_ENFORCE_EQ(data_type_,
+                    nvinfer1::DataType::kFLOAT,
+                    common::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts float32."));
+  PADDLE_ENFORCE_GE(
+      num_anchors_,
+      0,
+      common::errors::InvalidArgument(
+          "TRT anchor generator plugin only accepts number of anchors greater "
+          "than 0, but receive number of anchors = %d.",
+          num_anchors_));
+  PrepareParamsOnDevice();
+}
+
+PIRAnchorGeneratorPluginDynamic::~PIRAnchorGeneratorPluginDynamic() {
+  auto release_device_ptr = [](void* ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+      ptr = nullptr;
+    }
+  };
+  release_device_ptr(anchor_sizes_device_);
+  release_device_ptr(aspect_ratios_device_);
+  release_device_ptr(stride_device_);
+  release_device_ptr(variances_device_);
+}
+
+PIRAnchorGeneratorPluginDynamic::PIRAnchorGeneratorPluginDynamic(
+    void const* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchor_sizes_);
+  DeserializeValue(&data, &length, &aspect_ratios_);
+  DeserializeValue(&data, &length, &stride_);
+  DeserializeValue(&data, &length, &variances_);
+  DeserializeValue(&data, &length, &offset_);
+  DeserializeValue(&data, &length, &num_anchors_);
+  PrepareParamsOnDevice();
+}
+
+nvinfer1::IPluginV2DynamicExt* PIRAnchorGeneratorPluginDynamic::clone() const
+    TRT_NOEXCEPT {
+  auto plugin = new PIRAnchorGeneratorPluginDynamic(data_type_,
+                                                    anchor_sizes_,
+                                                    aspect_ratios_,
+                                                    stride_,
+                                                    variances_,
+                                                    offset_,
+                                                    num_anchors_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs PIRAnchorGeneratorPluginDynamic::getOutputDimensions(
+    int outputIndex,
+    const nvinfer1::DimsExprs* inputs,
+    int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
+  nvinfer1::DimsExprs ret{};
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[2];  // feature height
+  ret.d[1] = inputs[0].d[3];  // feature width
+  ret.d[2] = exprBuilder.constant(num_anchors_);
+  ret.d[3] = exprBuilder.constant(4);
+  return ret;
+}
+
+bool PIRAnchorGeneratorPluginDynamic::supportsFormatCombination(
+    int pos,
+    const nvinfer1::PluginTensorDesc* inOut,
+    int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT {
+  // input can be any, doesn't matter
+  // anchor generator doesn't read input raw data, only need the shape info
+  auto type = inOut[pos].type;
+  auto format = inOut[pos].format;
+#if IS_TRT_VERSION_GE(7234)
+  if (pos == 0) return true;
+#else
+  if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR;
+#endif
+  return (type == nvinfer1::DataType::kFLOAT &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+void PIRAnchorGeneratorPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in,
+    int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out,
+    int nbOutputs) TRT_NOEXCEPT {}
+
+size_t PIRAnchorGeneratorPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs,
+    int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs,
+    int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+
+template <typename T>
+int PIRAnchorGeneratorPluginDynamic::enqueue_impl(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc,
+    const void* const* inputs,
+    void* const* outputs,
+    void* workspace,
+    cudaStream_t stream) {
+  const int height = inputDesc[0].dims.d[2];
+  const int width = inputDesc[0].dims.d[3];
+  const int box_num = height * width * num_anchors_;
+  const int block = 512;
+  const int gen_anchor_grid = (box_num + block - 1) / block;
+  T* anchors = static_cast<T*>(outputs[0]);
+  T* vars = static_cast<T*>(outputs[1]);
+  const T* anchor_sizes_device = static_cast<const T*>(anchor_sizes_device_);
+  const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
+  const T* stride_device = static_cast<const T*>(stride_device_);
+  const T* variances_device = static_cast<const T*>(variances_device_);
+  phi::GenAnchors<T>
+      <<<gen_anchor_grid, block, 0, stream>>>(anchors,
+                                              aspect_ratios_device,
+                                              aspect_ratios_.size(),
+                                              anchor_sizes_device,
+                                              anchor_sizes_.size(),
+                                              stride_device,
+                                              stride_.size(),
+                                              height,
+                                              width,
+                                              offset_);
+  const int var_grid = (box_num * 4 + block - 1) / block;
+  phi::SetVariance<T><<<var_grid, block, 0, stream>>>(
+      vars, variances_device, variances_.size(), box_num * 4);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int PIRAnchorGeneratorPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc,
+    const void* const* inputs,
+    void* const* outputs,
+    void* workspace,
+    cudaStream_t stream) TRT_NOEXCEPT {
+  assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT);
+  assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT);
+  return enqueue_impl<float>(
+      inputDesc, outputDesc, inputs, outputs, workspace, stream);
+}
+
+nvinfer1::DataType PIRAnchorGeneratorPluginDynamic::getOutputDataType(
+    int index,
+    const nvinfer1::DataType* inputTypes,
+    int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+const char* PIRAnchorGeneratorPluginDynamic::getPluginType() const
+    TRT_NOEXCEPT {
+  return "pir_anchor_generator_plugin_dynamic";
+}
+
+int PIRAnchorGeneratorPluginDynamic::getNbOutputs() const TRT_NOEXCEPT {
+  return 2;
+}
+
+int PIRAnchorGeneratorPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
+
+void PIRAnchorGeneratorPluginDynamic::terminate() TRT_NOEXCEPT {}
+
+size_t PIRAnchorGeneratorPluginDynamic::getSerializationSize() const
+    TRT_NOEXCEPT {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchor_sizes_);
+  serialize_size += SerializedSize(aspect_ratios_);
+  serialize_size += SerializedSize(stride_);
+  serialize_size += SerializedSize(variances_);
+  serialize_size += SerializedSize(offset_);
+  serialize_size += SerializedSize(num_anchors_);
+  return serialize_size;
+}
+
+void PIRAnchorGeneratorPluginDynamic::serialize(void* buffer) const
+    TRT_NOEXCEPT {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchor_sizes_);
+  SerializeValue(&buffer, aspect_ratios_);
+  SerializeValue(&buffer, stride_);
+  SerializeValue(&buffer, variances_);
+  SerializeValue(&buffer, offset_);
+  SerializeValue(&buffer, num_anchors_);
+}
+
+void PIRAnchorGeneratorPluginDynamic::destroy() TRT_NOEXCEPT {}
+
+void PIRAnchorGeneratorPluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) TRT_NOEXCEPT {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* PIRAnchorGeneratorPluginDynamicCreator::getPluginNamespace() const
+    TRT_NOEXCEPT {
+  return namespace_.c_str();
+}
+
+const char* PIRAnchorGeneratorPluginDynamicCreator::getPluginName() const
+    TRT_NOEXCEPT {
+  return "pir_anchor_generator_plugin_dynamic";
+}
+
+const char* PIRAnchorGeneratorPluginDynamicCreator::getPluginVersion() const
+    TRT_NOEXCEPT {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+PIRAnchorGeneratorPluginDynamicCreator::getFieldNames() TRT_NOEXCEPT {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* PIRAnchorGeneratorPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
+  const nvinfer1::PluginField* fields = fc->fields;
+  std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
+  float offset = .5;
+  int num_anchors = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const nvinfer1::PluginField& f = fc->fields[i];
+    const std::string field_name(f.name);
+    if (field_name.compare("anchor_sizes") == 0) {
+      const float* data = static_cast<const float*>(f.data);
+      anchor_sizes.assign(data, data + f.length);
+    } else if (field_name.compare("aspect_ratios") == 0) {
+      const float* data = static_cast<const float*>(f.data);
+      aspect_ratios.assign(data, data + f.length);
+    } else if (field_name.compare("stride") == 0) {
+      const float* data = static_cast<const float*>(f.data);
+      stride.assign(data, data + f.length);
+    } else if (field_name.compare("variances") == 0) {
+      const float* data = static_cast<const float*>(f.data);
+      variances.assign(data, data + f.length);
+    } else if (field_name.compare("offset") == 0) {
+      offset = *static_cast<const float*>(f.data);
+    } else if (field_name.compare("num_anchors") == 0) {
+      num_anchors = *static_cast<const int*>(f.data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+  return new PIRAnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT,
+                                             anchor_sizes,
+                                             aspect_ratios,
+                                             stride,
+                                             variances,
+                                             offset,
+                                             num_anchors);
+}
+
+nvinfer1::IPluginV2Ext*
+PIRAnchorGeneratorPluginDynamicCreator::deserializePlugin(
+    const char* name,
+    const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
+  auto plugin = new PIRAnchorGeneratorPluginDynamic(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
index 72f11c76767ebb..20f145e9095694 100644
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
@@ -227,7 +227,105 @@ class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator {
   std::string namespace_;
   nvinfer1::PluginFieldCollection field_collection_;
 };
+
+class PIRAnchorGeneratorPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit PIRAnchorGeneratorPluginDynamic(
+      const nvinfer1::DataType data_type,
+      const std::vector<float>& anchor_sizes,
+      const std::vector<float>& aspect_ratios,
+      const std::vector<float>& stride,
+      const std::vector<float>& variances,
+      const float offset,
+      const int num_anchors);
+  PIRAnchorGeneratorPluginDynamic(void const* data, size_t length);
+  ~PIRAnchorGeneratorPluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex,
+      const nvinfer1::DimsExprs* inputs,
+      int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder)  // NOLINT
+      TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs,
+              void* const* outputs,
+              void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const
+      TRT_NOEXCEPT override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc,
+                   const nvinfer1::PluginTensorDesc* outputDesc,
+                   const void* const* inputs,
+                   void* const* outputs,
+                   void* workspace,
+                   cudaStream_t stream);
+  nvinfer1::DataType data_type_;
+  std::vector<float> anchor_sizes_;
+  std::vector<float> aspect_ratios_;
+  std::vector<float> stride_;
+  std::vector<float> variances_;
+  float offset_;
+  void* anchor_sizes_device_;
+  void* aspect_ratios_device_;
+  void* stride_device_;
+  void* variances_device_;
+  int num_anchors_;
+  std::string namespace_;
+};
+
+class PIRAnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  PIRAnchorGeneratorPluginDynamicCreator() = default;
+  ~PIRAnchorGeneratorPluginDynamicCreator() override = default;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length)
+      TRT_NOEXCEPT override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
 REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator);
+REGISTER_TRT_PLUGIN_V2(PIRAnchorGeneratorPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
index 0ad509a9601882..78eeb58a19133d 100644
--- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
+++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
@@ -94,6 +94,7 @@ DEFINE_GENERAL_PATTERN(Flip, paddle::dialect::FlipOp)
 DEFINE_GENERAL_PATTERN(Mish, paddle::dialect::MishOp)
 DEFINE_GENERAL_PATTERN(AssignValue, paddle::dialect::AssignValueOp)
 DEFINE_GENERAL_PATTERN(AssignValue_, paddle::dialect::AssignValue_Op)
+DEFINE_GENERAL_PATTERN(Anchor_Generator, paddle::dialect::AnchorGeneratorOp)
 DEFINE_GENERAL_PATTERN(Exp, paddle::dialect::ExpOp)
 DEFINE_GENERAL_PATTERN(Abs, paddle::dialect::AbsOp)
 DEFINE_GENERAL_PATTERN(Abs_, paddle::dialect::Abs_Op)
@@ -2294,6 +2295,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass {
     ADD_PATTERN(Mish)
     ADD_PATTERN(AssignValue)
     ADD_PATTERN(AssignValue_)
+    ADD_PATTERN(Anchor_Generator)
     ADD_PATTERN(Exp)
     ADD_PATTERN(Abs)
     ADD_PATTERN(Abs_)
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 4363fc6c8630d5..f0cf95ee7f66fb 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/op_callstack_utils.h"
 #include "paddle/fluid/pybind/op_function_common.h"
+#include "paddle/fluid/pybind/static_op_function.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
@@ -1188,6 +1189,18 @@ static PyObject *fused_gemm_epilogue(PyObject *self,
   }
 }
 
+static PyObject *anchor_generator(PyObject *self,
+                                  PyObject *args,
+                                  PyObject *kwargs) {
+  if (egr::Controller::Instance().GetCurrentTracer() == nullptr) {
+    VLOG(6) << "Call static_api_anchor_generator";
+    return static_api_anchor_generator(self, args, kwargs);
+  } else {
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+}
+
 static PyObject *share_var(PyObject *self, PyObject *args, PyObject *kwargs) {
   try {
     VLOG(6) << "Add share_var op into program";
@@ -1267,6 +1280,10 @@ static PyMethodDef ManualOpsAPI[] = {
      (PyCFunction)(void (*)(void))fused_gemm_epilogue,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for fused_gemm_epilogue."},
+    {"anchor_generator",
+     (PyCFunction)(void (*)(void))anchor_generator,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for anchor_generator."},
     {"_run_custom_op",
      (PyCFunction)(void (*)(void))run_custom_op,
      METH_VARARGS | METH_KEYWORDS,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 61c0a8e55ecb2f..b59e431a8480d4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -235,6 +235,7 @@ limitations under the License. */
 #include "pybind11/stl.h"
 #ifdef PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/pir/declare_plugin.h"
+#include "paddle/fluid/platform/tensorrt/trt_plugin.h"
 #endif
 
 COMMON_DECLARE_bool(use_mkldnn);
@@ -3422,6 +3423,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("clear_shape_info", []() {
     paddle::framework::CollectShapeManager::Instance().ClearShapeInfo();
   });
+#ifdef PADDLE_WITH_TENSORRT
+  m.def("register_paddle_plugin",
+        []() { paddle::platform::TrtPluginRegistry::Global()->RegistToTrt(); });
+#endif
 
 #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS)
   BindHeterWrapper(&m);
diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py
index cab46618c4c0ee..3e7b32d400042b 100644
--- a/python/paddle/tensorrt/converter.py
+++ b/python/paddle/tensorrt/converter.py
@@ -17,6 +17,10 @@
 import logging
 
 import numpy as np
+
+import paddle
+
+paddle.base.core.register_paddle_plugin()
 import tensorrt as trt
 
 import paddle
diff --git a/python/paddle/tensorrt/impls/others.py b/python/paddle/tensorrt/impls/others.py
index f2f571f6953129..8f9cafbccf758c 100644
--- a/python/paddle/tensorrt/impls/others.py
+++ b/python/paddle/tensorrt/impls/others.py
@@ -303,6 +303,66 @@ def share_data_converter(network, paddle_op, inputs):
     return identity_layer.get_output(0)
 
 
+@converter_registry.register("pd_op.anchor_generator", trt_version="8.x")
+def anchor_generator_converter(network, paddle_op, inputs):
+    inputs = inputs[0]
+    input_dims = inputs.shape
+    anchor_sizes = paddle_op.attrs().get("anchor_sizes")
+    aspect_ratios = paddle_op.attrs().get("aspect_ratios")
+    stride = paddle_op.attrs().get("stride")
+    variances = paddle_op.attrs().get("variances")
+    offset = paddle_op.attrs().get("offset")
+    num_anchors = len(aspect_ratios) * len(anchor_sizes)
+
+    height = input_dims[1]
+    width = input_dims[2]
+    box_num = width * height * num_anchors
+    data_type = trt.float32
+
+    plugin_fields = [
+        trt.PluginField(
+            "anchor_sizes",
+            np.array(anchor_sizes, dtype=np.float32),
+            trt.PluginFieldType.FLOAT32,
+        ),
+        trt.PluginField(
+            "aspect_ratios",
+            np.array(aspect_ratios, dtype=np.float32),
+            trt.PluginFieldType.FLOAT32,
+        ),
+        trt.PluginField(
+            "stride",
+            np.array(stride, dtype=np.float32),
+            trt.PluginFieldType.FLOAT32,
+        ),
+        trt.PluginField(
+            "variances",
+            np.array(variances, dtype=np.float32),
+            trt.PluginFieldType.FLOAT32,
+        ),
+        trt.PluginField(
+            "offset",
+            np.array(offset, dtype=np.float32),
+            trt.PluginFieldType.FLOAT32,
+        ),
+        trt.PluginField(
+            "num_anchors",
+            np.array(num_anchors, dtype=np.int32),
+            trt.PluginFieldType.INT32,
+        ),
+    ]
+    plugin_field_collection = trt.PluginFieldCollection(plugin_fields)
+    plugin_name = "pir_anchor_generator_plugin_dynamic"
+    plugin_version = "1"
+    plugin = get_trt_plugin(
+        plugin_name, plugin_field_collection, plugin_version
+    )
+    anchor_generator_layer = network.add_plugin_v2([inputs], plugin)
+    out0 = anchor_generator_layer.get_output(0)
+    out1 = anchor_generator_layer.get_output(1)
+    return (out0, out1)
+
+
 @converter_registry.register("pd_op.affine_channel", trt_version="8.x")
 def affine_channel_converter(network, paddle_op, inputs):
     x, scale_weights, bias_weights = inputs
diff --git a/test/tensorrt/test_converter_others.py b/test/tensorrt/test_converter_others.py
index 0c88733296f262..8b201467137eec 100644
--- a/test/tensorrt/test_converter_others.py
+++ b/test/tensorrt/test_converter_others.py
@@ -437,7 +437,7 @@ def test_fp16_trt_result(self):
         self.check_trt_result(precision_mode="fp16")
 
 
-class TestAffineChannelCas1TRTPattern(TensorRTBaseTest):
+class TestAffineChannelCase1TRTPattern(TensorRTBaseTest):
     def setUp(self):
         self.python_api = affine_channel
         self.api_args = {
@@ -458,5 +458,57 @@ def test_fp16_trt_result(self):
         self.check_trt_result(precision_mode="fp16")
 
 
+def anchor_generator(x, anchor_sizes, aspect_ratios, variances, stride, offset):
+    return _C_ops.anchor_generator(
+        x, anchor_sizes, aspect_ratios, variances, stride, offset
+    )
+
+
+class TestAnchorGeneratorTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = anchor_generator
+        self.api_args = {
+            "x": np.random.random((2, 3, 3, 100)).astype("float32"),
+            "anchor_sizes": [64.0, 128.0, 256.0],
+            "aspect_ratios": [0.5, 1, 2],
+            "variances": [1.0, 1.0, 1.0, 1.0],
+            "stride": [16.0, 16.0],
+            "offset": 0.5,
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [1, 3, 3, 100]}
+        self.opt_shape = {"x": [2, 3, 3, 100]}
+        self.max_shape = {"x": [3, 3, 3, 100]}
+
+    def test_fp32_trt_result(self):
+        self.check_trt_result()
+
+    def test_fp16_trt_result(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
+class TestAnchorGeneratorCase1TRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = anchor_generator
+        self.api_args = {
+            "x": np.random.random((2, 3, 64, 64)).astype("float32"),
+            "anchor_sizes": [64.0, 128.0, 256.0],
+            "aspect_ratios": [0.4, 1.2, 3],
+            "variances": [0.5, 1.0, 0.5, 1.0],
+            "stride": [16.0, 32.0],
+            "offset": 0.8,
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [2, 3, 64, 64]}
+        self.opt_shape = {"x": [2, 3, 64, 64]}
+        self.max_shape = {"x": [3, 3, 64, 64]}
+
+    def test_fp32_trt_result(self):
+        self.check_trt_result()
+
+    def test_fp16_trt_result(self):
+        self.check_trt_result(precision_mode="fp16")
+
+
 if __name__ == '__main__':
     unittest.main()

From a0075d2fc1bb5c1be3ee13bf7ad56d7a82056ea4 Mon Sep 17 00:00:00 2001
From: Junjie Zhang <1356732652@qq.com>
Date: Thu, 9 Jan 2025 10:59:48 +0800
Subject: [PATCH 44/57] Update CMakeLists.txt (#70683)

---
 test/legacy_test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index a625ad80b7077d..4da0738db3c47d 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -854,7 +854,7 @@ set_tests_properties(test_imperative_transformer_sorted_gradient
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 250)
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 250)
-set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270)
+set_tests_properties(test_activation_op PROPERTIES TIMEOUT 600)
 set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 270)

From 6ba4c447fc326a7c905ebf6137588485e072ed81 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 9 Jan 2025 11:03:02 +0800
Subject: [PATCH 45/57] [CINN] Delete llvm opt for host code (#70685)

* delete code

* fix

* delete compile host model in cuda

* fix
---
 paddle/cinn/backends/compiler.cc              |  1 -
 paddle/cinn/backends/llvm/execution_engine.cc | 14 +++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 0658d0507e4775..644b97757999ce 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -430,7 +430,6 @@ void Compiler::CompileCudaModule(const Module& module,
     device_fn_name_.emplace_back(kernel_fn_name);
   }
   engine_->Link<CodeGenGpuHost>(host_module);
-
 #else
   CINN_NOT_IMPLEMENTED
 #endif
diff --git a/paddle/cinn/backends/llvm/execution_engine.cc b/paddle/cinn/backends/llvm/execution_engine.cc
index ed771ef57ad540..91a32c283c77db 100644
--- a/paddle/cinn/backends/llvm/execution_engine.cc
+++ b/paddle/cinn/backends/llvm/execution_engine.cc
@@ -171,8 +171,10 @@ std::unique_ptr<llvm::MemoryBuffer> NaiveObjectCache::getObject(
 
 template <typename CodeGenT>
 void ExecutionEngine::Link(const ir::Module &module) {
+  if (module.functions().size() == 0) {
+    return;
+  }
   utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary);
-
   auto ir_emitter = std::make_unique<CodeGenT>(m.get(), b.get());
   VLOG(3) << "ir_emitter->Compile(module) Begin";
   ir_emitter->Compile(module);
@@ -211,6 +213,16 @@ void ExecutionEngine::Link(const ir::Module &module) {
   }
 }
 
+template <>
+void ExecutionEngine::Link<CodeGenGpuHost>(const ir::Module &module) {
+  if (module.functions().size() == 0) {
+    return;
+  }
+  utils::RecordEvent("ExecutionEngine Link", utils::EventType::kOrdinary);
+  auto ir_emitter = std::make_unique<CodeGenGpuHost>(m.get(), b.get());
+  ir_emitter->Compile(module);
+}
+
 bool ExecutionEngine::AddModule(std::unique_ptr<llvm::Module> module,
                                 std::unique_ptr<llvm::LLVMContext> context) {
   utils::RecordEvent("ExecutionEngine AddModule", utils::EventType::kOrdinary);

From b1c7b888a53dc054da4601732149c62246117fa7 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 11:04:36 +0800
Subject: [PATCH 46/57] [fluid_ops] c_comm_init remove
 FLAGS_dynamic_static_unified_comm (#70718)

---
 .../operators/collective/c_comm_init_op.cc    | 96 +++++--------------
 1 file changed, 26 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 349a2626bb4f4b..875c7fb41b3416 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -29,17 +29,6 @@ limitations under the License. */
 #include "paddle/phi/core/platform/collective_helper.h"
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/phi/core/distributed/nccl_comm_context.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
-#elif defined(PADDLE_WITH_XPU_BKCL)
-#include "paddle/phi/core/distributed/bkcl_comm_context.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
-#endif
-#if defined(PADDLE_WITH_CUSTOM_DEVICE)
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
-#endif
-
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/store/store_utils.h"
@@ -75,46 +64,26 @@ class CCommInitOp : public framework::OperatorBase {
         device_id = Attr<int>("device_id");
       }
       int rank_id = Attr<int>("rank");
-      if (FLAGS_dynamic_static_unified_comm) {
-        VLOG(3) << "#### use new comm lab ####";
-        auto store = phi::distributed::CreateOrGetGlobalTCPStore();
-        if (!phi::distributed::CommContextManager::GetInstance().Has(
-                std::to_string(rid))) {
-          phi::distributed::CommContextManager::CreateXCCLCommContext(
-              store,
-              std::to_string(rid),
-              phi::CustomPlace(place.GetDeviceType(), device_id),
-              rank_id,
-              nranks,
-              "c_comm_init_op");
-        }
-        return;
-      }
 
-      using UniqueId = phi::ccl::CCLRootId;
-      using CommContext = platform::XCCLCommContext;
+      VLOG(3) << "#### use new comm lab ####";
+      auto store = phi::distributed::CreateOrGetGlobalTCPStore();
+      if (!phi::distributed::CommContextManager::GetInstance().Has(
+              std::to_string(rid))) {
+        phi::distributed::CommContextManager::CreateXCCLCommContext(
+            store,
+            std::to_string(rid),
+            phi::CustomPlace(place.GetDeviceType(), device_id),
+            rank_id,
+            nranks,
+            "c_comm_init_op");
+      }
+      return;
 
-      VLOG(3) << "#### use old comm lab ####";
-      UniqueId* comm_id = var->GetMutable<UniqueId>();
-      CommContext::Instance(place.GetDeviceType())
-          .CreateComm(comm_id, nranks, rank_id, device_id, rid);
 #else
       PADDLE_THROW(common::errors::PreconditionNotMet(
           "PaddlePaddle should compile with custom device."));
 #endif
     } else {
-// TODO(wangxi): Put this in the unified header file
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      using UniqueId = ncclUniqueId;
-      using CommContext = platform::NCCLCommContext;
-#elif defined(PADDLE_WITH_XPU_BKCL)
-      using UniqueId = BKCLUniqueId;
-      using CommContext = platform::BKCLCommContext;
-#else
-      PADDLE_THROW(common::errors::PreconditionNotMet(
-          "PaddlePaddle should be compiled with GPU or XPU."));
-#endif
-
       PADDLE_ENFORCE_EQ(place.GetType() == phi::AllocationType::GPU ||
                             place.GetType() == phi::AllocationType::XPU,
                         true,
@@ -137,33 +106,20 @@ class CCommInitOp : public framework::OperatorBase {
       }
       int rank_id = Attr<int>("rank");
 #endif
-#if defined(PADDLE_WITH_NCCL)
-      if (FLAGS_dynamic_static_unified_comm) {
-        VLOG(3) << "#### use new comm lab ####";
-        auto store = phi::distributed::CreateOrGetGlobalTCPStore();
-        phi::distributed::CommContextManager::SetDeviceId(device_id);
-        std::string endpoints = Attr<std::string>("endpoints");
-        phi::distributed::CommContextManager::CreateNCCLCommContext(
-            store, std::to_string(rid), rank_id, nranks, endpoints);
-        return;
-      }
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+      VLOG(3) << "#### use new comm lab ####";
+      auto store = phi::distributed::CreateOrGetGlobalTCPStore();
+      phi::distributed::CommContextManager::SetDeviceId(device_id);
+      std::string endpoints = Attr<std::string>("endpoints");
+      phi::distributed::CommContextManager::CreateNCCLCommContext(
+          store, std::to_string(rid), rank_id, nranks, endpoints);
 #elif defined(PADDLE_WITH_XPU_BKCL)
-      if (FLAGS_dynamic_static_unified_comm) {
-        VLOG(3) << "#### use new comm lab ####";
-        auto store = phi::distributed::CreateOrGetGlobalTCPStore();
-        phi::distributed::CommContextManager::SetDeviceId(device_id);
-        std::string endpoints = Attr<std::string>("endpoints");
-        phi::distributed::CommContextManager::CreateBKCLCommContext(
-            store, std::to_string(rid), rank_id, nranks, endpoints);
-        return;
-      }
-#endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL)
-      VLOG(3) << "#### use old comm lab ####";
-      UniqueId* comm_id = var->GetMutable<UniqueId>();
-      CommContext::Instance().CreateComm(
-          comm_id, nranks, rank_id, device_id, rid);
+      VLOG(3) << "#### use new comm lab ####";
+      auto store = phi::distributed::CreateOrGetGlobalTCPStore();
+      phi::distributed::CommContextManager::SetDeviceId(device_id);
+      std::string endpoints = Attr<std::string>("endpoints");
+      phi::distributed::CommContextManager::CreateBKCLCommContext(
+          store, std::to_string(rid), rank_id, nranks, endpoints);
 #endif
     }
   }

From 885318b1a7e9025bb8737c4deefe0f083a47df23 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 9 Jan 2025 11:29:29 +0800
Subject: [PATCH 47/57] fix bce loss decomp bug (#70724)

---
 .../decomp_rule/decomp_rule/composite.h          | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
index 357b2434c1f676..3899f19c7b23ae 100644
--- a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
+++ b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
@@ -200,9 +200,19 @@ Tensor reciprocal_decomp(const Tensor& x) {
 
 template <typename T>
 Tensor bce_loss_decomp(const Tensor& x, const Tensor& label) {
-  auto one = full_scalar<T>(1, x.dtype(), x.place());
-  auto ans = full_scalar<T>(-1, x.dtype(), x.place()) *
-             (label * log<T>(x) + (one - label) * log<T>(one - x));
+  auto org_dtype = x.dtype();
+  auto x_mt = ConvertToMT<T>(x);
+
+  auto neg_100 = full_scalar<T>(-100, x_mt.dtype(), x.place());
+  auto one = full_scalar<T>(1, x_mt.dtype(), x.place());
+
+  auto log_x = maximum<T>(log<T>(x_mt), neg_100);
+  auto log_1_x = maximum<T>(log<T>(one - x_mt), neg_100);
+
+  auto ans = full_scalar<T>(-1, x_mt.dtype(), x.place()) *
+             (label * log_x + (one - label) * log_1_x);
+  ans = ConvertToOrig<T>(ans, org_dtype);
+
   return ans;
 }
 

From 74eb10a376dea51a1fb001c6c491d005b8b13e03 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:47:58 +0800
Subject: [PATCH 48/57] [clean old comm] remove
 FLAGS_dynamic_static_unified_comm in python directory (#70727)

---
 .../auto_parallel/static/process_group.py     | 84 ++++++++-----------
 .../fleet/base/private_helper_function.py     | 42 +---------
 .../fleet/meta_optimizers/common.py           |  7 --
 .../meta_optimizers/sharding_optimizer.py     |  6 --
 4 files changed, 35 insertions(+), 104 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/process_group.py b/python/paddle/distributed/auto_parallel/static/process_group.py
index 49f893368dccd0..4dba7898f3a160 100644
--- a/python/paddle/distributed/auto_parallel/static/process_group.py
+++ b/python/paddle/distributed/auto_parallel/static/process_group.py
@@ -160,58 +160,42 @@ def instantiate(self):
             strategy.nrings = 1
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(genv.device_id)
-                use_new_comm = paddle.get_flags(
-                    "FLAGS_dynamic_static_unified_comm"
-                )["FLAGS_dynamic_static_unified_comm"]
-                if use_new_comm:
-                    store = core.create_or_get_global_tcp_store()
-                    endpoints_str = ""
-                    for endpoint in strategy.trainer_endpoints:
-                        endpoints_str += endpoint
-                    endpoints_str += f"ring_id:{ring_id}"
-                    endpoints_str_hash = hashlib.md5(
-                        endpoints_str.encode(encoding='UTF-8')
-                    ).hexdigest()
-
-                    core.CommContextManager.set_device_id(genv.device_id)
-                    core.CommContextManager.create_nccl_comm_context(
-                        store,
-                        str(ring_id),
-                        strategy.local_rank,
-                        strategy.nranks,
-                        endpoints_str_hash,
-                    )
-                else:
-                    core.NCCLParallelContext(strategy, place).init_with_ring_id(
-                        ring_id
-                    )
+                store = core.create_or_get_global_tcp_store()
+                endpoints_str = ""
+                for endpoint in strategy.trainer_endpoints:
+                    endpoints_str += endpoint
+                endpoints_str += f"ring_id:{ring_id}"
+                endpoints_str_hash = hashlib.md5(
+                    endpoints_str.encode(encoding='UTF-8')
+                ).hexdigest()
+
+                core.CommContextManager.set_device_id(genv.device_id)
+                core.CommContextManager.create_nccl_comm_context(
+                    store,
+                    str(ring_id),
+                    strategy.local_rank,
+                    strategy.nranks,
+                    endpoints_str_hash,
+                )
             elif core.is_compiled_with_xpu():
                 place = core.XPUPlace(genv.device_id)
-                use_new_comm = paddle.get_flags(
-                    "FLAGS_dynamic_static_unified_comm"
-                )["FLAGS_dynamic_static_unified_comm"]
-                if use_new_comm:
-                    store = core.create_or_get_global_tcp_store()
-                    endpoints_str = ""
-                    for endpoint in strategy.trainer_endpoints:
-                        endpoints_str += endpoint
-                    endpoints_str += f"ring_id:{ring_id}"
-                    endpoints_str_hash = hashlib.md5(
-                        endpoints_str.encode(encoding='UTF-8')
-                    ).hexdigest()
-
-                    core.CommContextManager.set_device_id(genv.device_id)
-                    core.CommContextManager.create_bkcl_comm_context(
-                        store,
-                        str(ring_id),
-                        strategy.local_rank,
-                        strategy.nranks,
-                        endpoints_str_hash,
-                    )
-                else:
-                    core.BKCLParallelContext(strategy, place).init_with_ring_id(
-                        ring_id
-                    )
+                store = core.create_or_get_global_tcp_store()
+                endpoints_str = ""
+                for endpoint in strategy.trainer_endpoints:
+                    endpoints_str += endpoint
+                endpoints_str += f"ring_id:{ring_id}"
+                endpoints_str_hash = hashlib.md5(
+                    endpoints_str.encode(encoding='UTF-8')
+                ).hexdigest()
+
+                core.CommContextManager.set_device_id(genv.device_id)
+                core.CommContextManager.create_bkcl_comm_context(
+                    store,
+                    str(ring_id),
+                    strategy.local_rank,
+                    strategy.nranks,
+                    endpoints_str_hash,
+                )
             elif genv.device_type in core.get_all_custom_device_type():
                 place = core.CustomPlace(genv.device_type, genv.device_id)
                 core.XCCLParallelContext(strategy, place).init_with_ring_id(
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index 0da733c0f24c65..34eb192c106b17 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -11,12 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import socket
-import sys
-import time
-from contextlib import closing
 
-import paddle
 
 __all__ = []
 
@@ -35,39 +30,4 @@ def wait_server_ready(endpoints):
 
              >>> wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
     """
-    try:
-        use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[
-            "FLAGS_dynamic_static_unified_comm"
-        ]
-    except:
-        use_new_comm = False
-
-    if use_new_comm:
-        return
-    assert not isinstance(endpoints, str)
-    while True:
-        all_ok = True
-        not_ready_endpoints = []
-        for ep in endpoints:
-            ip_port = ep.split(":")
-            with closing(
-                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-            ) as sock:
-                sock.settimeout(2)
-                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-                if hasattr(socket, 'SO_REUSEPORT'):
-                    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
-
-                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                if result != 0:
-                    all_ok = False
-                    not_ready_endpoints.append(ep)
-        if not all_ok:
-            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-            sys.stderr.write(
-                "not ready endpoints:" + str(not_ready_endpoints) + "\n"
-            )
-            sys.stderr.flush()
-            time.sleep(3)
-        else:
-            break
+    return
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 75be5f621d4124..8147a957796e0f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -97,13 +97,6 @@ def _init_communicator(
             other_endpoints = endpoints[:]
             other_endpoints.remove(current_endpoint)
 
-        if rank == 0 and wait_port:
-            use_new_comm = paddle.get_flags(
-                "FLAGS_dynamic_static_unified_comm"
-            )["FLAGS_dynamic_static_unified_comm"]
-            if not use_new_comm:
-                wait_server_ready(other_endpoints)
-
         def _add_sync_by_allreduce(block):
             sync_var = block.create_var(
                 name=unique_name.generate('sync_var'),
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 045befd1f7bd28..07de62d3039f89 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -14,7 +14,6 @@
 
 import os
 
-import paddle
 from paddle.base import core
 from paddle.incubate.optimizer import PipelineOptimizer
 from paddle.static import (
@@ -705,11 +704,6 @@ def minimize_impl(
         self._recreate_not_persist_param_as_var()
 
         self._dump_program_for_debug()
-        use_new_comm = paddle.get_flags("FLAGS_dynamic_static_unified_comm")[
-            "FLAGS_dynamic_static_unified_comm"
-        ]
-        if not use_new_comm:
-            self._wait()
         return optimize_ops, params_grads
 
     def _init_pair_comm(self, pair, ring_id):

From dbf9de203ae3588b98edc1337c6563fb7ee5e99f Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:48:15 +0800
Subject: [PATCH 49/57] [fluid_ops] c_scatter remove
 FLAGS_dynamic_static_unified_comm (#70717)

---
 paddle/phi/kernels/gpu/c_scatter_kernel.cu | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/paddle/phi/kernels/gpu/c_scatter_kernel.cu b/paddle/phi/kernels/gpu/c_scatter_kernel.cu
index 4ea62f468e58e9..8598b787d524d7 100644
--- a/paddle/phi/kernels/gpu/c_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_scatter_kernel.cu
@@ -51,20 +51,8 @@ void CScatterOpCUDAKernel(const Context& dev_ctx,
       common::errors::InvalidArgument(
           "The ring_id (%d) for c_scatter_op must be non-negative.", ring_id));
 
-  const auto& comm_context_manager =
-      phi::distributed::CommContextManager::GetInstance();
-
-  PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
-                    true,
-                    common::errors::InvalidArgument(
-                        "You choose to use new communication library by "
-                        "setting environment "
-                        "variable FLAGS_dynamic_static_unified_comm True. "
-                        "But ring_id(%d) is "
-                        "not found in comm_context_manager.",
-                        std::to_string(ring_id)));
-  comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-      comm_context_manager.Get(std::to_string(ring_id)));
+  comm_ctx =
+      static_cast<phi::distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
   PADDLE_ENFORCE_NE(comm_ctx,
                     nullptr,
                     common::errors::Unavailable(

From 7040bb18b8a0b9067030f4c7086638b3d819a593 Mon Sep 17 00:00:00 2001
From: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
Date: Thu, 9 Jan 2025 16:19:14 +0800
Subject: [PATCH 50/57] [infrence]fix openvino.cmake (#70701)

---
 cmake/external/openvino.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/openvino.cmake b/cmake/external/openvino.cmake
index f08f987cff1b6d..dea1fd4625d0d3 100644
--- a/cmake/external/openvino.cmake
+++ b/cmake/external/openvino.cmake
@@ -103,7 +103,7 @@ file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/openvino/convert.patch
      native_convert)
 
 set(OPENVINO_PATCH_COMMAND
-    git checkout -- . && git fetch --depth=1 origin <OPENVINO_COMMIT> && git
+    git checkout -- . && git fetch --depth=1 origin ${OPENVINO_COMMIT} && git
     checkout ${OPENVINO_COMMIT} && patch -Np1 -d ${SOURCE_DIR} <
     ${native_convert} || true)
 

From 4077efe2b1d0504b7e0bb1b8837b8f8c3f6bb970 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 9 Jan 2025 16:25:01 +0800
Subject: [PATCH 51/57] [Inference]Support some pass use in converter (#70529)

* support pass in converter

* fix bugs

* fix unittest

* resolve conflict

* resolve unittest

* fix unittest

* fix unittest

* reduce file

* perfect comment
---
 .../fluid/pir/dialect/operator/utils/utils.cc |   8 +
 .../general/constant_folding_pass.cc          |   4 +-
 paddle/fluid/pybind/pir.cc                    |   6 +
 python/paddle/tensorrt/converter.py           | 196 +++++++++++-------
 python/paddle/tensorrt/converter_utils.py     |  21 +-
 python/paddle/tensorrt/export.py              |  23 +-
 python/paddle/tensorrt/impls/common.py        |   8 +-
 python/paddle/tensorrt/impls/conv.py          |   3 +
 python/paddle/tensorrt/impls/creation.py      |  12 +-
 python/paddle/tensorrt/impls/input.py         |   5 +
 python/paddle/tensorrt/impls/manipulation.py  |  78 +++----
 python/paddle/tensorrt/impls/math.py          |  24 +--
 python/paddle/tensorrt/impls/others.py        |  43 +---
 python/paddle/tensorrt/impls/pooling.py       |   9 +-
 python/paddle/tensorrt/impls/search.py        |  24 +--
 python/paddle/tensorrt/util.py                |  25 ++-
 test/tensorrt/tensorrt_test_base.py           |  21 +-
 test/tensorrt/test_converter_conv.py          |  22 ++
 test/tensorrt/test_converter_model_bert.py    |   1 +
 19 files changed, 311 insertions(+), 222 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 01e754b6889585..05c3337d4c2c31 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -66,6 +66,8 @@ enum class AttrType {
 
   STRING,
 
+  TENSOR_NAME,
+
   NUM_ATTR_TYPES,
 };
 
@@ -90,6 +92,8 @@ static inline AttrType GetAttributeType(const pir::Attribute& attr) {
     return AttrType::DATA_TYPE;
   } else if (attr.isa<paddle::dialect::PlaceAttribute>()) {
     return AttrType::PLACE;
+  } else if (attr.isa<pir::TensorNameAttribute>()) {
+    return AttrType::TENSOR_NAME;
   } else {
     PADDLE_THROW(common::errors::Unimplemented(
         "Unsupported ir Attribute type when casting it into "
@@ -141,6 +145,10 @@ static std::function<T(const pir::Attribute& attr)> GetAttrCast(
            [](const pir::Attribute& attr) {
              return T{attr.dyn_cast<paddle::dialect::PlaceAttribute>().data()};
            }},
+          {AttrType::TENSOR_NAME,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<pir::TensorNameAttribute>().data()};
+           }},
           {AttrType::ARRAY,
            [](const pir::Attribute& attr) {
              auto attr_vec = attr.dyn_cast<pir::ArrayAttribute>().AsVector();
diff --git a/paddle/fluid/pir/transforms/general/constant_folding_pass.cc b/paddle/fluid/pir/transforms/general/constant_folding_pass.cc
index 083c3cb9f63317..66669e276ee2ae 100644
--- a/paddle/fluid/pir/transforms/general/constant_folding_pass.cc
+++ b/paddle/fluid/pir/transforms/general/constant_folding_pass.cc
@@ -47,6 +47,7 @@
 #include "paddle/pir/include/core/region.h"
 #include "paddle/pir/include/core/value.h"
 #include "paddle/pir/include/pass/pass.h"
+#include "paddle/pir/include/pass/pass_registry.h"
 #include "paddle/pir/include/pattern_rewrite/frozen_rewrite_pattern_set.h"
 #include "paddle/pir/include/pattern_rewrite/pattern_match.h"
 #include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h"
@@ -300,7 +301,6 @@ class ConstantFoldingPattern : public pir::RewritePattern {
     }
     paddle::framework::InterpreterCore core(
         place_, {}, kernel_program->block(), scope_, *exe_config_);
-
     core.Run({});
     return output_var_names;
   }
@@ -557,3 +557,5 @@ std::unique_ptr<Pass> CreateConstantFoldingPass() {
 }
 
 }  // namespace pir
+
+REGISTER_IR_PASS(constant_folding_pass, ConstantFoldingPass);
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index d0e2b99ddb991c..335fb3ebc7ff2c 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -2676,6 +2676,12 @@ void BindPassManager(pybind11::module *m) {
                  pass->Set(attr.first, new int(attr.second.cast<int>()));
                } else if (py::isinstance<py::float_>(attr.second)) {
                  pass->Set(attr.first, new float(attr.second.cast<float>()));
+               } else if (py::isinstance<framework::Scope>(attr.second)) {
+                 pass->SetNotOwned(attr.first,
+                                   attr.second.cast<framework::Scope *>());
+               } else if (py::isinstance<phi::GPUPlace>(attr.second)) {
+                 pass->Set(attr.first,
+                           new phi::Place(attr.second.cast<phi::GPUPlace>()));
                } else {
                  PADDLE_THROW(common::errors::InvalidArgument(
                      "The pass attr is not supported this type."));
diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py
index 3e7b32d400042b..2017886fac5218 100644
--- a/python/paddle/tensorrt/converter.py
+++ b/python/paddle/tensorrt/converter.py
@@ -27,6 +27,7 @@
 from paddle import pir
 from paddle.base.core import clear_shape_info, get_value_shape_range_info
 from paddle.base.log_helper import get_logger
+from paddle.pir.core import _PADDLE_PIR_DTYPE_2_NUMPY_DTYPE
 
 from .impls.activation import *  # noqa: F403
 from .impls.attribute import *  # noqa: F403
@@ -75,8 +76,10 @@ def __init__(self, paddle_program, scope, trt_config=None):
         # save parameters
         for v in params:
             name = v.get_defining_op().attrs()["parameter_name"]
-            weight_array = np.array(self.scope.var(name).get_tensor())
-            # weights = trt.Weights(weight_array)
+            if self.scope.find_var(name) is None:
+                weight_array = None
+            else:
+                weight_array = np.array(self.scope.var(name).get_tensor())
             param_dict.update({name: weight_array})
         self.param_dict = param_dict
 
@@ -150,6 +153,7 @@ def convert_subgraph_to_trt(self, program, group_op):
         opt_value_map = {}
         max_value_map = {}
         input_names = []
+        new_input_values = []
 
         # Because one of the inputs to pd_op.concat is builtin.combine,
         # during the conversion process using the converter,
@@ -172,7 +176,24 @@ def convert_subgraph_to_trt(self, program, group_op):
                 param_name = defining_op.attrs()["parameter_name"]
                 weight = trt.Weights(self.param_dict[param_name])
                 value_to_trt_tensor[value.id] = weight
-                input_names.append("")
+            elif defining_op.name() == "builtin.constant":
+                constant_value_name = defining_op.attrs()["value"]
+                constant_tensor = self.scope.var(
+                    constant_value_name
+                ).get_tensor()
+                out_dtype = np.dtype(
+                    _PADDLE_PIR_DTYPE_2_NUMPY_DTYPE[value.dtype]
+                )
+                if out_dtype == np.dtype("float64"):
+                    out_dtype = np.dtype("float32")
+                if out_dtype == np.dtype("int64"):
+                    out_dtype = np.dtype("int32")
+                constant_data = np.array(constant_tensor, dtype=out_dtype)
+                if len(constant_data) == 0:
+                    value_to_trt_tensor[value.id] = None
+                else:
+                    constant_tensor = trt.Weights(constant_data)
+                    value_to_trt_tensor[value.id] = constant_tensor
             else:
                 shape = value.shape
                 dtype = map_dtype(value.dtype.name)
@@ -184,6 +205,7 @@ def convert_subgraph_to_trt(self, program, group_op):
                     name=input_name, dtype=dtype, shape=shape
                 )
                 input_names.append(input_name)
+                new_input_values.append(value)
                 value_to_trt_tensor[value.id] = input_tensor
 
         for op in operations:
@@ -196,6 +218,9 @@ def convert_subgraph_to_trt(self, program, group_op):
                 if not source.initialized():
                     operands.append(None)
                     continue
+                vec_type = source.type().as_vec_type()
+                if vec_type is not None and len(vec_type.as_list()) == 0:
+                    continue
                 define_op_name = source.get_defining_op().name()
                 if define_op_name == "builtin.combine":
                     operand_list = []
@@ -242,6 +267,10 @@ def convert_subgraph_to_trt(self, program, group_op):
 
             for idx, result in enumerate(op.results()):
                 if result.is_combine():
+                    # empty vec value condition
+                    if len(result.type().as_vec_type().as_list()) == 0:
+                        results.append(result)
+                        continue
                     used_ops = result.all_used_ops()
                     for use_op in used_ops:
                         if use_op.name() == "builtin.split":
@@ -249,6 +278,7 @@ def convert_subgraph_to_trt(self, program, group_op):
                             results.extend(split_outputs)
                 else:
                     results.append(result)
+
             for idx, result in enumerate(results):
                 if idx < len(trt_outs):
                     value_to_trt_tensor[result.id] = trt_outs[idx]
@@ -258,83 +288,86 @@ def convert_subgraph_to_trt(self, program, group_op):
         # Set TRT min/opt/max input shape and the value of shape tensor
         for i, value in enumerate(origin_input_value):
             trt_input = value_to_trt_tensor[value.id]
-            if isinstance(trt_input, trt.Weights):
+            defining_op_name = value.get_defining_op().name()
+            if (
+                defining_op_name == "builtin.parameter"
+                or defining_op_name == "builtin.constant"
+            ):
+                # constant/parameter condition, needn't get min/opt/max shape
                 continue
             input_name = trt_input.name
-            if input_name != "":
-                _logger.info(
-                    f"set shape of {value}, op is: {value.get_defining_op()}"
+            _logger.info(
+                f"set shape of {value}, op is: {value.get_defining_op()}"
+            )
+            min_shape = []
+            opt_shape = []
+            max_shape = []
+            min_value = []
+            opt_value = []
+            max_value = []
+
+            value_define_op = value.get_defining_op()
+            # if the input value is generated by the other trt_engine_op, so the shape is searched by origin value
+            if (
+                value_define_op.name() == "builtin.split"
+                and value_define_op.operand_source(0).get_defining_op().name()
+                == "pd_op.tensorrt_engine"
+            ):
+                min_shape = self.input_info[value.id]["min_shape"]
+                opt_shape = self.input_info[value.id]["opt_shape"]
+                max_shape = self.input_info[value.id]["max_shape"]
+                if trt_input.is_shape_tensor:
+                    min_value = self.input_info[value.id]["min_value"]
+                    opt_value = self.input_info[value.id]["opt_value"]
+                    max_value = self.input_info[value.id]["max_value"]
+            else:
+                min_shape = get_value_shape_range_info(
+                    value, False, paddle.base.core.ShapeMode.kMIN
                 )
-                min_shape = []
-                opt_shape = []
-                max_shape = []
-                min_value = []
-                opt_value = []
-                max_value = []
-
-                value_define_op = value.get_defining_op()
-                # if the input value is generated by the other trt_engine_op, so the shape is searched by origin value
-                if (
-                    value_define_op.name() == "builtin.split"
-                    and value_define_op.operand_source(0)
-                    .get_defining_op()
-                    .name()
-                    == "pd_op.tensorrt_engine"
-                ):
-                    min_shape = self.input_info[value.id]["min_shape"]
-                    opt_shape = self.input_info[value.id]["opt_shape"]
-                    max_shape = self.input_info[value.id]["max_shape"]
-                    if trt_input.is_shape_tensor:
-                        min_value = self.input_info[value.id]["min_value"]
-                        opt_value = self.input_info[value.id]["opt_value"]
-                        max_value = self.input_info[value.id]["max_value"]
-                else:
-                    min_shape = get_value_shape_range_info(
-                        value, False, paddle.base.core.ShapeMode.kMIN
-                    )
-                    opt_shape = get_value_shape_range_info(
-                        value, False, paddle.base.core.ShapeMode.kOPT
-                    )
-                    max_shape = get_value_shape_range_info(
-                        value, False, paddle.base.core.ShapeMode.kMAX
-                    )
-                    if trt_input.is_shape_tensor:
-                        min_value = get_value_shape_range_info(
-                            value, True, paddle.base.core.ShapeMode.kMIN
-                        )
-                        opt_value = get_value_shape_range_info(
-                            value, True, paddle.base.core.ShapeMode.kOPT
-                        )
-                        max_value = get_value_shape_range_info(
-                            value, True, paddle.base.core.ShapeMode.kMAX
-                        )
-                if not trt_input.is_shape_tensor:
-                    _logger.info(f"set min_shape of {value} as {min_shape}")
-                    _logger.info(f"set opt_shape of {value} as {opt_shape}")
-                    _logger.info(f"set max_shape of {value} as {max_shape}")
-                    profile.set_shape(
-                        input_name, min=min_shape, opt=opt_shape, max=max_shape
-                    )
-                else:
-                    _logger.info(
-                        f"set min_value of shape input: {value} as {min_value}"
-                    )
-                    _logger.info(
-                        f"set max_value of shape input: {value} as {opt_value}"
+                opt_shape = get_value_shape_range_info(
+                    value, False, paddle.base.core.ShapeMode.kOPT
+                )
+                max_shape = get_value_shape_range_info(
+                    value, False, paddle.base.core.ShapeMode.kMAX
+                )
+
+                if trt_input.is_shape_tensor:
+                    min_value = get_value_shape_range_info(
+                        value, True, paddle.base.core.ShapeMode.kMIN
                     )
-                    _logger.info(
-                        f"set opt_value of shape input: {value} as {max_value}"
+                    opt_value = get_value_shape_range_info(
+                        value, True, paddle.base.core.ShapeMode.kOPT
                     )
-                    profile.set_shape_input(
-                        input_name, min=min_value, opt=opt_value, max=max_value
+                    max_value = get_value_shape_range_info(
+                        value, True, paddle.base.core.ShapeMode.kMAX
                     )
+            if not trt_input.is_shape_tensor:
+                _logger.info(f"set min_shape of {value} as {min_shape}")
+                _logger.info(f"set opt_shape of {value} as {opt_shape}")
+                _logger.info(f"set max_shape of {value} as {max_shape}")
+                profile.set_shape(
+                    input_name, min=min_shape, opt=opt_shape, max=max_shape
+                )
+            else:
+                _logger.info(
+                    f"set min_value of shape input: {value} as {min_value}"
+                )
+                _logger.info(
+                    f"set max_value of shape input: {value} as {opt_value}"
+                )
+                _logger.info(
+                    f"set opt_value of shape input: {value} as {max_value}"
+                )
+                profile.set_shape_input(
+                    input_name, min=min_value, opt=opt_value, max=max_value
+                )
 
-                min_shape_map[input_name] = min_shape
-                opt_shape_map[input_name] = opt_shape
-                max_shape_map[input_name] = max_shape
-                min_value_map[input_name] = min_value
-                opt_value_map[input_name] = opt_value
-                max_value_map[input_name] = max_value
+            min_shape_map[input_name] = min_shape
+            opt_shape_map[input_name] = opt_shape
+            max_shape_map[input_name] = max_shape
+            min_value_map[input_name] = min_value
+            opt_value_map[input_name] = opt_value
+            max_value_map[input_name] = max_value
 
         out_shapes = []
         out_names = []
@@ -473,7 +506,7 @@ def convert_subgraph_to_trt(self, program, group_op):
         with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(program):
             pir.set_insertion_point(group_op)
             out = paddle._C_ops.tensorrt_engine(
-                origin_input_value,
+                new_input_values,
                 trt_params,
                 input_names,
                 out_names,
@@ -533,5 +566,20 @@ def convert_program_to_trt(self):
                     orin_out_values[o_i].replace_all_uses_with(new_out[o_i])
 
                 self.program.global_block().remove_op(op)
+
+        save_one_parameter = (
+            False  # We need to keep at least one parameter for save
+        )
+        for op in self.program.global_block().ops:
+            if op.name() == "builtin.parameter":
+                if not save_one_parameter:
+                    save_one_parameter = True
+                    continue
+                if op.results()[0].use_empty():
+                    self.program.global_block().remove_op(op)
+            if op.name() == "builtin.constant":
+                if op.results()[0].use_empty():
+                    self.program.global_block().remove_op(op)
+
         # Call clear_shape_info to clear the previous shape information
         clear_shape_info()
diff --git a/python/paddle/tensorrt/converter_utils.py b/python/paddle/tensorrt/converter_utils.py
index 76ccec354b0c5e..dfb38f13563241 100644
--- a/python/paddle/tensorrt/converter_utils.py
+++ b/python/paddle/tensorrt/converter_utils.py
@@ -453,6 +453,7 @@ def trt_reduce_to_scalar(network, tensor, dtype=trt.int32):
 def convert_conv2d(network, paddle_op, inputs):
     from paddle.tensorrt.util import support_fp32_mix_precision
 
+    bias = None
     if (
         paddle_op.name() == "pd_op.conv2d"
         or paddle_op.name() == "pd_op.depthwise_conv2d"
@@ -469,7 +470,8 @@ def convert_conv2d(network, paddle_op, inputs):
             output_size = None
         else:
             raise ValueError("Invalid number of inputs for conv2d_transpose")
-
+    if paddle_op.name() == "pd_op.fused_conv2d_add_act":
+        input_tensor, filter, bias, _ = inputs
     input_shape = paddle_op.operands()[0].source().shape
     filter_shape = paddle_op.operands()[1].source().shape
 
@@ -521,13 +523,14 @@ def convert_conv2d(network, paddle_op, inputs):
     if (
         paddle_op.name() == "pd_op.conv2d"
         or paddle_op.name() == "pd_op.depthwise_conv2d"
+        or paddle_op.name() == "pd_op.fused_conv2d_add_act"
     ):
         layer = network.add_convolution_nd(
             input=input_tensor,
             num_output_maps=n_output,
             kernel_shape=nv_ksize,
             kernel=filter,
-            bias=None,
+            bias=bias,
         )
     elif (
         paddle_op.name() == "pd_op.conv2d_transpose"
@@ -564,9 +567,21 @@ def convert_conv2d(network, paddle_op, inputs):
     return layer.get_output(0)
 
 
+def get_input_constant_value(paddle_op, inputs, input_index):
+    input_op = paddle_op.operands()[input_index].source().get_defining_op()
+    if input_op.name() == "builtin.constant":
+        return inputs[input_index].numpy().tolist()
+    elif input_op.name() == "pd_op.full_int_array":
+        return input_op.attrs()["value"]
+    elif input_op.name() == "pd_op.full":
+        return [input_op.attrs()["value"]]
+    else:
+        return None
+
+
 def add_reduce_layer(network, paddle_op, inputs, op_type):
     input_tensor = inputs[0]
-    axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"]
+    axis = get_input_constant_value(paddle_op, inputs, 1)
     input_shape = paddle_op.operands()[0].source().shape
     keepdim = paddle_op.attrs()["keepdim"]
     if network.has_implicit_batch_dimension:
diff --git a/python/paddle/tensorrt/export.py b/python/paddle/tensorrt/export.py
index 044f58f0041908..126f3086b1d514 100644
--- a/python/paddle/tensorrt/export.py
+++ b/python/paddle/tensorrt/export.py
@@ -172,6 +172,7 @@ def __init__(
         precision_mode: PrecisionMode = PrecisionMode.FP32,
         ops_run_float: str | list | None = None,
         optimization_level: int | None = 3,
+        disable_passes: list = [],
     ) -> None:
         """
         A class for configuring TensorRT optimizations.
@@ -196,6 +197,8 @@ def __init__(
                 The directory where the optimized model will be saved (default is None).
             optimization_level (int, optional):
                 Set TensorRT optimization level (default is 3). Only supported in TensorRT versions greater than 8.6.
+            disable_passes : (str|list, optional):
+                A list of string representing the names of pass that should not be used for origin program (default is []).
         Returns:
             None
 
@@ -226,6 +229,7 @@ def __init__(
         self.precision_mode = precision_mode
         self.ops_run_float = ops_run_float
         self.disable_ops = disable_ops
+        self.disable_passes = disable_passes
         self.optimization_level = optimization_level
         paddle.framework.set_flags(
             {'FLAGS_trt_min_group_size': min_subgraph_size}
@@ -257,18 +261,23 @@ def convert_to_trt(program, trt_config, scope):
             opt_shape_feed[feed_name[i]] = opt_data
             max_shape_feed[feed_name[i]] = max_data
 
-            # run warmup for collecting shape
-        program = warmup_shape_infer(
+        # run pir pass (including trt_op_marker_pass)
+        program_with_pir = run_pir_pass(
             program,
+            partition_mode=False,
+            disable_passes=trt_config.disable_passes,
+            scope=scope,
+        )
+
+        # run warmup for collecting shape
+        program = warmup_shape_infer(
+            program_with_pir,
             min_shape_feed=min_shape_feed,
             opt_shape_feed=opt_shape_feed,
             max_shape_feed=max_shape_feed,
             scope=scope,
         )
 
-        # run pir pass (including trt_op_marker_pass)
-        program_with_pir = run_pir_pass(program, partition_mode=False)
-
         # specify certain operators to be excluded from entering TensorRT
         if trt_config.disable_ops:
             forbid_op_lower_trt(program, trt_config.disable_ops)
@@ -277,7 +286,9 @@ def convert_to_trt(program, trt_config, scope):
         mark_builtin_op(program)
 
         # run pir pass (including trt_sub_graph_extract_pass)
-        program_with_pir = run_pir_pass(program, partition_mode=True)
+        program_with_pir = run_pir_pass(
+            program, partition_mode=True, scope=scope
+        )
 
         # Step4: run TRTConverter (would lower group_op into tensorrt_engine_op)
         converter = PaddleToTensorRTConverter(
diff --git a/python/paddle/tensorrt/impls/common.py b/python/paddle/tensorrt/impls/common.py
index 77db6ba90ab840..fef1eef69b8328 100644
--- a/python/paddle/tensorrt/impls/common.py
+++ b/python/paddle/tensorrt/impls/common.py
@@ -17,7 +17,10 @@
 import tensorrt as trt
 
 from paddle import pir
-from paddle.tensorrt.converter_utils import get_shape_tensor_element
+from paddle.tensorrt.converter_utils import (
+    get_input_constant_value,
+    get_shape_tensor_element,
+)
 from paddle.tensorrt.register import converter_registry
 from paddle.tensorrt.util import get_trt_version_list
 
@@ -25,8 +28,7 @@
 @converter_registry.register("pd_op.dropout", trt_version="8.x")
 def dropout_converter(network, paddle_op, inputs):
     input_x = inputs[0]
-    p_defining_op = paddle_op.operands()[2].source().get_defining_op()
-    dropout_prob = p_defining_op.attrs()["value"]
+    dropout_prob = get_input_constant_value(paddle_op, inputs, 2)[0]
     downgrade_in_infer = paddle_op.attrs().get("mode")
 
     if downgrade_in_infer == "upscale_in_train":
diff --git a/python/paddle/tensorrt/impls/conv.py b/python/paddle/tensorrt/impls/conv.py
index 55db36b9aa7db1..48b3dee19b58f0 100644
--- a/python/paddle/tensorrt/impls/conv.py
+++ b/python/paddle/tensorrt/impls/conv.py
@@ -19,6 +19,9 @@
 
 @converter_registry.register("pd_op.depthwise_conv2d", trt_version="8.x")
 @converter_registry.register("pd_op.conv2d", trt_version="trt_version_ge=8.0")
+@converter_registry.register(
+    "pd_op.fused_conv2d_add_act", trt_version="trt_version_ge=8.0"
+)
 @converter_registry.register("pd_op.conv2d_transpose", trt_version="8.x")
 @converter_registry.register(
     "pd_op.depthwise_conv2d_transpose", trt_version="8.x"
diff --git a/python/paddle/tensorrt/impls/creation.py b/python/paddle/tensorrt/impls/creation.py
index 59cdaa4ad025ad..d45f2a15886909 100644
--- a/python/paddle/tensorrt/impls/creation.py
+++ b/python/paddle/tensorrt/impls/creation.py
@@ -20,6 +20,7 @@
 from paddle.tensorrt.converter_utils import (
     add_1D_constant_layer,
     cast_tensor,
+    get_input_constant_value,
     resize_to_1d,
     trt_cast,
     trt_floor_div,
@@ -148,9 +149,8 @@ def full_like_converter(network, paddle_op, inputs):
             f"cast converter currently doesn't support dtype: {out_dtype}"
         )
 
-    value_op = paddle_op.operands()[1].source().get_defining_op()
-    if value_op.name() == "pd_op.full":
-        fill_value = value_op.attrs()["value"]
+    fill_value = get_input_constant_value(paddle_op, inputs, 1)
+    if fill_value is not None:
         value = network.add_constant(
             (1,),
             np.array(
@@ -206,9 +206,9 @@ def full_with_tensor_converter(network, paddle_op, inputs):
         else:
             shape_tensor_list = [shape_tensor]
 
-    shape_op = paddle_op.operands()[1].source().get_defining_op()
-    if shape_op.name() == "pd_op.full_int_array":
-        shape_tensor = shape_op.attrs()["value"]
+    shape_val = get_input_constant_value(paddle_op, inputs, 1)
+    if shape_val is not None:
+        shape_tensor = shape_val
         is_static_shape = True
     else:
         shape_tensor = inputs[1]
diff --git a/python/paddle/tensorrt/impls/input.py b/python/paddle/tensorrt/impls/input.py
index 8098a9d1264612..385958910c8ad4 100644
--- a/python/paddle/tensorrt/impls/input.py
+++ b/python/paddle/tensorrt/impls/input.py
@@ -47,6 +47,11 @@ def one_hot_converter(network, paddle_op, inputs):
 
     values_tensor = add_1D_constant_layer(network, values_data, dtype=np_dtype)
 
+    if isinstance(num_classes_tensor, trt.Weights):
+        num_classes_tensor = network.add_constant(
+            paddle_op.operands()[1].source().shape, num_classes_tensor
+        ).get_output(0)
+
     reshape_layer = network.add_shuffle(num_classes_tensor)
     reshape_layer.reshape_dims = ()
     depth_tensor = reshape_layer.get_output(0)
diff --git a/python/paddle/tensorrt/impls/manipulation.py b/python/paddle/tensorrt/impls/manipulation.py
index 8f005518d618c7..3f2084ed926bc4 100644
--- a/python/paddle/tensorrt/impls/manipulation.py
+++ b/python/paddle/tensorrt/impls/manipulation.py
@@ -22,6 +22,7 @@
     cast_tensor,
     fix_negative_indices,
     get_axes_for_reduce_op,
+    get_input_constant_value,
     get_shape_tensor_element,
     has_dynamic_shape,
     resize_to_1d,
@@ -47,9 +48,8 @@
 def reshape_converter(network, paddle_op, inputs):
     x = inputs[0]
     is_constant_shape = False
-    shape_defining_op = paddle_op.operands()[1].source().get_defining_op()
-    if shape_defining_op.name() == "pd_op.full_int_array":
-        shape = shape_defining_op.attrs()["value"]
+    shape = get_input_constant_value(paddle_op, inputs, 1)
+    if shape is not None:
         reshape_dim = shape
         is_constant_shape = True
     elif isinstance(inputs[1], list):
@@ -177,7 +177,7 @@ def concat_converter(network, paddle_op, inputs):
     axis_tensor = inputs[1]
     concat_layer = network.add_concatenation(inputs=input_tensors)
 
-    axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"]
+    axis = get_input_constant_value(paddle_op, inputs, 1)[0]
     axis = int(axis)
     if axis < 0:
         axis = len(input_tensors[0].shape) + axis
@@ -195,7 +195,7 @@ def concat_converter(network, paddle_op, inputs):
 def unsqueeze_converter(network, paddle_op, inputs):
     x = inputs[0]
     input_dims = x.shape
-    axes = paddle_op.operands()[1].source().get_defining_op().attrs()["value"]
+    axes = get_input_constant_value(paddle_op, inputs, 1)
     assert (
         len(axes) > 0
     ), f"axes size should be > 0 in when convert unsqueeze op in TensorRT, but received len(axes) = {len(axes)}."
@@ -250,15 +250,8 @@ def squeeze_converter(network, paddle_op, inputs):
         input_val = network.add_constant(input_shape, input_val).get_output(0)
 
     # Get axis
-    axis = (
-        paddle_op.operands()[1]
-        .source()
-        .get_defining_op()
-        .attrs()
-        .get("value", [])
-    )
-
-    if not axis:
+    axis = get_input_constant_value(paddle_op, inputs, 1)
+    if len(axis) == 0:
         for i in range(input_shape_size):
             if input_shape[i] == -1:
                 raise RuntimeError(
@@ -307,9 +300,8 @@ def expand_converter(network, paddle_op, inputs):
     rank = len(input_dims)
     paddle_shape_tensor = paddle_op.operands()[1].source()
 
-    shape_tensor_source_op = paddle_shape_tensor.get_defining_op()
-    if shape_tensor_source_op.name() == "pd_op.full_int_array":
-        shape = shape_tensor_source_op.attrs()["value"]
+    shape = get_input_constant_value(paddle_op, inputs, 1)
+    if shape is not None:
         shape_tensor = add_1D_constant_layer(network, shape)
         shape_rank = len(shape)
     elif paddle_shape_tensor.type().as_vec_type():
@@ -376,8 +368,6 @@ def slice_converter(network, paddle_op, inputs):
     axes = paddle_op.attrs()["axes"]
     decrease_axis = paddle_op.attrs().get("decrease_axis")
 
-    starts_op = paddle_op.operands()[1].source().get_defining_op()
-    ends_op = paddle_op.operands()[2].source().get_defining_op()
     input_shape_tensor = trt_shape(network, input_tensor)
     input_rank = len(input_tensor.shape)
 
@@ -389,8 +379,8 @@ def slice_converter(network, paddle_op, inputs):
             get_shape_tensor_element(network, input_shape_tensor, i)
         )
 
-    if starts_op.name() == "pd_op.full_int_array":
-        starts = starts_op.attrs()["value"]
+    starts = get_input_constant_value(paddle_op, inputs, 1)
+    if starts is not None:
         assert len(starts) == len(
             axes
         ), f"The size of this starts: {len(starts)} must be equal to the axes: {len(axes)}."
@@ -422,8 +412,8 @@ def slice_converter(network, paddle_op, inputs):
                 network, starts, idx
             )
 
-    if ends_op.name() == "pd_op.full_int_array":
-        ends = ends_op.attrs()["value"]
+    ends = get_input_constant_value(paddle_op, inputs, 2)
+    if ends is not None:
         assert len(ends) == len(
             axes
         ), f"The size of this ends: {len(ends)} must be equal to the axes: {len(axes)}."
@@ -500,9 +490,8 @@ def split_with_num_converter(network, paddle_op, inputs):
     input_shape_size = len(input_tensor.shape)
 
     # Handle the case where axis is of type pir::Value
-    axis_op = paddle_op.operands()[1].source().get_defining_op()
-    if axis_op.name() == "pd_op.full":
-        axis_value = axis_op.attrs()["value"]
+    axis_value = get_input_constant_value(paddle_op, inputs, 1)
+    if axis_value is not None:
         axis_tensor = add_1D_constant_layer(network, axis_value)
     else:
         axis_tensor = inputs[1]
@@ -576,18 +565,16 @@ def split_converter(network, paddle_op, inputs):
     input_shape = input_tensor.shape
     input_shape_size = len(input_shape)
 
-    axis_op = paddle_op.operands()[2].source().get_defining_op()
-    if axis_op.name() == "pd_op.full":
-        axis_value = axis_op.attrs()["value"]
+    axis_value = get_input_constant_value(paddle_op, inputs, 2)
+    if axis_value is not None:
         axis_tensor = add_1D_constant_layer(network, axis_value)
     else:
         axis_tensor = inputs[2]
         axis_tensor = cast_tensor(network, axis_tensor, trt.int32)
 
     # Retrieve and process sections
-    sections_op = paddle_op.operands()[1].source().get_defining_op()
-    if sections_op.name() == "pd_op.full_int_array":
-        sections_value = sections_op.attrs()["value"]
+    sections_value = get_input_constant_value(paddle_op, inputs, 1)
+    if sections_value is not None:
         section_list = [int(s) for s in sections_value]
         dynamic_sections = False
     else:
@@ -756,9 +743,8 @@ def tile_converter(network, paddle_op, inputs):
     input_shape_tensor = network.add_shape(input).get_output(0)
     rank = len(input_shape)
 
-    repeat_times_op = paddle_op.operands()[1].source().get_defining_op()
-    if repeat_times_op.name() == "pd_op.full_int_array":
-        repeat_times = repeat_times_op.attrs()["value"]
+    repeat_times = get_input_constant_value(paddle_op, inputs, 1)
+    if repeat_times is not None:
         repeat_tensor = add_1D_constant_layer(network, repeat_times)
         repeat_rank = len(repeat_times)
     else:
@@ -809,19 +795,9 @@ def tile_converter(network, paddle_op, inputs):
 def strided_slice_converter(network, paddle_op, inputs):
     input_tensor = inputs[0]
     axes = paddle_op.attrs()["axes"]
-
-    starts_op = paddle_op.operands()[1].source().get_defining_op()
-    ends_op = paddle_op.operands()[2].source().get_defining_op()
-    strides_op = paddle_op.operands()[3].source().get_defining_op()
-
-    if starts_op.name() == "pd_op.full_int_array":
-        starts = starts_op.attrs()["value"]
-
-    if ends_op.name() == "pd_op.full_int_array":
-        ends = ends_op.attrs()["value"]
-
-    if strides_op.name() == "pd_op.full_int_array":
-        strides = strides_op.attrs()["value"]
+    starts = get_input_constant_value(paddle_op, inputs, 1)
+    ends = get_input_constant_value(paddle_op, inputs, 2)
+    strides = get_input_constant_value(paddle_op, inputs, 3)
 
     input_shape = input_tensor.shape
     nchw_input_dims = len(input_shape)
@@ -886,10 +862,8 @@ def roll_converter(network, paddle_op, inputs):
     input_tensor = inputs[0]
     axis = paddle_op.attrs()["axis"]
 
-    shifts_op = paddle_op.operands()[1].source().get_defining_op()
-    if shifts_op.name() == "pd_op.full_int_array":
-        shifts = shifts_op.attrs()["value"]
-    else:
+    shifts = get_input_constant_value(paddle_op, inputs, 1)
+    if shifts is None:
         shifts = inputs[1]
 
     axis_size = len(axis)
diff --git a/python/paddle/tensorrt/impls/math.py b/python/paddle/tensorrt/impls/math.py
index 4247a02fbdfe5f..e260b27281ed2b 100644
--- a/python/paddle/tensorrt/impls/math.py
+++ b/python/paddle/tensorrt/impls/math.py
@@ -25,6 +25,7 @@
     fill_constant_layer,
     get_axes_for_reduce_op,
     get_axis_length,
+    get_input_constant_value,
     get_shape_tensor_element,
     trt_cast,
     trt_concat,
@@ -63,9 +64,8 @@ def scale_converter(network, paddle_op, inputs):
     reshape_layer_bias = network.add_shuffle(bias_tensor)
     reshape_layer_bias.set_input(1, bias_shapes_tensor)
 
-    scale_op = paddle_op.operands()[1].source().get_defining_op()
-    if scale_op.name() == "pd_op.full":
-        scale = scale_op.attrs()["value"]
+    scale = get_input_constant_value(paddle_op, inputs, 1)
+    if scale is not None:
         has_scale_tensor = False
         if is_int:
             scale_tensor = add_1D_constant_layer(
@@ -125,7 +125,7 @@ def scale_converter(network, paddle_op, inputs):
 @converter_registry.register("pd_op.max", trt_version="trt_version_ge=8.0")
 def max_converter(network, paddle_op, inputs):
     input_tensor = inputs[0]
-    axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"]
+    axis = get_input_constant_value(paddle_op, inputs, 1)
     input_shape = input_tensor.shape
     keepdim = paddle_op.attrs()["keepdim"]
     if network.has_implicit_batch_dimension:
@@ -171,10 +171,10 @@ def multiply_converter(network, paddle_op, inputs):
 @converter_registry.register("pd_op.clip", trt_version="8.x")
 def clip_converter(network, paddle_op, inputs):
     def _get_constant_or_expand_tensor(
-        op, constant_inputs, input_shape_tensor, rank
+        value, constant_inputs, input_shape_tensor, rank
     ):
-        if op.name() == "pd_op.full":
-            value = op.attrs()["value"]
+
+        if value is not None:
             return fill_constant_layer(
                 network, input_shape_tensor, rank, value, input_tensor.dtype
             )
@@ -194,15 +194,15 @@ def _get_constant_or_expand_tensor(
     input_shape_tensor = network.add_shape(input_tensor).get_output(0)
 
     # handle min operation
-    min_op = paddle_op.operands()[1].source().get_defining_op()
+    min_value = get_input_constant_value(paddle_op, inputs, 1)
     alpha_t = _get_constant_or_expand_tensor(
-        min_op, inputs[1], input_shape_tensor, rank
+        min_value, inputs[1], input_shape_tensor, rank
     )
 
     # handle max operation
-    max_op = paddle_op.operands()[2].source().get_defining_op()
+    max_value = get_input_constant_value(paddle_op, inputs, 2)
     beta_t = _get_constant_or_expand_tensor(
-        max_op, inputs[2], input_shape_tensor, rank
+        max_value, inputs[2], input_shape_tensor, rank
     )
 
     # run the clip operation
@@ -294,7 +294,7 @@ def all_converter(network, paddle_op, inputs):
 def cumsum_converter(network, paddle_op, inputs):
     input_tensor = inputs[0]
     dtype = input_tensor.dtype
-    axis = paddle_op.operands()[1].source().get_defining_op().attrs()["value"]
+    axis = get_input_constant_value(paddle_op, inputs, 1)[0]
     input_shape = input_tensor.shape
     rank = len(input_shape)
 
diff --git a/python/paddle/tensorrt/impls/others.py b/python/paddle/tensorrt/impls/others.py
index 8f9cafbccf758c..3aff438e0417bc 100644
--- a/python/paddle/tensorrt/impls/others.py
+++ b/python/paddle/tensorrt/impls/others.py
@@ -21,6 +21,7 @@
 from paddle.tensorrt.converter_utils import (
     add_1D_constant_layer,
     fill_constant_layer,
+    get_input_constant_value,
     get_shape_tensor_element,
     get_trt_plugin,
     trt_concat,
@@ -164,43 +165,13 @@ def set_value_converter(network, paddle_op, inputs):
         paddle_op.name() == "pd_op.set_value"
         or paddle_op.name() == "pd_op.set_value_"
     ):
-        starts = (
-            paddle_op.operands()[1]
-            .source()
-            .get_defining_op()
-            .attrs()["value"][0]
-        )
-        ends = (
-            paddle_op.operands()[2]
-            .source()
-            .get_defining_op()
-            .attrs()["value"][0]
-        )
-        steps = (
-            paddle_op.operands()[3]
-            .source()
-            .get_defining_op()
-            .attrs()["value"][0]
-        )
+        starts = get_input_constant_value(paddle_op, inputs, 1)[0]
+        ends = get_input_constant_value(paddle_op, inputs, 2)[0]
+        steps = get_input_constant_value(paddle_op, inputs, 3)[0]
     else:
-        starts = (
-            paddle_op.operands()[2]
-            .source()
-            .get_defining_op()
-            .attrs()["value"][0]
-        )
-        ends = (
-            paddle_op.operands()[3]
-            .source()
-            .get_defining_op()
-            .attrs()["value"][0]
-        )
-        steps = (
-            paddle_op.operands()[4]
-            .source()
-            .get_defining_op()
-            .attrs()["value"][0]
-        )
+        starts = get_input_constant_value(paddle_op, inputs, 2)[0]
+        ends = get_input_constant_value(paddle_op, inputs, 3)[0]
+        steps = get_input_constant_value(paddle_op, inputs, 4)[0]
     axes = paddle_op.attrs()["axes"][0]
 
     input_dims = x.shape
diff --git a/python/paddle/tensorrt/impls/pooling.py b/python/paddle/tensorrt/impls/pooling.py
index a49c8a8e9026d6..372fd0a1af065d 100644
--- a/python/paddle/tensorrt/impls/pooling.py
+++ b/python/paddle/tensorrt/impls/pooling.py
@@ -16,6 +16,7 @@
 import numpy as np
 import tensorrt as trt
 
+from paddle.tensorrt.converter_utils import get_input_constant_value
 from paddle.tensorrt.register import converter_registry
 
 
@@ -36,12 +37,10 @@ def pool2d_converter(network, paddle_op, inputs):
     padding_algorithm = paddle_op.attrs().get("padding_algorithm", "EXPLICIT")
 
     if not paddle_op.attrs().get("kernel_size") and len(inputs) == 2:
-        full_int_op = paddle_op.operands()[1].source().get_defining_op()
-        if full_int_op.name() == "pd_op.full_int_array":
-            kernel_size = full_int_op.attrs().get("value", [1, 1])
-        else:
+        kernel_size = get_input_constant_value(paddle_op, inputs, 1)
+        if kernel_size is None:
             raise Exception(
-                "The defining op of kernel size must be pd_op.full_int_array"
+                "The defining op of kernel size must be builtin.constant/pd_op.full_int_array"
             )
     else:
         kernel_size = paddle_op.attrs().get("kernel_size", [1, 1])
diff --git a/python/paddle/tensorrt/impls/search.py b/python/paddle/tensorrt/impls/search.py
index de9100297b1681..100514e88ef3df 100644
--- a/python/paddle/tensorrt/impls/search.py
+++ b/python/paddle/tensorrt/impls/search.py
@@ -16,6 +16,7 @@
 import tensorrt as trt
 
 from paddle.tensorrt.converter_utils import (
+    get_input_constant_value,
     get_shape_tensor_element,
     squeeze_trt,
     trt_cast,
@@ -41,13 +42,7 @@ def argmax_converter(network, paddle_op, inputs):
     x = inputs[0]
     input_dims = x.shape
     rank = len(input_dims)
-    axis = int(
-        paddle_op.operands()[1]
-        .source()
-        .get_defining_op()
-        .attrs()
-        .get("value", -1)
-    )
+    axis = int(get_input_constant_value(paddle_op, inputs, 1)[0])
     keepdims = paddle_op.attrs()["keepdims"]
 
     if axis < 0:
@@ -84,13 +79,7 @@ def argmin_converter(network, paddle_op, inputs):
     x = inputs[0]
     input_dims = x.shape
     rank = len(input_dims)
-    axis = int(
-        paddle_op.operands()[1]
-        .source()
-        .get_defining_op()
-        .attrs()
-        .get("value", -1)
-    )
+    axis = int(get_input_constant_value(paddle_op, inputs, 1)[0])
     keepdims = paddle_op.attrs()["keepdims"]
 
     if axis < 0:
@@ -171,11 +160,10 @@ def topk_converter(network, paddle_op, inputs):
     largest = paddle_op.attrs().get("largest", True)
     flag = trt.TopKOperation.MAX if largest else trt.TopKOperation.MIN
 
-    k_op = paddle_op.operands()[1].source().get_defining_op()
-    if k_op.name() == "pd_op.full":
-        k = k_op.attrs()["value"]
-    else:
+    k_list = get_input_constant_value(paddle_op, inputs, 1)
+    if k_list is None:
         raise NotImplementedError("Dynamic k is not supported in TensorRT.")
+    k = k_list[0]
     input_rank = len(input_shape)
 
     expand_to_2d = input_rank == 1
diff --git a/python/paddle/tensorrt/util.py b/python/paddle/tensorrt/util.py
index fbabef8c6178d5..de286d9bfa9ac0 100644
--- a/python/paddle/tensorrt/util.py
+++ b/python/paddle/tensorrt/util.py
@@ -49,20 +49,40 @@ def map_dtype(pd_dtype):
         raise TypeError(f"Unsupported dtype: {pd_dtype}")
 
 
-def run_pir_pass(program, partition_mode=False):
+def run_pir_pass(program, partition_mode=False, disable_passes=[], scope=None):
     pm = pir.PassManager(opt_level=4)
     pm.enable_print_statistics()
     paddle.base.libpaddle.pir.infer_symbolic_shape_pass(pm, program)
+    if scope is None:
+        scope = paddle.static.global_scope()
+    place = paddle.CUDAPlace(0)
     passes = [
         {'trt_op_marker_pass': {}},
+        {
+            'constant_folding_pass': {
+                "__place__": place,
+                "__param_scope__": scope,
+            }
+        },
+        {'conv2d_add_fuse_pass': {}},
+        {'trt_op_marker_pass': {}},  # for fusion op
     ]
     if partition_mode:
         passes = [{'trt_sub_graph_extract_pass': {}}]
 
     for pass_item in passes:
         for pass_name, pass_attr in pass_item.items():
+            if pass_name in disable_passes:
+                continue
             pm.add_pass(pass_name, pass_attr)
     pm.run(program)
+
+    # delete unused op
+    for op in program.global_block().ops:
+        if op.name() == "builtin.constant" or op.name() == "builtin.parameter":
+            if op.results()[0].use_empty():
+                program.global_block().remove_op(op)
+
     return program
 
 
@@ -198,10 +218,13 @@ def weight_to_tensor(network, paddle_value, trt_tensor, use_op_name):
         "pd_op.batch_norm_",
         "pd_op.layer_norm",
         "pd_op.depthwise_conv2d_transpose",
+        "pd_op.fused_conv2d_add_act",
         "pd_op.affine_channel",
     ]
     if use_op_name in forbid_cast_op:
         return trt_tensor
+    if paddle_value.get_defining_op().name() == "builtin.constant":
+        return trt_tensor
     input_shape = paddle_value.shape
     if type(trt_tensor) == trt.Weights:
         return network.add_constant(input_shape, trt_tensor).get_output(0)
diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py
index a8fc090d00bb00..48a8673b2dd6ef 100755
--- a/test/tensorrt/tensorrt_test_base.py
+++ b/test/tensorrt/tensorrt_test_base.py
@@ -43,6 +43,9 @@ def __init__(self, methodName='runTest'):
         self.max_shape = None
         self.target_marker_op = ""
         self.dynamic_shape_data = {}
+        self.disable_passes = [
+            "constant_folding_pass",
+        ]
 
     def create_fake_program(self):
         if self.python_api is None:
@@ -257,6 +260,14 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
                             max_shape_data[feed_name] = np.random.randn(
                                 *self.max_shape[feed_name]
                             ).astype(self.api_args[feed_name].dtype)
+
+            # run pir pass(including some constant fold pass, dead code elimination pass, fusion pass and trt_op_marker_pass)
+            main_program = run_pir_pass(
+                main_program,
+                partition_mode=False,
+                disable_passes=self.disable_passes,
+            )
+
             scope = paddle.static.global_scope()
             main_program = warmup_shape_infer(
                 main_program,
@@ -265,15 +276,11 @@ def check_trt_result(self, rtol=1e-4, atol=1e-4, precision_mode="fp32"):
                 max_shape_feed=max_shape_data,
                 scope=scope,
             )
-
             for op in main_program.global_block().ops[::-1]:
                 # Remove all invalid fetch op
                 if op.name() == "pd_op.fetch":
                     main_program.global_block().remove_op(op)
 
-            # run pir pass(including some fusion pass and trt_op_marker_pass)
-            main_program = run_pir_pass(main_program, partition_mode=False)
-
             # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph.
             mark_builtin_op(main_program)
 
@@ -331,7 +338,11 @@ def check_marker(self, expected_result):
             main_program, startup_program, fetch_list = (
                 self.create_fake_program()
             )
-            main_program = run_pir_pass(main_program, partition_mode=False)
+            main_program = run_pir_pass(
+                main_program,
+                partition_mode=False,
+                disable_passes=self.disable_passes,
+            )
             marker_result = False
             for op in main_program.global_block().ops:
                 if op.name() == self.target_marker_op:
diff --git a/test/tensorrt/test_converter_conv.py b/test/tensorrt/test_converter_conv.py
index 4c6d5c0d212341..e723cad045a66b 100644
--- a/test/tensorrt/test_converter_conv.py
+++ b/test/tensorrt/test_converter_conv.py
@@ -41,6 +41,7 @@ def setUp(self):
         self.min_shape = {"x": [1, 3, 8, 8]}
         self.opt_shape = {"x": [2, 3, 8, 8]}
         self.max_shape = {"x": [10, 3, 8, 8]}
+        self.disable_passes = ['constant_folding_pass', 'conv2d_add_fuse_pass']
 
     def test_trt_result_fp16(self):
         self.check_trt_result(precision_mode="fp16")
@@ -61,6 +62,7 @@ def setUp(self):
         self.min_shape = {"x": [1, 3, 8, 8]}
         self.opt_shape = {"x": [2, 3, 8, 8]}
         self.max_shape = {"x": [10, 3, 8, 8]}
+        self.disable_passes = ['constant_folding_pass', 'conv2d_add_fuse_pass']
 
     def test_trt_result(self):
         self.check_trt_result()
@@ -79,6 +81,7 @@ def setUp(self):
         self.min_shape = {"x": [1, 3, 8, 8]}
         self.opt_shape = {"x": [2, 3, 8, 8]}
         self.max_shape = {"x": [10, 3, 8, 8]}
+        self.disable_passes = ['constant_folding_pass', 'conv2d_add_fuse_pass']
 
     def test_trt_result(self):
         self.check_trt_result()
@@ -313,5 +316,24 @@ def test_trt_result(self):
         self.check_trt_result()
 
 
+class TestFusedConv2dAddActTRTPattern(TensorRTBaseTest):
+    def setUp(self):
+        self.python_api = conv2d_wrapper
+        self.api_args = {
+            "x": np.random.random([2, 3, 8, 8]).astype("float32"),
+        }
+        self.program_config = {"feed_list": ["x"]}
+        self.min_shape = {"x": [1, 3, 8, 8]}
+        self.opt_shape = {"x": [2, 3, 8, 8]}
+        self.max_shape = {"x": [10, 3, 8, 8]}
+        self.disable_passes = []
+
+    def test_trt_result_fp16(self):
+        self.check_trt_result(precision_mode="fp16")
+
+    def test_trt_result_fp32(self):
+        self.check_trt_result()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/tensorrt/test_converter_model_bert.py b/test/tensorrt/test_converter_model_bert.py
index d2f163757935bb..1435a396668fd6 100644
--- a/test/tensorrt/test_converter_model_bert.py
+++ b/test/tensorrt/test_converter_model_bert.py
@@ -46,6 +46,7 @@ def test_paddle_to_tensorrt_conversion_bert(self):
         # Create a TensorRTConfig with inputs as a required field.
         trt_config = TensorRTConfig(inputs=[input_config])
         trt_config.disable_ops = "pd_op.dropout"
+        trt_config.disable_passes = ['constant_folding_pass']
 
         # Step1.1: get original results(for tests only)
         output_var = program.global_block().ops[-1].result(0)

From 93cae2bcb0bb99c819fb6ce1ae6f991b08049a68 Mon Sep 17 00:00:00 2001
From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com>
Date: Thu, 9 Jan 2025 17:42:40 +0800
Subject: [PATCH 52/57] fix pir pass of moe global mesh tensor (#70715)

---
 .../distributed/auto_parallel/static/pir_pass.py   | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py
index 958c0c112109b0..65a2aed0f50a5d 100644
--- a/python/paddle/distributed/auto_parallel/static/pir_pass.py
+++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py
@@ -440,6 +440,20 @@ def prune_op(block):
                 elif op.name() == "cf.yield":
                     continue
                 elif op.name() == "pd_op.pylayer":
+                    # if the pylayer op is not on the current rank, we should delete it
+                    is_cur_rank = False
+                    for pylayer_block in list(op.blocks())[::-1]:
+                        for sub_block_op in pylayer_block.ops:
+                            if (
+                                sub_block_op.dist_attr
+                                and cur_rank
+                                in sub_block_op.dist_attr.process_mesh.process_ids
+                            ):
+                                is_cur_rank = True
+                                break
+                    if not is_cur_rank:
+                        op.erase()
+                        continue
                     for pylayer_block in list(op.blocks())[::-1]:
                         prune_op(pylayer_block)
                     # update pylayer op's inputs

From 8d120573b105a5fc3ca0a3e7214ab96ebd62ebc5 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 9 Jan 2025 18:17:50 +0800
Subject: [PATCH 53/57] [dygraph] refine dygraph backward error info (#70709)

* refine dygraph backward error info
---
 .../eager/accumulation/accumulation_node.h    |   6 +
 .../generator/eager_gen.py                    |   3 +-
 paddle/fluid/eager/backward.cc                | 342 ++++++++++--------
 paddle/fluid/framework/op_call_stack.cc       |  34 ++
 paddle/fluid/framework/op_call_stack.h        |   5 +
 paddle/fluid/pybind/eager.cc                  |   1 +
 paddle/fluid/pybind/eager_math_op_patch.cc    |  43 +++
 paddle/fluid/pybind/eager_method.cc           |   3 +
 paddle/fluid/pybind/eager_py_layer.cc         |   1 +
 paddle/fluid/pybind/eager_utils.cc            |  14 +
 10 files changed, 296 insertions(+), 156 deletions(-)

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index e58df3eee65555..114e65048c5371 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -19,6 +19,8 @@
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/utils/test_macros.h"
 
+COMMON_DECLARE_int32(call_stack_level);
+
 namespace egr {
 
 class TEST_API GradNodeAccumulation : public GradNodeBase {
@@ -30,6 +32,10 @@ class TEST_API GradNodeAccumulation : public GradNodeBase {
       weak_grad_ = meta->WeakGrad();
     }
 
+    if (FLAGS_call_stack_level == 3) {
+      this->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
+    }
+
     SetDefaultGradInOutMeta();
   }
 
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index cd86ee75562363..31523fb1ae8d02 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -485,7 +485,7 @@ class {} : public egr::GradNodeBase {{
     // Node Construction
 {}
     // Set for forward trace
-  if (FLAGS_check_nan_inf) {{
+  if (FLAGS_check_nan_inf || FLAGS_call_stack_level == 3) {{
     grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
   }}
     // SetAttributes if needed
@@ -590,6 +590,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/imperative/amp_utils.h"
 
 COMMON_DECLARE_bool(check_nan_inf);
+COMMON_DECLARE_int32(call_stack_level);
 COMMON_DECLARE_string(tensor_operants_mode);
 COMMON_DECLARE_bool(use_stride_kernel);
 {}
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 16c6aa07a9543a..ddc1a43ba73f50 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -18,6 +18,7 @@
 #include "paddle/phi/core/memory/stats.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 
+COMMON_DECLARE_int32(call_stack_level);
 namespace egr {
 
 std::unordered_map<GradNodeBase*, int> getInDegreeMap(
@@ -254,177 +255,208 @@ std::vector<paddle::Tensor> RunBackward(
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
     VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node;
-
-    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
+    try {
+      if (queue.size() > 1 && node_in_degree_map[node] != 0) {
+        queue.pop_front();
+        continue;
+      }
       queue.pop_front();
-      continue;
-    }
-    queue.pop_front();
 
-    // Run node: This is where Hook happens
-    auto node_input_buffer_iter = node_input_buffers_dict.find(node);
-    PADDLE_ENFORCE_NE(
-        node_input_buffer_iter,
-        node_input_buffers_dict.end(),
-        common::errors::Fatal(
-            "Unable to find next node in the GradTensorHolder \n"
-            "Trying to run Node without configuring its GradTensorHolder."));
-
-    std::unique_ptr<GradTensorHolder> node_input_buffer =
-        std::move(node_input_buffer_iter->second);
-
-    // Check input
-    EnforceGradNodeHasInput(node);
-
-    VLOG(7) << "Run Backward Kernel with GradTensorHolder.";
-
-    // This 'Global_XXXGradNode' record event is different with
-    // 'Local_XXXGradNode' event.
-    // * 'Global_XXXGradNode' will not only cover execution time of this
-    // function, but also include gradient
-    //    accumulation when the output(s) of corresponding forward OP are shared
-    //    by other OP(s), which may have extra overhead of accumulation than
-    //    'Local_XXXGradNode'.
-    // * 'Local_XXXGradNode' will only cover execution time of GradNode
-    // function.
-    phi::RecordEvent grad_node_record_event(
-        "Global_" + std::string((*node).name()),
-        phi::TracerEventType::Operator,
-        1);
-
-    // Run Pre Backward Node and get outputs
-    paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
-        grad_output_tensors = (*node)(
-            node_input_buffer->Buffers(), create_graph, is_general_grad);
-
-    if (!inputs.empty() && is_general_grad) {
-      GeneralGrad::Instance().SetResultForEndingNodes(grad_output_tensors,
-                                                      node);
-    }
-
-    // retain_grad or not
-    if (!retain_graph) {
-      VLOG(3)
-          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
-      node->ClearTensorWrappers();
-    }
+      // Run node: This is where Hook happens
+      auto node_input_buffer_iter = node_input_buffers_dict.find(node);
+      PADDLE_ENFORCE_NE(
+          node_input_buffer_iter,
+          node_input_buffers_dict.end(),
+          common::errors::Fatal(
+              "Unable to find next node in the GradTensorHolder \n"
+              "Trying to run Node without configuring its GradTensorHolder."));
+
+      std::unique_ptr<GradTensorHolder> node_input_buffer =
+          std::move(node_input_buffer_iter->second);
+
+      // Check input
+      EnforceGradNodeHasInput(node);
+
+      VLOG(7) << "Run Backward Kernel with GradTensorHolder.";
+
+      // This 'Global_XXXGradNode' record event is different with
+      // 'Local_XXXGradNode' event.
+      // * 'Global_XXXGradNode' will not only cover execution time of this
+      // function, but also include gradient
+      //    accumulation when the output(s) of corresponding forward OP are
+      //    shared by other OP(s), which may have extra overhead of accumulation
+      //    than 'Local_XXXGradNode'.
+      // * 'Local_XXXGradNode' will only cover execution time of GradNode
+      // function.
+      phi::RecordEvent grad_node_record_event(
+          "Global_" + std::string((*node).name()),
+          phi::TracerEventType::Operator,
+          1);
+
+      // Run Pre Backward Node and get outputs
+      paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
+          grad_output_tensors = (*node)(
+              node_input_buffer->Buffers(), create_graph, is_general_grad);
+
+      if (!inputs.empty() && is_general_grad) {
+        GeneralGrad::Instance().SetResultForEndingNodes(grad_output_tensors,
+                                                        node);
+      }
 
-    // TODO(jiabin): Should we erase it or find a more efficient way.
-    node_input_buffers_dict.erase(node_input_buffer_iter);
+      // retain_grad or not
+      if (!retain_graph) {
+        VLOG(3) << "retain_graph is false, need to clear the TensorWrapper of "
+                   "nodes.";
+        node->ClearTensorWrappers();
+      }
 
-    // Prepare GradTensorHolder for next node
-    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
-        metas = node->OutputMeta();
-    PADDLE_ENFORCE(metas.size() == grad_output_tensors.size() || metas.empty(),
-                   common::errors::Fatal(
-                       "Number of edges should be either empty ( for leaf node "
-                       ") or the same as number of output grad tensors, but we "
-                       "got edges size is: %d, grad_output size is: %d",
-                       metas.size(),
-                       grad_output_tensors.size()));
-
-    for (size_t i = 0; i < metas.size(); i++) {
-      for (size_t j = 0; j < metas[i].size(); j++) {
-        const Edge& edge = metas[i][j].GetEdge();
-        if (!edge.IsInitialized()) {
-          continue;
-        }
-        auto edge_rank = edge.GetEdgeRankInfo();
-        // Since we make edge has as same rank as bwd outputs, we indexing them
-        // with the same rank(i, j)
-        auto next_node_shared = edge.GetMutableGradNode();
-        VLOG(3) << "Node: " << node->name() << " addr:" << node
-                << ", Found pending node: " << next_node_shared->name()
-                << " addr: " << next_node_shared.get();
-        // Next node could be nullptr if it is leaf tensor with no
-        // AccumulationNode attached
-        // Or it could also originated from dispensable inputs
-        if (!next_node_shared || !next_node_shared.get() ||
-            grad_output_tensors[i].empty()) {
-          continue;
-        }
+      // TODO(jiabin): Should we erase it or find a more efficient way.
+      node_input_buffers_dict.erase(node_input_buffer_iter);
 
-        PADDLE_ENFORCE_LT(
-            j,
-            grad_output_tensors[i].size(),
-            common::errors::Fatal(
-                "Rank of grad_output_tensors should be less than "
-                "grad_output_tensors[i].size(), which is: %d. This error may "
-                "indicate autoprune or autograd api error. ",
-                grad_output_tensors.size()));
-        paddle::Tensor& grad_output_tensor = grad_output_tensors[i][j];
-
-        if ((!grad_output_tensor.defined() ||
-             !grad_output_tensor.has_allocation())) {
-          VLOG(7) << "We get grad_output_tensor with slot: " << i
-                  << ", rank: " << j
-                  << " as undefined tensor or without allocation.";
-        }
+      // Prepare GradTensorHolder for next node
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& metas =
+          node->OutputMeta();
+      PADDLE_ENFORCE(
+          metas.size() == grad_output_tensors.size() || metas.empty(),
+          common::errors::Fatal(
+              "Number of edges should be either empty ( for leaf node "
+              ") or the same as number of output grad tensors, but we "
+              "got edges size is: %d, grad_output size is: %d",
+              metas.size(),
+              grad_output_tensors.size()));
+
+      for (size_t i = 0; i < metas.size(); i++) {
+        for (size_t j = 0; j < metas[i].size(); j++) {
+          const Edge& edge = metas[i][j].GetEdge();
+          if (!edge.IsInitialized()) {
+            continue;
+          }
+          auto edge_rank = edge.GetEdgeRankInfo();
+          // Since we make edge has as same rank as bwd outputs, we indexing
+          // them with the same rank(i, j)
+          auto next_node_shared = edge.GetMutableGradNode();
+          VLOG(3) << "Node: " << node->name() << " addr:" << node
+                  << ", Found pending node: " << next_node_shared->name()
+                  << " addr: " << next_node_shared.get();
+          // Next node could be nullptr if it is leaf tensor with no
+          // AccumulationNode attached
+          // Or it could also originated from dispensable inputs
+          if (!next_node_shared || !next_node_shared.get() ||
+              grad_output_tensors[i].empty()) {
+            continue;
+          }
 
-        VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i
-                << ", rank: " << j
-                << " 's name is: " << grad_output_tensor.name();
-
-        auto* next_node = next_node_shared.get();
-        if (!node_input_buffers_dict.count(next_node)) {
-          const auto& input_meta = next_node->InputMeta();
-          auto grad_tensor_holder =
-              std::make_unique<GradTensorHolder>(input_meta);
-          VLOG(7) << "Construct GradTensorHolder for grad node: "
-                  << next_node->name();
-          node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
-        }
+          PADDLE_ENFORCE_LT(
+              j,
+              grad_output_tensors[i].size(),
+              common::errors::Fatal(
+                  "Rank of grad_output_tensors should be less than "
+                  "grad_output_tensors[i].size(), which is: %d. This error may "
+                  "indicate autoprune or autograd api error. ",
+                  grad_output_tensors.size()));
+          paddle::Tensor& grad_output_tensor = grad_output_tensors[i][j];
+
+          if ((!grad_output_tensor.defined() ||
+               !grad_output_tensor.has_allocation())) {
+            VLOG(7) << "We get grad_output_tensor with slot: " << i
+                    << ", rank: " << j
+                    << " as undefined tensor or without allocation.";
+          }
 
-        VLOG(3) << "Sum or Move grad inputs for edge slot: " << edge_rank.first
-                << ", rank: " << edge_rank.second;
-
-        node_input_buffers_dict[next_node]->add(edge_rank.first,
-                                                edge_rank.second,
-                                                grad_output_tensor,
-                                                create_graph);
-
-        // Update queue
-        node_in_degree_map[next_node]--;
-        VLOG(7) << next_node->name()
-                << " ref_cnt is: " << node_in_degree_map[next_node];
-
-        PADDLE_ENFORCE(
-            node_in_degree_map[next_node] >= 0,
-            common::errors::Fatal(
-                "Detected in-degree value smaller than zero. For Node: %s"
-                "Node's in-degree cannot be negative.",
-                next_node->name()));
-
-        auto add_next_node_func = [&queue](GradNodeBase* next_node) {
-          if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
-            queue.push_front(next_node);
-          } else {
-            queue.push_back(next_node);
+          VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i
+                  << ", rank: " << j
+                  << " 's name is: " << grad_output_tensor.name();
+
+          auto* next_node = next_node_shared.get();
+          if (!node_input_buffers_dict.count(next_node)) {
+            const auto& input_meta = next_node->InputMeta();
+            auto grad_tensor_holder =
+                std::make_unique<GradTensorHolder>(input_meta);
+            VLOG(7) << "Construct GradTensorHolder for grad node: "
+                    << next_node->name();
+            node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
           }
-        };
-        if (node_in_degree_map[next_node] == 0) {
-          if (force_sequential_nodes_set.count(next_node)) {
-            if (force_sequential_nodes_queue.front() == next_node) {
-              force_sequential_nodes_queue.pop_front();
-              add_next_node_func(next_node);
-              while (ready_force_sequential_nodes.count(
-                  force_sequential_nodes_queue.front())) {
-                ready_force_sequential_nodes.erase(
-                    force_sequential_nodes_queue.front());
-                add_next_node_func(force_sequential_nodes_queue.front());
+
+          VLOG(3) << "Sum or Move grad inputs for edge slot: "
+                  << edge_rank.first << ", rank: " << edge_rank.second;
+
+          node_input_buffers_dict[next_node]->add(edge_rank.first,
+                                                  edge_rank.second,
+                                                  grad_output_tensor,
+                                                  create_graph);
+
+          // Update queue
+          node_in_degree_map[next_node]--;
+          VLOG(7) << next_node->name()
+                  << " ref_cnt is: " << node_in_degree_map[next_node];
+
+          PADDLE_ENFORCE(
+              node_in_degree_map[next_node] >= 0,
+              common::errors::Fatal(
+                  "Detected in-degree value smaller than zero. For Node: %s"
+                  "Node's in-degree cannot be negative.",
+                  next_node->name()));
+
+          auto add_next_node_func = [&queue](GradNodeBase* next_node) {
+            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
+              queue.push_front(next_node);
+            } else {
+              queue.push_back(next_node);
+            }
+          };
+          if (node_in_degree_map[next_node] == 0) {
+            if (force_sequential_nodes_set.count(next_node)) {
+              if (force_sequential_nodes_queue.front() == next_node) {
                 force_sequential_nodes_queue.pop_front();
+                add_next_node_func(next_node);
+                while (ready_force_sequential_nodes.count(
+                    force_sequential_nodes_queue.front())) {
+                  ready_force_sequential_nodes.erase(
+                      force_sequential_nodes_queue.front());
+                  add_next_node_func(force_sequential_nodes_queue.front());
+                  force_sequential_nodes_queue.pop_front();
+                }
+              } else {
+                ready_force_sequential_nodes.insert(next_node);
+                continue;
               }
             } else {
-              ready_force_sequential_nodes.insert(next_node);
-              continue;
+              add_next_node_func(next_node);
             }
-          } else {
-            add_next_node_func(next_node);
           }
         }
       }
+      paddle::memory::LogDeviceMemoryStats(place, std::string((*node).name()));
+    } catch (::common::enforce::EnforceNotMet& ex) {
+      if (FLAGS_call_stack_level == 3) {
+        paddle::framework::InsertCallStackInfoDygraph(
+            node->name(), {node->GetForwardTrace()}, &ex);
+      }
+
+      LOG(WARNING) << "While running Node (" << node->name()
+                   << ") raises an EnforceNotMet exception";
+      throw ex;
+    } catch (std::exception& ex) {
+      LOG(WARNING) << "While running Node (" << node->name()
+                   << ") raises a std::exception: "
+                   << common::demangle(typeid(ex).name());
+      if (FLAGS_call_stack_level == 3) {
+        LOG(WARNING) << "Node (" << node->name()
+                     << ")'s forward call stack is :" << node->GetForwardTrace()
+                     << std::endl;
+      }
+      std::rethrow_exception(std::current_exception());
+    } catch (...) {
+      LOG(WARNING) << "While running Node (" << node->name()
+                   << ") raises an unknown exception";
+      if (FLAGS_call_stack_level == 3) {
+        LOG(WARNING) << "Node (" << node->name()
+                     << ")'s forward call stack is :" << node->GetForwardTrace()
+                     << std::endl;
+      }
+      std::rethrow_exception(std::current_exception());
     }
-    paddle::memory::LogDeviceMemoryStats(place, std::string((*node).name()));
   }
 
   VLOG(7) << "Run Backward Final hook size: "
diff --git a/paddle/fluid/framework/op_call_stack.cc b/paddle/fluid/framework/op_call_stack.cc
index f7b60af104747d..8d765d0c1becbf 100644
--- a/paddle/fluid/framework/op_call_stack.cc
+++ b/paddle/fluid/framework/op_call_stack.cc
@@ -74,6 +74,40 @@ void InsertCallStackInfo(const std::string &type,
   exception->set_error_str(sout.str());
 }
 
+void InsertCallStackInfoDygraph(
+    const std::string &node_name,
+    const std::vector<std::string> &forward_callstack_str,
+    platform::EnforceNotMet *exception) {
+  const std::vector<std::string> *callstack = &forward_callstack_str;
+  std::ostringstream sout;
+  // Step 1. Construct python call stack string
+  if (callstack) {
+    if (FLAGS_call_stack_level > 1) {
+      sout << "\n\n  Forward Traceback (most recent call last):";
+    } else {
+      sout << "In user code:\n";
+    }
+    for (auto &line : *callstack) {
+      sout << "\n  " << line;
+    }
+  }
+  VLOG(1) << exception->error_str();
+  // Step 2. Construct final call stack & append error op name
+  if (FLAGS_call_stack_level > 1) {
+    sout << exception->what();
+  } else {
+    // If callstack exists, use err_str_ instead sub_err_str_
+    if (callstack) {
+      sout << "\n\n";
+      sout << InsertIndentationIntoEachLine(exception->error_str());
+    } else {
+      sout << exception->simple_error_str();
+    }
+  }
+  sout << "  [GradNode < " << node_name << " > error]";
+  exception->set_error_str(sout.str());
+}
+
 void InsertCallStackInfo(const std::string &type,
                          const std::vector<std::string> &callstack_attr_str,
                          platform::EnforceNotMet *exception) {
diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
index 9f9ecd14ef8be7..3be29cb4585967 100644
--- a/paddle/fluid/framework/op_call_stack.h
+++ b/paddle/fluid/framework/op_call_stack.h
@@ -31,6 +31,11 @@ void InsertCallStackInfo(const std::string &type,
                          const std::vector<std::string> &callstack_attr_str,
                          platform::EnforceNotMet *exception);
 
+void InsertCallStackInfoDygraph(
+    const std::string &type,
+    const std::vector<std::string> &callstack_attr_str,
+    platform::EnforceNotMet *exception);
+
 // only append error op for exception message
 void AppendErrorOpHint(const std::string &type,
                        platform::EnforceNotMet *exception);
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 74585c3131cc91..dba28787c1acf6 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -827,6 +827,7 @@ Tensor is the basic data structure in PaddlePaddle. There are some ways to creat
  *  **/
 int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   EAGER_TRY
+  SetPythonStack();
   // set a flag to record use kwargs or not
   bool flag_kwargs = false;
   if (kwargs) flag_kwargs = true;
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index 2abecf91708ce9..cb1f600bed32b0 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -180,6 +180,7 @@ paddle::Tensor CallScalarFunction(const paddle::Tensor& self_tensor,
                                   double other,
                                   std::string op_type) {
   paddle::Tensor ret;
+  SetPythonStack();
   // scale_ad_func need sclar and bias with float type.
   if (op_type == "add" || op_type == "radd") {
     ret = scale_ad_func(self_tensor, phi::Scalar(1.0), other, true);
@@ -223,6 +224,7 @@ void TypePromotionForZeroDimTensor(std::string func,
         promote_type = self_tensor.dtype();
       }
     }
+    SetPythonStack();
     if (self_tensor.dtype() != promote_type) {
       eager_gil_scoped_release guard;
       self_tensor = cast_ad_func(self_tensor, promote_type);
@@ -243,6 +245,9 @@ static PyObject* tensor__add__method(TensorObject* self,
 
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__add__method";
+
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -338,6 +343,8 @@ static PyObject* tensor__sub__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__sub__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -432,6 +439,8 @@ static PyObject* tensor__rsub__method(TensorObject* self,
   EAGER_TRY
   VLOG(4) << "Running Eager tensor__rsub__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -525,6 +534,8 @@ static PyObject* tensor__mul__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__mul__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -628,6 +639,8 @@ static PyObject* tensor__div__method(TensorObject* self,
 
   VLOG(6) << "Running Eager tensor__div__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -720,6 +733,8 @@ static PyObject* tensor__rdiv__method(TensorObject* self,
 
   VLOG(6) << "Running Eager tensor__rdiv__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -806,6 +821,8 @@ static PyObject* tensor__gt__method(TensorObject* self,
   EAGER_TRY
   VLOG(4) << "Running Eager tensor__gt__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -897,6 +914,8 @@ static PyObject* tensor__ge__method(TensorObject* self,
   EAGER_TRY
   VLOG(4) << "Running Eager tensor__ge__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -988,6 +1007,8 @@ static PyObject* tensor__mod__method(TensorObject* self,
 
   VLOG(6) << "Running Eager tensor__mod__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1079,6 +1100,8 @@ static PyObject* tensor__rmod__method(TensorObject* self,
 
   VLOG(6) << "Running Eager tensor__rmod__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1170,6 +1193,8 @@ static PyObject* tensor__matmul__method(TensorObject* self,
 
   VLOG(6) << "Running Eager tensor__matmul__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1295,6 +1320,8 @@ static PyObject* tensor__rmatmul__method(TensorObject* self,
 
   VLOG(6) << "Running Eager tensor__rmatmul__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1420,6 +1447,8 @@ static PyObject* tensor__lt__method(TensorObject* self,
   EAGER_TRY
   VLOG(4) << "Running Eager tensor__lt__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1511,6 +1540,8 @@ static PyObject* tensor__le__method(TensorObject* self,
   EAGER_TRY
   VLOG(4) << "Running Eager tensor__le__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1601,6 +1632,8 @@ static PyObject* tensor__floordiv__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__floordiv__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1691,6 +1724,8 @@ static PyObject* tensor__rfloordiv__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__rfloordiv__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1782,6 +1817,8 @@ static PyObject* tensor__pow__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__pow__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1877,6 +1914,8 @@ static PyObject* tensor__rpow__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__rpow__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -1970,6 +2009,8 @@ static PyObject* tensor__ne__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__ne__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
@@ -2061,6 +2102,8 @@ static PyObject* tensor__eq__method(TensorObject* self,
   EAGER_TRY
   VLOG(6) << "Running Eager tensor__eq__method";
 
+  SetPythonStack();
+
   // Set Device ID
   auto place = egr::Controller::Instance().GetExpectedPlace();
   SetDevice(place);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 6cffcf68330af7..c121df6b78a31a 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1421,6 +1421,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
                                          PyObject* args,
                                          PyObject* kwargs) {
   EAGER_TRY
+  SetPythonStack();
   PyObject* _index = PyTuple_GET_ITEM(args, 0);
   VLOG(4) << "Call new indexing strategy _getitem_dygraph";
 
@@ -1691,6 +1692,7 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
                                          PyObject* args,
                                          PyObject* kwargs) {
   EAGER_TRY
+  SetPythonStack();
   VLOG(4) << "Call new indexing strategy _setitem_dygraph";
 
   PyObject* _index = PyTuple_GET_ITEM(args, 0);
@@ -2006,6 +2008,7 @@ static PyObject* tensor_register_grad_hook(TensorObject* self,
                                            PyObject* args,
                                            PyObject* kwargs) {
   EAGER_TRY
+  SetPythonStack();
   int64_t hook_id = 0;
   if (egr::EagerUtils::IsLeafTensor(self->tensor)) {
     VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name();
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index c695c5357e0bdc..269af549b4d132 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -134,6 +134,7 @@ PyObject* pylayer_method_apply(PyObject* cls,
                                PyObject* args,
                                PyObject* kwargs) {
   EAGER_TRY
+  SetPythonStack();
   VLOG(6) << "Begin run PyLayer apply...";
   PyObject* backward_function =
       PyObject_GetAttrString(cls, "_backward_function");
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index f4f1500189c94c..beb67068106d50 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -51,6 +51,7 @@ limitations under the License. */
 
 COMMON_DECLARE_bool(check_nan_inf);
 COMMON_DECLARE_int32(check_nan_inf_level);
+COMMON_DECLARE_int32(call_stack_level);
 
 using egr::ConvertToDistTensor;
 
@@ -269,6 +270,19 @@ void SetPythonStack() {
     std::string last = str + egr::Controller::Instance().GetPythonStack();
     egr::Controller::Instance().SetPythonStack(last);
   }
+
+  if (FLAGS_call_stack_level == 3) {
+    VLOG(4) << "this is SetPythonStack";
+    pybind11::gil_scoped_acquire gil;
+    PyObject* mod = PyImport_ImportModule("traceback");
+    PyObject* traceback_list = PyObject_CallMethod(mod, "format_stack", "");
+    std::string str = "";
+    for (Py_ssize_t i = 0; i < PyList_Size(traceback_list); i++) {
+      PyObject* line = PyList_GetItem(traceback_list, i);
+      str += py::str(PyUnicode_AsUTF8(line));
+    }
+    egr::Controller::Instance().SetPythonStack(str);
+  }
 }
 
 std::shared_ptr<jit::Function> CastPyArg2JitFunction(PyObject* obj,

From 4e3f03112da476baee72c4a5009cfc8bee0ccb3b Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 18:42:08 +0800
Subject: [PATCH 54/57] [clean old comm] remove dynamic_static_unified_comm in
 test directory (#70729)

---
 test/collective/collective_allgather_api.py     |  7 +++----
 test/collective/test_collective_alltoall_api.py |  3 +--
 test/legacy_test/test_collective_api_base.py    |  7 +------
 test/legacy_test/test_collective_base.py        | 11 ++---------
 test/xpu/collective_allgather_api.py            |  7 +++----
 test/xpu/test_collective_api_base.py            |  7 +------
 test/xpu/test_collective_base_xpu.py            | 12 +++---------
 7 files changed, 14 insertions(+), 40 deletions(-)

diff --git a/test/collective/collective_allgather_api.py b/test/collective/collective_allgather_api.py
index a502e5a6dad50d..e6d8aaa6c0084c 100644
--- a/test/collective/collective_allgather_api.py
+++ b/test/collective/collective_allgather_api.py
@@ -98,10 +98,9 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        if args["use_comm_context"] or args["dynamic_static_unified_comm"]:
-            paddle.distributed.collective._init_parallel_env(args["backend"])
-        else:
-            paddle.distributed.init_parallel_env()
+
+        paddle.distributed.collective._init_parallel_env(args["backend"])
+
         if args['backend'] == 'nccl':
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = base.CUDAPlace(
diff --git a/test/collective/test_collective_alltoall_api.py b/test/collective/test_collective_alltoall_api.py
index 5c3bb4c056006b..464014ad5cc8c0 100644
--- a/test/collective/test_collective_alltoall_api.py
+++ b/test/collective/test_collective_alltoall_api.py
@@ -39,7 +39,7 @@ def test_alltoall_nccl_with_new_comm(self):
                 "alltoall",
                 "nccl",
                 dtype=dtype,
-                need_envs={"FLAGS_dynamic_static_unified_comm": "true"},
+                need_envs={},
             )
 
     def test_alltoall_nccl_with_new_comm_pir(self):
@@ -57,7 +57,6 @@ def test_alltoall_nccl_with_new_comm_pir(self):
                 "nccl",
                 dtype=dtype,
                 need_envs={
-                    "FLAGS_dynamic_static_unified_comm": "true",
                     "FLAGS_enable_pir_in_executor": "1",
                 },
             )
diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py
index 5f4b1e71540b65..6ebf194da385aa 100644
--- a/test/legacy_test/test_collective_api_base.py
+++ b/test/legacy_test/test_collective_api_base.py
@@ -125,9 +125,7 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        if args['static_mode'] and (
-            args["use_comm_context"] or args["dynamic_static_unified_comm"]
-        ):
+        if args['static_mode']:
             paddle.distributed.collective._init_parallel_env(args["backend"])
         else:
             paddle.distributed.init_parallel_env()
@@ -188,9 +186,6 @@ def runtime_main(test_class, col_type):
     args["dtype"] = os.getenv("DTYPE")
     args["reduce_type"] = os.getenv("REDUCE_TYPE")
     args["use_comm_context"] = bool(int(os.getenv("USE_COMM_CONTEXT", "0")))
-    args["dynamic_static_unified_comm"] = bool(
-        os.getenv("FLAGS_dynamic_static_unified_comm", "true").lower() == "true"
-    )
     model.run_trainer(args)
 
 
diff --git a/test/legacy_test/test_collective_base.py b/test/legacy_test/test_collective_base.py
index 9e570ec31ba961..bbdfd402dd5919 100644
--- a/test/legacy_test/test_collective_base.py
+++ b/test/legacy_test/test_collective_base.py
@@ -111,12 +111,8 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        if args["dynamic_static_unified_comm"]:
-            _init_parallel_env("nccl")
-        else:
-            self.initCommunicator(
-                startup_prog, rank, nranks, True, current_endpoint, endpoints
-            )
+
+        _init_parallel_env("nccl")
 
         self.rank = rank
         result = self.get_model(train_prog, startup_prog)
@@ -146,9 +142,6 @@ def runtime_main(test_class, col_type, sub_type):
     args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
     args["col_type"] = col_type
     args["dtype"] = os.getenv("DTYPE")
-    args["dynamic_static_unified_comm"] = bool(
-        int(os.getenv("FLAGS_dynamic_static_unified_comm", "1"))
-    )
     model.run_trainer(args)
 
 
diff --git a/test/xpu/collective_allgather_api.py b/test/xpu/collective_allgather_api.py
index b4995ee1d08e0f..7f3c397bffa256 100644
--- a/test/xpu/collective_allgather_api.py
+++ b/test/xpu/collective_allgather_api.py
@@ -100,10 +100,9 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        if args["use_comm_context"] or args["dynamic_static_unified_comm"]:
-            paddle.distributed.collective._init_parallel_env(args["backend"])
-        else:
-            paddle.distributed.init_parallel_env()
+
+        paddle.distributed.collective._init_parallel_env(args["backend"])
+
         if args['backend'] == 'nccl':
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = base.CUDAPlace(
diff --git a/test/xpu/test_collective_api_base.py b/test/xpu/test_collective_api_base.py
index a43a3e5b6df202..098f8c4c7dafb5 100644
--- a/test/xpu/test_collective_api_base.py
+++ b/test/xpu/test_collective_api_base.py
@@ -125,9 +125,7 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        if args['static_mode'] and (
-            args["use_comm_context"] or args["dynamic_static_unified_comm"]
-        ):
+        if args['static_mode']:
             paddle.distributed.collective._init_parallel_env(args["backend"])
         else:
             paddle.distributed.init_parallel_env()
@@ -187,9 +185,6 @@ def runtime_main(test_class, col_type):
     args["dtype"] = os.getenv("DTYPE")
     args["reduce_type"] = os.getenv("REDUCE_TYPE")
     args["use_comm_context"] = bool(int(os.getenv("USE_COMM_CONTEXT", "0")))
-    args["dynamic_static_unified_comm"] = bool(
-        os.getenv("FLAGS_dynamic_static_unified_comm", "true").lower() == "true"
-    )
     model.run_trainer(args)
 
 
diff --git a/test/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py
index 3fed84ecbb6f14..1cd2e0e44f394a 100644
--- a/test/xpu/test_collective_base_xpu.py
+++ b/test/xpu/test_collective_base_xpu.py
@@ -140,12 +140,9 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        if args["dynamic_static_unified_comm"]:
-            _init_parallel_env("bkcl")
-        else:
-            self.initCommunicator(
-                startup_prog, rank, nranks, True, current_endpoint, endpoints
-            )
+
+        _init_parallel_env("bkcl")
+
         self.rank = rank
         np_dtype = DataTypeCast(args["dtype"])
         result = self.get_model(train_prog, startup_prog, np_dtype)
@@ -174,9 +171,6 @@ def runtime_main(test_class, col_type, sub_type):
     args["col_type"] = col_type
     args["dtype"] = os.getenv("DTYPE")
     args["batch_size"] = os.getenv("BATCH_SIZE")
-    args["dynamic_static_unified_comm"] = bool(
-        int(os.getenv("FLAGS_dynamic_static_unified_comm", "1"))
-    )
     model.run_trainer(args)
 
 

From f70042ab551078635fa5b28738aca23a6f298cd9 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 18:43:12 +0800
Subject: [PATCH 55/57] [clean old comm][fluid_ops]c_allreduce_op.h (#70732)

---
 .../operators/collective/c_allreduce_op.h     | 85 ++++++++-----------
 .../operators/collective/c_gen_bkcl_id_op.cc  | 18 ----
 .../operators/collective/c_gen_nccl_id_op.cc  | 18 ----
 .../operators/collective/c_wait_comm_op.cc    | 40 +++------
 .../operators/collective/c_wait_compute_op.cc | 40 +++------
 .../operators/collective/recv_v2_op.cu.cc     | 49 ++++-------
 .../operators/collective/send_v2_op.cu.cc     | 49 ++++-------
 7 files changed, 98 insertions(+), 201 deletions(-)

diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index b6db792077a362..57c4a7061df834 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -29,7 +29,6 @@ limitations under the License. */
     defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/common/flags.h"
 #include "paddle/phi/core/platform/collective_helper.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -180,30 +179,24 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
 
     const auto& comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(rid)));
-      comm_ctx = static_cast<phi::distributed::BKCLCommContext*>(
-          comm_context_manager.Get(std::to_string(rid)));
-      PADDLE_ENFORCE_NE(comm_ctx,
-                        nullptr,
-                        common::errors::Unavailable(
-                            "BKCLCommContext is nullptr, collective op should "
-                            "has ring_id attr."));
-      stream = comm_ctx->GetStream();
-      VLOG(3) << "new comm_context_manager has rid " << rid;
-    } else {
-      comm = platform::BKCLCommContext::Instance().Get(rid, place);
-      stream = comm->stream();
-      VLOG(3) << "old BKCLCommContext has rid " << rid;
-    }
+
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                      true,
+                      common::errors::InvalidArgument(
+                          "You choose to use new communication library. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(rid)));
+    comm_ctx = static_cast<phi::distributed::BKCLCommContext*>(
+        comm_context_manager.Get(std::to_string(rid)));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      common::errors::Unavailable(
+                          "BKCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+    stream = comm_ctx->GetStream();
+    VLOG(3) << "new comm_context_manager has rid " << rid;
+
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = phi::DeviceContextPool::Instance().Get(place);
       stream = static_cast<phi::XPUContext*>(dev_ctx)->x_context()->xpu_stream;
@@ -325,30 +318,24 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
 
     const auto& comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(rid)));
-      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-          comm_context_manager.Get(std::to_string(rid)));
-      PADDLE_ENFORCE_NE(comm_ctx,
-                        nullptr,
-                        common::errors::Unavailable(
-                            "NCCLCommContext is nullptr, collective op should "
-                            "has ring_id attr."));
-      stream = comm_ctx->GetStream();
-      VLOG(3) << "new comm_context_manager has rid " << rid;
-    } else {
-      comm = platform::NCCLCommContext::Instance().Get(rid, place);
-      stream = comm->stream();
-      VLOG(3) << "old NCCLCommContext has rid " << rid;
-    }
+
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                      true,
+                      common::errors::InvalidArgument(
+                          "You choose to use new communication library. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(rid)));
+    comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+        comm_context_manager.Get(std::to_string(rid)));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      common::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+    stream = comm_ctx->GetStream();
+    VLOG(3) << "new comm_context_manager has rid " << rid;
+
     if (ctx.Attr<bool>("use_calc_stream")) {
       // should not use global ctx for calc stream.
       // auto dev_ctx = phi::DeviceContextPool::Instance().Get(place);
diff --git a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
index 324cdde5175c4e..3479562f93ae55 100644
--- a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/phi/core/platform/device_context.h"
 #include "paddle/phi/core/platform/gen_comm_id_helper.h"
 
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
@@ -63,30 +62,13 @@ class CGenBKCLIdOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const phi::Place& dev_place) const override {
-    int rank = Attr<int>("rank");
-    int ring_id = Attr<int>("ring_id");
-
     std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
       return Output("Out");
     };
 
-    std::string endpoint = Attr<std::string>("endpoint");
-
     std::vector<BKCLUniqueId> bkcl_ids;
     bkcl_ids.resize(1);
 
-    if (!FLAGS_dynamic_static_unified_comm) {
-      int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
-      if (rank == 0) {
-        GenBKCLID(&bkcl_ids);
-        std::vector<std::string> endpoint_list =
-            Attr<std::vector<std::string>>("other_endpoints");
-        platform::SendBroadCastCommID(endpoint_list, &bkcl_ids, ring_id);
-      } else {
-        platform::RecvBroadCastCommID(server_fd, endpoint, &bkcl_ids, ring_id);
-      }
-    }
-
     CopyBKCLIDToVar(bkcl_ids, func, scope);
   }
 };
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 5004439695097f..beda7cf0c1377b 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/phi/core/platform/device_context.h"
 #include "paddle/phi/core/platform/gen_comm_id_helper.h"
 
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle::operators {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -58,30 +57,13 @@ class CGenNCCLIdOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const phi::Place& dev_place) const override {
-    int rank = Attr<int>("rank");
-    int ring_id = Attr<int>("ring_id");
-
     std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
       return Output("Out");
     };
 
-    std::string endpoint = Attr<std::string>("endpoint");
-
     std::vector<ncclUniqueId> nccl_ids;
     nccl_ids.resize(1);
 
-    if (!FLAGS_dynamic_static_unified_comm) {
-      int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
-      if (rank == 0) {
-        GenNCCLID(&nccl_ids);
-        std::vector<std::string> endpoint_list =
-            Attr<std::vector<std::string>>("other_endpoints");
-        platform::SendBroadCastCommID(endpoint_list, &nccl_ids, ring_id);
-      } else {
-        platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids, ring_id);
-      }
-    }
-
     CopyNCCLIDToVar(nccl_ids, func, scope);
   }
 };
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index ce9387d5aea183..8226f6d1d495e2 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -22,7 +22,6 @@ class Scope;
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/platform/collective_helper.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 namespace paddle::operators {
@@ -56,31 +55,20 @@ class CWaitCommOp : public framework::OperatorBase {
 
     const auto& comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(ring_id)));
-      phi::distributed::NCCLCommContext* comm_ctx =
-          static_cast<phi::distributed::NCCLCommContext*>(
-              comm_context_manager.Get(std::to_string(ring_id)));
-      comm_stream = comm_ctx->GetStream();
-      event = comm_ctx->GetComputeEvent();
-      VLOG(3) << "new comm_context_manager has rid " << ring_id;
-    } else {
-      comm_stream =
-          platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-
-      event = platform::NCCLCommContext::Instance()
-                  .Get(ring_id, place)
-                  ->comm_event();
-      VLOG(3) << "old NCCLCommContext has rid " << ring_id;
-    }
+
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                      true,
+                      common::errors::InvalidArgument(
+                          "You choose to use new communication library. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(ring_id)));
+    phi::distributed::NCCLCommContext* comm_ctx =
+        static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(ring_id)));
+    comm_stream = comm_ctx->GetStream();
+    event = comm_ctx->GetComputeEvent();
+    VLOG(3) << "new comm_context_manager has rid " << ring_id;
 
 // comm_stream-->event-->compute_stream
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 4d8a5f158c679b..234832a6c46059 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -22,7 +22,6 @@ class Scope;
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/platform/collective_helper.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 namespace paddle::operators {
@@ -56,31 +55,20 @@ class CWaitComputeOp : public framework::OperatorBase {
 
     const auto& comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(ring_id)));
-      phi::distributed::NCCLCommContext* comm_ctx =
-          static_cast<phi::distributed::NCCLCommContext*>(
-              comm_context_manager.Get(std::to_string(ring_id)));
-      comm_stream = comm_ctx->GetStream();
-      event = comm_ctx->GetComputeEvent();
-      VLOG(3) << "new comm_context_manager has rid " << ring_id;
-    } else {
-      comm_stream =
-          platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-
-      event = platform::NCCLCommContext::Instance()
-                  .Get(ring_id, place)
-                  ->compute_event();
-      VLOG(3) << "old NCCLCommContext has rid " << ring_id;
-    }
+
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                      true,
+                      common::errors::InvalidArgument(
+                          "You choose to use new communication library. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(ring_id)));
+    phi::distributed::NCCLCommContext* comm_ctx =
+        static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(ring_id)));
+    comm_stream = comm_ctx->GetStream();
+    event = comm_ctx->GetComputeEvent();
+    VLOG(3) << "new comm_context_manager has rid " << ring_id;
 
 // compute_stream-->event-->comm_stream
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index d6fbfdf6f4eee9..1888e2204a66ca 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/platform/collective_helper.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 #include "paddle/fluid/distributed/collective/process_group.h"
@@ -175,37 +174,23 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
 
     const auto &comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(rid)));
-      comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
-          comm_context_manager.Get(std::to_string(rid)));
-      PADDLE_ENFORCE_NE(comm_ctx,
-                        nullptr,
-                        common::errors::Unavailable(
-                            "NCCLCommContext is nullptr, collective op should "
-                            "has ring_id attr."));
-      stream = comm_ctx->GetStream();
-      VLOG(3) << "new comm_context_manager has rid " << rid;
-    } else {
-      comm = platform::NCCLCommContext::Instance().Get(rid, place);
-      PADDLE_ENFORCE_LT(
-          peer,
-          comm->nranks(),
-          common::errors::InvalidArgument("The value of peer (%d) you set must "
-                                          "be less than comm->nranks (%d).",
-                                          peer,
-                                          comm->nranks()));
-      stream = comm->stream();
-      VLOG(3) << "old NCCLCommContext has rid " << rid;
-    }
+
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                      true,
+                      common::errors::InvalidArgument(
+                          "You choose to use new communication library. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(rid)));
+    comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+        comm_context_manager.Get(std::to_string(rid)));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      common::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+    stream = comm_ctx->GetStream();
+    VLOG(3) << "new comm_context_manager has rid " << rid;
 
     if (ctx.Attr<bool>("use_calc_stream")) {
       // should ExecutionContext for calc stream.
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 10ff7108cab23b..d0c0c48cfd75a7 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/platform/collective_helper.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/api/include/tensor.h"
@@ -167,37 +166,23 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
 
     const auto& comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();
-    if (FLAGS_dynamic_static_unified_comm) {
-      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
-                        true,
-                        common::errors::InvalidArgument(
-                            "You choose to use new communication library by "
-                            "setting environment "
-                            "variable FLAGS_dynamic_static_unified_comm True. "
-                            "But ring_id(%d) is "
-                            "not found in comm_context_manager.",
-                            std::to_string(rid)));
-      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-          comm_context_manager.Get(std::to_string(rid)));
-      PADDLE_ENFORCE_NE(comm_ctx,
-                        nullptr,
-                        common::errors::Unavailable(
-                            "NCCLCommContext is nullptr, collective op should "
-                            "has ring_id attr."));
-      stream = comm_ctx->GetStream();
-      VLOG(3) << "new comm_context_manager has rid " << rid;
-    } else {
-      comm = platform::NCCLCommContext::Instance().Get(rid, place);
-      PADDLE_ENFORCE_LT(
-          peer,
-          comm->nranks(),
-          common::errors::InvalidArgument("The value of peer (%d) you set must "
-                                          "be less than comm->nranks (%d).",
-                                          peer,
-                                          comm->nranks()));
-      stream = comm->stream();
-      VLOG(3) << "old NCCLCommContext has rid " << rid;
-    }
+
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                      true,
+                      common::errors::InvalidArgument(
+                          "You choose to use new communication library. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(rid)));
+    comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+        comm_context_manager.Get(std::to_string(rid)));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      common::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+    stream = comm_ctx->GetStream();
+    VLOG(3) << "new comm_context_manager has rid " << rid;
 
     if (ctx.Attr<bool>("use_calc_stream")) {
       // should ExecutionContext for calc stream.

From 2f47c31ea51b64034bdf269063f74ff78f86e259 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 9 Jan 2025 18:43:42 +0800
Subject: [PATCH 56/57] [fluid_ops] sharding_optimizer.py replace c_broadcast
 (#70705)

---
 .../fleet/meta_optimizers/sharding_optimizer.py          | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 07de62d3039f89..50a8d35a4526c7 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -1707,7 +1707,7 @@ def _initialization_broadcast(self):
         # offload and optimize_cast will insert broadcast op
         broadcast_params = set()
         for op in startup_block.ops:
-            if op.type == 'c_broadcast':
+            if op.type == 'broadcast':
                 broadcast_params.add(op.desc.output_arg_names()[0])
 
         for param in params_name:
@@ -1723,13 +1723,12 @@ def _initialization_broadcast(self):
 
             for ring in rings:
                 startup_block.append_op(
-                    type='c_broadcast',
-                    inputs={'X': param},
-                    outputs={'Out': param},
+                    type='broadcast',
+                    inputs={'x': param},
+                    outputs={'out': param},
                     attrs={
                         'ring_id': ring,
                         'root': 0,
-                        'use_calc_stream': True,
                         OP_ROLE_KEY: OpRole.Forward,
                     },
                 )

From 266e3cd2f1c73e092b4bb037f9106b73e76e8ebc Mon Sep 17 00:00:00 2001
From: zty-king <129518799+zty-king@users.noreply.github.com>
Date: Thu, 9 Jan 2025 18:55:29 +0800
Subject: [PATCH 57/57] =?UTF-8?q?=E6=94=AF=E6=8C=81=E9=9D=9E=E5=9D=87?=
 =?UTF-8?q?=E8=A1=A1VPP=E7=BC=96=E6=8E=92=E7=9A=84=E7=81=B5=E6=B4=BB?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=B1=82=E5=88=86=E9=85=8D=E7=AD=96=E7=95=A5?=
 =?UTF-8?q?=20(#70230)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略

* 支持非均衡VPP编排的灵活模型层分配策略
---
 .../auto_parallel/static/pir_pass.py          | 39 +++++++++++++------
 .../pir/vpp_pass_unittest_pir.py              | 19 ++++++++-
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py
index 65a2aed0f50a5d..7f6238a223acb7 100644
--- a/python/paddle/distributed/auto_parallel/static/pir_pass.py
+++ b/python/paddle/distributed/auto_parallel/static/pir_pass.py
@@ -1066,12 +1066,6 @@ def complete_chunk_id(dist_program, startup_program, pipeline_strategy):
         dist_program.global_block().ops, seg_method
     )
     ops = dist_program.global_block().ops
-
-    assert (len(seg_struct_names) % num_chunks == 0) or (
-        (len(seg_struct_names) + 1) % num_chunks == 0
-        and (len(seg_struct_names) + 1) // num_chunks != 1
-    ), f"The number of layers[{seg_method}] ({len(seg_struct_names)}) should be divisible by part number ({num_chunks}),or ({len(seg_struct_names)} + 1) should be divisible by {num_chunks} and not equal to {num_chunks}."
-
     # Step2: analysis whether the pp_stage is non-decreasing among segments
     # 1. if non_use_custom_mesh is True, the ops' process_mesh will be changed by vpp strategy
     # 2. if non_use_custom_mesh is False, the ops's process_mesh will not be changed.
@@ -1080,20 +1074,41 @@ def complete_chunk_id(dist_program, startup_program, pipeline_strategy):
     # Step3: Get op index boundary, pp_stage, chunk_id, struct_names of each segment
     seg_pp_stages = [i % pp_degree for i in range(num_chunks)]
     seg_chunk_ids = [i // pp_degree for i in range(num_chunks)]
-    seg_layer_num = [0] * num_chunks
-    for j in range(0, len(seg_struct_names)):
-        i = j % num_chunks
-        seg_layer_num[i] = seg_layer_num[i] + 1
     seg_parts = [0]
-
+    last_struct_name = None
+    stage_ids = (
+        []
+    )  # stage_ids[i] represents the stage number assigned to the i-th layer.
     for idx, op in enumerate(ops):
         if len(seg_parts) == len(seg_struct_names):
             break
         struct_name = _extract_seg_method(op, seg_method)
+        if op.dist_attr is not None and last_struct_name != struct_name:
+            pp_stage = get_pp_stage_by_process_mesh(
+                op.dist_attr.process_mesh, pp_degree
+            )
+            if pp_stage is not None:
+                stage_ids.append(pp_stage)
+            last_struct_name = struct_name
         if struct_name == seg_struct_names[len(seg_parts)]:
             seg_parts.append(idx)
     seg_parts.append(len(ops))
-
+    pp_stage_layer_nums = [0] * pp_degree
+    for i in stage_ids:
+        pp_stage_layer_nums[i] = pp_stage_layer_nums[i] + 1
+    assert all(
+        value >= vpp_degree for value in pp_stage_layer_nums
+    ), "The number of layers on each pp_stage must not be less than the vpp_degree in the pp_stage to ensure that each chunk contains at least one layer."
+    seg_layer_num = [0] * num_chunks
+    for pp_stage in range(
+        0, pp_degree
+    ):  # Each pp_stage is assigned a number of layers based on user intent.
+        pp_stage_layer_num = pp_stage_layer_nums[pp_stage]
+        for i in range(0, pp_stage_layer_num):
+            # The pp_stage uses a Round robin scheduling algorithm to allocate layers one by one.
+            virtual_chunk_id = i % vpp_degree
+            real_chunk_id = (virtual_chunk_id) * pp_degree + pp_stage
+            seg_layer_num[real_chunk_id] = seg_layer_num[real_chunk_id] + 1
     # Step4: Set the process_mesh of each op
     seg_id = 0
     reshard_ops = []
diff --git a/test/auto_parallel/pir/vpp_pass_unittest_pir.py b/test/auto_parallel/pir/vpp_pass_unittest_pir.py
index cae24c9453cff1..97a4f703769265 100644
--- a/test/auto_parallel/pir/vpp_pass_unittest_pir.py
+++ b/test/auto_parallel/pir/vpp_pass_unittest_pir.py
@@ -103,6 +103,7 @@ def __init__(
         initializer_range=0.02,
         manual=True,
         hidden_layer=4,
+        random_shard=False,
     ):
         super().__init__()
 
@@ -116,6 +117,10 @@ def __init__(
             self.layer_to_mesh = [PP_MESH_0] * (
                 hidden_layer - hidden_layer // 2
             ) + [PP_MESH_1] * (hidden_layer // 2)
+        if random_shard:
+            self.layer_to_mesh = [PP_MESH_0] * (4) + [PP_MESH_1] * (
+                hidden_layer - 4
+            )
 
         self.layers = nn.LayerList(
             [
@@ -221,11 +226,14 @@ def run_pipeline(
         enable_send_recv_overlap=False,
         batch_size=BATCH_SIZE,
         hidden_layer=4,
+        random_shard=False,
     ):
         self.init()
 
         strategy = apply_pass(schedule_mode, acc_step, enable_send_recv_overlap)
-        model = MLPLayer(manual=manual, hidden_layer=hidden_layer)
+        model = MLPLayer(
+            manual=manual, hidden_layer=hidden_layer, random_shard=random_shard
+        )
         opt = paddle.optimizer.AdamW(
             learning_rate=0.00001, parameters=model.parameters()
         )
@@ -276,6 +284,15 @@ def test_pp_pass(self):
             schedule_mode="VPP", acc_step=4, manual=False, hidden_layer=7
         )
         self.check_result(Tail_removed_loss_vpp, loss_vpp)
+        # random-shard-vpp
+        Random_shards_vpp = self.run_pipeline(
+            schedule_mode="VPP",
+            acc_step=4,
+            manual=False,
+            hidden_layer=7,
+            random_shard=True,
+        )
+        self.check_result(Random_shards_vpp, loss_vpp)
 
     def check_result(self, loss1, loss2):
         return np.array_equal(loss1, loss2)