From e2299f976e569ff7be381dcd30885d36717f9714 Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Thu, 5 Dec 2024 06:36:32 +0000
Subject: [PATCH 01/12] added reset peak value initialization

	modified:   paddle/fluid/pybind/pybind.cc
	modified:   paddle/phi/core/memory/stats.cc
	modified:   paddle/phi/core/memory/stats.h
	modified:   python/paddle/device/cuda/__init__.py
---
 paddle/fluid/pybind/pybind.cc         |  2 ++
 paddle/phi/core/memory/stats.cc       |  8 ++++++++
 paddle/phi/core/memory/stats.h        | 16 ++++++++++++++++
 python/paddle/device/cuda/__init__.py | 12 ++++++++++++
 4 files changed, 38 insertions(+)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 43f39912c8f9f..e75cd51f68749 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2424,6 +2424,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_current_value",
         memory::DeviceMemoryStatCurrentValue);
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
+  m.def("device_memory_stat_reset_peak_value", memory::DeviceMemoryStatResetPeakValue);
+
   m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
   m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
   m.def(
diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc
index c3b93d41d604d..0b59ffade6f87 100644
--- a/paddle/phi/core/memory/stats.cc
+++ b/paddle/phi/core/memory/stats.cc
@@ -58,6 +58,10 @@ class StatRegistry {
     GetStat(stat_type, dev_id)->Update(increment);
   }
 
+  void ResetPeakValue(const std::string& stat_type, int dev_id) {
+    GetStat(stat_type, dev_id)->ResetPeakValue();
+  }
+
   void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
     std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
     stat_map_[GetStatKey(stat_type, dev_id)] = stat;
@@ -93,6 +97,10 @@ void DeviceMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }
 
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) {
+  StatRegistry::GetInstance()->ResetPeakValue("Device" + stat_type, dev_id);
+}
+
 int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
   return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
                                                       dev_id);
diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h
index acacd21a97f96..9ede49bd699a5 100644
--- a/paddle/phi/core/memory/stats.h
+++ b/paddle/phi/core/memory/stats.h
@@ -57,6 +57,7 @@ class StatBase {
   virtual int64_t GetCurrentValue() = 0;
   virtual int64_t GetPeakValue() = 0;
   virtual void Update(int64_t) = 0;
+  virtual void ResetPeakValue() = 0;
 
  private:
   DISABLE_COPY_AND_ASSIGN(StatBase);
@@ -112,6 +113,20 @@ class Stat : public StatBase {
     }
   }
 
+  void ResetPeakValue() override {
+    int64_t current_value = GetCurrentValue();
+    peak_value_.store(current_value, std::memory_order_relaxed);
+
+    std::unordered_map<uint64_t, std::reference_wrapper<ThreadLocalStatType>> thread_local_stats =
+      ThreadDataRegistry<ThreadLocalStatType>::GetInstance().GetAllThreadDataByRef();
+
+    for (auto pair : thread_local_stats) {
+      pair.second.get().peak = pair.second.get().current;
+    }
+
+    VLOG(8) << "Reset peak_value to current_value = " << current_value;
+  }
+
  private:
   Stat() {}
   ~Stat() {}
@@ -128,6 +143,7 @@ int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void DeviceMemoryStatUpdate(const std::string& stat_type,
                             int dev_id,
                             int64_t increment);
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id);
 
 int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 5f31c1df1fe58..8146612f66d66 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -48,6 +48,7 @@
     'get_device_properties',
     'get_device_name',
     'get_device_capability',
+    'reset_peak_memory_stats',
 ]
 
 
@@ -297,6 +298,17 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int:
     device_id = extract_cuda_device_id(device, op_name=name)
     return core.device_memory_stat_peak_value("Reserved", device_id)
 
+def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
+
+    name = "paddle.device.cuda.reset_peak_memory_stats"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Allocated", device_id)
+    core.device_memory_stat_reset_peak_value("Reserved", device_id)
+    # return core.device_memory_stat_peak_value("Reserved", device_id)
 
 def memory_allocated(device: _CudaPlaceLike | None = None) -> int:
     '''

From dc30348189e19311f680165d8fb53722a2c819f1 Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Sat, 7 Dec 2024 17:58:25 +0800
Subject: [PATCH 02/12] added comments

	modified:   paddle/fluid/pybind/pybind.cc
	modified:   python/paddle/device/cuda/__init__.py
---
 paddle/fluid/pybind/pybind.cc         |   9 +++
 python/paddle/device/cuda/__init__.py | 106 +++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e75cd51f68749..2f06a0a1a1420 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2426,6 +2426,15 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
   m.def("device_memory_stat_reset_peak_value", memory::DeviceMemoryStatResetPeakValue);
 
+  m.def("device_memory_stats", [](int dev_id) {
+    py::dict dict;
+    dict["memory.allocated.current"] = memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
+    dict["memory.allocated.peak"] = memory::DeviceMemoryStatPeakValue("Allocated", dev_id);
+    dict["memory.reserved.current"] = memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
+    dict["memory.reserved.peak"] = memory::DeviceMemoryStatPeakValue("Reserved", dev_id);
+    return dict;
+  });
+
   m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
   m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
   m.def(
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 8146612f66d66..abddf53720f06 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -24,6 +24,8 @@
 
 from .streams import Event, Stream
 
+import logging
+
 if TYPE_CHECKING:
     from paddle import CUDAPlace
     from paddle.base.libpaddle import _gpuDeviceProperties
@@ -49,6 +51,8 @@
     'get_device_name',
     'get_device_capability',
     'reset_peak_memory_stats',
+    'reset_max_memory_allocated',
+    'memory_stats',
 ]
 
 
@@ -299,7 +303,29 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int:
     return core.device_memory_stat_peak_value("Reserved", device_id)
 
 def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak values of GPU memory allocated and reserved to the current values.
+
+    This function resets the "peak" stats tracked by the CUDA memory allocator for both
+    memory allocated and memory reserved. After calling this function, the peak values
+    will be set to the current memory usage values.
 
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_peak_memory_stats(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_peak_memory_stats(0)
+            >>> paddle.device.cuda.reset_peak_memory_stats("gpu:0")
+    '''
     name = "paddle.device.cuda.reset_peak_memory_stats"
     if not core.is_compiled_with_cuda():
         raise ValueError(
@@ -308,7 +334,85 @@ def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
     device_id = extract_cuda_device_id(device, op_name=name)
     core.device_memory_stat_reset_peak_value("Allocated", device_id)
     core.device_memory_stat_reset_peak_value("Reserved", device_id)
-    # return core.device_memory_stat_peak_value("Reserved", device_id)
+
+
+def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved 
+    as well)
+    
+    Warning:
+        This function calls `paddle.device.cuda.reset_peak_memory_stats`, which resets both allocated
+        and reserved peak memory stats.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_max_memory_allocated(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_max_memory_allocated(0)
+            >>> paddle.device.cuda.reset_max_memory_allocated("gpu:0")
+    '''
+    
+    name = "paddle.device.cuda.reset_max_memory_allocated"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    logging.warning(
+        "paddle.device.cuda.reset_max_memory_allocated calls paddle.device.cuda.reset_peak_memory_stats, "
+        "which resets both allocated and reserved peak memory stats."
+    )
+    reset_peak_memory_stats(device)
+
+
+def memory_stats(device: _CudaPlaceLike | None = None) -> dict:
+    '''
+    Return the memory stats of the given device.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Return:
+        dict: The current memory stats of the given device, including the current size of GPU memory that is allocated to tensor,
+        the peak size of GPU memory that is allocated to tensor, the current size of GPU memory that is held by the allocator and
+        the peak size of GPU memory that is held by the allocator, in bytes.
+
+        memory_stats["memory.allocated.current"]
+        memory_stats["memory.allocated.peak"]
+        memory_stats["memory.reserved.current"]
+        memory_stats["memory.reserved.peak"]
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> memory_stats = paddle.device.cuda.memory_stats(paddle.CUDAPlace(0))
+            >>> memory_stats = paddle.device.cuda.memory_stats(0)
+            >>> memory_stats = paddle.device.cuda.memory_stats("gpu:0")
+    '''
+
+    name = "paddle.device.cuda.memory_stats"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    return core.device_memory_stats(device_id)
+
 
 def memory_allocated(device: _CudaPlaceLike | None = None) -> int:
     '''

From d081dde72cd48b67b3caa87563e5735b272d9840 Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Sat, 7 Dec 2024 22:01:31 +0800
Subject: [PATCH 03/12] added cpp tests

	modified:   paddle/fluid/pybind/pybind.cc
	modified:   paddle/phi/core/memory/stats.cc
	modified:   paddle/phi/core/memory/stats.h
	modified:   test/cpp/fluid/memory/stats_test.cc
---
 paddle/fluid/pybind/pybind.cc       |  1 +
 paddle/phi/core/memory/stats.cc     |  4 ++++
 paddle/phi/core/memory/stats.h      |  5 +++++
 test/cpp/fluid/memory/stats_test.cc | 28 +++++++++++++++++++++++++---
 4 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2f06a0a1a1420..3be2c5f10c430 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2437,6 +2437,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
   m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
+  m.def("host_memory_stat_reset_peak_value", memory::HostMemoryStatResetPeakValue);
   m.def(
       "run_cmd",
       [](const std::string &cmd,
diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc
index 0b59ffade6f87..7b75ea9021997 100644
--- a/paddle/phi/core/memory/stats.cc
+++ b/paddle/phi/core/memory/stats.cc
@@ -116,6 +116,10 @@ void HostMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
 }
 
+void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) {
+  StatRegistry::GetInstance()->ResetPeakValue("Host" + stat_type, dev_id);
+}
+
 void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name) {
   if (FLAGS_log_memory_stats && phi::is_gpu_place(place)) {
     VLOG(0) << "After launching op_name: " << op_name << ", "
diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h
index 9ede49bd699a5..f59cc0336a802 100644
--- a/paddle/phi/core/memory/stats.h
+++ b/paddle/phi/core/memory/stats.h
@@ -150,6 +150,7 @@ int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void HostMemoryStatUpdate(const std::string& stat_type,
                           int dev_id,
                           int64_t increment);
+void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id);
 
 void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
 
@@ -195,6 +196,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
   DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
 #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
   DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
+#define DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, ResetPeakValue)
 
 #define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                             \
   [&] {                                                                        \
@@ -215,6 +218,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
   HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
 #define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
   HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
+#define HOST_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, ResetPeakValue)
 
 #define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
   struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
diff --git a/test/cpp/fluid/memory/stats_test.cc b/test/cpp/fluid/memory/stats_test.cc
index 6aefe5b77b1ff..52b40780d9b7b 100644
--- a/test/cpp/fluid/memory/stats_test.cc
+++ b/test/cpp/fluid/memory/stats_test.cc
@@ -32,15 +32,18 @@ class StatsTest : public ::testing::Test {
   void SetFunc(
       std::function<void(const std::string, int, int64_t)> update_func,
       std::function<int64_t(const std::string, int)> current_value_func,
-      std::function<int64_t(const std::string, int)> peak_value_func) {
+      std::function<int64_t(const std::string, int)> peak_value_func,
+      std::function<void(const std::string, int)> reset_peak_value_func) {
     update_func_ = update_func;
     current_value_func_ = current_value_func;
     peak_value_func_ = peak_value_func;
+    reset_peak_value_func_ = reset_peak_value_func;
   }
 
   void RunTests() {
     MultiThreadReadWriteTest();
     PeakValueTest();
+    ResetPeakValueTest();
   }
 
  private:
@@ -94,6 +97,16 @@ class StatsTest : public ::testing::Test {
     EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
   }
 
+  void ResetPeakValueTest() {
+    for(long unsigned int i = 0; i < datas_.size(); ++i) {
+      update_func_(stat_type_, 0, datas_[i]);
+
+      EXPECT_GE(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0));
+      reset_peak_value_func_(stat_type_, 0);
+      EXPECT_EQ(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0));
+    }
+  }
+
   std::string stat_type_;
   std::vector<int64_t> datas_{
       543149808935355, 634698327471328, 706215795436611, 577939367795333,
@@ -125,13 +138,15 @@ class StatsTest : public ::testing::Test {
   std::function<void(const std::string, int, int64_t)> update_func_;
   std::function<int64_t(const std::string, int)> current_value_func_;
   std::function<int64_t(const std::string, int)> peak_value_func_;
+  std::function<void(const std::string, int)> reset_peak_value_func_;
 };
 
 TEST_F(StatsTest, DeviceAllocatedTest) {
   SetStatType("Allocated");
   SetFunc(DeviceMemoryStatUpdate,
           DeviceMemoryStatCurrentValue,
-          DeviceMemoryStatPeakValue);
+          DeviceMemoryStatPeakValue,
+          DeviceMemoryStatResetPeakValue);
   RunTests();
 }
 
@@ -146,6 +161,9 @@ TEST_F(StatsTest, DeviceReservedMacroTest) {
       },
       [](const std::string stat_type, int id) {
         return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(Reserved, id);
       });
   RunTests();
 }
@@ -161,6 +179,9 @@ TEST_F(StatsTest, HostAllocatedMacroTest) {
       },
       [](const std::string stat_type, int id) {
         return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_RESET_PEAK_VALUE(Allocated, id);
       });
   RunTests();
 }
@@ -169,7 +190,8 @@ TEST_F(StatsTest, HostReservedTest) {
   SetStatType("Reserved");
   SetFunc(HostMemoryStatUpdate,
           HostMemoryStatCurrentValue,
-          HostMemoryStatPeakValue);
+          HostMemoryStatPeakValue,
+          HostMemoryStatResetPeakValue);
   RunTests();
 }
 

From 35a12c83edf5ec31813c545c3f5f8c2cd6813936 Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Sat, 7 Dec 2024 22:15:38 +0800
Subject: [PATCH 04/12] added python tests

	new file:   test/legacy_test/test_cuda_memory_stats.py
	new file:   test/legacy_test/test_cuda_reset_peak_memory_stats.py
---
 test/legacy_test/test_cuda_memory_stats.py    | 83 +++++++++++++++++
 .../test_cuda_reset_peak_memory_stats.py      | 88 +++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 test/legacy_test/test_cuda_memory_stats.py
 create mode 100644 test/legacy_test/test_cuda_reset_peak_memory_stats.py

diff --git a/test/legacy_test/test_cuda_memory_stats.py b/test/legacy_test/test_cuda_memory_stats.py
new file mode 100644
index 0000000000000..b87cc3a38e24a
--- /dev/null
+++ b/test/legacy_test/test_cuda_memory_stats.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.base import core
+from paddle.device.cuda import (
+    device_count,
+    max_memory_allocated,
+    memory_allocated,
+    max_memory_reserved,
+    memory_reserved,
+    memory_stats,
+)
+
+
+class TestMemoryStats(unittest.TestCase):
+    def func_test_memory_stats(self, device=None):
+        if core.is_compiled_with_cuda():
+            alloc_time = 100
+            max_alloc_size = 10000
+            for i in range(alloc_time):
+                # first alloc
+                shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2)
+                tensor = paddle.zeros(shape)
+                del shape
+                del tensor
+
+                # second alloc
+                shape = paddle.randint(low=0, high=max_alloc_size)
+                tensor = paddle.zeros(shape)
+
+                memory_stats = paddle.device.cuda.memory_stats(device)
+
+                self.assertEqual(memory_stats["memory.allocated.current"], memory_allocated(device))
+                self.assertEqual(memory_stats["memory.allocated.peak"], max_memory_allocated(device))
+
+                self.assertEqual(memory_stats["memory.reserved.current"], memory_reserved(device))
+                self.assertEqual(memory_stats["memory.reserved.peak"], max_memory_reserved(device))
+
+                del shape
+                del tensor
+
+    def test_memory_stats_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.func_test_memory_stats(core.CUDAPlace(i))
+                self.func_test_memory_stats(i)
+                self.func_test_memory_stats("gpu:" + str(i))
+
+    def test_memory_stats_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(),
+                device_count() + 1,
+                -2,
+                0.5,
+                "gpu1",
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):  # noqa: B017
+                    memory_stats(device)
+        else:
+            with self.assertRaises(ValueError):
+                memory_stats()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_cuda_reset_peak_memory_stats.py b/test/legacy_test/test_cuda_reset_peak_memory_stats.py
new file mode 100644
index 0000000000000..b917e8c01062f
--- /dev/null
+++ b/test/legacy_test/test_cuda_reset_peak_memory_stats.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.base import core
+from paddle.device.cuda import (
+    device_count,
+    max_memory_allocated,
+    memory_allocated,
+    max_memory_reserved,
+    memory_reserved,
+    reset_peak_memory_stats,
+)
+
+
+class TestResetPeakMemoryStats(unittest.TestCase):
+    def func_test_reset_peak_memory_stats(self, device=None):
+        if core.is_compiled_with_cuda():
+            alloc_time = 100
+            max_alloc_size = 10000
+            for i in range(alloc_time):
+                # first alloc
+                shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2)
+                tensor = paddle.zeros(shape)
+                peak_memory_allocated_size_first = max_memory_allocated(device)
+                peak_memory_reserved_size_first = max_memory_reserved(device)
+                del shape
+                del tensor
+
+                # second alloc
+                shape = paddle.randint(low=0, high=max_alloc_size)
+                tensor = paddle.zeros(shape)
+                
+                # reset peak memory stats
+                reset_peak_memory_stats(device)
+
+                peak_memory_allocated_size_second = max_memory_allocated(device)
+                self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device))
+                self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first)
+
+                peak_memory_reserved_size_second = max_memory_reserved(device)
+                self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device))
+                self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first)
+
+                del shape
+                del tensor
+
+    def test_reset_peak_memory_stats_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.func_test_reset_peak_memory_stats(core.CUDAPlace(i))
+                self.func_test_reset_peak_memory_stats(i)
+                self.func_test_reset_peak_memory_stats("gpu:" + str(i))
+
+    def test_reset_peak_memory_stats_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(),
+                device_count() + 1,
+                -2,
+                0.5,
+                "gpu1",
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):  # noqa: B017
+                    reset_peak_memory_stats(device)
+        else:
+            with self.assertRaises(ValueError):
+                reset_peak_memory_stats()
+
+
+if __name__ == "__main__":
+    unittest.main()

From cb5036c584519a1901b3b2ddb9c130dade1f430e Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Sun, 8 Dec 2024 10:35:09 +0800
Subject: [PATCH 05/12] added a python test for reset_max_memory_allocated

	new file:   test/legacy_test/test_cuda_reset_max_memory_allocated.py
---
 .../test_cuda_reset_max_memory_allocated.py   | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 test/legacy_test/test_cuda_reset_max_memory_allocated.py

diff --git a/test/legacy_test/test_cuda_reset_max_memory_allocated.py b/test/legacy_test/test_cuda_reset_max_memory_allocated.py
new file mode 100644
index 0000000000000..fb444497cc569
--- /dev/null
+++ b/test/legacy_test/test_cuda_reset_max_memory_allocated.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.base import core
+from paddle.device.cuda import (
+    device_count,
+    max_memory_allocated,
+    memory_allocated,
+    max_memory_reserved,
+    memory_reserved,
+    reset_max_memory_allocated,
+)
+
+
+class TestResetMaxMemoryAllocated(unittest.TestCase):
+    def func_test_reset_max_memory_allocated(self, device=None):
+        if core.is_compiled_with_cuda():
+            alloc_time = 100
+            max_alloc_size = 10000
+            for i in range(alloc_time):
+                # first alloc
+                shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2)
+                tensor = paddle.zeros(shape)
+                peak_memory_allocated_size_first = max_memory_allocated(device)
+                peak_memory_reserved_size_first = max_memory_reserved(device)
+                del shape
+                del tensor
+
+                # second alloc
+                shape = paddle.randint(low=0, high=max_alloc_size)
+                tensor = paddle.zeros(shape)
+                
+                # reset peak memory stats
+                reset_max_memory_allocated(device)
+
+                peak_memory_allocated_size_second = max_memory_allocated(device)
+                self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device))
+                self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first)
+
+                peak_memory_reserved_size_second = max_memory_reserved(device)
+                self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device))
+                self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first)
+
+                del shape
+                del tensor
+
+    def test_reset_max_memory_allocated_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.func_test_reset_max_memory_allocated(core.CUDAPlace(i))
+                self.func_test_reset_max_memory_allocated(i)
+                self.func_test_reset_max_memory_allocated("gpu:" + str(i))
+
+    def test_reset_max_memory_allocated_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(),
+                device_count() + 1,
+                -2,
+                0.5,
+                "gpu1",
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):  # noqa: B017
+                    reset_max_memory_allocated(device)
+        else:
+            with self.assertRaises(ValueError):
+                reset_max_memory_allocated()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5d28856e3a0f3a8d83ad6ba2c7473af046d872e0 Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Mon, 9 Dec 2024 21:48:01 +0800
Subject: [PATCH 06/12] formatted by pre-commit

	modified:   python/paddle/device/cuda/__init__.py
	modified:   test/legacy_test/test_cuda_memory_stats.py
	modified:   test/legacy_test/test_cuda_reset_max_memory_allocated.py
	modified:   test/legacy_test/test_cuda_reset_peak_memory_stats.py
---
 python/paddle/device/cuda/__init__.py         | 10 +++----
 test/legacy_test/test_cuda_memory_stats.py    | 26 ++++++++++++++-----
 .../test_cuda_reset_max_memory_allocated.py   | 26 ++++++++++++++-----
 .../test_cuda_reset_peak_memory_stats.py      | 26 ++++++++++++++-----
 4 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index abddf53720f06..b3fdb470fe4d1 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import annotations
 
+import logging
 from typing import TYPE_CHECKING, NoReturn, Union
 
 from typing_extensions import TypeAlias
@@ -24,8 +25,6 @@
 
 from .streams import Event, Stream
 
-import logging
-
 if TYPE_CHECKING:
     from paddle import CUDAPlace
     from paddle.base.libpaddle import _gpuDeviceProperties
@@ -302,6 +301,7 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int:
     device_id = extract_cuda_device_id(device, op_name=name)
     return core.device_memory_stat_peak_value("Reserved", device_id)
 
+
 def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
     '''
     Reset the peak values of GPU memory allocated and reserved to the current values.
@@ -338,9 +338,9 @@ def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
 
 def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
     '''
-    Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved 
+    Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved
     as well)
-    
+
     Warning:
         This function calls `paddle.device.cuda.reset_peak_memory_stats`, which resets both allocated
         and reserved peak memory stats.
@@ -361,7 +361,7 @@ def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
             >>> paddle.device.cuda.reset_max_memory_allocated(0)
             >>> paddle.device.cuda.reset_max_memory_allocated("gpu:0")
     '''
-    
+
     name = "paddle.device.cuda.reset_max_memory_allocated"
     if not core.is_compiled_with_cuda():
         raise ValueError(
diff --git a/test/legacy_test/test_cuda_memory_stats.py b/test/legacy_test/test_cuda_memory_stats.py
index b87cc3a38e24a..f209b62d0d5d7 100644
--- a/test/legacy_test/test_cuda_memory_stats.py
+++ b/test/legacy_test/test_cuda_memory_stats.py
@@ -19,8 +19,8 @@
 from paddle.device.cuda import (
     device_count,
     max_memory_allocated,
-    memory_allocated,
     max_memory_reserved,
+    memory_allocated,
     memory_reserved,
     memory_stats,
 )
@@ -33,7 +33,9 @@ def func_test_memory_stats(self, device=None):
             max_alloc_size = 10000
             for i in range(alloc_time):
                 # first alloc
-                shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2)
+                shape = paddle.randint(
+                    low=max_alloc_size, high=max_alloc_size * 2
+                )
                 tensor = paddle.zeros(shape)
                 del shape
                 del tensor
@@ -44,11 +46,23 @@ def func_test_memory_stats(self, device=None):
 
                 memory_stats = paddle.device.cuda.memory_stats(device)
 
-                self.assertEqual(memory_stats["memory.allocated.current"], memory_allocated(device))
-                self.assertEqual(memory_stats["memory.allocated.peak"], max_memory_allocated(device))
+                self.assertEqual(
+                    memory_stats["memory.allocated.current"],
+                    memory_allocated(device),
+                )
+                self.assertEqual(
+                    memory_stats["memory.allocated.peak"],
+                    max_memory_allocated(device),
+                )
 
-                self.assertEqual(memory_stats["memory.reserved.current"], memory_reserved(device))
-                self.assertEqual(memory_stats["memory.reserved.peak"], max_memory_reserved(device))
+                self.assertEqual(
+                    memory_stats["memory.reserved.current"],
+                    memory_reserved(device),
+                )
+                self.assertEqual(
+                    memory_stats["memory.reserved.peak"],
+                    max_memory_reserved(device),
+                )
 
                 del shape
                 del tensor
diff --git a/test/legacy_test/test_cuda_reset_max_memory_allocated.py b/test/legacy_test/test_cuda_reset_max_memory_allocated.py
index fb444497cc569..ee326e2728fe5 100644
--- a/test/legacy_test/test_cuda_reset_max_memory_allocated.py
+++ b/test/legacy_test/test_cuda_reset_max_memory_allocated.py
@@ -19,8 +19,8 @@
 from paddle.device.cuda import (
     device_count,
     max_memory_allocated,
-    memory_allocated,
     max_memory_reserved,
+    memory_allocated,
     memory_reserved,
     reset_max_memory_allocated,
 )
@@ -33,7 +33,9 @@ def func_test_reset_max_memory_allocated(self, device=None):
             max_alloc_size = 10000
             for i in range(alloc_time):
                 # first alloc
-                shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2)
+                shape = paddle.randint(
+                    low=max_alloc_size, high=max_alloc_size * 2
+                )
                 tensor = paddle.zeros(shape)
                 peak_memory_allocated_size_first = max_memory_allocated(device)
                 peak_memory_reserved_size_first = max_memory_reserved(device)
@@ -43,17 +45,27 @@ def func_test_reset_max_memory_allocated(self, device=None):
                 # second alloc
                 shape = paddle.randint(low=0, high=max_alloc_size)
                 tensor = paddle.zeros(shape)
-                
+
                 # reset peak memory stats
                 reset_max_memory_allocated(device)
 
                 peak_memory_allocated_size_second = max_memory_allocated(device)
-                self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device))
-                self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first)
+                self.assertEqual(
+                    peak_memory_allocated_size_second, memory_allocated(device)
+                )
+                self.assertLess(
+                    peak_memory_allocated_size_second,
+                    peak_memory_allocated_size_first,
+                )
 
                 peak_memory_reserved_size_second = max_memory_reserved(device)
-                self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device))
-                self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first)
+                self.assertEqual(
+                    peak_memory_reserved_size_second, memory_reserved(device)
+                )
+                self.assertLessEqual(
+                    peak_memory_reserved_size_second,
+                    peak_memory_reserved_size_first,
+                )
 
                 del shape
                 del tensor
diff --git a/test/legacy_test/test_cuda_reset_peak_memory_stats.py b/test/legacy_test/test_cuda_reset_peak_memory_stats.py
index b917e8c01062f..be66ef3e6bf23 100644
--- a/test/legacy_test/test_cuda_reset_peak_memory_stats.py
+++ b/test/legacy_test/test_cuda_reset_peak_memory_stats.py
@@ -19,8 +19,8 @@
 from paddle.device.cuda import (
     device_count,
     max_memory_allocated,
-    memory_allocated,
     max_memory_reserved,
+    memory_allocated,
     memory_reserved,
     reset_peak_memory_stats,
 )
@@ -33,7 +33,9 @@ def func_test_reset_peak_memory_stats(self, device=None):
             max_alloc_size = 10000
             for i in range(alloc_time):
                 # first alloc
-                shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2)
+                shape = paddle.randint(
+                    low=max_alloc_size, high=max_alloc_size * 2
+                )
                 tensor = paddle.zeros(shape)
                 peak_memory_allocated_size_first = max_memory_allocated(device)
                 peak_memory_reserved_size_first = max_memory_reserved(device)
@@ -43,17 +45,27 @@ def func_test_reset_peak_memory_stats(self, device=None):
                 # second alloc
                 shape = paddle.randint(low=0, high=max_alloc_size)
                 tensor = paddle.zeros(shape)
-                
+
                 # reset peak memory stats
                 reset_peak_memory_stats(device)
 
                 peak_memory_allocated_size_second = max_memory_allocated(device)
-                self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device))
-                self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first)
+                self.assertEqual(
+                    peak_memory_allocated_size_second, memory_allocated(device)
+                )
+                self.assertLess(
+                    peak_memory_allocated_size_second,
+                    peak_memory_allocated_size_first,
+                )
 
                 peak_memory_reserved_size_second = max_memory_reserved(device)
-                self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device))
-                self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first)
+                self.assertEqual(
+                    peak_memory_reserved_size_second, memory_reserved(device)
+                )
+                self.assertLessEqual(
+                    peak_memory_reserved_size_second,
+                    peak_memory_reserved_size_first,
+                )
 
                 del shape
                 del tensor

From f6b3d84673f78e4b1f666ecd81684dea80e8d23e Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Mon, 9 Dec 2024 22:00:05 +0800
Subject: [PATCH 07/12] formatted by pre-commit (clang-format) 	modified:  
 paddle/fluid/pybind/pybind.cc 	modified:   paddle/phi/core/memory/stats.cc 
 modified:   paddle/phi/core/memory/stats.h 	modified:  
 test/cpp/fluid/memory/stats_test.cc

---
 paddle/fluid/pybind/pybind.cc       | 18 ++++++++++++------
 paddle/phi/core/memory/stats.cc     |  4 ++--
 paddle/phi/core/memory/stats.h      | 10 ++++++----
 test/cpp/fluid/memory/stats_test.cc | 10 ++++++----
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3be2c5f10c430..cea69960b21d3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2424,20 +2424,26 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_current_value",
         memory::DeviceMemoryStatCurrentValue);
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
-  m.def("device_memory_stat_reset_peak_value", memory::DeviceMemoryStatResetPeakValue);
+  m.def("device_memory_stat_reset_peak_value",
+        memory::DeviceMemoryStatResetPeakValue);
 
   m.def("device_memory_stats", [](int dev_id) {
     py::dict dict;
-    dict["memory.allocated.current"] = memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
-    dict["memory.allocated.peak"] = memory::DeviceMemoryStatPeakValue("Allocated", dev_id);
-    dict["memory.reserved.current"] = memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
-    dict["memory.reserved.peak"] = memory::DeviceMemoryStatPeakValue("Reserved", dev_id);
+    dict["memory.allocated.current"] =
+        memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
+    dict["memory.allocated.peak"] =
+        memory::DeviceMemoryStatPeakValue("Allocated", dev_id);
+    dict["memory.reserved.current"] =
+        memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
+    dict["memory.reserved.peak"] =
+        memory::DeviceMemoryStatPeakValue("Reserved", dev_id);
     return dict;
   });
 
   m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
   m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
-  m.def("host_memory_stat_reset_peak_value", memory::HostMemoryStatResetPeakValue);
+  m.def("host_memory_stat_reset_peak_value",
+        memory::HostMemoryStatResetPeakValue);
   m.def(
       "run_cmd",
       [](const std::string &cmd,
diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc
index 7b75ea9021997..a6cbf37bd4c9a 100644
--- a/paddle/phi/core/memory/stats.cc
+++ b/paddle/phi/core/memory/stats.cc
@@ -97,7 +97,7 @@ void DeviceMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }
 
-void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) {
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
   StatRegistry::GetInstance()->ResetPeakValue("Device" + stat_type, dev_id);
 }
 
@@ -116,7 +116,7 @@ void HostMemoryStatUpdate(const std::string& stat_type,
   StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
 }
 
-void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) {
+void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) {
   StatRegistry::GetInstance()->ResetPeakValue("Host" + stat_type, dev_id);
 }
 
diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h
index f59cc0336a802..e5b4f9d8ad771 100644
--- a/paddle/phi/core/memory/stats.h
+++ b/paddle/phi/core/memory/stats.h
@@ -117,8 +117,10 @@ class Stat : public StatBase {
     int64_t current_value = GetCurrentValue();
     peak_value_.store(current_value, std::memory_order_relaxed);
 
-    std::unordered_map<uint64_t, std::reference_wrapper<ThreadLocalStatType>> thread_local_stats =
-      ThreadDataRegistry<ThreadLocalStatType>::GetInstance().GetAllThreadDataByRef();
+    std::unordered_map<uint64_t, std::reference_wrapper<ThreadLocalStatType>>
+        thread_local_stats =
+            ThreadDataRegistry<ThreadLocalStatType>::GetInstance()
+                .GetAllThreadDataByRef();
 
     for (auto pair : thread_local_stats) {
       pair.second.get().peak = pair.second.get().current;
@@ -143,14 +145,14 @@ int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void DeviceMemoryStatUpdate(const std::string& stat_type,
                             int dev_id,
                             int64_t increment);
-void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id);
+void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);
 
 int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
 void HostMemoryStatUpdate(const std::string& stat_type,
                           int dev_id,
                           int64_t increment);
-void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id);
+void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id);
 
 void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name);
 
diff --git a/test/cpp/fluid/memory/stats_test.cc b/test/cpp/fluid/memory/stats_test.cc
index 52b40780d9b7b..51e6bd13636a6 100644
--- a/test/cpp/fluid/memory/stats_test.cc
+++ b/test/cpp/fluid/memory/stats_test.cc
@@ -98,12 +98,14 @@ class StatsTest : public ::testing::Test {
   }
 
   void ResetPeakValueTest() {
-    for(long unsigned int i = 0; i < datas_.size(); ++i) {
-      update_func_(stat_type_, 0, datas_[i]);
+    for (int64_t data : datas_) {
+      update_func_(stat_type_, 0, data);
 
-      EXPECT_GE(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0));
+      EXPECT_GE(peak_value_func_(stat_type_, 0),
+                current_value_func_(stat_type_, 0));
       reset_peak_value_func_(stat_type_, 0);
-      EXPECT_EQ(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0));
+      EXPECT_EQ(peak_value_func_(stat_type_, 0),
+                current_value_func_(stat_type_, 0));
     }
   }
 

From b6ea9be25ded38424f8f1810b494bcdb43d19f60 Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Fri, 13 Dec 2024 21:24:44 +0800
Subject: [PATCH 08/12] added reset max memory reserved function

	modified:   python/paddle/device/cuda/__init__.py
	modified:   test/legacy_test/test_cuda_reset_max_memory_allocated.py
	new file:   test/legacy_test/test_cuda_reset_max_memory_reserved.py
---
 python/paddle/device/cuda/__init__.py         | 48 +++++++---
 .../test_cuda_reset_max_memory_allocated.py   | 13 +--
 .../test_cuda_reset_max_memory_reserved.py    | 89 +++++++++++++++++++
 3 files changed, 126 insertions(+), 24 deletions(-)
 create mode 100644 test/legacy_test/test_cuda_reset_max_memory_reserved.py

diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index b3fdb470fe4d1..c2cfc11a495b0 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 from __future__ import annotations
 
-import logging
 from typing import TYPE_CHECKING, NoReturn, Union
 
 from typing_extensions import TypeAlias
@@ -51,6 +50,7 @@
     'get_device_capability',
     'reset_peak_memory_stats',
     'reset_max_memory_allocated',
+    'reset_max_memory_reserved',
     'memory_stats',
 ]
 
@@ -338,12 +338,8 @@ def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
 
 def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
     '''
-    Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved
-    as well)
-
-    Warning:
-        This function calls `paddle.device.cuda.reset_peak_memory_stats`, which resets both allocated
-        and reserved peak memory stats.
+    Reset the peak values of GPU memory allocated to the current values.
+    Allocated memory refers to the GPU memory that is currently allocated to tensors.
 
     Args:
         device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
@@ -367,11 +363,39 @@ def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
         raise ValueError(
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
-    logging.warning(
-        "paddle.device.cuda.reset_max_memory_allocated calls paddle.device.cuda.reset_peak_memory_stats, "
-        "which resets both allocated and reserved peak memory stats."
-    )
-    reset_peak_memory_stats(device)
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Allocated", device_id)
+
+
+def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None:
+    '''
+    Reset the peak values of GPU memory reserved to the current values.
+    Reserved memory refers to the GPU memory that is held by the allocator of the given device.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> paddle.device.cuda.reset_max_memory_reserved(paddle.CUDAPlace(0))
+            >>> paddle.device.cuda.reset_max_memory_reserved(0)
+            >>> paddle.device.cuda.reset_max_memory_reserved("gpu:0")
+    '''
+
+    name = "paddle.device.cuda.reset_max_memory_reserved"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    core.device_memory_stat_reset_peak_value("Reserved", device_id)
 
 
 def memory_stats(device: _CudaPlaceLike | None = None) -> dict:
diff --git a/test/legacy_test/test_cuda_reset_max_memory_allocated.py b/test/legacy_test/test_cuda_reset_max_memory_allocated.py
index ee326e2728fe5..ae99b6056dd70 100644
--- a/test/legacy_test/test_cuda_reset_max_memory_allocated.py
+++ b/test/legacy_test/test_cuda_reset_max_memory_allocated.py
@@ -19,9 +19,7 @@
 from paddle.device.cuda import (
     device_count,
     max_memory_allocated,
-    max_memory_reserved,
     memory_allocated,
-    memory_reserved,
     reset_max_memory_allocated,
 )
 
@@ -38,7 +36,7 @@ def func_test_reset_max_memory_allocated(self, device=None):
                 )
                 tensor = paddle.zeros(shape)
                 peak_memory_allocated_size_first = max_memory_allocated(device)
-                peak_memory_reserved_size_first = max_memory_reserved(device)
+
                 del shape
                 del tensor
 
@@ -58,15 +56,6 @@ def func_test_reset_max_memory_allocated(self, device=None):
                     peak_memory_allocated_size_first,
                 )
 
-                peak_memory_reserved_size_second = max_memory_reserved(device)
-                self.assertEqual(
-                    peak_memory_reserved_size_second, memory_reserved(device)
-                )
-                self.assertLessEqual(
-                    peak_memory_reserved_size_second,
-                    peak_memory_reserved_size_first,
-                )
-
                 del shape
                 del tensor
 
diff --git a/test/legacy_test/test_cuda_reset_max_memory_reserved.py b/test/legacy_test/test_cuda_reset_max_memory_reserved.py
new file mode 100644
index 0000000000000..51d9470599c34
--- /dev/null
+++ b/test/legacy_test/test_cuda_reset_max_memory_reserved.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.base import core
+from paddle.device.cuda import (
+    device_count,
+    max_memory_reserved,
+    memory_reserved,
+    reset_max_memory_reserved,
+)
+
+
+class TestResetMaxMemoryReserved(unittest.TestCase):
+    def func_test_reset_max_memory_reserved(self, device=None):
+        if core.is_compiled_with_cuda():
+            alloc_time = 100
+            max_alloc_size = 10000
+            for i in range(alloc_time):
+                # first alloc
+                shape = paddle.randint(
+                    low=max_alloc_size, high=max_alloc_size * 2
+                )
+                tensor = paddle.zeros(shape)
+                peak_memory_reserved_size_first = max_memory_reserved(device)
+
+                del shape
+                del tensor
+
+                # second alloc
+                shape = paddle.randint(low=0, high=max_alloc_size)
+                tensor = paddle.zeros(shape)
+
+                # reset peak memory stats
+                reset_max_memory_reserved(device)
+
+                peak_memory_reserved_size_second = max_memory_reserved(device)
+                self.assertEqual(
+                    peak_memory_reserved_size_second, memory_reserved(device)
+                )
+                self.assertLessEqual(
+                    peak_memory_reserved_size_second,
+                    peak_memory_reserved_size_first,
+                )
+
+                del shape
+                del tensor
+
+    def test_reset_max_memory_reserved_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.func_test_reset_max_memory_reserved(core.CUDAPlace(i))
+                self.func_test_reset_max_memory_reserved(i)
+                self.func_test_reset_max_memory_reserved("gpu:" + str(i))
+
+    def test_reset_max_memory_reserved_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(),
+                device_count() + 1,
+                -2,
+                0.5,
+                "gpu1",
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):  # noqa: B017
+                    reset_max_memory_reserved(device)
+        else:
+            with self.assertRaises(ValueError):
+                reset_max_memory_reserved()
+
+
+if __name__ == "__main__":
+    unittest.main()

From caad36840c27e0ab96e39f3776ff9c5af1be6c0e Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Sun, 15 Dec 2024 15:07:33 +0800
Subject: [PATCH 09/12] deleted memory stats and reset peak memory stats

	modified:   paddle/fluid/pybind/pybind.cc
	modified:   python/paddle/device/cuda/__init__.py
	deleted:    test/legacy_test/test_cuda_memory_stats.py
	deleted:    test/legacy_test/test_cuda_reset_peak_memory_stats.py
---
 paddle/fluid/pybind/pybind.cc                 |  13 ---
 python/paddle/device/cuda/__init__.py         |  82 +-------------
 test/legacy_test/test_cuda_memory_stats.py    |  97 -----------------
 .../test_cuda_reset_peak_memory_stats.py      | 100 ------------------
 4 files changed, 2 insertions(+), 290 deletions(-)
 delete mode 100644 test/legacy_test/test_cuda_memory_stats.py
 delete mode 100644 test/legacy_test/test_cuda_reset_peak_memory_stats.py

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cea69960b21d3..1aa83b8fd26e9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2427,19 +2427,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_reset_peak_value",
         memory::DeviceMemoryStatResetPeakValue);
 
-  m.def("device_memory_stats", [](int dev_id) {
-    py::dict dict;
-    dict["memory.allocated.current"] =
-        memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
-    dict["memory.allocated.peak"] =
-        memory::DeviceMemoryStatPeakValue("Allocated", dev_id);
-    dict["memory.reserved.current"] =
-        memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
-    dict["memory.reserved.peak"] =
-        memory::DeviceMemoryStatPeakValue("Reserved", dev_id);
-    return dict;
-  });
-
   m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
   m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
   m.def("host_memory_stat_reset_peak_value",
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index c2cfc11a495b0..538c52c7837a5 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -48,10 +48,8 @@
     'get_device_properties',
     'get_device_name',
     'get_device_capability',
-    'reset_peak_memory_stats',
     'reset_max_memory_allocated',
     'reset_max_memory_reserved',
-    'memory_stats',
 ]
 
 
@@ -302,44 +300,9 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int:
     return core.device_memory_stat_peak_value("Reserved", device_id)
 
 
-def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None:
-    '''
-    Reset the peak values of GPU memory allocated and reserved to the current values.
-
-    This function resets the "peak" stats tracked by the CUDA memory allocator for both
-    memory allocated and memory reserved. After calling this function, the peak values
-    will be set to the current memory usage values.
-
-    Args:
-        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
-            the string name of device like 'gpu:x'. If device is None, the device is the current device.
-            Default: None.
-
-    Examples:
-        .. code-block:: python
-
-            >>> # doctest: +REQUIRES(env:GPU)
-            >>> import paddle
-            >>> paddle.device.set_device('gpu')
-
-            >>> paddle.device.cuda.reset_peak_memory_stats(paddle.CUDAPlace(0))
-            >>> paddle.device.cuda.reset_peak_memory_stats(0)
-            >>> paddle.device.cuda.reset_peak_memory_stats("gpu:0")
-    '''
-    name = "paddle.device.cuda.reset_peak_memory_stats"
-    if not core.is_compiled_with_cuda():
-        raise ValueError(
-            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
-        )
-    device_id = extract_cuda_device_id(device, op_name=name)
-    core.device_memory_stat_reset_peak_value("Allocated", device_id)
-    core.device_memory_stat_reset_peak_value("Reserved", device_id)
-
-
 def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
     '''
-    Reset the peak values of GPU memory allocated to the current values.
-    Allocated memory refers to the GPU memory that is currently allocated to tensors.
+    Reset the peak size of GPU memory that is allocated to tensor of the given device.
 
     Args:
         device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
@@ -369,8 +332,7 @@ def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None:
 
 def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None:
     '''
-    Reset the peak values of GPU memory reserved to the current values.
-    Reserved memory refers to the GPU memory that is held by the allocator of the given device.
+    Reset the peak size of GPU memory that is held by the allocator of the given device.
 
     Args:
         device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
@@ -398,46 +360,6 @@ def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None:
     core.device_memory_stat_reset_peak_value("Reserved", device_id)
 
 
-def memory_stats(device: _CudaPlaceLike | None = None) -> dict:
-    '''
-    Return the memory stats of the given device.
-
-    Args:
-        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
-            the string name of device like 'gpu:x'. If device is None, the device is the current device.
-            Default: None.
-
-    Return:
-        dict: The current memory stats of the given device, including the current size of GPU memory that is allocated to tensor,
-        the peak size of GPU memory that is allocated to tensor, the current size of GPU memory that is held by the allocator and
-        the peak size of GPU memory that is held by the allocator, in bytes.
-
-        memory_stats["memory.allocated.current"]
-        memory_stats["memory.allocated.peak"]
-        memory_stats["memory.reserved.current"]
-        memory_stats["memory.reserved.peak"]
-
-    Examples:
-        .. code-block:: python
-
-            >>> # doctest: +REQUIRES(env:GPU)
-            >>> import paddle
-            >>> paddle.device.set_device('gpu')
-
-            >>> memory_stats = paddle.device.cuda.memory_stats(paddle.CUDAPlace(0))
-            >>> memory_stats = paddle.device.cuda.memory_stats(0)
-            >>> memory_stats = paddle.device.cuda.memory_stats("gpu:0")
-    '''
-
-    name = "paddle.device.cuda.memory_stats"
-    if not core.is_compiled_with_cuda():
-        raise ValueError(
-            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
-        )
-    device_id = extract_cuda_device_id(device, op_name=name)
-    return core.device_memory_stats(device_id)
-
-
 def memory_allocated(device: _CudaPlaceLike | None = None) -> int:
     '''
     Return the current size of gpu memory that is allocated to tensor of the given device.
diff --git a/test/legacy_test/test_cuda_memory_stats.py b/test/legacy_test/test_cuda_memory_stats.py
deleted file mode 100644
index f209b62d0d5d7..0000000000000
--- a/test/legacy_test/test_cuda_memory_stats.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.base import core
-from paddle.device.cuda import (
-    device_count,
-    max_memory_allocated,
-    max_memory_reserved,
-    memory_allocated,
-    memory_reserved,
-    memory_stats,
-)
-
-
-class TestMemoryStats(unittest.TestCase):
-    def func_test_memory_stats(self, device=None):
-        if core.is_compiled_with_cuda():
-            alloc_time = 100
-            max_alloc_size = 10000
-            for i in range(alloc_time):
-                # first alloc
-                shape = paddle.randint(
-                    low=max_alloc_size, high=max_alloc_size * 2
-                )
-                tensor = paddle.zeros(shape)
-                del shape
-                del tensor
-
-                # second alloc
-                shape = paddle.randint(low=0, high=max_alloc_size)
-                tensor = paddle.zeros(shape)
-
-                memory_stats = paddle.device.cuda.memory_stats(device)
-
-                self.assertEqual(
-                    memory_stats["memory.allocated.current"],
-                    memory_allocated(device),
-                )
-                self.assertEqual(
-                    memory_stats["memory.allocated.peak"],
-                    max_memory_allocated(device),
-                )
-
-                self.assertEqual(
-                    memory_stats["memory.reserved.current"],
-                    memory_reserved(device),
-                )
-                self.assertEqual(
-                    memory_stats["memory.reserved.peak"],
-                    max_memory_reserved(device),
-                )
-
-                del shape
-                del tensor
-
-    def test_memory_stats_for_all_places(self):
-        if core.is_compiled_with_cuda():
-            gpu_num = device_count()
-            for i in range(gpu_num):
-                paddle.device.set_device("gpu:" + str(i))
-                self.func_test_memory_stats(core.CUDAPlace(i))
-                self.func_test_memory_stats(i)
-                self.func_test_memory_stats("gpu:" + str(i))
-
-    def test_memory_stats_exception(self):
-        if core.is_compiled_with_cuda():
-            wrong_device = [
-                core.CPUPlace(),
-                device_count() + 1,
-                -2,
-                0.5,
-                "gpu1",
-            ]
-            for device in wrong_device:
-                with self.assertRaises(BaseException):  # noqa: B017
-                    memory_stats(device)
-        else:
-            with self.assertRaises(ValueError):
-                memory_stats()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_cuda_reset_peak_memory_stats.py b/test/legacy_test/test_cuda_reset_peak_memory_stats.py
deleted file mode 100644
index be66ef3e6bf23..0000000000000
--- a/test/legacy_test/test_cuda_reset_peak_memory_stats.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle.base import core
-from paddle.device.cuda import (
-    device_count,
-    max_memory_allocated,
-    max_memory_reserved,
-    memory_allocated,
-    memory_reserved,
-    reset_peak_memory_stats,
-)
-
-
-class TestResetPeakMemoryStats(unittest.TestCase):
-    def func_test_reset_peak_memory_stats(self, device=None):
-        if core.is_compiled_with_cuda():
-            alloc_time = 100
-            max_alloc_size = 10000
-            for i in range(alloc_time):
-                # first alloc
-                shape = paddle.randint(
-                    low=max_alloc_size, high=max_alloc_size * 2
-                )
-                tensor = paddle.zeros(shape)
-                peak_memory_allocated_size_first = max_memory_allocated(device)
-                peak_memory_reserved_size_first = max_memory_reserved(device)
-                del shape
-                del tensor
-
-                # second alloc
-                shape = paddle.randint(low=0, high=max_alloc_size)
-                tensor = paddle.zeros(shape)
-
-                # reset peak memory stats
-                reset_peak_memory_stats(device)
-
-                peak_memory_allocated_size_second = max_memory_allocated(device)
-                self.assertEqual(
-                    peak_memory_allocated_size_second, memory_allocated(device)
-                )
-                self.assertLess(
-                    peak_memory_allocated_size_second,
-                    peak_memory_allocated_size_first,
-                )
-
-                peak_memory_reserved_size_second = max_memory_reserved(device)
-                self.assertEqual(
-                    peak_memory_reserved_size_second, memory_reserved(device)
-                )
-                self.assertLessEqual(
-                    peak_memory_reserved_size_second,
-                    peak_memory_reserved_size_first,
-                )
-
-                del shape
-                del tensor
-
-    def test_reset_peak_memory_stats_for_all_places(self):
-        if core.is_compiled_with_cuda():
-            gpu_num = device_count()
-            for i in range(gpu_num):
-                paddle.device.set_device("gpu:" + str(i))
-                self.func_test_reset_peak_memory_stats(core.CUDAPlace(i))
-                self.func_test_reset_peak_memory_stats(i)
-                self.func_test_reset_peak_memory_stats("gpu:" + str(i))
-
-    def test_reset_peak_memory_stats_exception(self):
-        if core.is_compiled_with_cuda():
-            wrong_device = [
-                core.CPUPlace(),
-                device_count() + 1,
-                -2,
-                0.5,
-                "gpu1",
-            ]
-            for device in wrong_device:
-                with self.assertRaises(BaseException):  # noqa: B017
-                    reset_peak_memory_stats(device)
-        else:
-            with self.assertRaises(ValueError):
-                reset_peak_memory_stats()
-
-
-if __name__ == "__main__":
-    unittest.main()

From 323e7b234fc5eb7ae7c505b6d1a7820fb031f4c2 Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Sun, 29 Dec 2024 21:04:21 +0800
Subject: [PATCH 10/12] formatted baddbmm version

	modified:   paddle/phi/infermeta/ternary.cc
	modified:   paddle/phi/infermeta/ternary.h
	new file:   paddle/phi/kernels/baddbmm_grad_kernel.h
	new file:   paddle/phi/kernels/baddbmm_kernel.h
	new file:   paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc
	new file:   paddle/phi/kernels/cpu/baddbmm_kernel.cc
	modified:   paddle/phi/kernels/funcs/blas/blas.h
	modified:   paddle/phi/kernels/funcs/blas/blas_impl.cu.h
	modified:   paddle/phi/kernels/funcs/blas/blas_impl.h
	new file:   paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu
	new file:   paddle/phi/kernels/gpu/baddbmm_kernel.cu
	new file:   paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
	new file:   paddle/phi/kernels/impl/baddbmm_kernel_impl.h
	modified:   paddle/phi/ops/yaml/backward.yaml
	modified:   paddle/phi/ops/yaml/ops.yaml
	modified:   python/paddle/__init__.py
	modified:   python/paddle/tensor/__init__.py
	modified:   python/paddle/tensor/math.py
---
 paddle/phi/infermeta/ternary.cc               |  90 ++++++
 paddle/phi/infermeta/ternary.h                |   7 +
 paddle/phi/kernels/baddbmm_grad_kernel.h      |  33 +++
 paddle/phi/kernels/baddbmm_kernel.h           |  30 ++
 paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc |  22 ++
 paddle/phi/kernels/cpu/baddbmm_kernel.cc      |  22 ++
 paddle/phi/kernels/funcs/blas/blas.h          |  27 ++
 paddle/phi/kernels/funcs/blas/blas_impl.cu.h  | 265 ++++++++++++++++++
 paddle/phi/kernels/funcs/blas/blas_impl.h     |  91 ++++++
 paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu |  28 ++
 paddle/phi/kernels/gpu/baddbmm_kernel.cu      |  28 ++
 .../kernels/impl/baddbmm_grad_kernel_impl.h   | 248 ++++++++++++++++
 paddle/phi/kernels/impl/baddbmm_kernel_impl.h | 179 ++++++++++++
 paddle/phi/ops/yaml/backward.yaml             |  10 +
 paddle/phi/ops/yaml/ops.yaml                  |  12 +
 python/paddle/__init__.py                     |   4 +
 python/paddle/tensor/__init__.py              |   4 +
 python/paddle/tensor/math.py                  | 157 +++++++++++
 18 files changed, 1257 insertions(+)
 create mode 100644 paddle/phi/kernels/baddbmm_grad_kernel.h
 create mode 100644 paddle/phi/kernels/baddbmm_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/baddbmm_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/baddbmm_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/baddbmm_kernel_impl.h

diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index e2566301a45b2..d04cfcd59bd57 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -150,6 +150,96 @@ void AddmmInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void BaddbmmInferMeta(const MetaTensor& input,
+                      const MetaTensor& x,
+                      const MetaTensor& y,
+                      float beta,
+                      float alpha,
+                      MetaTensor* out) {
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto ndim_input = input_dims.size();
+  auto ndim_x = x_dims.size();
+  auto ndim_y = y_dims.size();
+
+  VLOG(3) << "baddbmm operator input.shape=" << input_dims
+          << " x.shape=" << x_dims << " y.shape=" << y_dims << " beta=" << beta
+          << " alpha=" << alpha << " ndim_input=" << ndim_input
+          << " ndim_x=" << ndim_x << " ndim_y=" << ndim_y;
+
+  PADDLE_ENFORCE_NE(
+      product(input_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'input' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+
+  PADDLE_ENFORCE_NE(
+      product(x_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'x' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+
+  PADDLE_ENFORCE_NE(
+      product(y_dims),
+      0,
+      errors::PreconditionNotMet("The Input variable 'y' has not "
+                                 "been initialized. You may need to confirm "
+                                 "if you put exe.run(startup_program) "
+                                 "after optimizer.minimize function."));
+  // dim check
+  PADDLE_ENFORCE_EQ(
+      ndim_input,
+      3,
+      errors::InvalidArgument("The input tensor input's dimension must be 3. "
+                              "But received input's dimension = [%d].",
+                              ndim_input));
+  PADDLE_ENFORCE_EQ(
+      ndim_x,
+      3,
+      errors::InvalidArgument("The input tensor x's dimension must be 3. "
+                              "But received x's dimension = [%d].",
+                              ndim_x));
+  PADDLE_ENFORCE_EQ(
+      ndim_y,
+      3,
+      errors::InvalidArgument("The input tensor y's dimension must be 3. "
+                              "But received y's dimension = [%d].",
+                              ndim_y));
+
+  PADDLE_ENFORCE_EQ(
+      input_dims[0],
+      x_dims[0],
+      errors::InvalidArgument(
+          "The batch size of input and x must be the same. "
+          "But received input batch size = [%d], x batch size = [%d].",
+          input_dims[0],
+          x_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      x_dims[2],
+      y_dims[1],
+      errors::InvalidArgument("The second dimension of x must be equal to the "
+                              "first dimension of y. "
+                              "But received x's second dimension = [%d], y's "
+                              "first dimension = [%d].",
+                              x_dims[2],
+                              y_dims[1]));
+
+  std::vector<int64_t> output_dims;
+  output_dims.push_back(x_dims[0]);
+  output_dims.push_back(x_dims[1]);
+  output_dims.push_back(y_dims[2]);
+
+  out->set_dims(common::make_ddim(output_dims));
+  out->share_lod(input);
+  out->set_dtype(input.dtype());
+}
+
 void AffineChannelInferMeta(const MetaTensor& x,
                             const MetaTensor& scale,
                             const MetaTensor& bias,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index b05e64b426212..ee7f484c5d203 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -48,6 +48,13 @@ void AddmmInferMeta(const MetaTensor& input,
                     float alpha,
                     MetaTensor* out);
 
+void BaddbmmInferMeta(const MetaTensor& input,
+                      const MetaTensor& x,
+                      const MetaTensor& y,
+                      float beta,
+                      float alpha,
+                      MetaTensor* out);
+
 void AffineChannelInferMeta(const MetaTensor& x,
                             const MetaTensor& scale,
                             const MetaTensor& bias,
diff --git a/paddle/phi/kernels/baddbmm_grad_kernel.h b/paddle/phi/kernels/baddbmm_grad_kernel.h
new file mode 100644
index 0000000000000..34d237e379cb6
--- /dev/null
+++ b/paddle/phi/kernels/baddbmm_grad_kernel.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BaddbmmGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& out_grad,
+                       float alpha,
+                       float beta,
+                       DenseTensor* input_grad,
+                       DenseTensor* x_grad,
+                       DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/baddbmm_kernel.h b/paddle/phi/kernels/baddbmm_kernel.h
new file mode 100644
index 0000000000000..a10a89d4beb44
--- /dev/null
+++ b/paddle/phi/kernels/baddbmm_kernel.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BaddbmmKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   float beta,
+                   float alpha,
+                   DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc b/paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc
new file mode 100644
index 0000000000000..cfd36c73c9cb8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/baddbmm_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    baddbmm_grad, CPU, ALL_LAYOUT, phi::BaddbmmGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/baddbmm_kernel.cc b/paddle/phi/kernels/cpu/baddbmm_kernel.cc
new file mode 100644
index 0000000000000..7b616c924bf95
--- /dev/null
+++ b/paddle/phi/kernels/cpu/baddbmm_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/baddbmm_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/baddbmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    baddbmm, CPU, ALL_LAYOUT, phi::BaddbmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index 2f27682247bdc..5e42107312138 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -96,6 +96,18 @@ class Blas {
             T beta,
             T* C) const;
 
+  template <typename T, typename U = T>
+  void GEMM(CBLAS_TRANSPOSE transA,
+            CBLAS_TRANSPOSE transB,
+            int M,
+            int N,
+            int K,
+            U alpha,
+            const T* A,
+            const T* B,
+            U beta,
+            T* C) const;
+
   template <typename T>
   void GEMM(bool transA,
             bool transB,
@@ -292,6 +304,21 @@ class Blas {
                    int64_t strideA,
                    int64_t strideB) const;
 
+  template <typename T, typename U = T>
+  void BatchedGEMM(CBLAS_TRANSPOSE transA,
+                   CBLAS_TRANSPOSE transB,
+                   int M,
+                   int N,
+                   int K,
+                   U alpha,
+                   const T* A,
+                   const T* B,
+                   U beta,
+                   T* C,
+                   int batchCount,
+                   int64_t strideA,
+                   int64_t strideB) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA,
                    CBLAS_TRANSPOSE transB,
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 5fcc3f12f2b35..63cef454a492e 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -1183,6 +1183,152 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <typename T, typename U>
+void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                 CBLAS_TRANSPOSE transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 U alpha,
+                                 const T *A,
+                                 const T *B,
+                                 U beta,
+                                 T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  T t_alpha = static_cast<T>(alpha);
+  T t_beta = static_cast<T>(beta);
+
+#if CUDA_VERSION >= 8000
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx,
+                       cuTransB,
+                       cuTransA,
+                       N,
+                       M,
+                       K,
+                       &t_alpha,
+                       B,
+                       CUDA_R_32F,
+                       ldb,
+                       A,
+                       CUDA_R_32F,
+                       lda,
+                       &t_beta,
+                       C,
+                       CUDA_R_32F,
+                       N);
+  } else {
+#endif  // CUDA_VERSION >= 8000
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle,
+                      cuTransB,
+                      cuTransA,
+                      N,
+                      M,
+                      K,
+                      &t_alpha,
+                      B,
+                      ldb,
+                      A,
+                      lda,
+                      &t_beta,
+                      C,
+                      N);
+    });
+
+#if CUDA_VERSION >= 8000
+  }
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        float alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        float beta,
+                                        phi::dtype::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      53,
+      common::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = alpha;
+  float h_beta = beta;
+
+#if CUDA_VERSION >= 8000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
+  CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
+                                       cuTransB,
+                                       cuTransA,
+                                       N,
+                                       M,
+                                       K,
+                                       &h_alpha,
+                                       B,
+                                       CUDA_R_16F,
+                                       ldb,
+                                       A,
+                                       CUDA_R_16F,
+                                       lda,
+                                       &h_beta,
+                                       C,
+                                       CUDA_R_16F,
+                                       N,
+                                       CUDA_R_32F);
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &h_alpha,
+                                      h_B,
+                                      ldb,
+                                      h_A,
+                                      lda,
+                                      &h_beta,
+                                      h_C,
+                                      N);
+  });
+#endif  // CUDA_VERSION >= 8000
+}
+
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1781,6 +1927,125 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <typename T, typename U>
+void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        U alpha,
+                                        const T *A,
+                                        const T *B,
+                                        U beta,
+                                        T *C,
+                                        int batchCount,
+                                        int64_t strideA,
+                                        int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+#if CUDA_VERSION >= 9010
+  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
+      std::is_same<T, phi::dtype::float16>::value) {
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+    bool use_tensor_op_math = context_.tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+    VLOG(4) << "use_half_precision_compute_type: "
+            << FLAGS_gemm_use_half_precision_compute_type;
+
+    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+#if CUDA_VERSION >= 11000
+    auto compute_type = CUBLAS_COMPUTE_32F;
+#else
+    auto compute_type = CUDA_R_32F;
+#endif
+
+    float h_alpha = static_cast<float>(alpha);
+    float h_beta = static_cast<float>(beta);
+    void *a = static_cast<void *>(&h_alpha);
+    void *b = static_cast<void *>(&h_beta);
+    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
+    if (FLAGS_gemm_use_half_precision_compute_type == true &&
+        std::is_same<T, phi::dtype::float16>::value) {
+      a = static_cast<void *>(&alpha);
+      b = static_cast<void *>(&beta);
+#if CUDA_VERSION >= 11000
+      compute_type = CUBLAS_COMPUTE_16F;
+#else
+      compute_type = CUDA_R_16F;
+#endif
+    }
+
+    context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   N,
+                                                   M,
+                                                   K,
+                                                   a,
+                                                   B,
+                                                   fp,
+                                                   ldb,
+                                                   strideB,
+                                                   A,
+                                                   fp,
+                                                   lda,
+                                                   strideA,
+                                                   b,
+                                                   C,
+                                                   fp,
+                                                   ldc,
+                                                   strideC,
+                                                   batchCount,
+                                                   compute_type,
+                                                   algo));
+    });
+  } else {
+#endif  // CUDA_VERSION >= 9010
+
+    T h_alpha = static_cast<T>(alpha);
+    T h_beta = static_cast<T>(beta);
+    context_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    N,
+                                    M,
+                                    K,
+                                    &h_alpha,
+                                    B,
+                                    ldb,
+                                    strideB,
+                                    A,
+                                    lda,
+                                    strideA,
+                                    &h_beta,
+                                    C,
+                                    ldc,
+                                    strideC,
+                                    batchCount);
+    });
+
+#if CUDA_VERSION >= 9010
+  }
+#endif  // CUDA_VERSION >= 9010
+}
+
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 098e37105a45d..4c6bcca8d63d7 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1080,6 +1080,37 @@ void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                  ldc);
 }
 
+template <>
+template <typename T, typename U>
+void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                 CBLAS_TRANSPOSE transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 U alpha,
+                                 const T *A,
+                                 const T *B,
+                                 U beta,
+                                 T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CBlas<T>::GEMM(CblasRowMajor,
+                 transA,
+                 transB,
+                 M,
+                 N,
+                 K,
+                 alpha,
+                 A,
+                 lda,
+                 B,
+                 ldb,
+                 beta,
+                 C,
+                 ldc);
+}
+
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM(bool transA,
@@ -1410,6 +1441,66 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif
 }
 
+template <>
+template <typename T, typename U>
+void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        U alpha,
+                                        const T *A,
+                                        const T *B,
+                                        U beta,
+                                        T *C,
+                                        int batchCount,
+                                        int64_t strideA,
+                                        int64_t strideB) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      A, common::errors::InvalidArgument("Pointer A should not be null."));
+  PADDLE_ENFORCE_NOT_NULL(
+      B, common::errors::InvalidArgument("Pointer B should not be null."));
+  PADDLE_ENFORCE_NOT_NULL(
+      C, common::errors::InvalidArgument("Pointer C should not be null."));
+#ifdef PADDLE_WITH_MKLML
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const T *>(batchCount);
+  auto b_array = std::vector<const T *>(batchCount);
+  auto c_array = std::vector<T *>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+
+  CBlas<T>::GEMM_BATCH(CblasRowMajor,
+                       &transA,
+                       &transB,
+                       &M,
+                       &N,
+                       &K,
+                       &alpha,
+                       a_array.data(),
+                       &lda,
+                       b_array.data(),
+                       &ldb,
+                       &beta,
+                       c_array.data(),
+                       &ldc,
+                       1 /* group_count */,
+                       &batchCount);
+#else
+  for (int k = 0; k < batchCount; ++k) {
+    auto *Ak = &A[k * strideA];
+    auto *Bk = &B[k * strideB];
+    auto *Ck = &C[k * M * N];
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
diff --git a/paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu b/paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu
new file mode 100644
index 0000000000000..5dcf03c7458ad
--- /dev/null
+++ b/paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/baddbmm_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(baddbmm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BaddbmmGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/baddbmm_kernel.cu b/paddle/phi/kernels/gpu/baddbmm_kernel.cu
new file mode 100644
index 0000000000000..0e41074119eee
--- /dev/null
+++ b/paddle/phi/kernels/gpu/baddbmm_kernel.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/baddbmm_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/baddbmm_kernel_impl.h"
+
+PD_REGISTER_KERNEL(baddbmm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BaddbmmKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
new file mode 100644
index 0000000000000..0f40f8172e3f7
--- /dev/null
+++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
@@ -0,0 +1,248 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <type_traits>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/baddbmm_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+
+template <typename T>
+struct BCopyOrScaleFunctor {
+  BCopyOrScaleFunctor(const float scale, const T* x, T* output, int64_t numel)
+      : scale_(scale), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    const MPType mp_scale = static_cast<MPType>(scale_);
+    const MPType mp_x = static_cast<MPType>(x_[idx]);
+    output_[idx] = static_cast<T>(mp_scale * mp_x);
+  }
+
+ private:
+  const float scale_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+using Array3 = Eigen::DSizes<Eigen::DenseIndex, 3>;
+
+template <typename T, typename Context>
+void BaddbmmGradKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& out_grad,
+                       float alpha,
+                       float beta,
+                       DenseTensor* input_grad,
+                       DenseTensor* x_grad,
+                       DenseTensor* y_grad) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  bool is_float16_or_bfloat16 = false;
+  if (std::is_same<T, phi::dtype::float16>::value ||
+      std::is_same<T, phi::dtype::bfloat16>::value) {
+    is_float16_or_bfloat16 = true;
+  }
+
+  auto in_dims = input.dims();
+  int total_elems = 0;
+
+  VLOG(3) << "alpha: " << alpha << " beta: " << beta;
+
+  if (input_grad != nullptr) {
+    input_grad->set_lod(out_grad.lod());
+  }
+  if (x_grad != nullptr) {
+    x_grad->set_lod(x.lod());
+  }
+  if (y_grad != nullptr) {
+    y_grad->set_lod(y.lod());
+  }
+
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+  auto mt_blas = funcs::GetBlas<Context, MPType>(dev_ctx);
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    total_elems = in_dims[0] * in_dims[1] * in_dims[2];
+    auto& place = *dev_ctx.eigen_device();
+    auto eigen_dout = PhiEigenTensor<T, 3>::From(out_grad);
+    auto eigen_dinput = PhiEigenTensor<T, 3>::From(*input_grad);
+
+    bool batch_compress = in_dims[0] != out_grad.dims()[0];
+    bool row_compress = in_dims[1] != out_grad.dims()[1];
+    bool col_compress = in_dims[2] != out_grad.dims()[2];
+    auto eigen_dinput_shape = Array3(
+        input_grad->dims()[0], input_grad->dims()[1], input_grad->dims()[2]);
+
+    if (batch_compress && row_compress && col_compress) {
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum().eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum()
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
+    } else if (batch_compress && row_compress) {
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array2(0, 1)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array2(0, 1))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
+    } else if (batch_compress && col_compress) {
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array2(0, 2)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array2(0, 2))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
+    } else if (row_compress && col_compress) {
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array2(1, 2)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array2(1, 2))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
+    } else if (batch_compress) {
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array1(0))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
+    } else if (row_compress) {
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array1(1))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
+    } else if (col_compress) {
+      if (!is_float16_or_bfloat16) {
+        eigen_dinput.device(place) =
+            eigen_dout.sum(Array1(2)).eval().reshape(eigen_dinput_shape);
+      } else {
+        eigen_dinput.device(place) = eigen_dout.template cast<MPType>()
+                                         .sum(Array1(2))
+                                         .eval()
+                                         .reshape(eigen_dinput_shape)
+                                         .template cast<T>();
+      }
+    } else {
+      // The VCOPY does not support the float16, bfloat16
+      if (!is_float16_or_bfloat16) {
+        mt_blas.VCOPY(
+            total_elems, out_grad.data<MPType>(), input_grad->data<MPType>());
+      } else {
+        phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+        BCopyOrScaleFunctor<T> functor(
+            1, out_grad.data<T>(), input_grad->data<T>(), total_elems);
+        for_range(functor);
+      }
+    }
+
+    // The SCAL does not support the float16, bfloat16
+    if (!is_float16_or_bfloat16) {
+      mt_blas.SCAL(total_elems, beta, input_grad->data<MPType>());
+    } else {
+      phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+      BCopyOrScaleFunctor<T> functor(
+          beta, input_grad->data<T>(), input_grad->data<T>(), total_elems);
+      for_range(functor);
+    }
+  }
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    total_elems = x.dims()[0] * x.dims()[1] * x.dims()[2];
+    // x_grad = out_grad * y'. x_grad: B x M x K, out_grad : B x M x N, y : B x
+    // K x N
+    for (int i = 0; i < x.dims()[0]; ++i) {
+      auto out_grad_slice = out_grad.Slice(i, i + 1);
+      auto y_slice = y.Slice(i, i + 1);
+      auto x_grad_slice = x_grad->Slice(i, i + 1);
+      blas.MatMul(out_grad_slice, false, y_slice, true, &x_grad_slice);
+    }
+    if (!is_float16_or_bfloat16) {
+      mt_blas.SCAL(total_elems, alpha, x_grad->data<MPType>());
+    } else {
+      phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+      BCopyOrScaleFunctor<T> functor(
+          alpha, x_grad->data<T>(), x_grad->data<T>(), total_elems);
+      for_range(functor);
+    }
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    total_elems = y.dims()[0] * y.dims()[1] * y.dims()[2];
+    // y_grad = x' * out_grad. y_grad: B x K x N, out_grad : B x M x N, x : B x
+    // M x K
+    for (int i = 0; i < x.dims()[0]; ++i) {
+      auto out_grad_slice = out_grad.Slice(i, i + 1);
+      auto x_slice = x.Slice(i, i + 1);
+      auto y_grad_slice = y_grad->Slice(i, i + 1);
+      blas.MatMul(x_slice, true, out_grad_slice, false, &y_grad_slice);
+    }
+    if (!is_float16_or_bfloat16) {
+      mt_blas.SCAL(total_elems, alpha, y_grad->data<MPType>());
+    } else {
+      phi::funcs::ForRange<Context> for_range(dev_ctx, total_elems);
+      BCopyOrScaleFunctor<T> functor(
+          alpha, y_grad->data<T>(), y_grad->data<T>(), total_elems);
+      for_range(functor);
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
new file mode 100644
index 0000000000000..f788be0d10b1f
--- /dev/null
+++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
@@ -0,0 +1,179 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/kernels/baddbmm_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
+using Array3 = Eigen::DSizes<Eigen::DenseIndex, 3>;
+
+template <typename T, typename Context>
+void BaddbmmKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   float beta,
+                   float alpha,
+                   DenseTensor* out) {
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  DenseTensor input_3d(input);
+  if (input.dims().size() == 2) {
+    input_dims = {1, input.dims()[0], input.dims()[1]};
+    input_3d.Resize(input_dims);
+  }
+
+  // broadcast mode check
+  if (x_dims[0] != input_dims[0]) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      1,
+                      errors::InvalidArgument(
+                          "When x_dims[0] is not equal with input_dims[0], "
+                          "input_dims[0] must be 1 but got %s",
+                          input_dims[0]));
+    PADDLE_ENFORCE_EQ(y_dims[2] == input_dims[2] || input_dims[2] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  if (y_dims[2] != input_dims[2]) {
+    PADDLE_ENFORCE_EQ(input_dims[2],
+                      1,
+                      errors::InvalidArgument(
+                          "When y_dims[2] is not equal with input_dims[2], "
+                          "input_dims[2] must be 1 but got %s",
+                          input_dims[2]));
+    PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1,
+                      true,
+                      errors::InvalidArgument(
+                          "The input tensor shape mismatch, input shape=[%s], "
+                          "x shape=[%s], y shape=[%s]",
+                          input_dims,
+                          x_dims,
+                          y_dims));
+  }
+  PADDLE_ENFORCE_EQ(
+      x_dims[2],
+      y_dims[1],
+      errors::InvalidArgument(
+          "The input tensor X's width must be equal with matrix Y' height. "
+          "But received X's shape = [%s], Y's shape = [%s].",
+          x_dims[2],
+          y_dims[1]));
+
+  dev_ctx.template Alloc<T>(out);
+  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
+
+  // calc broadcast dim
+  Array3 bcast_dims;
+  bcast_dims[0] = x_dims[0] / input_dims[0];
+  bcast_dims[1] = x_dims[1] / input_dims[1];
+  bcast_dims[2] = y_dims[2] / input_dims[2];
+  VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << ","
+          << bcast_dims[2] << "]";
+
+  // broadcast using eigen
+  const DenseTensor& const_ref_input = input_3d;
+  auto eigen_input = PhiEigenTensor<T, 3>::From(const_ref_input);
+  auto eigen_out = PhiEigenTensor<T, 3>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 3>::Eval(
+      place, eigen_out, eigen_input, bcast_dims);
+
+  // special case for float16
+  if constexpr (std::is_same_v<T, phi::dtype::float16>) {
+    float t_alpha = alpha;
+    float t_beta = beta;
+    if (x_dims[0] == 1) {
+      blas.GEMM(CblasNoTrans,
+                CblasNoTrans,
+                x_dims[1],
+                y_dims[2],
+                x_dims[2],
+                t_alpha,
+                x.data<T>(),
+                y.data<T>(),
+                t_beta,
+                out->data<T>());
+    } else {
+      blas.BatchedGEMM(CblasNoTrans,
+                       CblasNoTrans,
+                       x_dims[1],
+                       y_dims[2],
+                       x_dims[2],
+                       t_alpha,
+                       x.data<T>(),
+                       y.data<T>(),
+                       t_beta,
+                       out->data<T>(),
+                       x_dims[0],
+                       x_dims[1] * x_dims[2],
+                       x_dims[2] * y_dims[2]);
+    }
+  } else {
+    T t_alpha = static_cast<T>(alpha);
+    T t_beta = static_cast<T>(beta);
+    if (x_dims[0] == 1) {
+      blas.GEMM(CblasNoTrans,
+                CblasNoTrans,
+                x_dims[1],
+                y_dims[2],
+                x_dims[2],
+                t_alpha,
+                x.data<T>(),
+                y.data<T>(),
+                t_beta,
+                out->data<T>());
+    } else {
+      blas.BatchedGEMM(CblasNoTrans,
+                       CblasNoTrans,
+                       x_dims[1],
+                       y_dims[2],
+                       x_dims[2],
+                       t_alpha,
+                       x.data<T>(),
+                       y.data<T>(),
+                       t_beta,
+                       out->data<T>(),
+                       x_dims[0],
+                       x_dims[1] * x_dims[2],
+                       x_dims[2] * y_dims[2]);
+      // x_dims[2] == y_dims[1]
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
index dae96ff02fffe..aa09c21f77fb9 100644
--- a/paddle/phi/ops/yaml/backward.yaml
+++ b/paddle/phi/ops/yaml/backward.yaml
@@ -223,6 +223,16 @@
     func : atanh_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : baddbmm_grad
+  forward : baddbmm (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0) -> Tensor(out)
+  args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta)
+  output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [input, x, y]
+  kernel :
+    func : baddbmm_grad
+
 - backward_op : batch_fc_grad
   forward : batch_fc (Tensor input, Tensor w, Tensor bias) -> Tensor(out)
   args : (Tensor input, Tensor w, Tensor bias, Tensor out_grad)
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
index 2818ec6d89343..6ac560334b5a2 100755
--- a/paddle/phi/ops/yaml/ops.yaml
+++ b/paddle/phi/ops/yaml/ops.yaml
@@ -490,6 +490,18 @@
   inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
   traits : paddle::dialect::ForwardOnlyTrait
 
+- op : baddbmm
+  args : (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0)
+  output : Tensor(out)
+  infer_meta :
+    func : BaddbmmInferMeta
+  kernel :
+    func : baddbmm
+    data_type : x
+  inplace: (input -> out)
+  backward : baddbmm_grad
+  # interfaces : paddle::dialect::InferSymbolicShapeInterface
+
 - op : barrier
   args : (Tensor x, int ring_id=0)
   output : Tensor(out)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 7cac26bda3790..b6659a5b583cb 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -386,6 +386,8 @@
     atan_,
     atanh,
     atanh_,
+    baddbmm,
+    baddbmm_,
     bitwise_left_shift,
     bitwise_left_shift_,
     bitwise_right_shift,
@@ -805,6 +807,8 @@
     'raw',
     'addmm',
     'addmm_',
+    'baddbmm',
+    'baddbmm_',
     'allclose',
     'isclose',
     't',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index ff8eed1aa1715..6ff77cad6fcac 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -257,6 +257,8 @@
     atan_,
     atanh,
     atanh_,
+    baddbmm,
+    baddbmm_,
     bitwise_left_shift,
     bitwise_left_shift_,
     bitwise_right_shift,
@@ -612,6 +614,8 @@
     'erf',
     'addmm',
     'addmm_',
+    'baddbmm',
+    'baddbmm_',
     'clip',
     'clip_',
     'trace',
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 426d7c979bc91..5e75c10eaff55 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2543,6 +2543,163 @@ def addmm_(
         return _C_ops.addmm_(input, x, y, beta, alpha)
 
 
+def baddbmm(
+    input: Tensor,
+    x: Tensor,
+    y: Tensor,
+    beta: float = 1.0,
+    alpha: float = 1.0,
+    name: str | None = None,
+) -> Tensor:
+    """
+    **baddbmm**
+
+    Perform batch matrix multiplication for input $x$ and $y$.
+    $input$ is added to the final result.
+    The equation is:
+
+    ..  math::
+        Out = alpha * x * y + beta * input
+
+    $Input$, $x$ and $y$ can carry the LoD (Level of Details) information, or not. But the output only shares the LoD information with input $input$.
+
+    Args:
+        input (Tensor): The input Tensor to be added to the final result.
+        x (Tensor): The first input Tensor for batch matrix multiplication.
+        y (Tensor): The second input Tensor for batch matrix multiplication.
+        beta (float, optional): Coefficient of $input$, default is 1.
+        alpha (float, optional): Coefficient of $x*y$, default is 1.
+        name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output Tensor of baddbmm.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> x = paddle.ones([2, 2, 2])
+            >>> y = paddle.ones([2, 2, 2])
+            >>> input = paddle.ones([2, 2, 2])
+
+            >>> out = paddle.baddbmm(input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
+            >>> print(out)
+            Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[10.50000000, 10.50000000],
+              [10.50000000, 10.50000000]],
+             [[10.50000000, 10.50000000],
+              [10.50000000, 10.50000000]]])
+    """
+    input_shape = input.shape
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(x_shape) == len(y_shape) == 3:
+        raise ValueError(
+            f"The dimension of x, y should be 3 but receive x's shape: {x_shape}, y's shape: {y_shape}"
+        )
+    if x_shape[2] != y_shape[1]:
+        raise ValueError(
+            f"The input Variable x's width must be equal with Variable y's height. But received x's shape = {x_shape}, y's shape = {y_shape}."
+        )
+
+    if len(input_shape) == 3:
+        if input_shape[0] != x_shape[0]:
+            raise ValueError(
+                f"The batch size of input must be equal to the batch size of x. But received input's batch size = {input_shape[0]}, x's batch size = {x_shape[0]}"
+            )
+        if input_shape[1] != x_shape[1]:
+            if input_shape[1] != 1:
+                raise ValueError(
+                    f"When x's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}"
+                )
+        if input_shape[2] != y_shape[2]:
+            if input_shape[2] != 1:
+                raise ValueError(
+                    f"When y's dimension[2] is not equal with input's dimension[2], input's dimension[2] must be 1 but got {input_shape[2]}"
+                )
+    else:
+        raise ValueError(
+            f"The dimension of input should be 3 but received input's shape: {input_shape}"
+        )
+
+    if in_dynamic_or_pir_mode():
+        return _C_ops.baddbmm(input, x, y, beta, alpha)
+    else:
+        inputs = {'Input': input, "X": x, "Y": y}
+        attrs = {'Alpha': alpha, 'Beta': beta}
+
+        helper = LayerHelper("baddbmm", **locals())
+        check_variable_and_dtype(
+            input,
+            'Input',
+            ['float16', 'float32', 'float64', 'uint16'],
+            'baddbmm',
+        )
+        check_variable_and_dtype(
+            x, 'X', ['float16', 'float32', 'float64', 'uint16'], 'baddbmm'
+        )
+        check_variable_and_dtype(
+            y, 'Y', ['float16', 'float32', 'float64', 'uint16'], 'baddbmm'
+        )
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+        helper.append_op(
+            type="baddbmm", inputs=inputs, attrs=attrs, outputs={"Out": out}
+        )
+        return out
+
+
+@inplace_apis_in_dygraph_only
+def baddbmm_(
+    input: Tensor,
+    x: Tensor,
+    y: Tensor,
+    beta: float = 1.0,
+    alpha: float = 1.0,
+    name: str | None = None,
+) -> Tensor:
+    """
+    Inplace version of ``baddbmm`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_paddle_baddbmm`.
+    """
+    input_shape = input.shape
+    x_shape = x.shape
+    y_shape = y.shape
+    if not len(x_shape) == len(y_shape) == 3:
+        raise ValueError(
+            f"The dimension of x, y should be 3 but receive x's shape: {x_shape}, y's shape: {y_shape}"
+        )
+    if x_shape[2] != y_shape[1]:
+        raise ValueError(
+            f"The input Variable x's width must be equal with Variable y's height. But received x's shape = {x_shape}, y's shape = {y_shape}."
+        )
+
+    if len(input_shape) == 3:
+        if input_shape[0] != x_shape[0]:
+            raise ValueError(
+                f"The batch size of input must be equal to the batch size of x. But received input's batch size = {input_shape[0]}, x's batch size = {x_shape[0]}"
+            )
+        if input_shape[1] != x_shape[1]:
+            if input_shape[1] != 1:
+                raise ValueError(
+                    f"When x's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}"
+                )
+        if input_shape[2] != y_shape[2]:
+            if input_shape[2] != 1:
+                raise ValueError(
+                    f"When y's dimension[2] is not equal with input's dimension[2], input's dimension[2] must be 1 but got {input_shape[2]}"
+                )
+    else:
+        raise ValueError(
+            f"The dimension of input should be 3 but received input's shape: {input_shape}"
+        )
+
+    if in_dynamic_mode():
+        return _C_ops.baddbmm_(input, x, y, beta, alpha)
+
+
 def renorm(x: Tensor, p: float, axis: int, max_norm: float) -> Tensor:
     """
     **renorm**

From a0ac3ee60a0f789f480a6bbde2b5ef262a50286e Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Thu, 9 Jan 2025 21:17:04 +0800
Subject: [PATCH 11/12] added file for test

	modified:   paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
	modified:   paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
	modified:   paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h
	modified:   paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
	modified:   paddle/phi/api/ext/tensor_compat.h
	modified:   paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
	modified:   paddle/phi/ops/yaml/op_compat.yaml
	new file:   test/legacy_test/test_baddbmm_op.py
---
 .../decomp_interface_gen_op_list.py           |  1 +
 .../multiary_infer_sym.cc                     | 61 ++++++++++++
 .../infer_symbolic_shape/multiary_infer_sym.h |  2 +
 .../decomp_rule/decomp_rule/composite.h       | 22 +++++
 paddle/phi/api/ext/tensor_compat.h            |  1 +
 .../kernels/impl/baddbmm_grad_kernel_impl.h   | 10 ++
 paddle/phi/ops/yaml/op_compat.yaml            | 11 +++
 test/legacy_test/test_baddbmm_op.py           | 93 +++++++++++++++++++
 8 files changed, 201 insertions(+)
 create mode 100644 test/legacy_test/test_baddbmm_op.py

diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
index 6b9d3c67d3a7d..6ce7cda1e5576 100644
--- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
@@ -32,6 +32,7 @@
     "add_n",
     "addmm",
     "any",
+    "baddbmm",
     "bce_loss",
     "bmm",
     "diag",
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
index 3f2c8397a6141..5c5a36ae186c3 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc
@@ -163,6 +163,67 @@ bool Addmm_OpInferSymbolicShape(pir::Operation *op,
   return AddmmOpInferSymbolicShape(op, infer_context);
 }
 
+bool BaddbmmOpInferSymbolicShape(
+    pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {
+  const auto &input_shape =
+      infer_context->GetShapeOrDataForValue(op->operand_source(0));
+  const auto &x_shape =
+      infer_context->GetShapeOrDataForValue(op->operand_source(1));
+  const auto &y_shape =
+      infer_context->GetShapeOrDataForValue(op->operand_source(2));
+
+  auto ndim_input = input_shape.shape().size();
+  auto ndim_x = x_shape.shape().size();
+  auto ndim_y = y_shape.shape().size();
+
+  PADDLE_ENFORCE_EQ(ndim_input,
+                    3,
+                    common::errors::InvalidArgument(
+                        "The input tensor input's dimension must be 3. "
+                        "But received input's dimension = [%d].",
+                        ndim_input));
+  PADDLE_ENFORCE_EQ(ndim_x,
+                    3,
+                    common::errors::InvalidArgument(
+                        "The input tensor x's dimension must be 3. "
+                        "But received x's dimension = [%d].",
+                        ndim_x));
+  PADDLE_ENFORCE_EQ(ndim_y,
+                    3,
+                    common::errors::InvalidArgument(
+                        "The input tensor y's dimension must be 3. "
+                        "But received y's dimension = [%d].",
+                        ndim_y));
+
+  std::vector<symbol::DimExpr> output_shape;
+  output_shape.push_back(x_shape.shape()[0]);  // batch size
+  output_shape.push_back(x_shape.shape()[1]);
+  output_shape.push_back(y_shape.shape()[2]);
+
+  infer_context->SetShapeOrDataForValue(
+      op->result(0),
+      symbol::ShapeOrDataDimExprs{
+          symbol::TensorShapeOrDataDimExprs(output_shape)});
+
+  infer_context->AddEqualCstr(x_shape.shape()[0],
+                              y_shape.shape()[0]);  // batch size
+  infer_context->AddEqualCstr(x_shape.shape()[2], y_shape.shape()[1]);
+
+  infer_context->AddBroadcastableCstr(input_shape.shape()[0],
+                                      x_shape.shape()[0]);  // batch size
+  infer_context->AddBroadcastableCstr(input_shape.shape()[1],
+                                      x_shape.shape()[1]);
+  infer_context->AddBroadcastableCstr(input_shape.shape()[2],
+                                      y_shape.shape()[2]);
+
+  return true;
+}
+
+bool Baddbmm_OpInferSymbolicShape(
+    pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {
+  return BaddbmmOpInferSymbolicShape(op, infer_context);
+}
+
 bool AucOpInferSymbolicShape(pir::Operation *op,
                              pir::InferSymbolicShapeContext *infer_context) {
   const auto &predict_shape =
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h
index c8dd6a2b048ce..d647580d0242c 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h
@@ -21,6 +21,8 @@ namespace paddle::dialect {
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Accuracy)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Addmm)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Addmm_)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Baddbmm)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Baddbmm_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(AddN)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Auc)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(AssignPos)
diff --git a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
index 357b2434c1f67..020f4741ef63a 100644
--- a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
+++ b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
@@ -1409,6 +1409,28 @@ Tensor addmm_decomp(const Tensor& input,
          full_scalar<T>(beta, input.dtype()) * input;
 }
 
+template <typename T>
+Tensor baddbmm_decomp(const Tensor& input,
+                      const Tensor& x,
+                      const Tensor& y,
+                      const float beta,
+                      const float alpha) {
+  int batch_size = x.shape()[0];
+  std::vector<Tensor> batch_results;
+
+  for (int i = 0; i < batch_size; ++i) {
+    Tensor x_batch = get_slice<T>(x, i);
+    Tensor y_batch = get_slice<T>(y, i);
+    Tensor result = matmul<T>(x_batch, y_batch);
+    batch_results.push_back(result);
+  }
+
+  Tensor x_y_mat = concat<T>(batch_results);
+
+  return full_scalar<T>(alpha, x_y_mat.dtype()) * x_y_mat +
+         full_scalar<T>(beta, input.dtype()) * input;
+}
+
 template <typename T>
 Tensor eye_decomp(const paddle::Scalar& num_rows,
                   const paddle::Scalar& num_columns,
diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h
index b1a140da46a89..6c09a4f7451c1 100644
--- a/paddle/phi/api/ext/tensor_compat.h
+++ b/paddle/phi/api/ext/tensor_compat.h
@@ -35,6 +35,7 @@ using experimental::asinh;
 using experimental::atan;
 using experimental::atan2;
 using experimental::atanh;
+using experimental::baddbmm;
 using experimental::bernoulli;
 using experimental::ceil;
 using experimental::cholesky;
diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
index 0f40f8172e3f7..238f50c555194 100644
--- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
@@ -212,6 +212,12 @@ void BaddbmmGradKernel(const Context& dev_ctx,
       auto out_grad_slice = out_grad.Slice(i, i + 1);
       auto y_slice = y.Slice(i, i + 1);
       auto x_grad_slice = x_grad->Slice(i, i + 1);
+      auto x_grad_dims = x_grad_slice.dims();
+
+      x_grad_slice.Resize({x_grad_dims[1], x_grad_dims[2]});
+      y_slice.Resize({y_slice.dims()[1], y_slice.dims()[2]});
+      out_grad_slice.Resize(
+          {out_grad_slice.dims()[1], out_grad_slice.dims()[2]});
       blas.MatMul(out_grad_slice, false, y_slice, true, &x_grad_slice);
     }
     if (!is_float16_or_bfloat16) {
@@ -232,6 +238,10 @@ void BaddbmmGradKernel(const Context& dev_ctx,
       auto out_grad_slice = out_grad.Slice(i, i + 1);
       auto x_slice = x.Slice(i, i + 1);
       auto y_grad_slice = y_grad->Slice(i, i + 1);
+      out_grad_slice.Resize(
+          {out_grad_slice.dims()[1], out_grad_slice.dims()[2]});
+      x_slice.Resize({x_slice.dims()[1], x_slice.dims()[2]});
+      y_grad_slice.Resize({y_grad_slice.dims()[1], y_grad_slice.dims()[2]});
       blas.MatMul(x_slice, true, out_grad_slice, false, &y_grad_slice);
     }
     if (!is_float16_or_bfloat16) {
diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml
index 89a91aa264893..d3707eef9a361 100755
--- a/paddle/phi/ops/yaml/op_compat.yaml
+++ b/paddle/phi/ops/yaml/op_compat.yaml
@@ -372,6 +372,17 @@
   outputs :
     {auc : AUC, stat_pos_out : StatPosOut, stat_neg_out : StatNegOut}
 
+- op : baddbmm
+  backward : baddbmm_grad
+  inputs :
+    {input : Input, x : X, y : Y}
+  outputs :
+    out : Out
+  attrs :
+    {alpha : Alpha, beta : Beta}
+  extra :
+    attrs : [bool use_mkldnn = false]
+
 - op : barrier
   inputs :
     {x : X}
diff --git a/test/legacy_test/test_baddbmm_op.py b/test/legacy_test/test_baddbmm_op.py
new file mode 100644
index 0000000000000..0be1d00141c09
--- /dev/null
+++ b/test/legacy_test/test_baddbmm_op.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language go verning permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+
+
+class TestBaddBmmOp(OpTest):
+    # test basic
+    def setUp(self):
+        self.op_type = "baddbmm"
+        self.prim_op_type = "comp"
+        self.python_api = paddle.baddbmm
+        self.public_python_api = paddle.baddbmm
+        self.init_dtype_type()
+        self.inputs = {
+            'Input': np.random.random((10, 20, 15)).astype(self.dtype),
+            'X': np.random.random((10, 20, 10)).astype(self.dtype),
+            'Y': np.random.random((10, 10, 15)).astype(self.dtype),
+        }
+        self.outputs = {
+            'Out': self.inputs['Input']
+            + np.matmul(self.inputs['X'], self.inputs['Y'])
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float64
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_prim_pir=True)
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['Input', 'X', 'Y'],
+            'Out',
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+    def test_check_grad_x(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            no_grad_set=None,
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+    def test_check_grad_y(self):
+        self.check_grad(
+            ['Y'],
+            'Out',
+            no_grad_set=None,
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+    def test_check_grad_input(self):
+        self.check_grad(
+            ['Input'],
+            'Out',
+            no_grad_set=None,
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+
+class TestBaddBmmFP16Op(TestBaddBmmOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()

From fdf6322c54eb7fada52895cbfe7f8be0a4d5169f Mon Sep 17 00:00:00 2001
From: Qin-sx <Qin_sx@outlook.com>
Date: Fri, 10 Jan 2025 14:35:32 +0800
Subject: [PATCH 12/12] added bloat16 case

	modified:   ../paddle/phi/kernels/funcs/blas/blas_impl.cu.h
	modified:   ../paddle/phi/kernels/impl/baddbmm_kernel_impl.h
---
 paddle/phi/kernels/funcs/blas/blas_impl.cu.h  | 140 ++++++++++++++++++
 paddle/phi/kernels/impl/baddbmm_kernel_impl.h |  10 +-
 2 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 63cef454a492e..096ab5bd857ed 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -1398,6 +1398,75 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 11000
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        float alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        float beta,
+                                        phi::dtype::bfloat16 *C) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      80,
+      common::errors::InvalidArgument(
+          "cublas bf16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = alpha;
+  float h_beta = beta;
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                          cuTransB,
+                                                          cuTransA,
+                                                          N,
+                                                          M,
+                                                          K,
+                                                          &h_alpha,
+                                                          B,
+                                                          CUDA_R_16BF,
+                                                          ldb,
+                                                          A,
+                                                          CUDA_R_16BF,
+                                                          lda,
+                                                          &h_beta,
+                                                          C,
+                                                          CUDA_R_16BF,
+                                                          N,
+                                                          CUDA_R_32F,
+                                                          algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(common::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <>
 inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -2117,6 +2186,77 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 11000
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float alpha,
+                                               const phi::dtype::bfloat16 *A,
+                                               const phi::dtype::bfloat16 *B,
+                                               float beta,
+                                               phi::dtype::bfloat16 *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+  float h_alpha = alpha;
+  float h_beta = beta;
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                 cuTransB,
+                                                 cuTransA,
+                                                 N,
+                                                 M,
+                                                 K,
+                                                 &h_alpha,
+                                                 B,
+                                                 CUDA_R_16BF,
+                                                 ldb,
+                                                 strideB,
+                                                 A,
+                                                 CUDA_R_16BF,
+                                                 lda,
+                                                 strideA,
+                                                 &h_beta,
+                                                 C,
+                                                 CUDA_R_16BF,
+                                                 ldc,
+                                                 strideC,
+                                                 batchCount,
+                                                 CUBLAS_COMPUTE_32F,
+                                                 algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(common::errors::Unimplemented(
+      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
+      "11"));
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
diff --git a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
index f788be0d10b1f..83fedc4bcbc62 100644
--- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
@@ -18,10 +18,12 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/baddbmm_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
 namespace phi {
 
 template <typename T,
@@ -113,8 +115,12 @@ void BaddbmmKernel(const Context& dev_ctx,
   funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 3>::Eval(
       place, eigen_out, eigen_input, bcast_dims);
 
-  // special case for float16
-  if constexpr (std::is_same_v<T, phi::dtype::float16>) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // special case for MPType
+  if constexpr (std::is_same_v<MPType, float>) {
+    VLOG(4) << "Function: baddbmm, Type of T: " << typeid(T).name();
+    VLOG(4) << "Function: baddbmm, Type of MPType: " << typeid(MPType).name();
     float t_alpha = alpha;
     float t_beta = beta;
     if (x_dims[0] == 1) {