From e2299f976e569ff7be381dcd30885d36717f9714 Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Thu, 5 Dec 2024 06:36:32 +0000 Subject: [PATCH 01/12] added reset peak value initialization modified: paddle/fluid/pybind/pybind.cc modified: paddle/phi/core/memory/stats.cc modified: paddle/phi/core/memory/stats.h modified: python/paddle/device/cuda/__init__.py --- paddle/fluid/pybind/pybind.cc | 2 ++ paddle/phi/core/memory/stats.cc | 8 ++++++++ paddle/phi/core/memory/stats.h | 16 ++++++++++++++++ python/paddle/device/cuda/__init__.py | 12 ++++++++++++ 4 files changed, 38 insertions(+) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 43f39912c8f9f..e75cd51f68749 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2424,6 +2424,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("device_memory_stat_current_value", memory::DeviceMemoryStatCurrentValue); m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue); + m.def("device_memory_stat_reset_peak_value", memory::DeviceMemoryStatResetPeakValue); + m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue); m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue); m.def( diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc index c3b93d41d604d..0b59ffade6f87 100644 --- a/paddle/phi/core/memory/stats.cc +++ b/paddle/phi/core/memory/stats.cc @@ -58,6 +58,10 @@ class StatRegistry { GetStat(stat_type, dev_id)->Update(increment); } + void ResetPeakValue(const std::string& stat_type, int dev_id) { + GetStat(stat_type, dev_id)->ResetPeakValue(); + } + void Register(const std::string& stat_type, int dev_id, StatBase* stat) { std::lock_guard lock_guard(stat_map_lock_); stat_map_[GetStatKey(stat_type, dev_id)] = stat; @@ -93,6 +97,10 @@ void DeviceMemoryStatUpdate(const std::string& stat_type, StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment); } +void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) { + StatRegistry::GetInstance()->ResetPeakValue("Device" + stat_type, dev_id); +} + int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type, dev_id); diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h index acacd21a97f96..9ede49bd699a5 100644 --- a/paddle/phi/core/memory/stats.h +++ b/paddle/phi/core/memory/stats.h @@ -57,6 +57,7 @@ class StatBase { virtual int64_t GetCurrentValue() = 0; virtual int64_t GetPeakValue() = 0; virtual void Update(int64_t) = 0; + virtual void ResetPeakValue() = 0; private: DISABLE_COPY_AND_ASSIGN(StatBase); @@ -112,6 +113,20 @@ class Stat : public StatBase { } } + void ResetPeakValue() override { + int64_t current_value = GetCurrentValue(); + peak_value_.store(current_value, std::memory_order_relaxed); + + std::unordered_map> thread_local_stats = + ThreadDataRegistry::GetInstance().GetAllThreadDataByRef(); + + for (auto pair : thread_local_stats) { + pair.second.get().peak = pair.second.get().current; + } + + VLOG(8) << "Reset peak_value to current_value = " << current_value; + } + private: Stat() {} ~Stat() {} @@ -128,6 +143,7 @@ int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id); void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id, int64_t increment); +void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id); int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id); int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id); diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index 5f31c1df1fe58..8146612f66d66 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -48,6 +48,7 @@ 'get_device_properties', 'get_device_name', 'get_device_capability', + 'reset_peak_memory_stats', ] @@ -297,6 +298,17 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int: device_id = extract_cuda_device_id(device, op_name=name) return core.device_memory_stat_peak_value("Reserved", device_id) +def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None: + + name = "paddle.device.cuda.reset_peak_memory_stats" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + device_id = extract_cuda_device_id(device, op_name=name) + core.device_memory_stat_reset_peak_value("Allocated", device_id) + core.device_memory_stat_reset_peak_value("Reserved", device_id) + # return core.device_memory_stat_peak_value("Reserved", device_id) def memory_allocated(device: _CudaPlaceLike | None = None) -> int: ''' From dc30348189e19311f680165d8fb53722a2c819f1 Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Sat, 7 Dec 2024 17:58:25 +0800 Subject: [PATCH 02/12] added comments modified: paddle/fluid/pybind/pybind.cc modified: python/paddle/device/cuda/__init__.py --- paddle/fluid/pybind/pybind.cc | 9 +++ python/paddle/device/cuda/__init__.py | 106 +++++++++++++++++++++++++- 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e75cd51f68749..2f06a0a1a1420 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2426,6 +2426,15 @@ All parameter, weight, gradient are variables in Paddle. m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue); m.def("device_memory_stat_reset_peak_value", memory::DeviceMemoryStatResetPeakValue); + m.def("device_memory_stats", [](int dev_id) { + py::dict dict; + dict["memory.allocated.current"] = memory::DeviceMemoryStatCurrentValue("Allocated", dev_id); + dict["memory.allocated.peak"] = memory::DeviceMemoryStatPeakValue("Allocated", dev_id); + dict["memory.reserved.current"] = memory::DeviceMemoryStatCurrentValue("Reserved", dev_id); + dict["memory.reserved.peak"] = memory::DeviceMemoryStatPeakValue("Reserved", dev_id); + return dict; + }); + m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue); m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue); m.def( diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index 8146612f66d66..abddf53720f06 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -24,6 +24,8 @@ from .streams import Event, Stream +import logging + if TYPE_CHECKING: from paddle import CUDAPlace from paddle.base.libpaddle import _gpuDeviceProperties @@ -49,6 +51,8 @@ 'get_device_name', 'get_device_capability', 'reset_peak_memory_stats', + 'reset_max_memory_allocated', + 'memory_stats', ] @@ -299,7 +303,29 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int: return core.device_memory_stat_peak_value("Reserved", device_id) def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None: + ''' + Reset the peak values of GPU memory allocated and reserved to the current values. + + This function resets the "peak" stats tracked by the CUDA memory allocator for both + memory allocated and memory reserved. After calling this function, the peak values + will be set to the current memory usage values. + Args: + device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + + >>> paddle.device.cuda.reset_peak_memory_stats(paddle.CUDAPlace(0)) + >>> paddle.device.cuda.reset_peak_memory_stats(0) + >>> paddle.device.cuda.reset_peak_memory_stats("gpu:0") + ''' name = "paddle.device.cuda.reset_peak_memory_stats" if not core.is_compiled_with_cuda(): raise ValueError( @@ -308,7 +334,85 @@ def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None: device_id = extract_cuda_device_id(device, op_name=name) core.device_memory_stat_reset_peak_value("Allocated", device_id) core.device_memory_stat_reset_peak_value("Reserved", device_id) - # return core.device_memory_stat_peak_value("Reserved", device_id) + + +def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None: + ''' + Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved + as well) + + Warning: + This function calls `paddle.device.cuda.reset_peak_memory_stats`, which resets both allocated + and reserved peak memory stats. + + Args: + device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + + >>> paddle.device.cuda.reset_max_memory_allocated(paddle.CUDAPlace(0)) + >>> paddle.device.cuda.reset_max_memory_allocated(0) + >>> paddle.device.cuda.reset_max_memory_allocated("gpu:0") + ''' + + name = "paddle.device.cuda.reset_max_memory_allocated" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + logging.warning( + "paddle.device.cuda.reset_max_memory_allocated calls paddle.device.cuda.reset_peak_memory_stats, " + "which resets both allocated and reserved peak memory stats." + ) + reset_peak_memory_stats(device) + + +def memory_stats(device: _CudaPlaceLike | None = None) -> dict: + ''' + Return the memory stats of the given device. + + Args: + device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Return: + dict: The current memory stats of the given device, including the current size of GPU memory that is allocated to tensor, + the peak size of GPU memory that is allocated to tensor, the current size of GPU memory that is held by the allocator and + the peak size of GPU memory that is held by the allocator, in bytes. + + memory_stats["memory.allocated.current"] + memory_stats["memory.allocated.peak"] + memory_stats["memory.reserved.current"] + memory_stats["memory.reserved.peak"] + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + + >>> memory_stats = paddle.device.cuda.memory_stats(paddle.CUDAPlace(0)) + >>> memory_stats = paddle.device.cuda.memory_stats(0) + >>> memory_stats = paddle.device.cuda.memory_stats("gpu:0") + ''' + + name = "paddle.device.cuda.memory_stats" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + device_id = extract_cuda_device_id(device, op_name=name) + return core.device_memory_stats(device_id) + def memory_allocated(device: _CudaPlaceLike | None = None) -> int: ''' From d081dde72cd48b67b3caa87563e5735b272d9840 Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Sat, 7 Dec 2024 22:01:31 +0800 Subject: [PATCH 03/12] added cpp tests modified: paddle/fluid/pybind/pybind.cc modified: paddle/phi/core/memory/stats.cc modified: paddle/phi/core/memory/stats.h modified: test/cpp/fluid/memory/stats_test.cc --- paddle/fluid/pybind/pybind.cc | 1 + paddle/phi/core/memory/stats.cc | 4 ++++ paddle/phi/core/memory/stats.h | 5 +++++ test/cpp/fluid/memory/stats_test.cc | 28 +++++++++++++++++++++++++--- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2f06a0a1a1420..3be2c5f10c430 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2437,6 +2437,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue); m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue); + m.def("host_memory_stat_reset_peak_value", memory::HostMemoryStatResetPeakValue); m.def( "run_cmd", [](const std::string &cmd, diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc index 0b59ffade6f87..7b75ea9021997 100644 --- a/paddle/phi/core/memory/stats.cc +++ b/paddle/phi/core/memory/stats.cc @@ -116,6 +116,10 @@ void HostMemoryStatUpdate(const std::string& stat_type, StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment); } +void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) { + StatRegistry::GetInstance()->ResetPeakValue("Host" + stat_type, dev_id); +} + void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name) { if (FLAGS_log_memory_stats && phi::is_gpu_place(place)) { VLOG(0) << "After launching op_name: " << op_name << ", " diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h index 9ede49bd699a5..f59cc0336a802 100644 --- a/paddle/phi/core/memory/stats.h +++ b/paddle/phi/core/memory/stats.h @@ -150,6 +150,7 @@ int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id); void HostMemoryStatUpdate(const std::string& stat_type, int dev_id, int64_t increment); +void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id); void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name); @@ -195,6 +196,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name); DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue) #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \ DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment) +#define DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \ + DEVICE_MEMORY_STAT_FUNC(item, id, ResetPeakValue) #define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \ [&] { \ @@ -215,6 +218,8 @@ void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name); HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue) #define HOST_MEMORY_STAT_UPDATE(item, id, increment) \ HOST_MEMORY_STAT_FUNC(item, id, Update, increment) +#define HOST_MEMORY_STAT_RESET_PEAK_VALUE(item, id) \ + HOST_MEMORY_STAT_FUNC(item, id, ResetPeakValue) #define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \ struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {} diff --git a/test/cpp/fluid/memory/stats_test.cc b/test/cpp/fluid/memory/stats_test.cc index 6aefe5b77b1ff..52b40780d9b7b 100644 --- a/test/cpp/fluid/memory/stats_test.cc +++ b/test/cpp/fluid/memory/stats_test.cc @@ -32,15 +32,18 @@ class StatsTest : public ::testing::Test { void SetFunc( std::function update_func, std::function current_value_func, - std::function peak_value_func) { + std::function peak_value_func, + std::function reset_peak_value_func) { update_func_ = update_func; current_value_func_ = current_value_func; peak_value_func_ = peak_value_func; + reset_peak_value_func_ = reset_peak_value_func; } void RunTests() { MultiThreadReadWriteTest(); PeakValueTest(); + ResetPeakValueTest(); } private: @@ -94,6 +97,16 @@ class StatsTest : public ::testing::Test { EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value); } + void ResetPeakValueTest() { + for(long unsigned int i = 0; i < datas_.size(); ++i) { + update_func_(stat_type_, 0, datas_[i]); + + EXPECT_GE(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0)); + reset_peak_value_func_(stat_type_, 0); + EXPECT_EQ(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0)); + } + } + std::string stat_type_; std::vector datas_{ 543149808935355, 634698327471328, 706215795436611, 577939367795333, @@ -125,13 +138,15 @@ class StatsTest : public ::testing::Test { std::function update_func_; std::function current_value_func_; std::function peak_value_func_; + std::function reset_peak_value_func_; }; TEST_F(StatsTest, DeviceAllocatedTest) { SetStatType("Allocated"); SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue, - DeviceMemoryStatPeakValue); + DeviceMemoryStatPeakValue, + DeviceMemoryStatResetPeakValue); RunTests(); } @@ -146,6 +161,9 @@ TEST_F(StatsTest, DeviceReservedMacroTest) { }, [](const std::string stat_type, int id) { return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id); + }, + [](const std::string stat_type, int id) { + return DEVICE_MEMORY_STAT_RESET_PEAK_VALUE(Reserved, id); }); RunTests(); } @@ -161,6 +179,9 @@ TEST_F(StatsTest, HostAllocatedMacroTest) { }, [](const std::string stat_type, int id) { return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id); + }, + [](const std::string stat_type, int id) { + return HOST_MEMORY_STAT_RESET_PEAK_VALUE(Allocated, id); }); RunTests(); } @@ -169,7 +190,8 @@ TEST_F(StatsTest, HostReservedTest) { SetStatType("Reserved"); SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue, - HostMemoryStatPeakValue); + HostMemoryStatPeakValue, + HostMemoryStatResetPeakValue); RunTests(); } From 35a12c83edf5ec31813c545c3f5f8c2cd6813936 Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Sat, 7 Dec 2024 22:15:38 +0800 Subject: [PATCH 04/12] added python tests new file: test/legacy_test/test_cuda_memory_stats.py new file: test/legacy_test/test_cuda_reset_peak_memory_stats.py --- test/legacy_test/test_cuda_memory_stats.py | 83 +++++++++++++++++ .../test_cuda_reset_peak_memory_stats.py | 88 +++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 test/legacy_test/test_cuda_memory_stats.py create mode 100644 test/legacy_test/test_cuda_reset_peak_memory_stats.py diff --git a/test/legacy_test/test_cuda_memory_stats.py b/test/legacy_test/test_cuda_memory_stats.py new file mode 100644 index 0000000000000..b87cc3a38e24a --- /dev/null +++ b/test/legacy_test/test_cuda_memory_stats.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +from paddle.base import core +from paddle.device.cuda import ( + device_count, + max_memory_allocated, + memory_allocated, + max_memory_reserved, + memory_reserved, + memory_stats, +) + + +class TestMemoryStats(unittest.TestCase): + def func_test_memory_stats(self, device=None): + if core.is_compiled_with_cuda(): + alloc_time = 100 + max_alloc_size = 10000 + for i in range(alloc_time): + # first alloc + shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2) + tensor = paddle.zeros(shape) + del shape + del tensor + + # second alloc + shape = paddle.randint(low=0, high=max_alloc_size) + tensor = paddle.zeros(shape) + + memory_stats = paddle.device.cuda.memory_stats(device) + + self.assertEqual(memory_stats["memory.allocated.current"], memory_allocated(device)) + self.assertEqual(memory_stats["memory.allocated.peak"], max_memory_allocated(device)) + + self.assertEqual(memory_stats["memory.reserved.current"], memory_reserved(device)) + self.assertEqual(memory_stats["memory.reserved.peak"], max_memory_reserved(device)) + + del shape + del tensor + + def test_memory_stats_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.func_test_memory_stats(core.CUDAPlace(i)) + self.func_test_memory_stats(i) + self.func_test_memory_stats("gpu:" + str(i)) + + def test_memory_stats_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), + device_count() + 1, + -2, + 0.5, + "gpu1", + ] + for device in wrong_device: + with self.assertRaises(BaseException): # noqa: B017 + memory_stats(device) + else: + with self.assertRaises(ValueError): + memory_stats() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/legacy_test/test_cuda_reset_peak_memory_stats.py b/test/legacy_test/test_cuda_reset_peak_memory_stats.py new file mode 100644 index 0000000000000..b917e8c01062f --- /dev/null +++ b/test/legacy_test/test_cuda_reset_peak_memory_stats.py @@ -0,0 +1,88 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +from paddle.base import core +from paddle.device.cuda import ( + device_count, + max_memory_allocated, + memory_allocated, + max_memory_reserved, + memory_reserved, + reset_peak_memory_stats, +) + + +class TestResetPeakMemoryStats(unittest.TestCase): + def func_test_reset_peak_memory_stats(self, device=None): + if core.is_compiled_with_cuda(): + alloc_time = 100 + max_alloc_size = 10000 + for i in range(alloc_time): + # first alloc + shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2) + tensor = paddle.zeros(shape) + peak_memory_allocated_size_first = max_memory_allocated(device) + peak_memory_reserved_size_first = max_memory_reserved(device) + del shape + del tensor + + # second alloc + shape = paddle.randint(low=0, high=max_alloc_size) + tensor = paddle.zeros(shape) + + # reset peak memory stats + reset_peak_memory_stats(device) + + peak_memory_allocated_size_second = max_memory_allocated(device) + self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device)) + self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first) + + peak_memory_reserved_size_second = max_memory_reserved(device) + self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device)) + self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first) + + del shape + del tensor + + def test_reset_peak_memory_stats_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.func_test_reset_peak_memory_stats(core.CUDAPlace(i)) + self.func_test_reset_peak_memory_stats(i) + self.func_test_reset_peak_memory_stats("gpu:" + str(i)) + + def test_reset_peak_memory_stats_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), + device_count() + 1, + -2, + 0.5, + "gpu1", + ] + for device in wrong_device: + with self.assertRaises(BaseException): # noqa: B017 + reset_peak_memory_stats(device) + else: + with self.assertRaises(ValueError): + reset_peak_memory_stats() + + +if __name__ == "__main__": + unittest.main() From cb5036c584519a1901b3b2ddb9c130dade1f430e Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Sun, 8 Dec 2024 10:35:09 +0800 Subject: [PATCH 05/12] added a python test for reset_max_memory_allocated new file: test/legacy_test/test_cuda_reset_max_memory_allocated.py --- .../test_cuda_reset_max_memory_allocated.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 test/legacy_test/test_cuda_reset_max_memory_allocated.py diff --git a/test/legacy_test/test_cuda_reset_max_memory_allocated.py b/test/legacy_test/test_cuda_reset_max_memory_allocated.py new file mode 100644 index 0000000000000..fb444497cc569 --- /dev/null +++ b/test/legacy_test/test_cuda_reset_max_memory_allocated.py @@ -0,0 +1,88 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +from paddle.base import core +from paddle.device.cuda import ( + device_count, + max_memory_allocated, + memory_allocated, + max_memory_reserved, + memory_reserved, + reset_max_memory_allocated, +) + + +class TestResetMaxMemoryAllocated(unittest.TestCase): + def func_test_reset_max_memory_allocated(self, device=None): + if core.is_compiled_with_cuda(): + alloc_time = 100 + max_alloc_size = 10000 + for i in range(alloc_time): + # first alloc + shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2) + tensor = paddle.zeros(shape) + peak_memory_allocated_size_first = max_memory_allocated(device) + peak_memory_reserved_size_first = max_memory_reserved(device) + del shape + del tensor + + # second alloc + shape = paddle.randint(low=0, high=max_alloc_size) + tensor = paddle.zeros(shape) + + # reset peak memory stats + reset_max_memory_allocated(device) + + peak_memory_allocated_size_second = max_memory_allocated(device) + self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device)) + self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first) + + peak_memory_reserved_size_second = max_memory_reserved(device) + self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device)) + self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first) + + del shape + del tensor + + def test_reset_max_memory_allocated_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.func_test_reset_max_memory_allocated(core.CUDAPlace(i)) + self.func_test_reset_max_memory_allocated(i) + self.func_test_reset_max_memory_allocated("gpu:" + str(i)) + + def test_reset_max_memory_allocated_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), + device_count() + 1, + -2, + 0.5, + "gpu1", + ] + for device in wrong_device: + with self.assertRaises(BaseException): # noqa: B017 + reset_max_memory_allocated(device) + else: + with self.assertRaises(ValueError): + reset_max_memory_allocated() + + +if __name__ == "__main__": + unittest.main() From 5d28856e3a0f3a8d83ad6ba2c7473af046d872e0 Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Mon, 9 Dec 2024 21:48:01 +0800 Subject: [PATCH 06/12] formatted by pre-commit modified: python/paddle/device/cuda/__init__.py modified: test/legacy_test/test_cuda_memory_stats.py modified: test/legacy_test/test_cuda_reset_max_memory_allocated.py modified: test/legacy_test/test_cuda_reset_peak_memory_stats.py --- python/paddle/device/cuda/__init__.py | 10 +++---- test/legacy_test/test_cuda_memory_stats.py | 26 ++++++++++++++----- .../test_cuda_reset_max_memory_allocated.py | 26 ++++++++++++++----- .../test_cuda_reset_peak_memory_stats.py | 26 ++++++++++++++----- 4 files changed, 63 insertions(+), 25 deletions(-) diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index abddf53720f06..b3fdb470fe4d1 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import logging from typing import TYPE_CHECKING, NoReturn, Union from typing_extensions import TypeAlias @@ -24,8 +25,6 @@ from .streams import Event, Stream -import logging - if TYPE_CHECKING: from paddle import CUDAPlace from paddle.base.libpaddle import _gpuDeviceProperties @@ -302,6 +301,7 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int: device_id = extract_cuda_device_id(device, op_name=name) return core.device_memory_stat_peak_value("Reserved", device_id) + def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None: ''' Reset the peak values of GPU memory allocated and reserved to the current values. @@ -338,9 +338,9 @@ def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None: def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None: ''' - Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved + Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved as well) - + Warning: This function calls `paddle.device.cuda.reset_peak_memory_stats`, which resets both allocated and reserved peak memory stats. @@ -361,7 +361,7 @@ def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None: >>> paddle.device.cuda.reset_max_memory_allocated(0) >>> paddle.device.cuda.reset_max_memory_allocated("gpu:0") ''' - + name = "paddle.device.cuda.reset_max_memory_allocated" if not core.is_compiled_with_cuda(): raise ValueError( diff --git a/test/legacy_test/test_cuda_memory_stats.py b/test/legacy_test/test_cuda_memory_stats.py index b87cc3a38e24a..f209b62d0d5d7 100644 --- a/test/legacy_test/test_cuda_memory_stats.py +++ b/test/legacy_test/test_cuda_memory_stats.py @@ -19,8 +19,8 @@ from paddle.device.cuda import ( device_count, max_memory_allocated, - memory_allocated, max_memory_reserved, + memory_allocated, memory_reserved, memory_stats, ) @@ -33,7 +33,9 @@ def func_test_memory_stats(self, device=None): max_alloc_size = 10000 for i in range(alloc_time): # first alloc - shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2) + shape = paddle.randint( + low=max_alloc_size, high=max_alloc_size * 2 + ) tensor = paddle.zeros(shape) del shape del tensor @@ -44,11 +46,23 @@ def func_test_memory_stats(self, device=None): memory_stats = paddle.device.cuda.memory_stats(device) - self.assertEqual(memory_stats["memory.allocated.current"], memory_allocated(device)) - self.assertEqual(memory_stats["memory.allocated.peak"], max_memory_allocated(device)) + self.assertEqual( + memory_stats["memory.allocated.current"], + memory_allocated(device), + ) + self.assertEqual( + memory_stats["memory.allocated.peak"], + max_memory_allocated(device), + ) - self.assertEqual(memory_stats["memory.reserved.current"], memory_reserved(device)) - self.assertEqual(memory_stats["memory.reserved.peak"], max_memory_reserved(device)) + self.assertEqual( + memory_stats["memory.reserved.current"], + memory_reserved(device), + ) + self.assertEqual( + memory_stats["memory.reserved.peak"], + max_memory_reserved(device), + ) del shape del tensor diff --git a/test/legacy_test/test_cuda_reset_max_memory_allocated.py b/test/legacy_test/test_cuda_reset_max_memory_allocated.py index fb444497cc569..ee326e2728fe5 100644 --- a/test/legacy_test/test_cuda_reset_max_memory_allocated.py +++ b/test/legacy_test/test_cuda_reset_max_memory_allocated.py @@ -19,8 +19,8 @@ from paddle.device.cuda import ( device_count, max_memory_allocated, - memory_allocated, max_memory_reserved, + memory_allocated, memory_reserved, reset_max_memory_allocated, ) @@ -33,7 +33,9 @@ def func_test_reset_max_memory_allocated(self, device=None): max_alloc_size = 10000 for i in range(alloc_time): # first alloc - shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2) + shape = paddle.randint( + low=max_alloc_size, high=max_alloc_size * 2 + ) tensor = paddle.zeros(shape) peak_memory_allocated_size_first = max_memory_allocated(device) peak_memory_reserved_size_first = max_memory_reserved(device) @@ -43,17 +45,27 @@ def func_test_reset_max_memory_allocated(self, device=None): # second alloc shape = paddle.randint(low=0, high=max_alloc_size) tensor = paddle.zeros(shape) - + # reset peak memory stats reset_max_memory_allocated(device) peak_memory_allocated_size_second = max_memory_allocated(device) - self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device)) - self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first) + self.assertEqual( + peak_memory_allocated_size_second, memory_allocated(device) + ) + self.assertLess( + peak_memory_allocated_size_second, + peak_memory_allocated_size_first, + ) peak_memory_reserved_size_second = max_memory_reserved(device) - self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device)) - self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first) + self.assertEqual( + peak_memory_reserved_size_second, memory_reserved(device) + ) + self.assertLessEqual( + peak_memory_reserved_size_second, + peak_memory_reserved_size_first, + ) del shape del tensor diff --git a/test/legacy_test/test_cuda_reset_peak_memory_stats.py b/test/legacy_test/test_cuda_reset_peak_memory_stats.py index b917e8c01062f..be66ef3e6bf23 100644 --- a/test/legacy_test/test_cuda_reset_peak_memory_stats.py +++ b/test/legacy_test/test_cuda_reset_peak_memory_stats.py @@ -19,8 +19,8 @@ from paddle.device.cuda import ( device_count, max_memory_allocated, - memory_allocated, max_memory_reserved, + memory_allocated, memory_reserved, reset_peak_memory_stats, ) @@ -33,7 +33,9 @@ def func_test_reset_peak_memory_stats(self, device=None): max_alloc_size = 10000 for i in range(alloc_time): # first alloc - shape = paddle.randint(low=max_alloc_size, high=max_alloc_size*2) + shape = paddle.randint( + low=max_alloc_size, high=max_alloc_size * 2 + ) tensor = paddle.zeros(shape) peak_memory_allocated_size_first = max_memory_allocated(device) peak_memory_reserved_size_first = max_memory_reserved(device) @@ -43,17 +45,27 @@ def func_test_reset_peak_memory_stats(self, device=None): # second alloc shape = paddle.randint(low=0, high=max_alloc_size) tensor = paddle.zeros(shape) - + # reset peak memory stats reset_peak_memory_stats(device) peak_memory_allocated_size_second = max_memory_allocated(device) - self.assertEqual(peak_memory_allocated_size_second, memory_allocated(device)) - self.assertLess(peak_memory_allocated_size_second, peak_memory_allocated_size_first) + self.assertEqual( + peak_memory_allocated_size_second, memory_allocated(device) + ) + self.assertLess( + peak_memory_allocated_size_second, + peak_memory_allocated_size_first, + ) peak_memory_reserved_size_second = max_memory_reserved(device) - self.assertEqual(peak_memory_reserved_size_second, memory_reserved(device)) - self.assertLessEqual(peak_memory_reserved_size_second, peak_memory_reserved_size_first) + self.assertEqual( + peak_memory_reserved_size_second, memory_reserved(device) + ) + self.assertLessEqual( + peak_memory_reserved_size_second, + peak_memory_reserved_size_first, + ) del shape del tensor From f6b3d84673f78e4b1f666ecd81684dea80e8d23e Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Mon, 9 Dec 2024 22:00:05 +0800 Subject: [PATCH 07/12] formatted by pre-commit (clang-format) modified: paddle/fluid/pybind/pybind.cc modified: paddle/phi/core/memory/stats.cc modified: paddle/phi/core/memory/stats.h modified: test/cpp/fluid/memory/stats_test.cc --- paddle/fluid/pybind/pybind.cc | 18 ++++++++++++------ paddle/phi/core/memory/stats.cc | 4 ++-- paddle/phi/core/memory/stats.h | 10 ++++++---- test/cpp/fluid/memory/stats_test.cc | 10 ++++++---- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3be2c5f10c430..cea69960b21d3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2424,20 +2424,26 @@ All parameter, weight, gradient are variables in Paddle. m.def("device_memory_stat_current_value", memory::DeviceMemoryStatCurrentValue); m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue); - m.def("device_memory_stat_reset_peak_value", memory::DeviceMemoryStatResetPeakValue); + m.def("device_memory_stat_reset_peak_value", + memory::DeviceMemoryStatResetPeakValue); m.def("device_memory_stats", [](int dev_id) { py::dict dict; - dict["memory.allocated.current"] = memory::DeviceMemoryStatCurrentValue("Allocated", dev_id); - dict["memory.allocated.peak"] = memory::DeviceMemoryStatPeakValue("Allocated", dev_id); - dict["memory.reserved.current"] = memory::DeviceMemoryStatCurrentValue("Reserved", dev_id); - dict["memory.reserved.peak"] = memory::DeviceMemoryStatPeakValue("Reserved", dev_id); + dict["memory.allocated.current"] = + memory::DeviceMemoryStatCurrentValue("Allocated", dev_id); + dict["memory.allocated.peak"] = + memory::DeviceMemoryStatPeakValue("Allocated", dev_id); + dict["memory.reserved.current"] = + memory::DeviceMemoryStatCurrentValue("Reserved", dev_id); + dict["memory.reserved.peak"] = + memory::DeviceMemoryStatPeakValue("Reserved", dev_id); return dict; }); m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue); m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue); - m.def("host_memory_stat_reset_peak_value", memory::HostMemoryStatResetPeakValue); + m.def("host_memory_stat_reset_peak_value", + memory::HostMemoryStatResetPeakValue); m.def( "run_cmd", [](const std::string &cmd, diff --git a/paddle/phi/core/memory/stats.cc b/paddle/phi/core/memory/stats.cc index 7b75ea9021997..a6cbf37bd4c9a 100644 --- a/paddle/phi/core/memory/stats.cc +++ b/paddle/phi/core/memory/stats.cc @@ -97,7 +97,7 @@ void DeviceMemoryStatUpdate(const std::string& stat_type, StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment); } -void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) { +void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) { StatRegistry::GetInstance()->ResetPeakValue("Device" + stat_type, dev_id); } @@ -116,7 +116,7 @@ void HostMemoryStatUpdate(const std::string& stat_type, StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment); } -void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id) { +void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id) { StatRegistry::GetInstance()->ResetPeakValue("Host" + stat_type, dev_id); } diff --git a/paddle/phi/core/memory/stats.h b/paddle/phi/core/memory/stats.h index f59cc0336a802..e5b4f9d8ad771 100644 --- a/paddle/phi/core/memory/stats.h +++ b/paddle/phi/core/memory/stats.h @@ -117,8 +117,10 @@ class Stat : public StatBase { int64_t current_value = GetCurrentValue(); peak_value_.store(current_value, std::memory_order_relaxed); - std::unordered_map> thread_local_stats = - ThreadDataRegistry::GetInstance().GetAllThreadDataByRef(); + std::unordered_map> + thread_local_stats = + ThreadDataRegistry::GetInstance() + .GetAllThreadDataByRef(); for (auto pair : thread_local_stats) { pair.second.get().peak = pair.second.get().current; @@ -143,14 +145,14 @@ int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id); void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id, int64_t increment); -void DeviceMemoryStatResetPeakValue(const std::string& stat_type,int dev_id); +void DeviceMemoryStatResetPeakValue(const std::string& stat_type, int dev_id); int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id); int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id); void HostMemoryStatUpdate(const std::string& stat_type, int dev_id, int64_t increment); -void HostMemoryStatResetPeakValue(const std::string& stat_type,int dev_id); +void HostMemoryStatResetPeakValue(const std::string& stat_type, int dev_id); void LogDeviceMemoryStats(const phi::Place& place, const std::string& op_name); diff --git a/test/cpp/fluid/memory/stats_test.cc b/test/cpp/fluid/memory/stats_test.cc index 52b40780d9b7b..51e6bd13636a6 100644 --- a/test/cpp/fluid/memory/stats_test.cc +++ b/test/cpp/fluid/memory/stats_test.cc @@ -98,12 +98,14 @@ class StatsTest : public ::testing::Test { } void ResetPeakValueTest() { - for(long unsigned int i = 0; i < datas_.size(); ++i) { - update_func_(stat_type_, 0, datas_[i]); + for (int64_t data : datas_) { + update_func_(stat_type_, 0, data); - EXPECT_GE(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0)); + EXPECT_GE(peak_value_func_(stat_type_, 0), + current_value_func_(stat_type_, 0)); reset_peak_value_func_(stat_type_, 0); - EXPECT_EQ(peak_value_func_(stat_type_, 0), current_value_func_(stat_type_, 0)); + EXPECT_EQ(peak_value_func_(stat_type_, 0), + current_value_func_(stat_type_, 0)); } } From b6ea9be25ded38424f8f1810b494bcdb43d19f60 Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Fri, 13 Dec 2024 21:24:44 +0800 Subject: [PATCH 08/12] added reset max memory reserved function modified: python/paddle/device/cuda/__init__.py modified: test/legacy_test/test_cuda_reset_max_memory_allocated.py new file: test/legacy_test/test_cuda_reset_max_memory_reserved.py --- python/paddle/device/cuda/__init__.py | 48 +++++++--- .../test_cuda_reset_max_memory_allocated.py | 13 +-- .../test_cuda_reset_max_memory_reserved.py | 89 +++++++++++++++++++ 3 files changed, 126 insertions(+), 24 deletions(-) create mode 100644 test/legacy_test/test_cuda_reset_max_memory_reserved.py diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index b3fdb470fe4d1..c2cfc11a495b0 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. from __future__ import annotations -import logging from typing import TYPE_CHECKING, NoReturn, Union from typing_extensions import TypeAlias @@ -51,6 +50,7 @@ 'get_device_capability', 'reset_peak_memory_stats', 'reset_max_memory_allocated', + 'reset_max_memory_reserved', 'memory_stats', ] @@ -338,12 +338,8 @@ def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None: def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None: ''' - Reset the peak values of GPU memory allocated to the current values. (Reset the GPU memory reserved - as well) - - Warning: - This function calls `paddle.device.cuda.reset_peak_memory_stats`, which resets both allocated - and reserved peak memory stats. + Reset the peak values of GPU memory allocated to the current values. + Allocated memory refers to the GPU memory that is currently allocated to tensors. Args: device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or @@ -367,11 +363,39 @@ def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None: raise ValueError( f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." ) - logging.warning( - "paddle.device.cuda.reset_max_memory_allocated calls paddle.device.cuda.reset_peak_memory_stats, " - "which resets both allocated and reserved peak memory stats." - ) - reset_peak_memory_stats(device) + device_id = extract_cuda_device_id(device, op_name=name) + core.device_memory_stat_reset_peak_value("Allocated", device_id) + + +def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None: + ''' + Reset the peak values of GPU memory reserved to the current values. + Reserved memory refers to the GPU memory that is held by the allocator of the given device. + + Args: + device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Examples: + .. code-block:: python + + >>> # doctest: +REQUIRES(env:GPU) + >>> import paddle + >>> paddle.device.set_device('gpu') + + >>> paddle.device.cuda.reset_max_memory_reserved(paddle.CUDAPlace(0)) + >>> paddle.device.cuda.reset_max_memory_reserved(0) + >>> paddle.device.cuda.reset_max_memory_reserved("gpu:0") + ''' + + name = "paddle.device.cuda.reset_max_memory_reserved" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + device_id = extract_cuda_device_id(device, op_name=name) + core.device_memory_stat_reset_peak_value("Reserved", device_id) def memory_stats(device: _CudaPlaceLike | None = None) -> dict: diff --git a/test/legacy_test/test_cuda_reset_max_memory_allocated.py b/test/legacy_test/test_cuda_reset_max_memory_allocated.py index ee326e2728fe5..ae99b6056dd70 100644 --- a/test/legacy_test/test_cuda_reset_max_memory_allocated.py +++ b/test/legacy_test/test_cuda_reset_max_memory_allocated.py @@ -19,9 +19,7 @@ from paddle.device.cuda import ( device_count, max_memory_allocated, - max_memory_reserved, memory_allocated, - memory_reserved, reset_max_memory_allocated, ) @@ -38,7 +36,7 @@ def func_test_reset_max_memory_allocated(self, device=None): ) tensor = paddle.zeros(shape) peak_memory_allocated_size_first = max_memory_allocated(device) - peak_memory_reserved_size_first = max_memory_reserved(device) + del shape del tensor @@ -58,15 +56,6 @@ def func_test_reset_max_memory_allocated(self, device=None): peak_memory_allocated_size_first, ) - peak_memory_reserved_size_second = max_memory_reserved(device) - self.assertEqual( - peak_memory_reserved_size_second, memory_reserved(device) - ) - self.assertLessEqual( - peak_memory_reserved_size_second, - peak_memory_reserved_size_first, - ) - del shape del tensor diff --git a/test/legacy_test/test_cuda_reset_max_memory_reserved.py b/test/legacy_test/test_cuda_reset_max_memory_reserved.py new file mode 100644 index 0000000000000..51d9470599c34 --- /dev/null +++ b/test/legacy_test/test_cuda_reset_max_memory_reserved.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +from paddle.base import core +from paddle.device.cuda import ( + device_count, + max_memory_reserved, + memory_reserved, + reset_max_memory_reserved, +) + + +class TestResetMaxMemoryReserved(unittest.TestCase): + def func_test_reset_max_memory_reserved(self, device=None): + if core.is_compiled_with_cuda(): + alloc_time = 100 + max_alloc_size = 10000 + for i in range(alloc_time): + # first alloc + shape = paddle.randint( + low=max_alloc_size, high=max_alloc_size * 2 + ) + tensor = paddle.zeros(shape) + peak_memory_reserved_size_first = max_memory_reserved(device) + + del shape + del tensor + + # second alloc + shape = paddle.randint(low=0, high=max_alloc_size) + tensor = paddle.zeros(shape) + + # reset peak memory stats + reset_max_memory_reserved(device) + + peak_memory_reserved_size_second = max_memory_reserved(device) + self.assertEqual( + peak_memory_reserved_size_second, memory_reserved(device) + ) + self.assertLessEqual( + peak_memory_reserved_size_second, + peak_memory_reserved_size_first, + ) + + del shape + del tensor + + def test_reset_max_memory_reserved_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.func_test_reset_max_memory_reserved(core.CUDAPlace(i)) + self.func_test_reset_max_memory_reserved(i) + self.func_test_reset_max_memory_reserved("gpu:" + str(i)) + + def test_reset_max_memory_reserved_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), + device_count() + 1, + -2, + 0.5, + "gpu1", + ] + for device in wrong_device: + with self.assertRaises(BaseException): # noqa: B017 + reset_max_memory_reserved(device) + else: + with self.assertRaises(ValueError): + reset_max_memory_reserved() + + +if __name__ == "__main__": + unittest.main() From caad36840c27e0ab96e39f3776ff9c5af1be6c0e Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Sun, 15 Dec 2024 15:07:33 +0800 Subject: [PATCH 09/12] deleted memory stats and reset peak memory stats modified: paddle/fluid/pybind/pybind.cc modified: python/paddle/device/cuda/__init__.py deleted: test/legacy_test/test_cuda_memory_stats.py deleted: test/legacy_test/test_cuda_reset_peak_memory_stats.py --- paddle/fluid/pybind/pybind.cc | 13 --- python/paddle/device/cuda/__init__.py | 82 +------------- test/legacy_test/test_cuda_memory_stats.py | 97 ----------------- .../test_cuda_reset_peak_memory_stats.py | 100 ------------------ 4 files changed, 2 insertions(+), 290 deletions(-) delete mode 100644 test/legacy_test/test_cuda_memory_stats.py delete mode 100644 test/legacy_test/test_cuda_reset_peak_memory_stats.py diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cea69960b21d3..1aa83b8fd26e9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2427,19 +2427,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("device_memory_stat_reset_peak_value", memory::DeviceMemoryStatResetPeakValue); - m.def("device_memory_stats", [](int dev_id) { - py::dict dict; - dict["memory.allocated.current"] = - memory::DeviceMemoryStatCurrentValue("Allocated", dev_id); - dict["memory.allocated.peak"] = - memory::DeviceMemoryStatPeakValue("Allocated", dev_id); - dict["memory.reserved.current"] = - memory::DeviceMemoryStatCurrentValue("Reserved", dev_id); - dict["memory.reserved.peak"] = - memory::DeviceMemoryStatPeakValue("Reserved", dev_id); - return dict; - }); - m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue); m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue); m.def("host_memory_stat_reset_peak_value", diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index c2cfc11a495b0..538c52c7837a5 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -48,10 +48,8 @@ 'get_device_properties', 'get_device_name', 'get_device_capability', - 'reset_peak_memory_stats', 'reset_max_memory_allocated', 'reset_max_memory_reserved', - 'memory_stats', ] @@ -302,44 +300,9 @@ def max_memory_reserved(device: _CudaPlaceLike | None = None) -> int: return core.device_memory_stat_peak_value("Reserved", device_id) -def reset_peak_memory_stats(device: _CudaPlaceLike | None = None) -> None: - ''' - Reset the peak values of GPU memory allocated and reserved to the current values. - - This function resets the "peak" stats tracked by the CUDA memory allocator for both - memory allocated and memory reserved. After calling this function, the peak values - will be set to the current memory usage values. - - Args: - device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or - the string name of device like 'gpu:x'. If device is None, the device is the current device. - Default: None. - - Examples: - .. code-block:: python - - >>> # doctest: +REQUIRES(env:GPU) - >>> import paddle - >>> paddle.device.set_device('gpu') - - >>> paddle.device.cuda.reset_peak_memory_stats(paddle.CUDAPlace(0)) - >>> paddle.device.cuda.reset_peak_memory_stats(0) - >>> paddle.device.cuda.reset_peak_memory_stats("gpu:0") - ''' - name = "paddle.device.cuda.reset_peak_memory_stats" - if not core.is_compiled_with_cuda(): - raise ValueError( - f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." - ) - device_id = extract_cuda_device_id(device, op_name=name) - core.device_memory_stat_reset_peak_value("Allocated", device_id) - core.device_memory_stat_reset_peak_value("Reserved", device_id) - - def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None: ''' - Reset the peak values of GPU memory allocated to the current values. - Allocated memory refers to the GPU memory that is currently allocated to tensors. + Reset the peak size of GPU memory that is allocated to tensor of the given device. Args: device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or @@ -369,8 +332,7 @@ def reset_max_memory_allocated(device: _CudaPlaceLike | None = None) -> None: def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None: ''' - Reset the peak values of GPU memory reserved to the current values. - Reserved memory refers to the GPU memory that is held by the allocator of the given device. + Reset the peak size of GPU memory that is held by the allocator of the given device. Args: device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or @@ -398,46 +360,6 @@ def reset_max_memory_reserved(device: _CudaPlaceLike | None = None) -> None: core.device_memory_stat_reset_peak_value("Reserved", device_id) -def memory_stats(device: _CudaPlaceLike | None = None) -> dict: - ''' - Return the memory stats of the given device. - - Args: - device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or - the string name of device like 'gpu:x'. If device is None, the device is the current device. - Default: None. - - Return: - dict: The current memory stats of the given device, including the current size of GPU memory that is allocated to tensor, - the peak size of GPU memory that is allocated to tensor, the current size of GPU memory that is held by the allocator and - the peak size of GPU memory that is held by the allocator, in bytes. - - memory_stats["memory.allocated.current"] - memory_stats["memory.allocated.peak"] - memory_stats["memory.reserved.current"] - memory_stats["memory.reserved.peak"] - - Examples: - .. code-block:: python - - >>> # doctest: +REQUIRES(env:GPU) - >>> import paddle - >>> paddle.device.set_device('gpu') - - >>> memory_stats = paddle.device.cuda.memory_stats(paddle.CUDAPlace(0)) - >>> memory_stats = paddle.device.cuda.memory_stats(0) - >>> memory_stats = paddle.device.cuda.memory_stats("gpu:0") - ''' - - name = "paddle.device.cuda.memory_stats" - if not core.is_compiled_with_cuda(): - raise ValueError( - f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." - ) - device_id = extract_cuda_device_id(device, op_name=name) - return core.device_memory_stats(device_id) - - def memory_allocated(device: _CudaPlaceLike | None = None) -> int: ''' Return the current size of gpu memory that is allocated to tensor of the given device. diff --git a/test/legacy_test/test_cuda_memory_stats.py b/test/legacy_test/test_cuda_memory_stats.py deleted file mode 100644 index f209b62d0d5d7..0000000000000 --- a/test/legacy_test/test_cuda_memory_stats.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import paddle -from paddle.base import core -from paddle.device.cuda import ( - device_count, - max_memory_allocated, - max_memory_reserved, - memory_allocated, - memory_reserved, - memory_stats, -) - - -class TestMemoryStats(unittest.TestCase): - def func_test_memory_stats(self, device=None): - if core.is_compiled_with_cuda(): - alloc_time = 100 - max_alloc_size = 10000 - for i in range(alloc_time): - # first alloc - shape = paddle.randint( - low=max_alloc_size, high=max_alloc_size * 2 - ) - tensor = paddle.zeros(shape) - del shape - del tensor - - # second alloc - shape = paddle.randint(low=0, high=max_alloc_size) - tensor = paddle.zeros(shape) - - memory_stats = paddle.device.cuda.memory_stats(device) - - self.assertEqual( - memory_stats["memory.allocated.current"], - memory_allocated(device), - ) - self.assertEqual( - memory_stats["memory.allocated.peak"], - max_memory_allocated(device), - ) - - self.assertEqual( - memory_stats["memory.reserved.current"], - memory_reserved(device), - ) - self.assertEqual( - memory_stats["memory.reserved.peak"], - max_memory_reserved(device), - ) - - del shape - del tensor - - def test_memory_stats_for_all_places(self): - if core.is_compiled_with_cuda(): - gpu_num = device_count() - for i in range(gpu_num): - paddle.device.set_device("gpu:" + str(i)) - self.func_test_memory_stats(core.CUDAPlace(i)) - self.func_test_memory_stats(i) - self.func_test_memory_stats("gpu:" + str(i)) - - def test_memory_stats_exception(self): - if core.is_compiled_with_cuda(): - wrong_device = [ - core.CPUPlace(), - device_count() + 1, - -2, - 0.5, - "gpu1", - ] - for device in wrong_device: - with self.assertRaises(BaseException): # noqa: B017 - memory_stats(device) - else: - with self.assertRaises(ValueError): - memory_stats() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_cuda_reset_peak_memory_stats.py b/test/legacy_test/test_cuda_reset_peak_memory_stats.py deleted file mode 100644 index be66ef3e6bf23..0000000000000 --- a/test/legacy_test/test_cuda_reset_peak_memory_stats.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import paddle -from paddle.base import core -from paddle.device.cuda import ( - device_count, - max_memory_allocated, - max_memory_reserved, - memory_allocated, - memory_reserved, - reset_peak_memory_stats, -) - - -class TestResetPeakMemoryStats(unittest.TestCase): - def func_test_reset_peak_memory_stats(self, device=None): - if core.is_compiled_with_cuda(): - alloc_time = 100 - max_alloc_size = 10000 - for i in range(alloc_time): - # first alloc - shape = paddle.randint( - low=max_alloc_size, high=max_alloc_size * 2 - ) - tensor = paddle.zeros(shape) - peak_memory_allocated_size_first = max_memory_allocated(device) - peak_memory_reserved_size_first = max_memory_reserved(device) - del shape - del tensor - - # second alloc - shape = paddle.randint(low=0, high=max_alloc_size) - tensor = paddle.zeros(shape) - - # reset peak memory stats - reset_peak_memory_stats(device) - - peak_memory_allocated_size_second = max_memory_allocated(device) - self.assertEqual( - peak_memory_allocated_size_second, memory_allocated(device) - ) - self.assertLess( - peak_memory_allocated_size_second, - peak_memory_allocated_size_first, - ) - - peak_memory_reserved_size_second = max_memory_reserved(device) - self.assertEqual( - peak_memory_reserved_size_second, memory_reserved(device) - ) - self.assertLessEqual( - peak_memory_reserved_size_second, - peak_memory_reserved_size_first, - ) - - del shape - del tensor - - def test_reset_peak_memory_stats_for_all_places(self): - if core.is_compiled_with_cuda(): - gpu_num = device_count() - for i in range(gpu_num): - paddle.device.set_device("gpu:" + str(i)) - self.func_test_reset_peak_memory_stats(core.CUDAPlace(i)) - self.func_test_reset_peak_memory_stats(i) - self.func_test_reset_peak_memory_stats("gpu:" + str(i)) - - def test_reset_peak_memory_stats_exception(self): - if core.is_compiled_with_cuda(): - wrong_device = [ - core.CPUPlace(), - device_count() + 1, - -2, - 0.5, - "gpu1", - ] - for device in wrong_device: - with self.assertRaises(BaseException): # noqa: B017 - reset_peak_memory_stats(device) - else: - with self.assertRaises(ValueError): - reset_peak_memory_stats() - - -if __name__ == "__main__": - unittest.main() From 323e7b234fc5eb7ae7c505b6d1a7820fb031f4c2 Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Sun, 29 Dec 2024 21:04:21 +0800 Subject: [PATCH 10/12] formatted baddbmm version modified: paddle/phi/infermeta/ternary.cc modified: paddle/phi/infermeta/ternary.h new file: paddle/phi/kernels/baddbmm_grad_kernel.h new file: paddle/phi/kernels/baddbmm_kernel.h new file: paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc new file: paddle/phi/kernels/cpu/baddbmm_kernel.cc modified: paddle/phi/kernels/funcs/blas/blas.h modified: paddle/phi/kernels/funcs/blas/blas_impl.cu.h modified: paddle/phi/kernels/funcs/blas/blas_impl.h new file: paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu new file: paddle/phi/kernels/gpu/baddbmm_kernel.cu new file: paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h new file: paddle/phi/kernels/impl/baddbmm_kernel_impl.h modified: paddle/phi/ops/yaml/backward.yaml modified: paddle/phi/ops/yaml/ops.yaml modified: python/paddle/__init__.py modified: python/paddle/tensor/__init__.py modified: python/paddle/tensor/math.py --- paddle/phi/infermeta/ternary.cc | 90 ++++++ paddle/phi/infermeta/ternary.h | 7 + paddle/phi/kernels/baddbmm_grad_kernel.h | 33 +++ paddle/phi/kernels/baddbmm_kernel.h | 30 ++ paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc | 22 ++ paddle/phi/kernels/cpu/baddbmm_kernel.cc | 22 ++ paddle/phi/kernels/funcs/blas/blas.h | 27 ++ paddle/phi/kernels/funcs/blas/blas_impl.cu.h | 265 ++++++++++++++++++ paddle/phi/kernels/funcs/blas/blas_impl.h | 91 ++++++ paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu | 28 ++ paddle/phi/kernels/gpu/baddbmm_kernel.cu | 28 ++ .../kernels/impl/baddbmm_grad_kernel_impl.h | 248 ++++++++++++++++ paddle/phi/kernels/impl/baddbmm_kernel_impl.h | 179 ++++++++++++ paddle/phi/ops/yaml/backward.yaml | 10 + paddle/phi/ops/yaml/ops.yaml | 12 + python/paddle/__init__.py | 4 + python/paddle/tensor/__init__.py | 4 + python/paddle/tensor/math.py | 157 +++++++++++ 18 files changed, 1257 insertions(+) create mode 100644 paddle/phi/kernels/baddbmm_grad_kernel.h create mode 100644 paddle/phi/kernels/baddbmm_kernel.h create mode 100644 paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/baddbmm_kernel.cc create mode 100644 paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/baddbmm_kernel.cu create mode 100644 paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/baddbmm_kernel_impl.h diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index e2566301a45b2..d04cfcd59bd57 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -150,6 +150,96 @@ void AddmmInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } +void BaddbmmInferMeta(const MetaTensor& input, + const MetaTensor& x, + const MetaTensor& y, + float beta, + float alpha, + MetaTensor* out) { + auto input_dims = input.dims(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + auto ndim_input = input_dims.size(); + auto ndim_x = x_dims.size(); + auto ndim_y = y_dims.size(); + + VLOG(3) << "baddbmm operator input.shape=" << input_dims + << " x.shape=" << x_dims << " y.shape=" << y_dims << " beta=" << beta + << " alpha=" << alpha << " ndim_input=" << ndim_input + << " ndim_x=" << ndim_x << " ndim_y=" << ndim_y; + + PADDLE_ENFORCE_NE( + product(input_dims), + 0, + errors::PreconditionNotMet("The Input variable 'input' has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + + PADDLE_ENFORCE_NE( + product(x_dims), + 0, + errors::PreconditionNotMet("The Input variable 'x' has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + + PADDLE_ENFORCE_NE( + product(y_dims), + 0, + errors::PreconditionNotMet("The Input variable 'y' has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + // dim check + PADDLE_ENFORCE_EQ( + ndim_input, + 3, + errors::InvalidArgument("The input tensor input's dimension must be 3. " + "But received input's dimension = [%d].", + ndim_input)); + PADDLE_ENFORCE_EQ( + ndim_x, + 3, + errors::InvalidArgument("The input tensor x's dimension must be 3. " + "But received x's dimension = [%d].", + ndim_x)); + PADDLE_ENFORCE_EQ( + ndim_y, + 3, + errors::InvalidArgument("The input tensor y's dimension must be 3. " + "But received y's dimension = [%d].", + ndim_y)); + + PADDLE_ENFORCE_EQ( + input_dims[0], + x_dims[0], + errors::InvalidArgument( + "The batch size of input and x must be the same. " + "But received input batch size = [%d], x batch size = [%d].", + input_dims[0], + x_dims[0])); + PADDLE_ENFORCE_EQ( + x_dims[2], + y_dims[1], + errors::InvalidArgument("The second dimension of x must be equal to the " + "first dimension of y. " + "But received x's second dimension = [%d], y's " + "first dimension = [%d].", + x_dims[2], + y_dims[1])); + + std::vector output_dims; + output_dims.push_back(x_dims[0]); + output_dims.push_back(x_dims[1]); + output_dims.push_back(y_dims[2]); + + out->set_dims(common::make_ddim(output_dims)); + out->share_lod(input); + out->set_dtype(input.dtype()); +} + void AffineChannelInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index b05e64b426212..ee7f484c5d203 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -48,6 +48,13 @@ void AddmmInferMeta(const MetaTensor& input, float alpha, MetaTensor* out); +void BaddbmmInferMeta(const MetaTensor& input, + const MetaTensor& x, + const MetaTensor& y, + float beta, + float alpha, + MetaTensor* out); + void AffineChannelInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, diff --git a/paddle/phi/kernels/baddbmm_grad_kernel.h b/paddle/phi/kernels/baddbmm_grad_kernel.h new file mode 100644 index 0000000000000..34d237e379cb6 --- /dev/null +++ b/paddle/phi/kernels/baddbmm_grad_kernel.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BaddbmmGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + float alpha, + float beta, + DenseTensor* input_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/baddbmm_kernel.h b/paddle/phi/kernels/baddbmm_kernel.h new file mode 100644 index 0000000000000..a10a89d4beb44 --- /dev/null +++ b/paddle/phi/kernels/baddbmm_kernel.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BaddbmmKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + float beta, + float alpha, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc b/paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc new file mode 100644 index 0000000000000..cfd36c73c9cb8 --- /dev/null +++ b/paddle/phi/kernels/cpu/baddbmm_grad_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/baddbmm_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + baddbmm_grad, CPU, ALL_LAYOUT, phi::BaddbmmGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/baddbmm_kernel.cc b/paddle/phi/kernels/cpu/baddbmm_kernel.cc new file mode 100644 index 0000000000000..7b616c924bf95 --- /dev/null +++ b/paddle/phi/kernels/cpu/baddbmm_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/baddbmm_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/baddbmm_kernel_impl.h" + +PD_REGISTER_KERNEL( + baddbmm, CPU, ALL_LAYOUT, phi::BaddbmmKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h index 2f27682247bdc..5e42107312138 100644 --- a/paddle/phi/kernels/funcs/blas/blas.h +++ b/paddle/phi/kernels/funcs/blas/blas.h @@ -96,6 +96,18 @@ class Blas { T beta, T* C) const; + template + void GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + U alpha, + const T* A, + const T* B, + U beta, + T* C) const; + template void GEMM(bool transA, bool transB, @@ -292,6 +304,21 @@ class Blas { int64_t strideA, int64_t strideB) const; + template + void BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + U alpha, + const T* A, + const T* B, + U beta, + T* C, + int batchCount, + int64_t strideA, + int64_t strideB) const; + template void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index 5fcc3f12f2b35..63cef454a492e 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -1183,6 +1183,152 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 8000 } +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + U alpha, + const T *A, + const T *B, + U beta, + T *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + T t_alpha = static_cast(alpha); + T t_beta = static_cast(beta); + +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, + cuTransB, + cuTransA, + N, + M, + K, + &t_alpha, + B, + CUDA_R_32F, + ldb, + A, + CUDA_R_32F, + lda, + &t_beta, + C, + CUDA_R_32F, + N); + } else { +#endif // CUDA_VERSION >= 8000 + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &t_alpha, + B, + ldb, + A, + lda, + &t_beta, + C, + N); + }); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const phi::dtype::float16 *A, + const phi::dtype::float16 *B, + float beta, + phi::dtype::float16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 53, + common::errors::InvalidArgument( + "cublas fp16 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = alpha; + float h_beta = beta; + +#if CUDA_VERSION >= 8000 + // cublasHgemm does true FP16 computation which is slow for non-Volta + // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: + // input/output in fp16, computation in fp32, which can also be accelerated + // using tensor cores in volta GPUs. + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16F, + ldb, + A, + CUDA_R_16F, + lda, + &h_beta, + C, + CUDA_R_16F, + N, + CUDA_R_32F); +#else + // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + h_B, + ldb, + h_A, + lda, + &h_beta, + h_C, + N); + }); +#endif // CUDA_VERSION >= 8000 +} + template <> template <> inline void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -1781,6 +1927,125 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 9010 } +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + U alpha, + const T *A, + const T *B, + U beta, + T *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t strideC = M * N; + +#if CUDA_VERSION >= 9010 + if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same::value)) || + std::is_same::value) { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + VLOG(4) << "use_half_precision_compute_type: " + << FLAGS_gemm_use_half_precision_compute_type; + + auto fp = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; +#if CUDA_VERSION >= 11000 + auto compute_type = CUBLAS_COMPUTE_32F; +#else + auto compute_type = CUDA_R_32F; +#endif + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + void *a = static_cast(&h_alpha); + void *b = static_cast(&h_beta); + // set ComputeType as CUDA_R_32F for fp16, for better accuracy + if (FLAGS_gemm_use_half_precision_compute_type == true && + std::is_same::value) { + a = static_cast(&alpha); + b = static_cast(&beta); +#if CUDA_VERSION >= 11000 + compute_type = CUBLAS_COMPUTE_16F; +#else + compute_type = CUDA_R_16F; +#endif + } + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + a, + B, + fp, + ldb, + strideB, + A, + fp, + lda, + strideA, + b, + C, + fp, + ldc, + strideC, + batchCount, + compute_type, + algo)); + }); + } else { +#endif // CUDA_VERSION >= 9010 + + T h_alpha = static_cast(alpha); + T h_beta = static_cast(beta); + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + ldb, + strideB, + A, + lda, + strideA, + &h_beta, + C, + ldc, + strideC, + batchCount); + }); + +#if CUDA_VERSION >= 9010 + } +#endif // CUDA_VERSION >= 9010 +} + template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index 098e37105a45d..4c6bcca8d63d7 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -1080,6 +1080,37 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, ldc); } +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + U alpha, + const T *A, + const T *B, + U beta, + T *C) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + CBlas::GEMM(CblasRowMajor, + transA, + transB, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc); +} + template <> template void Blas::GEMM(bool transA, @@ -1410,6 +1441,66 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif } +template <> +template +void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + U alpha, + const T *A, + const T *B, + U beta, + T *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + PADDLE_ENFORCE_NOT_NULL( + A, common::errors::InvalidArgument("Pointer A should not be null.")); + PADDLE_ENFORCE_NOT_NULL( + B, common::errors::InvalidArgument("Pointer B should not be null.")); + PADDLE_ENFORCE_NOT_NULL( + C, common::errors::InvalidArgument("Pointer C should not be null.")); +#ifdef PADDLE_WITH_MKLML + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA]; + b_array[k] = &B[k * strideB]; + c_array[k] = &C[k * M * N]; + } + + CBlas::GEMM_BATCH(CblasRowMajor, + &transA, + &transB, + &M, + &N, + &K, + &alpha, + a_array.data(), + &lda, + b_array.data(), + &ldb, + &beta, + c_array.data(), + &ldc, + 1 /* group_count */, + &batchCount); +#else + for (int k = 0; k < batchCount; ++k) { + auto *Ak = &A[k * strideA]; + auto *Bk = &B[k * strideB]; + auto *Ck = &C[k * M * N]; + this->template GEMM(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck); + } +#endif +} + template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, diff --git a/paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu b/paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu new file mode 100644 index 0000000000000..5dcf03c7458ad --- /dev/null +++ b/paddle/phi/kernels/gpu/baddbmm_grad_kernel.cu @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/baddbmm_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(baddbmm_grad, + GPU, + ALL_LAYOUT, + phi::BaddbmmGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/baddbmm_kernel.cu b/paddle/phi/kernels/gpu/baddbmm_kernel.cu new file mode 100644 index 0000000000000..0e41074119eee --- /dev/null +++ b/paddle/phi/kernels/gpu/baddbmm_kernel.cu @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/baddbmm_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/baddbmm_kernel_impl.h" + +PD_REGISTER_KERNEL(baddbmm, + GPU, + ALL_LAYOUT, + phi::BaddbmmKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h new file mode 100644 index 0000000000000..0f40f8172e3f7 --- /dev/null +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h @@ -0,0 +1,248 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include "glog/logging.h" + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/kernels/baddbmm_grad_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { + +template +struct BCopyOrScaleFunctor { + BCopyOrScaleFunctor(const float scale, const T* x, T* output, int64_t numel) + : scale_(scale), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + using MPType = typename phi::dtype::MPTypeTrait::Type; + const MPType mp_scale = static_cast(scale_); + const MPType mp_x = static_cast(x_[idx]); + output_[idx] = static_cast(mp_scale * mp_x); + } + + private: + const float scale_; + const T* x_; + T* output_; + int64_t numel_; +}; + +template +using PhiEigenTensor = EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; +using Array3 = Eigen::DSizes; + +template +void BaddbmmGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + float alpha, + float beta, + DenseTensor* input_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + bool is_float16_or_bfloat16 = false; + if (std::is_same::value || + std::is_same::value) { + is_float16_or_bfloat16 = true; + } + + auto in_dims = input.dims(); + int total_elems = 0; + + VLOG(3) << "alpha: " << alpha << " beta: " << beta; + + if (input_grad != nullptr) { + input_grad->set_lod(out_grad.lod()); + } + if (x_grad != nullptr) { + x_grad->set_lod(x.lod()); + } + if (y_grad != nullptr) { + y_grad->set_lod(y.lod()); + } + + auto blas = funcs::GetBlas(dev_ctx); + auto mt_blas = funcs::GetBlas(dev_ctx); + if (input_grad) { + dev_ctx.template Alloc(input_grad); + total_elems = in_dims[0] * in_dims[1] * in_dims[2]; + auto& place = *dev_ctx.eigen_device(); + auto eigen_dout = PhiEigenTensor::From(out_grad); + auto eigen_dinput = PhiEigenTensor::From(*input_grad); + + bool batch_compress = in_dims[0] != out_grad.dims()[0]; + bool row_compress = in_dims[1] != out_grad.dims()[1]; + bool col_compress = in_dims[2] != out_grad.dims()[2]; + auto eigen_dinput_shape = Array3( + input_grad->dims()[0], input_grad->dims()[1], input_grad->dims()[2]); + + if (batch_compress && row_compress && col_compress) { + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum().eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum() + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } + } else if (batch_compress && row_compress) { + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array2(0, 1)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array2(0, 1)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } + } else if (batch_compress && col_compress) { + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array2(0, 2)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array2(0, 2)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } + } else if (row_compress && col_compress) { + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array2(1, 2)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array2(1, 2)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } + } else if (batch_compress) { + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array1(0)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } + } else if (row_compress) { + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array1(1)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } + } else if (col_compress) { + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array1(2)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array1(2)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } + } else { + // The VCOPY does not support the float16, bfloat16 + if (!is_float16_or_bfloat16) { + mt_blas.VCOPY( + total_elems, out_grad.data(), input_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + BCopyOrScaleFunctor functor( + 1, out_grad.data(), input_grad->data(), total_elems); + for_range(functor); + } + } + + // The SCAL does not support the float16, bfloat16 + if (!is_float16_or_bfloat16) { + mt_blas.SCAL(total_elems, beta, input_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + BCopyOrScaleFunctor functor( + beta, input_grad->data(), input_grad->data(), total_elems); + for_range(functor); + } + } + if (x_grad) { + dev_ctx.template Alloc(x_grad); + total_elems = x.dims()[0] * x.dims()[1] * x.dims()[2]; + // x_grad = out_grad * y'. x_grad: B x M x K, out_grad : B x M x N, y : B x + // K x N + for (int i = 0; i < x.dims()[0]; ++i) { + auto out_grad_slice = out_grad.Slice(i, i + 1); + auto y_slice = y.Slice(i, i + 1); + auto x_grad_slice = x_grad->Slice(i, i + 1); + blas.MatMul(out_grad_slice, false, y_slice, true, &x_grad_slice); + } + if (!is_float16_or_bfloat16) { + mt_blas.SCAL(total_elems, alpha, x_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + BCopyOrScaleFunctor functor( + alpha, x_grad->data(), x_grad->data(), total_elems); + for_range(functor); + } + } + if (y_grad) { + dev_ctx.template Alloc(y_grad); + total_elems = y.dims()[0] * y.dims()[1] * y.dims()[2]; + // y_grad = x' * out_grad. y_grad: B x K x N, out_grad : B x M x N, x : B x + // M x K + for (int i = 0; i < x.dims()[0]; ++i) { + auto out_grad_slice = out_grad.Slice(i, i + 1); + auto x_slice = x.Slice(i, i + 1); + auto y_grad_slice = y_grad->Slice(i, i + 1); + blas.MatMul(x_slice, true, out_grad_slice, false, &y_grad_slice); + } + if (!is_float16_or_bfloat16) { + mt_blas.SCAL(total_elems, alpha, y_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + BCopyOrScaleFunctor functor( + alpha, y_grad->data(), y_grad->data(), total_elems); + for_range(functor); + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h new file mode 100644 index 0000000000000..f788be0d10b1f --- /dev/null +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h @@ -0,0 +1,179 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "glog/logging.h" + +#include "paddle/phi/kernels/baddbmm_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +namespace phi { + +template +using PhiEigenTensor = EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; +using Array3 = Eigen::DSizes; + +template +void BaddbmmKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + float beta, + float alpha, + DenseTensor* out) { + auto input_dims = input.dims(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + DenseTensor input_3d(input); + if (input.dims().size() == 2) { + input_dims = {1, input.dims()[0], input.dims()[1]}; + input_3d.Resize(input_dims); + } + + // broadcast mode check + if (x_dims[0] != input_dims[0]) { + PADDLE_ENFORCE_EQ(input_dims[0], + 1, + errors::InvalidArgument( + "When x_dims[0] is not equal with input_dims[0], " + "input_dims[0] must be 1 but got %s", + input_dims[0])); + PADDLE_ENFORCE_EQ(y_dims[2] == input_dims[2] || input_dims[2] == 1, + true, + errors::InvalidArgument( + "The input tensor shape mismatch, input shape=[%s], " + "x shape=[%s], y shape=[%s]", + input_dims, + x_dims, + y_dims)); + } + if (y_dims[2] != input_dims[2]) { + PADDLE_ENFORCE_EQ(input_dims[2], + 1, + errors::InvalidArgument( + "When y_dims[2] is not equal with input_dims[2], " + "input_dims[2] must be 1 but got %s", + input_dims[2])); + PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1, + true, + errors::InvalidArgument( + "The input tensor shape mismatch, input shape=[%s], " + "x shape=[%s], y shape=[%s]", + input_dims, + x_dims, + y_dims)); + } + PADDLE_ENFORCE_EQ( + x_dims[2], + y_dims[1], + errors::InvalidArgument( + "The input tensor X's width must be equal with matrix Y' height. " + "But received X's shape = [%s], Y's shape = [%s].", + x_dims[2], + y_dims[1])); + + dev_ctx.template Alloc(out); + auto blas = funcs::GetBlas(dev_ctx); + + // calc broadcast dim + Array3 bcast_dims; + bcast_dims[0] = x_dims[0] / input_dims[0]; + bcast_dims[1] = x_dims[1] / input_dims[1]; + bcast_dims[2] = y_dims[2] / input_dims[2]; + VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "," + << bcast_dims[2] << "]"; + + // broadcast using eigen + const DenseTensor& const_ref_input = input_3d; + auto eigen_input = PhiEigenTensor::From(const_ref_input); + auto eigen_out = PhiEigenTensor::From(*out); + auto& place = *dev_ctx.eigen_device(); + funcs::EigenBroadcast, T, 3>::Eval( + place, eigen_out, eigen_input, bcast_dims); + + // special case for float16 + if constexpr (std::is_same_v) { + float t_alpha = alpha; + float t_beta = beta; + if (x_dims[0] == 1) { + blas.GEMM(CblasNoTrans, + CblasNoTrans, + x_dims[1], + y_dims[2], + x_dims[2], + t_alpha, + x.data(), + y.data(), + t_beta, + out->data()); + } else { + blas.BatchedGEMM(CblasNoTrans, + CblasNoTrans, + x_dims[1], + y_dims[2], + x_dims[2], + t_alpha, + x.data(), + y.data(), + t_beta, + out->data(), + x_dims[0], + x_dims[1] * x_dims[2], + x_dims[2] * y_dims[2]); + } + } else { + T t_alpha = static_cast(alpha); + T t_beta = static_cast(beta); + if (x_dims[0] == 1) { + blas.GEMM(CblasNoTrans, + CblasNoTrans, + x_dims[1], + y_dims[2], + x_dims[2], + t_alpha, + x.data(), + y.data(), + t_beta, + out->data()); + } else { + blas.BatchedGEMM(CblasNoTrans, + CblasNoTrans, + x_dims[1], + y_dims[2], + x_dims[2], + t_alpha, + x.data(), + y.data(), + t_beta, + out->data(), + x_dims[0], + x_dims[1] * x_dims[2], + x_dims[2] * y_dims[2]); + // x_dims[2] == y_dims[1] + } + } +} + +} // namespace phi diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml index dae96ff02fffe..aa09c21f77fb9 100644 --- a/paddle/phi/ops/yaml/backward.yaml +++ b/paddle/phi/ops/yaml/backward.yaml @@ -223,6 +223,16 @@ func : atanh_grad inplace : (out_grad -> x_grad) +- backward_op : baddbmm_grad + forward : baddbmm (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0) -> Tensor(out) + args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta) + output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [input, x, y] + kernel : + func : baddbmm_grad + - backward_op : batch_fc_grad forward : batch_fc (Tensor input, Tensor w, Tensor bias) -> Tensor(out) args : (Tensor input, Tensor w, Tensor bias, Tensor out_grad) diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 2818ec6d89343..6ac560334b5a2 100755 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -490,6 +490,18 @@ inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates) traits : paddle::dialect::ForwardOnlyTrait +- op : baddbmm + args : (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0) + output : Tensor(out) + infer_meta : + func : BaddbmmInferMeta + kernel : + func : baddbmm + data_type : x + inplace: (input -> out) + backward : baddbmm_grad + # interfaces : paddle::dialect::InferSymbolicShapeInterface + - op : barrier args : (Tensor x, int ring_id=0) output : Tensor(out) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 7cac26bda3790..b6659a5b583cb 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -386,6 +386,8 @@ atan_, atanh, atanh_, + baddbmm, + baddbmm_, bitwise_left_shift, bitwise_left_shift_, bitwise_right_shift, @@ -805,6 +807,8 @@ 'raw', 'addmm', 'addmm_', + 'baddbmm', + 'baddbmm_', 'allclose', 'isclose', 't', diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index ff8eed1aa1715..6ff77cad6fcac 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -257,6 +257,8 @@ atan_, atanh, atanh_, + baddbmm, + baddbmm_, bitwise_left_shift, bitwise_left_shift_, bitwise_right_shift, @@ -612,6 +614,8 @@ 'erf', 'addmm', 'addmm_', + 'baddbmm', + 'baddbmm_', 'clip', 'clip_', 'trace', diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 426d7c979bc91..5e75c10eaff55 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -2543,6 +2543,163 @@ def addmm_( return _C_ops.addmm_(input, x, y, beta, alpha) +def baddbmm( + input: Tensor, + x: Tensor, + y: Tensor, + beta: float = 1.0, + alpha: float = 1.0, + name: str | None = None, +) -> Tensor: + """ + **baddbmm** + + Perform batch matrix multiplication for input $x$ and $y$. + $input$ is added to the final result. + The equation is: + + .. math:: + Out = alpha * x * y + beta * input + + $Input$, $x$ and $y$ can carry the LoD (Level of Details) information, or not. But the output only shares the LoD information with input $input$. + + Args: + input (Tensor): The input Tensor to be added to the final result. + x (Tensor): The first input Tensor for batch matrix multiplication. + y (Tensor): The second input Tensor for batch matrix multiplication. + beta (float, optional): Coefficient of $input$, default is 1. + alpha (float, optional): Coefficient of $x*y$, default is 1. + name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: The output Tensor of baddbmm. + + Examples: + .. code-block:: python + + >>> import paddle + + >>> x = paddle.ones([2, 2, 2]) + >>> y = paddle.ones([2, 2, 2]) + >>> input = paddle.ones([2, 2, 2]) + + >>> out = paddle.baddbmm(input=input, x=x, y=y, beta=0.5, alpha=5.0) + + >>> print(out) + Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True, + [[[10.50000000, 10.50000000], + [10.50000000, 10.50000000]], + [[10.50000000, 10.50000000], + [10.50000000, 10.50000000]]]) + """ + input_shape = input.shape + x_shape = x.shape + y_shape = y.shape + if not len(x_shape) == len(y_shape) == 3: + raise ValueError( + f"The dimension of x, y should be 3 but receive x's shape: {x_shape}, y's shape: {y_shape}" + ) + if x_shape[2] != y_shape[1]: + raise ValueError( + f"The input Variable x's width must be equal with Variable y's height. But received x's shape = {x_shape}, y's shape = {y_shape}." + ) + + if len(input_shape) == 3: + if input_shape[0] != x_shape[0]: + raise ValueError( + f"The batch size of input must be equal to the batch size of x. But received input's batch size = {input_shape[0]}, x's batch size = {x_shape[0]}" + ) + if input_shape[1] != x_shape[1]: + if input_shape[1] != 1: + raise ValueError( + f"When x's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}" + ) + if input_shape[2] != y_shape[2]: + if input_shape[2] != 1: + raise ValueError( + f"When y's dimension[2] is not equal with input's dimension[2], input's dimension[2] must be 1 but got {input_shape[2]}" + ) + else: + raise ValueError( + f"The dimension of input should be 3 but received input's shape: {input_shape}" + ) + + if in_dynamic_or_pir_mode(): + return _C_ops.baddbmm(input, x, y, beta, alpha) + else: + inputs = {'Input': input, "X": x, "Y": y} + attrs = {'Alpha': alpha, 'Beta': beta} + + helper = LayerHelper("baddbmm", **locals()) + check_variable_and_dtype( + input, + 'Input', + ['float16', 'float32', 'float64', 'uint16'], + 'baddbmm', + ) + check_variable_and_dtype( + x, 'X', ['float16', 'float32', 'float64', 'uint16'], 'baddbmm' + ) + check_variable_and_dtype( + y, 'Y', ['float16', 'float32', 'float64', 'uint16'], 'baddbmm' + ) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type="baddbmm", inputs=inputs, attrs=attrs, outputs={"Out": out} + ) + return out + + +@inplace_apis_in_dygraph_only +def baddbmm_( + input: Tensor, + x: Tensor, + y: Tensor, + beta: float = 1.0, + alpha: float = 1.0, + name: str | None = None, +) -> Tensor: + """ + Inplace version of ``baddbmm`` API, the output Tensor will be inplaced with input ``x``. + Please refer to :ref:`api_paddle_baddbmm`. + """ + input_shape = input.shape + x_shape = x.shape + y_shape = y.shape + if not len(x_shape) == len(y_shape) == 3: + raise ValueError( + f"The dimension of x, y should be 3 but receive x's shape: {x_shape}, y's shape: {y_shape}" + ) + if x_shape[2] != y_shape[1]: + raise ValueError( + f"The input Variable x's width must be equal with Variable y's height. But received x's shape = {x_shape}, y's shape = {y_shape}." + ) + + if len(input_shape) == 3: + if input_shape[0] != x_shape[0]: + raise ValueError( + f"The batch size of input must be equal to the batch size of x. But received input's batch size = {input_shape[0]}, x's batch size = {x_shape[0]}" + ) + if input_shape[1] != x_shape[1]: + if input_shape[1] != 1: + raise ValueError( + f"When x's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {input_shape[1]}" + ) + if input_shape[2] != y_shape[2]: + if input_shape[2] != 1: + raise ValueError( + f"When y's dimension[2] is not equal with input's dimension[2], input's dimension[2] must be 1 but got {input_shape[2]}" + ) + else: + raise ValueError( + f"The dimension of input should be 3 but received input's shape: {input_shape}" + ) + + if in_dynamic_mode(): + return _C_ops.baddbmm_(input, x, y, beta, alpha) + + def renorm(x: Tensor, p: float, axis: int, max_norm: float) -> Tensor: """ **renorm** From a0ac3ee60a0f789f480a6bbde2b5ef262a50286e Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Thu, 9 Jan 2025 21:17:04 +0800 Subject: [PATCH 11/12] added file for test modified: paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py modified: paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc modified: paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h modified: paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h modified: paddle/phi/api/ext/tensor_compat.h modified: paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h modified: paddle/phi/ops/yaml/op_compat.yaml new file: test/legacy_test/test_baddbmm_op.py --- .../decomp_interface_gen_op_list.py | 1 + .../multiary_infer_sym.cc | 61 ++++++++++++ .../infer_symbolic_shape/multiary_infer_sym.h | 2 + .../decomp_rule/decomp_rule/composite.h | 22 +++++ paddle/phi/api/ext/tensor_compat.h | 1 + .../kernels/impl/baddbmm_grad_kernel_impl.h | 10 ++ paddle/phi/ops/yaml/op_compat.yaml | 11 +++ test/legacy_test/test_baddbmm_op.py | 93 +++++++++++++++++++ 8 files changed, 201 insertions(+) create mode 100644 test/legacy_test/test_baddbmm_op.py diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py index 6b9d3c67d3a7d..6ce7cda1e5576 100644 --- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py +++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py @@ -32,6 +32,7 @@ "add_n", "addmm", "any", + "baddbmm", "bce_loss", "bmm", "diag", diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc index 3f2c8397a6141..5c5a36ae186c3 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.cc @@ -163,6 +163,67 @@ bool Addmm_OpInferSymbolicShape(pir::Operation *op, return AddmmOpInferSymbolicShape(op, infer_context); } +bool BaddbmmOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + const auto &input_shape = + infer_context->GetShapeOrDataForValue(op->operand_source(0)); + const auto &x_shape = + infer_context->GetShapeOrDataForValue(op->operand_source(1)); + const auto &y_shape = + infer_context->GetShapeOrDataForValue(op->operand_source(2)); + + auto ndim_input = input_shape.shape().size(); + auto ndim_x = x_shape.shape().size(); + auto ndim_y = y_shape.shape().size(); + + PADDLE_ENFORCE_EQ(ndim_input, + 3, + common::errors::InvalidArgument( + "The input tensor input's dimension must be 3. " + "But received input's dimension = [%d].", + ndim_input)); + PADDLE_ENFORCE_EQ(ndim_x, + 3, + common::errors::InvalidArgument( + "The input tensor x's dimension must be 3. " + "But received x's dimension = [%d].", + ndim_x)); + PADDLE_ENFORCE_EQ(ndim_y, + 3, + common::errors::InvalidArgument( + "The input tensor y's dimension must be 3. " + "But received y's dimension = [%d].", + ndim_y)); + + std::vector output_shape; + output_shape.push_back(x_shape.shape()[0]); // batch size + output_shape.push_back(x_shape.shape()[1]); + output_shape.push_back(y_shape.shape()[2]); + + infer_context->SetShapeOrDataForValue( + op->result(0), + symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(output_shape)}); + + infer_context->AddEqualCstr(x_shape.shape()[0], + y_shape.shape()[0]); // batch size + infer_context->AddEqualCstr(x_shape.shape()[2], y_shape.shape()[1]); + + infer_context->AddBroadcastableCstr(input_shape.shape()[0], + x_shape.shape()[0]); // batch size + infer_context->AddBroadcastableCstr(input_shape.shape()[1], + x_shape.shape()[1]); + infer_context->AddBroadcastableCstr(input_shape.shape()[2], + y_shape.shape()[2]); + + return true; +} + +bool Baddbmm_OpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + return BaddbmmOpInferSymbolicShape(op, infer_context); +} + bool AucOpInferSymbolicShape(pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { const auto &predict_shape = diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h index c8dd6a2b048ce..d647580d0242c 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/multiary_infer_sym.h @@ -21,6 +21,8 @@ namespace paddle::dialect { OP_DECLARE_INFER_SYMBOLIC_SHAPE(Accuracy) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Addmm) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Addmm_) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(Baddbmm) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(Baddbmm_) OP_DECLARE_INFER_SYMBOLIC_SHAPE(AddN) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Auc) OP_DECLARE_INFER_SYMBOLIC_SHAPE(AssignPos) diff --git a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h index 357b2434c1f67..020f4741ef63a 100644 --- a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h +++ b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h @@ -1409,6 +1409,28 @@ Tensor addmm_decomp(const Tensor& input, full_scalar(beta, input.dtype()) * input; } +template +Tensor baddbmm_decomp(const Tensor& input, + const Tensor& x, + const Tensor& y, + const float beta, + const float alpha) { + int batch_size = x.shape()[0]; + std::vector batch_results; + + for (int i = 0; i < batch_size; ++i) { + Tensor x_batch = get_slice(x, i); + Tensor y_batch = get_slice(y, i); + Tensor result = matmul(x_batch, y_batch); + batch_results.push_back(result); + } + + Tensor x_y_mat = concat(batch_results); + + return full_scalar(alpha, x_y_mat.dtype()) * x_y_mat + + full_scalar(beta, input.dtype()) * input; +} + template Tensor eye_decomp(const paddle::Scalar& num_rows, const paddle::Scalar& num_columns, diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h index b1a140da46a89..6c09a4f7451c1 100644 --- a/paddle/phi/api/ext/tensor_compat.h +++ b/paddle/phi/api/ext/tensor_compat.h @@ -35,6 +35,7 @@ using experimental::asinh; using experimental::atan; using experimental::atan2; using experimental::atanh; +using experimental::baddbmm; using experimental::bernoulli; using experimental::ceil; using experimental::cholesky; diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h index 0f40f8172e3f7..238f50c555194 100644 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h @@ -212,6 +212,12 @@ void BaddbmmGradKernel(const Context& dev_ctx, auto out_grad_slice = out_grad.Slice(i, i + 1); auto y_slice = y.Slice(i, i + 1); auto x_grad_slice = x_grad->Slice(i, i + 1); + auto x_grad_dims = x_grad_slice.dims(); + + x_grad_slice.Resize({x_grad_dims[1], x_grad_dims[2]}); + y_slice.Resize({y_slice.dims()[1], y_slice.dims()[2]}); + out_grad_slice.Resize( + {out_grad_slice.dims()[1], out_grad_slice.dims()[2]}); blas.MatMul(out_grad_slice, false, y_slice, true, &x_grad_slice); } if (!is_float16_or_bfloat16) { @@ -232,6 +238,10 @@ void BaddbmmGradKernel(const Context& dev_ctx, auto out_grad_slice = out_grad.Slice(i, i + 1); auto x_slice = x.Slice(i, i + 1); auto y_grad_slice = y_grad->Slice(i, i + 1); + out_grad_slice.Resize( + {out_grad_slice.dims()[1], out_grad_slice.dims()[2]}); + x_slice.Resize({x_slice.dims()[1], x_slice.dims()[2]}); + y_grad_slice.Resize({y_grad_slice.dims()[1], y_grad_slice.dims()[2]}); blas.MatMul(x_slice, true, out_grad_slice, false, &y_grad_slice); } if (!is_float16_or_bfloat16) { diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml index 89a91aa264893..d3707eef9a361 100755 --- a/paddle/phi/ops/yaml/op_compat.yaml +++ b/paddle/phi/ops/yaml/op_compat.yaml @@ -372,6 +372,17 @@ outputs : {auc : AUC, stat_pos_out : StatPosOut, stat_neg_out : StatNegOut} +- op : baddbmm + backward : baddbmm_grad + inputs : + {input : Input, x : X, y : Y} + outputs : + out : Out + attrs : + {alpha : Alpha, beta : Beta} + extra : + attrs : [bool use_mkldnn = false] + - op : barrier inputs : {x : X} diff --git a/test/legacy_test/test_baddbmm_op.py b/test/legacy_test/test_baddbmm_op.py new file mode 100644 index 0000000000000..0be1d00141c09 --- /dev/null +++ b/test/legacy_test/test_baddbmm_op.py @@ -0,0 +1,93 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language go verning permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest + +import paddle + + +class TestBaddBmmOp(OpTest): + # test basic + def setUp(self): + self.op_type = "baddbmm" + self.prim_op_type = "comp" + self.python_api = paddle.baddbmm + self.public_python_api = paddle.baddbmm + self.init_dtype_type() + self.inputs = { + 'Input': np.random.random((10, 20, 15)).astype(self.dtype), + 'X': np.random.random((10, 20, 10)).astype(self.dtype), + 'Y': np.random.random((10, 10, 15)).astype(self.dtype), + } + self.outputs = { + 'Out': self.inputs['Input'] + + np.matmul(self.inputs['X'], self.inputs['Y']) + } + + def init_dtype_type(self): + self.dtype = np.float64 + + def test_check_output(self): + self.check_output(check_pir=True, check_prim_pir=True) + + def test_check_grad_normal(self): + self.check_grad( + ['Input', 'X', 'Y'], + 'Out', + check_pir=True, + check_prim_pir=True, + ) + + def test_check_grad_x(self): + self.check_grad( + ['X'], + 'Out', + no_grad_set=None, + check_pir=True, + check_prim_pir=True, + ) + + def test_check_grad_y(self): + self.check_grad( + ['Y'], + 'Out', + no_grad_set=None, + check_pir=True, + check_prim_pir=True, + ) + + def test_check_grad_input(self): + self.check_grad( + ['Input'], + 'Out', + no_grad_set=None, + check_pir=True, + check_prim_pir=True, + ) + + +class TestBaddBmmFP16Op(TestBaddBmmOp): + def init_dtype_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-2) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From fdf6322c54eb7fada52895cbfe7f8be0a4d5169f Mon Sep 17 00:00:00 2001 From: Qin-sx Date: Fri, 10 Jan 2025 14:35:32 +0800 Subject: [PATCH 12/12] added bloat16 case modified: ../paddle/phi/kernels/funcs/blas/blas_impl.cu.h modified: ../paddle/phi/kernels/impl/baddbmm_kernel_impl.h --- paddle/phi/kernels/funcs/blas/blas_impl.cu.h | 140 ++++++++++++++++++ paddle/phi/kernels/impl/baddbmm_kernel_impl.h | 10 +- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index 63cef454a492e..096ab5bd857ed 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -1398,6 +1398,75 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 11000 } +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + float beta, + phi::dtype::bfloat16 *C) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 80, + common::errors::InvalidArgument( + "cublas bf16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = alpha; + float h_beta = beta; + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + A, + CUDA_R_16BF, + lda, + &h_beta, + C, + CUDA_R_16BF, + N, + CUDA_R_32F, + algo)); + }); +#else + // raise error + PADDLE_THROW(common::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 +} + template <> template <> inline void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -2117,6 +2186,77 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 11000 } +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const phi::dtype::bfloat16 *A, + const phi::dtype::bfloat16 *B, + float beta, + phi::dtype::bfloat16 *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t strideC = M * N; + + float h_alpha = alpha; + float h_beta = beta; + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + strideB, + A, + CUDA_R_16BF, + lda, + strideA, + &h_beta, + C, + CUDA_R_16BF, + ldc, + strideC, + batchCount, + CUBLAS_COMPUTE_32F, + algo)); + }); +#else + // raise error + PADDLE_THROW(common::errors::Unimplemented( + "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= " + "11")); +#endif // CUDA_VERSION >= 11000 +} + template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, diff --git a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h index f788be0d10b1f..83fedc4bcbc62 100644 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h @@ -18,10 +18,12 @@ limitations under the License. */ #include "glog/logging.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + namespace phi { template , T, 3>::Eval( place, eigen_out, eigen_input, bcast_dims); - // special case for float16 - if constexpr (std::is_same_v) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // special case for MPType + if constexpr (std::is_same_v) { + VLOG(4) << "Function: baddbmm, Type of T: " << typeid(T).name(); + VLOG(4) << "Function: baddbmm, Type of MPType: " << typeid(MPType).name(); float t_alpha = alpha; float t_beta = beta; if (x_dims[0] == 1) {