From 7e17653f320bf989708d4b3ab3d87b62583e6a92 Mon Sep 17 00:00:00 2001 From: Niket Agarwal Date: Mon, 6 Jan 2025 20:17:49 +0530 Subject: [PATCH] [GPU] Optimized operations in the blas kernels with the latest buffer changes. Updated the pipeline for both fp32 and fp16. SwiGLU, RmsNorm and Concat ops updated. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Niket Agarwal --- nntrainer/layers/cl_layers/concat_cl.cpp | 326 +++++++++--------- nntrainer/layers/cl_layers/layer_impl_cl.h | 2 + .../layers/cl_layers/rmsnorm_layer_cl.cpp | 59 ++-- nntrainer/layers/cl_layers/swiglu_cl.cpp | 69 ++-- nntrainer/layers/cl_layers/swiglu_cl.h | 2 + 5 files changed, 245 insertions(+), 213 deletions(-) diff --git a/nntrainer/layers/cl_layers/concat_cl.cpp b/nntrainer/layers/cl_layers/concat_cl.cpp index 6715ea5e64..c0913b7ab2 100644 --- a/nntrainer/layers/cl_layers/concat_cl.cpp +++ b/nntrainer/layers/cl_layers/concat_cl.cpp @@ -446,47 +446,47 @@ void ConcatLayerCl::concat_cl_axis3(const float *matAdata, int dim = int(input1_batch_size * input1_channels * input1_height * (input1_width + input2_width)); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - input1_height * input2_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - input1_height * (input1_width + input2_width), - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * input1_height * + input1_width, + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * input1_height * + input2_width, + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * input1_height * + (input1_width + input2_width), + vecYdata); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -530,7 +530,11 @@ void ConcatLayerCl::concat_cl_axis3(const float *matAdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * input1_height * + (input1_width + input2_width), + vecYdata); if (!result) { break; } @@ -555,47 +559,46 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata, int dim = int(input1_batch_size * input1_channels * input1_width * (input1_height + input2_height)); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - input2_height * input1_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - (input1_height + input2_height) * input1_width, - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * input1_height * + input1_width, + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * input2_height * + input1_width, + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * + (input1_height + input2_height) * input1_width, + vecYdata); if (!result) { break; } - - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -639,7 +642,11 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * + (input1_height + input2_height) * input1_width, + vecYdata); if (!result) { break; } @@ -663,47 +670,47 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata, int dim = int(input1_batch_size * input1_width * input1_height * (input1_channels + input2_channels)); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input2_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(float) * input1_batch_size * input1_width * - input1_height * (input1_channels + input2_channels), - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_channels * input1_height * + input1_width, + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input2_channels * input1_height * + input1_width, + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_width * input1_height * + (input1_channels + input2_channels), + vecYdata); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -747,7 +754,11 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(float) * input1_batch_size * input1_width * input1_height * + (input1_channels + input2_channels), + vecYdata); if (!result) { break; } @@ -756,13 +767,11 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata, } #ifdef ENABLE_FP16 -void ConcatLayerCl::concat_cl_axis3_fp16(const _FP16 *matAdata, - const _FP16 *vecXdata, _FP16 *vecYdata, - unsigned int input1_batch_size, - unsigned int input1_channels, - unsigned int input1_height, - unsigned int input1_width, - unsigned int input2_width) { +void ConcatLayerCl::concat_cl_axis3_fp16( + const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, + unsigned int input1_batch_size, unsigned int input1_channels, + unsigned int input1_height, unsigned int input1_width, + unsigned int input2_width) { bool result = false; @@ -774,47 +783,47 @@ void ConcatLayerCl::concat_cl_axis3_fp16(const _FP16 *matAdata, int dim = int(input1_batch_size * input1_channels * input1_height * (input1_width + input2_width)); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input2_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * (input1_width + input2_width), - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * input1_height * + input1_width, + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * input1_height * + input2_width, + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * input1_height * + (input1_width + input2_width), + vecYdata); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -858,7 +867,11 @@ void ConcatLayerCl::concat_cl_axis3_fp16(const _FP16 *matAdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * input1_height * + (input1_width + input2_width), + vecYdata); if (!result) { break; } @@ -866,13 +879,11 @@ void ConcatLayerCl::concat_cl_axis3_fp16(const _FP16 *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis2_fp16(const _FP16 *matAdata, - const _FP16 *vecXdata, _FP16 *vecYdata, - unsigned int input1_batch_size, - unsigned int input1_channels, - unsigned int input1_width, - unsigned int input1_height, - unsigned int input2_height) { +void ConcatLayerCl::concat_cl_axis2_fp16( + const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, + unsigned int input1_batch_size, unsigned int input1_channels, + unsigned int input1_width, unsigned int input1_height, + unsigned int input2_height) { bool result = false; @@ -883,47 +894,46 @@ void ConcatLayerCl::concat_cl_axis2_fp16(const _FP16 *matAdata, int dim = int(input1_batch_size * input1_channels * input1_width * (input1_height + input2_height)); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input2_height * input1_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - (input1_height + input2_height) * input1_width, - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * input1_height * + input1_width, + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * input2_height * + input1_width, + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * + (input1_height + input2_height) * input1_width, + vecYdata); if (!result) { break; } - - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -967,7 +977,11 @@ void ConcatLayerCl::concat_cl_axis2_fp16(const _FP16 *matAdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * + (input1_height + input2_height) * input1_width, + vecYdata); if (!result) { break; } @@ -975,13 +989,11 @@ void ConcatLayerCl::concat_cl_axis2_fp16(const _FP16 *matAdata, } while (false); } -void ConcatLayerCl::concat_cl_axis1_fp16(const _FP16 *matAdata, - const _FP16 *vecXdata, _FP16 *vecYdata, - unsigned int input1_batch_size, - unsigned int input1_height, - unsigned int input1_width, - unsigned int input1_channels, - unsigned int input2_channels) { +void ConcatLayerCl::concat_cl_axis1_fp16( + const __fp16 *matAdata, const __fp16 *vecXdata, __fp16 *vecYdata, + unsigned int input1_batch_size, unsigned int input1_height, + unsigned int input1_width, unsigned int input1_channels, + unsigned int input2_channels) { bool result = false; @@ -993,47 +1005,47 @@ void ConcatLayerCl::concat_cl_axis1_fp16(const _FP16 *matAdata, int dim = int(input1_batch_size * input1_width * input1_height * (input1_channels + input2_channels)); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input2_channels * - input1_height * input1_width, - true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(_FP16) * input1_batch_size * input1_width * - input1_height * (input1_channels + input2_channels), - true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_channels * input1_height * + input1_width, + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input2_channels * input1_height * + input1_width, + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_width * input1_height * + (input1_channels + input2_channels), + vecYdata); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_concat_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_concat_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -1077,7 +1089,11 @@ void ConcatLayerCl::concat_cl_axis1_fp16(const _FP16 *matAdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, + sizeof(__fp16) * input1_batch_size * input1_width * input1_height * + (input1_channels + input2_channels), + vecYdata); if (!result) { break; } diff --git a/nntrainer/layers/cl_layers/layer_impl_cl.h b/nntrainer/layers/cl_layers/layer_impl_cl.h index b2fb0dde7c..e8a3485fad 100644 --- a/nntrainer/layers/cl_layers/layer_impl_cl.h +++ b/nntrainer/layers/cl_layers/layer_impl_cl.h @@ -61,6 +61,8 @@ class LayerImplCl : public LayerImpl { protected: inline static ClContext cl_context_ref; + inline static ClBufferManager &clbuffInstance = + ClBufferManager::getInstance(); }; } // namespace nntrainer diff --git a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp index fe05129959..2adafd7334 100644 --- a/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp +++ b/nntrainer/layers/cl_layers/rmsnorm_layer_cl.cpp @@ -137,37 +137,35 @@ void RMSNormLayerCl::rmsnormProcess(Tensor const &input, Tensor &result, auto kernel_rmsnorm_ptr = layer_kernel_ptrs[Kernels::RMSNORM_CL]; - opencl::Buffer inputbuf(cl_context_ref.context_inst_, dim1 * sizeof(float), - true, nullptr); - - opencl::Buffer gammabuf(cl_context_ref.context_inst_, - input.width() * sizeof(float), true, nullptr); - opencl::Buffer resultbuf(cl_context_ref.context_inst_, dim1 * sizeof(float), - true, nullptr); - const float *data = input.getData(); float *rdata = result.getData(); const float *gdata = gamma.getData(); - ret = inputbuf.WriteData(cl_context_ref.command_queue_inst_, data); + ret = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * sizeof(float), data); if (!ret) { break; } - ret = gammabuf.WriteData(cl_context_ref.command_queue_inst_, gdata); + ret = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, input.width() * sizeof(float), gdata); if (!ret) { break; } - ret = kernel_rmsnorm_ptr->SetKernelArguments(0, &inputbuf, sizeof(cl_mem)); + + ret = kernel_rmsnorm_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!ret) { break; } - ret = kernel_rmsnorm_ptr->SetKernelArguments(1, &resultbuf, sizeof(cl_mem)); + ret = kernel_rmsnorm_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!ret) { break; } - ret = kernel_rmsnorm_ptr->SetKernelArguments(2, &gammabuf, sizeof(cl_mem)); + ret = kernel_rmsnorm_ptr->SetKernelArguments( + 2, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!ret) { break; } @@ -204,7 +202,8 @@ void RMSNormLayerCl::rmsnormProcess(Tensor const &input, Tensor &result, break; } - ret = resultbuf.ReadData(cl_context_ref.command_queue_inst_, rdata); + ret = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim1 * sizeof(float), rdata); if (!ret) { break; } @@ -228,40 +227,42 @@ void RMSNormLayerCl::rmsnormProcess_fp16(Tensor const &input, Tensor &result, do { auto kernel_rmsnorm_ptr = layer_kernel_ptrs[Kernels::RMSNORM_CL_FP16]; - opencl::Buffer inputbuf(cl_context_ref.context_inst_, - dim1 * sizeof(cl_half), true, nullptr); - - opencl::Buffer gammabuf(cl_context_ref.context_inst_, - input.width() * sizeof(cl_half), true, nullptr); - opencl::Buffer resultbuf(cl_context_ref.context_inst_, - dim1 * sizeof(cl_half), true, nullptr); - const __fp16 *data = input.getData<__fp16>(); __fp16 *rdata = result.getData<__fp16>(); const __fp16 *gdata = gamma.getData<__fp16>(); - ret = inputbuf.WriteData(cl_context_ref.command_queue_inst_, data); + + ret = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * sizeof(cl_half), data); if (!ret) { break; } - ret = gammabuf.WriteData(cl_context_ref.command_queue_inst_, gdata); + ret = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, input.width() * sizeof(cl_half), + gdata); if (!ret) { break; } - ret = kernel_rmsnorm_ptr->SetKernelArguments(0, &inputbuf, sizeof(cl_mem)); + + ret = kernel_rmsnorm_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!ret) { break; } - ret = kernel_rmsnorm_ptr->SetKernelArguments(1, &resultbuf, sizeof(cl_mem)); + + ret = kernel_rmsnorm_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!ret) { break; } - ret = kernel_rmsnorm_ptr->SetKernelArguments(2, &gammabuf, sizeof(cl_mem)); + ret = kernel_rmsnorm_ptr->SetKernelArguments( + 2, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!ret) { break; } ret = kernel_rmsnorm_ptr->SetKernelArguments(4, &b, sizeof(int)); + if (!ret) { break; } @@ -275,6 +276,7 @@ void RMSNormLayerCl::rmsnormProcess_fp16(Tensor const &input, Tensor &result, if (!ret) { break; } + ret = kernel_rmsnorm_ptr->SetKernelArguments(6, &h, sizeof(int)); if (!ret) { break; @@ -292,7 +294,8 @@ void RMSNormLayerCl::rmsnormProcess_fp16(Tensor const &input, Tensor &result, break; } - ret = resultbuf.ReadData(cl_context_ref.command_queue_inst_, rdata); + ret = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim1 * sizeof(cl_half), rdata); if (!ret) { break; } diff --git a/nntrainer/layers/cl_layers/swiglu_cl.cpp b/nntrainer/layers/cl_layers/swiglu_cl.cpp index cacbcf892a..111640c011 100644 --- a/nntrainer/layers/cl_layers/swiglu_cl.cpp +++ b/nntrainer/layers/cl_layers/swiglu_cl.cpp @@ -104,41 +104,42 @@ void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata, } int dim = int(dim1 * dim2); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(float) * dim1 * dim2, true, nullptr); - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(float) * dim1 * dim2, true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(float) * dim1 * dim2, true, nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), + vecYdata); if (!result) { break; } - result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_swiglu_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_swiglu_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_swiglu_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -152,7 +153,9 @@ void SwiGLULayerCl::swiglu_cl(const float *matAdata, const float *vecXdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), + vecYdata); if (!result) { break; } @@ -174,41 +177,42 @@ void SwiGLULayerCl::swiglu_cl_fp16(const __fp16 *matAdata, } int dim = int(dim1 * dim2); - opencl::Buffer inputA(cl_context_ref.context_inst_, - sizeof(__fp16) * dim1 * dim2, true, nullptr); - - opencl::Buffer inputX(cl_context_ref.context_inst_, - sizeof(__fp16) * dim1 * dim2, true, nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, - sizeof(__fp16) * dim1 * dim2, true, nullptr); - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(__fp16), + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(__fp16), + vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(__fp16), + vecYdata); if (!result) { break; } - result = kernel_swiglu_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_swiglu_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_swiglu_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_swiglu_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_swiglu_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_swiglu_ptr->SetKernelArguments( + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -222,7 +226,12 @@ void SwiGLULayerCl::swiglu_cl_fp16(const __fp16 *matAdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), + vecYdata); + if (!result) { + break; + } if (!result) { break; } diff --git a/nntrainer/layers/cl_layers/swiglu_cl.h b/nntrainer/layers/cl_layers/swiglu_cl.h index bbb74dc77a..208d6737c4 100644 --- a/nntrainer/layers/cl_layers/swiglu_cl.h +++ b/nntrainer/layers/cl_layers/swiglu_cl.h @@ -34,6 +34,8 @@ class SwiGLULayerCl final : public Layer { private: inline static ClContext cl_context_ref; + inline static ClBufferManager &clbuffInstance = + ClBufferManager::getInstance(); public: /**