diff --git a/nntrainer/tensor/cl_operations/attention_kernel_strings.h b/nntrainer/tensor/cl_operations/attention_kernel_strings.h index d58fd75035..d7b0b39413 100644 --- a/nntrainer/tensor/cl_operations/attention_kernel_strings.h +++ b/nntrainer/tensor/cl_operations/attention_kernel_strings.h @@ -15,7 +15,8 @@ #define __ATTENTION_KERNEL_STRINGS_H__ #include - +// unsigned int offsetFeqsSin, +// unsigned int offsetSin namespace nntrainer { static const std::string rotary_emb_cl_kernel_ = R"( @@ -34,10 +35,11 @@ __kernel void rotary_emb_cl(__global float *input, unsigned int dim, unsigned int half_, unsigned int max_timestep, - unsigned int from) { + unsigned int from, + unsigned int offsetFreqsSin, + unsigned int offsetSin) { __global float *cos_ptr = cos_; __global float *sin_ptr = sin_; - float value = 0.0f; float transformed_value = 0.0f; @@ -50,7 +52,7 @@ __kernel void rotary_emb_cl(__global float *input, unsigned idx = (from + h)*dim; for(unsigned int i = idx; i < idx + dim; i++){ cos_ptr[i - idx] = freqs_cos[i]; - sin_ptr[i - idx] = freqs_sin[i]; + sin_ptr[i - idx + offsetSin] = freqs_sin[i + offsetFreqsSin]; } } @@ -63,7 +65,7 @@ __kernel void rotary_emb_cl(__global float *input, } else { transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_]; } - value = value * cos_ptr[k] + transformed_value * sin_ptr[k]; + value = value * cos_ptr[k] + transformed_value * sin_ptr[k + offsetSin]; output[b * channel * height * width + c * height * width + h * width + span] = value; } } @@ -90,7 +92,9 @@ __kernel void rotary_emb_cl_fp16(__global half *input, unsigned int dim, unsigned int half_, unsigned int max_timestep, - unsigned int from) { + unsigned int from, + unsigned int offsetFreqsSin, + unsigned int offsetSin) { __global float *cos_ptr = cos_; __global float *sin_ptr = sin_; @@ -106,7 +110,7 @@ __kernel void rotary_emb_cl_fp16(__global half *input, unsigned idx = (from + h)*dim; for(int i = idx; i < idx + dim; i++ ){ cos_ptr[i - idx] = freqs_cos[i]; - sin_ptr[i - idx] = freqs_sin[i]; + sin_ptr[i - idx + offsetSin] = freqs_sin[i + offsetFreqsSin]; } } @@ -119,7 +123,7 @@ __kernel void rotary_emb_cl_fp16(__global half *input, } else { transformed_value = (float)input[b * channel * height * width + c * height * width + h * width + span - half_]; } - value = value * cos_ptr[k] + transformed_value * sin_ptr[k]; + value = value * cos_ptr[k] + transformed_value * sin_ptr[k + offsetSin]; output[b * channel * height * width + c * height * width + h * width + span] = (half)value; } } diff --git a/nntrainer/tensor/cl_operations/attention_kernels.cpp b/nntrainer/tensor/cl_operations/attention_kernels.cpp index 388cc0805f..e9e7db6c4c 100644 --- a/nntrainer/tensor/cl_operations/attention_kernels.cpp +++ b/nntrainer/tensor/cl_operations/attention_kernels.cpp @@ -46,24 +46,6 @@ void rotary_emb_cl(float *in, float *out, sizeof(float) * freqs_cos_dim * dim; // max_timestep * dim size_t dim6_size = sizeof(float) * freqs_sin_dim * dim; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true, - nullptr); - - opencl::Buffer cosBuf(cl_context_ref.context_inst_, dim3_size, true, - nullptr); - - opencl::Buffer sinBuf(cl_context_ref.context_inst_, dim4_size, true, - nullptr); - - opencl::Buffer freqs_cosBuf(cl_context_ref.context_inst_, dim5_size, true, - nullptr); - - opencl::Buffer freqs_sinBuf(cl_context_ref.context_inst_, dim6_size, true, - nullptr); - std::vector freqs_cos_flat; std::vector freqs_sin_flat; for (const auto &row : freqs_cos) { @@ -73,81 +55,86 @@ void rotary_emb_cl(float *in, float *out, freqs_sin_flat.insert(freqs_sin_flat.end(), row.begin(), row.end()); } - result = inputA.WriteData(cl_context_ref.command_queue_inst_, in); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, in); if (!result) { printf("Failed to write input data\n"); break; } - result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, out); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, out); if (!result) { printf("Failed to write output data\n"); break; } - result = freqs_cosBuf.WriteData(cl_context_ref.command_queue_inst_, - freqs_cos_flat.data()); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim5_size, freqs_cos_flat.data()); if (!result) { printf("Failed to write freqs cos data\n"); break; } - result = freqs_sinBuf.WriteData(cl_context_ref.command_queue_inst_, - freqs_sin_flat.data()); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim6_size, freqs_sin_flat.data(), 0, + dim5_size); if (!result) { printf("Failed to write freqs sin data\n"); break; } - result = cosBuf.WriteData(cl_context_ref.command_queue_inst_, cos_.data()); + result = clbuffInstance.getInBufferC()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim3_size, cos_.data()); if (!result) { printf("Failed to write cos data\n"); break; } - result = sinBuf.WriteData(cl_context_ref.command_queue_inst_, sin_.data()); + result = clbuffInstance.getInBufferC()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim4_size, sin_.data(), 0, dim3_size); if (!result) { printf("Failed to write sin data\n"); break; } - result = - kernel_rotaryEmb_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_rotaryEmb_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { printf("Failed to set inputA argument\n"); break; } - result = - kernel_rotaryEmb_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem)); + result = kernel_rotaryEmb_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { printf("Failed to set inOutRes argument\n"); break; } - result = kernel_rotaryEmb_ptr->SetKernelArguments(2, &freqs_cosBuf, - sizeof(cl_mem)); + result = kernel_rotaryEmb_ptr->SetKernelArguments( + 2, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { printf("Failed to set freqs_cosBuf argument\n"); break; } - result = kernel_rotaryEmb_ptr->SetKernelArguments(3, &freqs_sinBuf, - sizeof(cl_mem)); + result = kernel_rotaryEmb_ptr->SetKernelArguments( + 3, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { printf("Failed to set freqs_sinBuf argument\n"); break; } - result = - kernel_rotaryEmb_ptr->SetKernelArguments(4, &cosBuf, sizeof(cl_mem)); + result = kernel_rotaryEmb_ptr->SetKernelArguments( + 4, clbuffInstance.getInBufferC(), sizeof(cl_mem)); if (!result) { printf("Failed to set cosBuf argument\n"); break; } - result = - kernel_rotaryEmb_ptr->SetKernelArguments(5, &sinBuf, sizeof(cl_mem)); + result = kernel_rotaryEmb_ptr->SetKernelArguments( + 5, clbuffInstance.getInBufferC(), sizeof(cl_mem)); if (!result) { printf("Failed to set sinBuf argument\n"); break; @@ -202,6 +189,22 @@ void rotary_emb_cl(float *in, float *out, break; } + unsigned int offsetFreqsSin = freqs_cos_dim * dim; + result = kernel_rotaryEmb_ptr->SetKernelArguments(14, &offsetFreqsSin, + sizeof(int)); + if (!result) { + printf("Failed to set offsetFreqsSin argument\n"); + break; + } + + unsigned int offsetSin = cos_dim; + result = + kernel_rotaryEmb_ptr->SetKernelArguments(15, &offsetSin, sizeof(int)); + if (!result) { + printf("Failed to set offsetSin argument\n"); + break; + } + const int work_groups_count[3] = {(int)batch, (int)channel, 1}; const int work_group_size[3] = {32, 32, 1}; // test-value result = cl_context_ref.command_queue_inst_.DispatchCommand( @@ -211,12 +214,12 @@ void rotary_emb_cl(float *in, float *out, break; } - result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, out); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, out); if (!result) { printf("Failed to read data\n"); break; } - } while (false); } } // namespace nntrainer diff --git a/nntrainer/tensor/cl_operations/attention_kernels.h b/nntrainer/tensor/cl_operations/attention_kernels.h index 37a3a4428a..22b141552c 100644 --- a/nntrainer/tensor/cl_operations/attention_kernels.h +++ b/nntrainer/tensor/cl_operations/attention_kernels.h @@ -14,15 +14,18 @@ #ifndef __ATTENTION_KERNELS_H__ #define __ATTENTION_KERNELS_H__ +#include #include #include #include + #include namespace nntrainer { // get global cl_context to use in kernels static ClContext cl_context_ref; +static ClBufferManager &clbuffInstance = ClBufferManager::getInstance(); /** * @brief Rotary Embedding process diff --git a/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp index c1284b0a9c..fe0e37edd8 100644 --- a/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp +++ b/nntrainer/tensor/cl_operations/attention_kernels_fp16.cpp @@ -46,24 +46,6 @@ void rotary_emb_cl(__fp16 *in, __fp16 *out, size_t dim5_size = sizeof(float) * freqs_cos_dim * dim; size_t dim6_size = sizeof(float) * freqs_sin_dim * dim; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true, - nullptr); - - opencl::Buffer cosBuf(cl_context_ref.context_inst_, dim3_size, true, - nullptr); - - opencl::Buffer sinBuf(cl_context_ref.context_inst_, dim4_size, true, - nullptr); - - opencl::Buffer freqs_cosBuf(cl_context_ref.context_inst_, dim5_size, true, - nullptr); - - opencl::Buffer freqs_sinBuf(cl_context_ref.context_inst_, dim6_size, true, - nullptr); - std::vector freqs_cos_flat; std::vector freqs_sin_flat; for (const auto &row : freqs_cos) { @@ -73,81 +55,86 @@ void rotary_emb_cl(__fp16 *in, __fp16 *out, freqs_sin_flat.insert(freqs_sin_flat.end(), row.begin(), row.end()); } - result = inputA.WriteData(cl_context_ref.command_queue_inst_, in); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, in); if (!result) { printf("Failed to write input data\n"); break; } - result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, out); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, out); if (!result) { printf("Failed to write output data\n"); break; } - result = freqs_cosBuf.WriteData(cl_context_ref.command_queue_inst_, - freqs_cos_flat.data()); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim5_size, freqs_cos_flat.data()); if (!result) { printf("Failed to write freqs cos data\n"); break; } - result = freqs_sinBuf.WriteData(cl_context_ref.command_queue_inst_, - freqs_sin_flat.data()); + result = clbuffInstance.getInBufferB()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim6_size, freqs_sin_flat.data(), 0, + dim5_size); if (!result) { printf("Failed to write freqs sin data\n"); break; } - result = cosBuf.WriteData(cl_context_ref.command_queue_inst_, cos_.data()); + result = clbuffInstance.getInBufferC()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim3_size, cos_.data()); if (!result) { printf("Failed to write cos data\n"); break; } - result = sinBuf.WriteData(cl_context_ref.command_queue_inst_, sin_.data()); + result = clbuffInstance.getInBufferC()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim4_size, sin_.data(), 0, dim3_size); if (!result) { printf("Failed to write sin data\n"); break; } - result = - kernel_rotaryEmb_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { printf("Failed to set inputA argument\n"); break; } - result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments(1, &inOutRes, - sizeof(cl_mem)); + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { printf("Failed to set inOutRes argument\n"); break; } - result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments(2, &freqs_cosBuf, - sizeof(cl_mem)); + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments( + 2, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { printf("Failed to set freqs_cosBuf argument\n"); break; } - result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments(3, &freqs_sinBuf, - sizeof(cl_mem)); + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments( + 3, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { printf("Failed to set freqs_sinBuf argument\n"); break; } - result = - kernel_rotaryEmb_fp16_ptr->SetKernelArguments(4, &cosBuf, sizeof(cl_mem)); + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments( + 4, clbuffInstance.getInBufferC(), sizeof(cl_mem)); if (!result) { printf("Failed to set cosBuf argument\n"); break; } - result = - kernel_rotaryEmb_fp16_ptr->SetKernelArguments(5, &sinBuf, sizeof(cl_mem)); + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments( + 5, clbuffInstance.getInBufferC(), sizeof(cl_mem)); if (!result) { printf("Failed to set sinBuf argument\n"); break; @@ -209,6 +196,22 @@ void rotary_emb_cl(__fp16 *in, __fp16 *out, break; } + unsigned int offsetFreqsSin = freqs_cos_dim * dim; + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments(14, &offsetFreqsSin, + sizeof(int)); + if (!result) { + printf("Failed to set offsetFreqsSin argument\n"); + break; + } + + unsigned int offsetSin = cos_dim; + result = kernel_rotaryEmb_fp16_ptr->SetKernelArguments(15, &offsetSin, + sizeof(int)); + if (!result) { + printf("Failed to set offsetSin argument\n"); + break; + } + const int work_groups_count[3] = {(int)batch, (int)channel, 1}; const int work_group_size[3] = {32, 32, 1}; // test-value result = cl_context_ref.command_queue_inst_.DispatchCommand( @@ -218,7 +221,8 @@ void rotary_emb_cl(__fp16 *in, __fp16 *out, break; } - result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, out); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, out); if (!result) { printf("Failed to read data\n"); break; diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp index 6c7751b8b0..59f8bcbca6 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels.cpp @@ -304,30 +304,27 @@ void addition_cl(const float *input, float *res, unsigned int size_input, size_t dim1_size = sizeof(float) * size_input; size_t dim2_size = sizeof(float) * size_res; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true, - nullptr); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, input); - result = inputA.WriteData(cl_context_ref.command_queue_inst_, input); if (!result) { break; } - result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, res); if (!result) { break; } - result = - kernel_addition_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_addition_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = - kernel_addition_ptr->SetKernelArguments(1, &inOutRes, sizeof(cl_mem)); + result = kernel_addition_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -351,7 +348,8 @@ void addition_cl(const float *input, float *res, unsigned int size_input, break; } - result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, res); if (!result) { break; } diff --git a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp index bdff42c135..94bfc9fe30 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp @@ -315,30 +315,28 @@ void addition_cl(const _FP16 *input, _FP16 *res, unsigned int size_input, size_t dim1_size = sizeof(cl_half) * size_input; size_t dim2_size = sizeof(cl_half) * size_res; - opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true, - nullptr); + result = clbuffInstance.getInBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, input); - result = inputA.WriteData(cl_context_ref.command_queue_inst_, input); if (!result) { break; } - result = inOutRes.WriteData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, res); if (!result) { break; } - result = - kernel_addition_fp16_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_addition_fp16_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_addition_fp16_ptr->SetKernelArguments(1, &inOutRes, - sizeof(cl_mem)); + result = kernel_addition_fp16_ptr->SetKernelArguments( + 1, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -363,7 +361,8 @@ void addition_cl(const _FP16 *input, _FP16 *res, unsigned int size_input, break; } - result = inOutRes.ReadData(cl_context_ref.command_queue_inst_, res); + result = clbuffInstance.getOutBufferA()->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, res); if (!result) { break; } diff --git a/test/unittest/unittest_blas_kernels_cl.cpp b/test/unittest/unittest_blas_kernels_cl.cpp index ab1c8a03fa..1c89d94339 100644 --- a/test/unittest/unittest_blas_kernels_cl.cpp +++ b/test/unittest/unittest_blas_kernels_cl.cpp @@ -581,6 +581,59 @@ TEST(blas_kernels, addition_i) { EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(blas_kernels, addition_i_fp16) { + + int batch = 12; + int channel = 1; + int height = 26; + int width = 26; + + int batch_b = 1; + + const float alpha = 1e-1; + const int MOD = 10; + + // nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + // nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::Tensor A_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B_fp16(batch_b, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor C_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor D_fp16(batch_b, channel, height, width, t_type_nchw_fp16); + + GEN_TEST_INPUT(A_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_C(B_fp16, ((i * (batch_b * height * channel) + + j * (batch_b * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(C_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_C(D_fp16, ((i * (batch_b * height * channel) + + j * (batch_b * height) + k * (width) + l + 1) % + MOD) * + alpha); + + A_fp16.add_i(B_fp16); + add_i_cl(C_fp16, D_fp16); + + float mseErrorNeon_fp16 = mse<__fp16>( + A_fp16.getData<__fp16>(), C_fp16.getData<__fp16>(), A_fp16.size()); + + double cosSimNeon_fp16 = cosine_similarity<__fp16>( + A_fp16.getData<__fp16>(), C_fp16.getData<__fp16>(), A_fp16.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon_fp16, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon_fp16, 0.99, 1); +} GTEST_API_ int main(int argc, char **argv) { int result = -1;