-
Notifications
You must be signed in to change notification settings - Fork 77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenCL/GPU] Optimized Blas and Attention kernels with the latest GPU Pipeline. #2859
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,8 @@ | |
#define __ATTENTION_KERNEL_STRINGS_H__ | ||
|
||
#include <string> | ||
|
||
// unsigned int offsetFeqsSin, | ||
// unsigned int offsetSin | ||
namespace nntrainer { | ||
static const std::string rotary_emb_cl_kernel_ = R"( | ||
|
||
|
@@ -34,10 +35,11 @@ __kernel void rotary_emb_cl(__global float *input, | |
unsigned int dim, | ||
unsigned int half_, | ||
unsigned int max_timestep, | ||
unsigned int from) { | ||
unsigned int from, | ||
unsigned int offsetFreqsSin, | ||
unsigned int offsetSin) { | ||
__global float *cos_ptr = cos_; | ||
__global float *sin_ptr = sin_; | ||
|
||
float value = 0.0f; | ||
float transformed_value = 0.0f; | ||
|
||
|
@@ -50,7 +52,7 @@ __kernel void rotary_emb_cl(__global float *input, | |
unsigned idx = (from + h)*dim; | ||
for(unsigned int i = idx; i < idx + dim; i++){ | ||
cos_ptr[i - idx] = freqs_cos[i]; | ||
sin_ptr[i - idx] = freqs_sin[i]; | ||
sin_ptr[i - idx + offsetSin] = freqs_sin[i + offsetFreqsSin]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you explain this part?
Also, wouldn't this result in accessing invalid memory space for freqs_sin? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hello, so as per the latest GPU pipeline changes we are using a genralized set if buffers instead of creating buffers everytime whenever we a kernel is called. As of now there are only 5 generalized buffers, 3 for input buffers and 2 for output buffers. Please refer this PR for more understanding: #2816 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the clarification! |
||
} | ||
} | ||
|
||
|
@@ -63,7 +65,7 @@ __kernel void rotary_emb_cl(__global float *input, | |
} else { | ||
transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_]; | ||
} | ||
value = value * cos_ptr[k] + transformed_value * sin_ptr[k]; | ||
value = value * cos_ptr[k] + transformed_value * sin_ptr[k + offsetSin]; | ||
output[b * channel * height * width + c * height * width + h * width + span] = value; | ||
} | ||
} | ||
|
@@ -90,7 +92,9 @@ __kernel void rotary_emb_cl_fp16(__global half *input, | |
unsigned int dim, | ||
unsigned int half_, | ||
unsigned int max_timestep, | ||
unsigned int from) { | ||
unsigned int from, | ||
unsigned int offsetFreqsSin, | ||
unsigned int offsetSin) { | ||
__global float *cos_ptr = cos_; | ||
__global float *sin_ptr = sin_; | ||
|
||
|
@@ -106,7 +110,7 @@ __kernel void rotary_emb_cl_fp16(__global half *input, | |
unsigned idx = (from + h)*dim; | ||
for(int i = idx; i < idx + dim; i++ ){ | ||
cos_ptr[i - idx] = freqs_cos[i]; | ||
sin_ptr[i - idx] = freqs_sin[i]; | ||
sin_ptr[i - idx + offsetSin] = freqs_sin[i + offsetFreqsSin]; | ||
} | ||
} | ||
|
||
|
@@ -119,7 +123,7 @@ __kernel void rotary_emb_cl_fp16(__global half *input, | |
} else { | ||
transformed_value = (float)input[b * channel * height * width + c * height * width + h * width + span - half_]; | ||
} | ||
value = value * cos_ptr[k] + transformed_value * sin_ptr[k]; | ||
value = value * cos_ptr[k] + transformed_value * sin_ptr[k + offsetSin]; | ||
output[b * channel * height * width + c * height * width + h * width + span] = (half)value; | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's remove it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll update in the latest commit.