Skip to content

Commit

Permalink
[ GPU/OpenCL ] Split register kernel from forwarding function
Browse files Browse the repository at this point in the history
- This commit is draft
- This commit splits kernel registeration from forwarding function.
- This is WIP. This commit contains example update for concat_cl and
fc_layer_cl.

Self evaluation:

Build test: [X]Passed [ ]Failed [ ]Skipped
Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Eunju Yang <[email protected]>
  • Loading branch information
EunjuYang committed Nov 4, 2024
1 parent b1a3c75 commit f43b253
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 59 deletions.
15 changes: 9 additions & 6 deletions nntrainer/cl_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ std::once_flag global_cl_context_init_flag;

static void add_default_object(ClContext &cc) {

FullyConnectedLayerCl::registerClKernels();
cc.registerFactory(nntrainer::createLayer<FullyConnectedLayerCl>,
FullyConnectedLayerCl::type,
ml::train::LayerType::LAYER_FC);
Expand All @@ -40,15 +41,17 @@ static void add_default_object(ClContext &cc) {
// AdditionLayerCL::type,
// ml::train::LayerType::LAYER_ADDITION);

cc.registerFactory(nntrainer::createLayer<SwiGLULayerCl>, SwiGLULayerCl::type,
ml::train::LayerType::LAYER_SWIGLU);
// cc.registerFactory(nntrainer::createLayer<SwiGLULayerCl>,
// SwiGLULayerCl::type,
// ml::train::LayerType::LAYER_SWIGLU);

cc.registerFactory(nntrainer::createLayer<ReshapeLayerCl>,
ReshapeLayerCl::type, ml::train::LayerType::LAYER_RESHAPE);
// cc.registerFactory(nntrainer::createLayer<ReshapeLayerCl>,
// ReshapeLayerCl::type, ml::train::LayerType::LAYER_RESHAPE);

cc.registerFactory(nntrainer::createLayer<RMSNormLayerCl>,
RMSNormLayerCl::type, ml::train::LayerType::LAYER_RMSNORM);
// cc.registerFactory(nntrainer::createLayer<RMSNormLayerCl>,
// RMSNormLayerCl::type, ml::train::LayerType::LAYER_RMSNORM);

ConcatLayerCl::registerClKernels();
cc.registerFactory(nntrainer::createLayer<ConcatLayerCl>, ConcatLayerCl::type,
ml::train::LayerType::LAYER_CONCAT);
}
Expand Down
100 changes: 57 additions & 43 deletions nntrainer/layers/cl_layers/concat_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,50 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
static constexpr size_t INPUT_IDX_1 = 0;
static constexpr size_t INPUT_IDX_2 = 1;

std::vector<ClContext::SharedPtrClKernel> ConcatLayerCl::layer_kernel_ptrs;

bool ConcatLayerCl::registerClKernels() {
ClContext::SharedPtrClKernel kernel_concat_ptr = nullptr;

kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis1_kernel_, "concat_cl_axis1");
NNTR_THROW_IF(!kernel_concat_ptr, std::runtime_error)
<< "OpenCL Error: Fail to register concat_cl_axis1 kernel";
layer_kernel_ptrs.emplace_back(kernel_concat_ptr);

kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis2_kernel_, "concat_cl_axis2");
NNTR_THROW_IF(!kernel_concat_ptr, std::runtime_error)
<< "OpenCL Error: Fail to register concat_cl_axis2 kernel";
layer_kernel_ptrs.emplace_back(kernel_concat_ptr);

kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis3_kernel_, "concat_cl_axis3");
NNTR_THROW_IF(!kernel_concat_ptr, std::runtime_error)
<< "OpenCL Error: Fail to register concat_cl_axis3 kernel";
layer_kernel_ptrs.emplace_back(kernel_concat_ptr);

kernel_concat_ptr = cl_context_ref.registerClKernel(
concat_cl_axis1_kernel_fp16_, "concat_cl_axis1_fp16");
NNTR_THROW_IF(!kernel_concat_ptr, std::runtime_error)
<< "OpenCL Error: Fail to register concat_cl_axis1_fp16 kernel";
layer_kernel_ptrs.emplace_back(kernel_concat_ptr);

kernel_concat_ptr = cl_context_ref.registerClKernel(
concat_cl_axis2_kernel_fp16_, "concat_cl_axis2_fp16");
NNTR_THROW_IF(!kernel_concat_ptr, std::runtime_error)
<< "OpenCL Error: Fail to register concat_cl_axis2_fp16 kernel";
layer_kernel_ptrs.emplace_back(kernel_concat_ptr);

kernel_concat_ptr = cl_context_ref.registerClKernel(
concat_cl_axis3_kernel_fp16_, "concat_cl_axis3_fp16");
NNTR_THROW_IF(!kernel_concat_ptr, std::runtime_error)
<< "OpenCL Error: Fail to register concat_cl_axis3_fp16 kernel";
layer_kernel_ptrs.emplace_back(kernel_concat_ptr);

return true;
}

void ConcatLayerCl::finalize(InitLayerContext &context) {
auto &concat_dimension_prop = std::get<props::ConcatDimension>(concat_props);
/** for backward compatibility, default concat dimension will be channel */
Expand Down Expand Up @@ -302,13 +346,6 @@ void ConcatLayerCl::incremental_forwarding(RunLayerContext &context,
ConcatProcess(in1, in2, out);
}

opencl::Kernel ConcatLayerCl::kernel_concat_axis3;
opencl::Kernel ConcatLayerCl::kernel_concat_axis3_fp16;
opencl::Kernel ConcatLayerCl::kernel_concat_axis2;
opencl::Kernel ConcatLayerCl::kernel_concat_axis2_fp16;
opencl::Kernel ConcatLayerCl::kernel_concat_axis1;
opencl::Kernel ConcatLayerCl::kernel_concat_axis1_fp16;

void ConcatLayerCl::ConcatProcess(Tensor const &in1, Tensor const &in2,
Tensor &result) {

Expand Down Expand Up @@ -375,12 +412,8 @@ void ConcatLayerCl::concat_cl_axis3(const float *matAdata,
bool result = false;

do {
ClContext::SharedPtrClKernel kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis3_kernel_,
"concat_cl_axis3");
if (!kernel_concat_ptr) {
break;
}

const auto &kernel_concat_ptr = layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS3];

int dim = int(input1_batch_size * input1_channels * input1_height *
(input1_width + input2_width));
Expand Down Expand Up @@ -486,12 +519,9 @@ void ConcatLayerCl::concat_cl_axis3_fp16(
bool result = false;

do {
ClContext::SharedPtrClKernel kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis3_kernel_fp16_,
"concat_cl_axis3_fp16");
if (!kernel_concat_ptr) {
break;
}

const auto &kernel_concat_ptr =
layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS3_FP16];

int dim = int(input1_batch_size * input1_channels * input1_height *
(input1_width + input2_width));
Expand Down Expand Up @@ -599,12 +629,8 @@ void ConcatLayerCl::concat_cl_axis2(const float *matAdata,
bool result = false;

do {
ClContext::SharedPtrClKernel kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis2_kernel_,
"concat_cl_axis2");
if (!kernel_concat_ptr) {
break;
}

const auto &kernel_concat_ptr = layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS2];

int dim = int(input1_batch_size * input1_channels * input1_width *
(input1_height + input2_height));
Expand Down Expand Up @@ -710,12 +736,8 @@ void ConcatLayerCl::concat_cl_axis2_fp16(
bool result = false;

do {
ClContext::SharedPtrClKernel kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis2_kernel_fp16_,
"concat_cl_axis2_fp16");
if (!kernel_concat_ptr) {
break;
}
const auto &kernel_concat_ptr =
layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS2_FP16];

int dim = int(input1_batch_size * input1_channels * input1_width *
(input1_height + input2_height));
Expand Down Expand Up @@ -823,12 +845,7 @@ void ConcatLayerCl::concat_cl_axis1(const float *matAdata,
bool result = false;

do {
ClContext::SharedPtrClKernel kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis1_kernel_,
"concat_cl_axis1");
if (!kernel_concat_ptr) {
break;
}
const auto &kernel_concat_ptr = layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS1];

int dim = int(input1_batch_size * input1_width * input1_height *
(input1_channels + input2_channels));
Expand Down Expand Up @@ -934,12 +951,9 @@ void ConcatLayerCl::concat_cl_axis1_fp16(
bool result = false;

do {
ClContext::SharedPtrClKernel kernel_concat_ptr =
cl_context_ref.registerClKernel(concat_cl_axis1_kernel_fp16_,
"concat_cl_axis1_fp16");
if (!kernel_concat_ptr) {
break;
}

const auto &kernel_concat_ptr =
layer_kernel_ptrs[Kernels::CONCAT_CL_AXIS1_FP16];

int dim = int(input1_batch_size * input1_width * input1_height *
(input1_channels + input2_channels));
Expand Down
26 changes: 19 additions & 7 deletions nntrainer/layers/cl_layers/concat_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,12 @@ class ConcatLayerCl : public Layer {
*/
void setProperty(const std::vector<std::string> &values) override;

inline static const std::string type = "concat";
/**
* @brief registerClKernels
*/
static bool registerClKernels();

static opencl::Kernel kernel_concat_axis3;
static opencl::Kernel kernel_concat_axis3_fp16;
static opencl::Kernel kernel_concat_axis2;
static opencl::Kernel kernel_concat_axis2_fp16;
static opencl::Kernel kernel_concat_axis1;
static opencl::Kernel kernel_concat_axis1_fp16;
inline static const std::string type = "concat";

/**
* @brief Process data and dimensions for concat
Expand Down Expand Up @@ -233,6 +231,20 @@ class ConcatLayerCl : public Layer {
#endif
private:
std::tuple<props::ConcatDimension> concat_props;

const static int num_layer_kernels = 6;

static std::vector<ClContext::SharedPtrClKernel>
layer_kernel_ptrs; /** kernel list relevant with this layer */

enum Kernels {
CONCAT_CL_AXIS1,
CONCAT_CL_AXIS2,
CONCAT_CL_AXIS3,
CONCAT_CL_AXIS1_FP16,
CONCAT_CL_AXIS2_FP16,
CONCAT_CL_AXIS3_FP16,
};
};

} // namespace nntrainer
Expand Down
2 changes: 1 addition & 1 deletion nntrainer/layers/cl_layers/fc_layer_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
enum FCParams { weight, bias };

FullyConnectedLayerCl::FullyConnectedLayerCl() :
LayerImpl(), fc_props(props::Unit()) {
LayerImplCl(), fc_props(props::Unit()) {
weight_idx.fill(std::numeric_limits<unsigned>::max());
}

Expand Down
11 changes: 9 additions & 2 deletions nntrainer/layers/cl_layers/fc_layer_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
#ifdef __cplusplus

#include <common_properties.h>
#include <layer_impl.h>
#include <layer_impl_cl.h>

namespace nntrainer {

/**
* @class FullyConnecedLayer
* @brief fully connected layer
*/
class FullyConnectedLayerCl : public LayerImpl {
class FullyConnectedLayerCl : public LayerImplCl {
public:
/**
* @brief Constructor of Fully Connected Layer
Expand Down Expand Up @@ -101,12 +101,19 @@ class FullyConnectedLayerCl : public LayerImpl {
*/
void setProperty(const std::vector<std::string> &values) override;

static bool registerClKernels() { return true; };

inline static const std::string type = "fully_connected";

private:
std::tuple<props::Unit>
fc_props; /**< fc layer properties : unit - number of output neurons */
std::array<unsigned int, 2> weight_idx; /**< indices of the weights */

const static int num_layer_kernels = 0; /** < number of layer kernels */

static std::vector<ClContext::SharedPtrClKernel>
layer_kernel_ptrs; /**< kernel list relevant with this layer */
};
} // namespace nntrainer

Expand Down

0 comments on commit f43b253

Please sign in to comment.