Skip to content

Commit

Permalink
Merge pull request #1740 from CEED/jeremy/gen-rollback
Browse files Browse the repository at this point in the history
GPU - gen fallback to shared if error
  • Loading branch information
jeremylt authored Feb 7, 2025
2 parents 6b50d2b + c9192ac commit ea41f46
Show file tree
Hide file tree
Showing 12 changed files with 304 additions and 152 deletions.
81 changes: 75 additions & 6 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
//------------------------------------------------------------------------------
// Build single operator kernel
//------------------------------------------------------------------------------
extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
bool is_tensor = true, is_at_points = false, use_3d_slices = false;
Ceed ceed;
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
Expand All @@ -927,18 +927,77 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
CeedOperator_Cuda_gen *data;
std::ostringstream code;

CeedCallBackend(CeedOperatorGetData(op, &data));
{
bool is_setup_done;

CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
if (is_setup_done) return CEED_ERROR_SUCCESS;
if (is_setup_done) {
*is_good_build = !data->use_fallback;
return CEED_ERROR_SUCCESS;
}
}

// Check field compatibility
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
{
bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;

for (CeedInt i = 0; i < num_input_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor = is_all_tensor && is_tensor;
is_all_nontensor = is_all_not_tensor && !is_tensor;
CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}

for (CeedInt i = 0; i < num_output_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor = is_all_tensor && is_tensor;
is_all_nontensor = is_all_nontensor && !is_tensor;

CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}
// -- Fallback to ref if not all bases are shared
if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
*is_good_build = false;
return CEED_ERROR_SUCCESS;
}
}
CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
CeedCallBackend(CeedOperatorGetData(op, &data));
CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));

// Get operator data
Expand Down Expand Up @@ -1207,8 +1266,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
code << "// -----------------------------------------------------------------------------\n\n";

// Compile
CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
{
bool is_compile_good = false;

CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
if (is_compile_good) {
*is_good_build = true;
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
} else {
*is_good_build = false;
data->use_fallback = true;
}
}
CeedCallBackend(CeedOperatorSetSetupDone(op));
CeedCallBackend(CeedDestroy(&ceed));
CeedCallBackend(CeedQFunctionDestroy(&qf));
Expand Down
2 changes: 1 addition & 1 deletion backends/cuda-gen/ceed-cuda-gen-operator-build.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
// This file is part of CEED: http://github.com/ceed
#pragma once

CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op);
CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build);
78 changes: 21 additions & 57 deletions backends/cuda-gen/ceed-cuda-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
// Apply and add to output
//------------------------------------------------------------------------------
static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
bool is_at_points, is_tensor;
bool is_at_points, is_tensor, is_run_good = true;
Ceed ceed;
Ceed_Cuda *cuda_data;
CeedInt num_elem, num_input_fields, num_output_fields;
Expand All @@ -111,62 +111,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
CeedOperatorField *op_input_fields, *op_output_fields;
CeedOperator_Cuda_gen *data;

// Check for shared bases
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
// Creation of the operator
{
bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;

for (CeedInt i = 0; i < num_input_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;
CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}
bool is_good_build = false;

for (CeedInt i = 0; i < num_output_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;

CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}
// -- Fallback to ref if not all bases are shared
if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_good_build));
if (!is_good_build) {
CeedOperator op_fallback;

CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due unsupported bases");
CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to code generation issue");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
return CEED_ERROR_SUCCESS;
Expand All @@ -179,11 +132,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));

// Creation of the operator
CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op));

// Input vectors
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
Expand Down Expand Up @@ -293,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
}
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);

CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));

// Restore input arrays
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -349,8 +300,21 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,

// Restore context data
CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));

// Cleanup
CeedCallBackend(CeedDestroy(&ceed));
CeedCallBackend(CeedQFunctionDestroy(&qf));

// Fallback if run was bad (out of resources)
if (!is_run_good) {
CeedOperator op_fallback;

data->use_fallback = true;
CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to kernel execution issue");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
return CEED_ERROR_SUCCESS;
}
return CEED_ERROR_SUCCESS;
}

Expand Down
1 change: 1 addition & 0 deletions backends/cuda-gen/ceed-cuda-gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cuda.h>

typedef struct {
bool use_fallback;
CeedInt dim;
CeedInt Q_1d;
CeedInt max_P_1d;
Expand Down
68 changes: 53 additions & 15 deletions backends/cuda/ceed-cuda-compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
//------------------------------------------------------------------------------
// Compile CUDA kernel
//------------------------------------------------------------------------------
int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
const CeedInt num_defines, va_list args) {
size_t ptx_size;
char *ptx;
const int num_opts = 4;
Expand All @@ -50,8 +51,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed

// Get kernel specific options, such as kernel constants
if (num_defines > 0) {
va_list args;
va_start(args, num_defines);
char *name;
int val;

Expand All @@ -60,7 +59,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
val = va_arg(args, int);
code << "#define " << name << " " << val << "\n";
}
va_end(args);
}

// Standard libCEED definitions for CUDA backends
Expand Down Expand Up @@ -133,7 +131,8 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
}
CeedCallBackend(CeedFree(&opts));
if (result != NVRTC_SUCCESS) {
*is_compile_good = result == NVRTC_SUCCESS;
if (!*is_compile_good && throw_error) {
char *log;
size_t log_size;

Expand All @@ -159,6 +158,25 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
return CEED_ERROR_SUCCESS;
}

int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
bool is_compile_good = true;
va_list args;

va_start(args, num_defines);
CeedCallBackend(CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args));
va_end(args);
return CEED_ERROR_SUCCESS;
}

int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
va_list args;

va_start(args, num_defines);
CeedCallBackend(CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args));
va_end(args);
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Get CUDA kernel
//------------------------------------------------------------------------------
Expand Down Expand Up @@ -200,24 +218,44 @@ int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, con
//------------------------------------------------------------------------------
// Run CUDA kernel for spatial dimension with shared memory
//------------------------------------------------------------------------------
int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, void **args) {
static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run,
void **args) {
#if CUDA_VERSION >= 9000
cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
#endif
CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);

if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
int max_threads_per_block, shared_size_bytes, num_regs;

cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
return CeedError(ceed, CEED_ERROR_BACKEND,
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
*is_good_run = false;
if (throw_error) {
int max_threads_per_block, shared_size_bytes, num_regs;

cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
return CeedError(ceed, CEED_ERROR_BACKEND,
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
}
} else CeedChk_Cu(ceed, result);
return CEED_ERROR_SUCCESS;
}

int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, void **args) {
bool is_good_run = true;

CeedCallBackend(
CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
return CEED_ERROR_SUCCESS;
}

int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
CeedCallBackend(
CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
3 changes: 3 additions & 0 deletions backends/cuda/ceed-cuda-compile.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }

CEED_INTERN int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...);
CEED_INTERN int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...);

CEED_INTERN int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel);

Expand All @@ -24,3 +25,5 @@ CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_siz

CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
int shared_mem_size, void **args);
CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
int shared_mem_size, bool *is_good_run, void **args);
Loading

0 comments on commit ea41f46

Please sign in to comment.