From ff2fb5cb1132dd69a3d57d07412193c0f57b50d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Wed, 1 Sep 2021 10:54:22 +0200 Subject: [PATCH 1/3] add clang-format file Add alpaka's clang format file and add to `IncludeCategories:` a section about cupla includes. --- .clang-format | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..37aa603c --- /dev/null +++ b/.clang-format @@ -0,0 +1,116 @@ +--- +# General options +Language: Cpp +Standard: c++17 +DisableFormat: false + +AccessModifierOffset: -4 +AlignAfterOpenBracket: AlwaysBreak +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: false +AlignEscapedNewlines: Right +AlignOperands: false +AlignTrailingComments: false +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BreakBeforeBinaryOperators: All +BreakBeforeBraces: Allman +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeComma +BreakInheritanceList: BeforeComma +BreakStringLiterals: true +ColumnLimit: 119 +CommentPragmas: '^ COMMENT pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentCaseLabels: false +IndentGotoLabels: true +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 2 +NamespaceIndentation: All +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 1000 +PointerAlignment: Left +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: Never +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +TabWidth: 4 +UseCRLF: false +UseTab: Never + +# Project specific options +IncludeCategories: + # Local headers (in "") above all else + - Regex: '"([A-Za-z0-9.\/-_])+"' + Priority: 1 + # after local headers + - Regex: '"cupla/([A-Za-z0-9.\/-_])+"' + Priority: 2 + # after local headers + - Regex: '' + Priority: 3 + # C++ standard library headers are the last group to be included + - Regex: '<([A-Za-z0-9\/-_])+>' + Priority: 4 + +# Future options - not supported in clang-format 11 +# AlignConsecutiveBitFields: false +# AllowShortEnumsOnASingleLine: false +# BitFieldColonSpacing: Both +# IndentCaseBlocks: true +# IndentExternBlock: AfterExternBlock +# OperandAlignmentStyle: Align +... From d3a9ba7733937dd6d4c8e9f75eb79aea066a9678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Wed, 1 Sep 2021 11:00:19 +0200 Subject: [PATCH 2/3] apply clang-format Format code. ``` find src example include test -iname "*.def" \ -o -iname "*.h" -o -iname "*.cpp" -o -iname "*.hpp" \ | xargs clang-format-11 -i ``` --- example/CUDASamples/asyncAPI/src/asyncAPI.cpp | 54 +- .../asyncAPI_tuned/src/asyncAPI.cpp | 56 +- .../blackScholes/src/BlackScholes.cpp | 180 ++- .../blackScholes/src/BlackScholes_gold.cpp | 66 +- example/CUDASamples/common/exception.h | 115 +- example/CUDASamples/common/helper_cuda.h | 1300 +++++++++-------- example/CUDASamples/common/helper_functions.h | 24 +- example/CUDASamples/common/helper_image.h | 504 ++++--- example/CUDASamples/common/helper_string.h | 474 +++--- example/CUDASamples/common/helper_timer.h | 343 ++--- .../cuplaVectorAdd/src/vectorAdd.cpp | 176 +-- .../CUDASamples/matrixMul/src/matrixMul.cpp | 276 ++-- .../CUDASamples/vectorAdd/src/vectorAdd.cpp | 173 ++- include/cuda_to_cupla.hpp | 4 +- include/cupla.hpp | 2 +- include/cupla/api/common.hpp | 47 +- include/cupla/api/device.hpp | 28 +- include/cupla/api/event.hpp | 42 +- include/cupla/api/memory.hpp | 190 +-- include/cupla/api/stream.hpp | 30 +- include/cupla/c/datatypes/cuplaArray.hpp | 15 +- include/cupla/c/datatypes/cuplaExtent.hpp | 284 ++-- .../cupla/c/datatypes/cuplaMemcpy3DParms.hpp | 33 +- include/cupla/c/datatypes/cuplaPitchedPtr.hpp | 48 +- include/cupla/c/datatypes/cuplaPos.hpp | 281 ++-- include/cupla/config/AnyOacc.hpp | 20 +- include/cupla/config/AnyOmp5.hpp | 20 +- include/cupla/config/CpuOmp2Blocks.hpp | 20 +- include/cupla/config/CpuOmp2Threads.hpp | 20 +- include/cupla/config/CpuSerial.hpp | 20 +- include/cupla/config/CpuTbbBlocks.hpp | 20 +- include/cupla/config/CpuThreads.hpp | 20 +- include/cupla/config/GpuCudaRt.hpp | 20 +- include/cupla/config/GpuHipRt.hpp | 20 +- include/cupla/cudaToCupla/driverTypes.hpp | 32 +- include/cupla/cudaToCupla/runtime.hpp | 6 +- include/cupla/datatypes/Array.hpp | 57 +- include/cupla/datatypes/dim3.hpp | 67 +- include/cupla/datatypes/uint.hpp | 293 ++-- include/cupla/defines.hpp | 107 +- include/cupla/device/Atomic.hpp | 229 ++- include/cupla/device/Hierarchy.hpp | 23 +- include/cupla/device/Index.hpp | 141 +- include/cupla/device/SharedMemory.hpp | 8 +- include/cupla/device/Synchronization.hpp | 55 +- include/cupla/device/math/Abs.hpp | 23 +- include/cupla/device/math/Common.hpp | 209 +-- include/cupla/device/math/Comparison.hpp | 27 +- include/cupla/device/math/Erf.hpp | 23 +- include/cupla/device/math/Exp.hpp | 23 +- include/cupla/device/math/Log.hpp | 23 +- include/cupla/device/math/Mod.hpp | 27 +- include/cupla/device/math/Pow.hpp | 23 +- include/cupla/device/math/Root.hpp | 37 +- include/cupla/device/math/Round.hpp | 61 +- include/cupla/device/math/Trigo.hpp | 47 +- include/cupla/device_functions.hpp | 4 +- include/cupla/kernel.hpp | 398 +++-- include/cupla/manager/Device.hpp | 222 ++- include/cupla/manager/Driver.hpp | 48 +- include/cupla/manager/Event.hpp | 355 ++--- include/cupla/manager/Memory.hpp | 180 +-- include/cupla/manager/Stream.hpp | 217 ++- include/cupla/namespace.hpp | 100 +- include/cupla/traits/IsThreadSeqAcc.hpp | 85 +- include/cupla/types.hpp | 288 ++-- include/cupla_driver_types.hpp | 60 +- include/cupla_runtime.hpp | 37 +- src/common.cpp | 76 +- src/device.cpp | 163 +-- src/event.cpp | 172 +-- src/manager/Driver.cpp | 59 +- src/memory.cpp | 1171 +++++---------- src/stream.cpp | 120 +- test/system/config/kernel.cpp | 36 +- test/system/config/main.cpp | 36 +- 76 files changed, 4413 insertions(+), 5880 deletions(-) diff --git a/example/CUDASamples/asyncAPI/src/asyncAPI.cpp b/example/CUDASamples/asyncAPI/src/asyncAPI.cpp index 595d4b1a..f61aa532 100644 --- a/example/CUDASamples/asyncAPI/src/asyncAPI.cpp +++ b/example/CUDASamples/asyncAPI/src/asyncAPI.cpp @@ -27,26 +27,22 @@ // includes, project #include -#include // helper utility functions +#include // helper utility functions struct increment_kernel { - -template< - typename T_Acc -> -ALPAKA_FN_ACC -void operator()(T_Acc const & acc, int *g_data, int inc_value) const -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - g_data[idx] = g_data[idx] + inc_value; -} + template + ALPAKA_FN_ACC void operator()(T_Acc const& acc, int* g_data, int inc_value) const + { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + g_data[idx] = g_data[idx] + inc_value; + } }; -int correct_output(int *data, const int n, const int x) +int correct_output(int* data, const int n, const int x) { - for (int i = 0; i < n; i++) - if (data[i] != x) + for(int i = 0; i < n; i++) + if(data[i] != x) { printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); return 0; @@ -55,44 +51,44 @@ int correct_output(int *data, const int n, const int x) return 1; } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { -// int devID; -// cudaDeviceProp deviceProps; + // int devID; + // cudaDeviceProp deviceProps; printf("[%s] - Starting...\n", argv[0]); // This will pick the best possible CUDA capable device -// devID = findCudaDevice(argc, (const char **)argv); + // devID = findCudaDevice(argc, (const char **)argv); // get device name -// checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); -// printf("CUDA device [%s]\n", deviceProps.name); + // checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + // printf("CUDA device [%s]\n", deviceProps.name); int n = 16 * 1024 * 1024; int nbytes = n * sizeof(int); int value = 26; // allocate host memory - int *a = 0; - checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); + int* a = 0; + checkCudaErrors(cudaMallocHost((void**) &a, nbytes)); memset(a, 0, nbytes); // allocate device memory - int *d_a=0; - checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); + int* d_a = 0; + checkCudaErrors(cudaMalloc((void**) &d_a, nbytes)); checkCudaErrors(cudaMemset(d_a, 255, nbytes)); // set kernel launch configuration dim3 threads = dim3(512, 1); - dim3 blocks = dim3(n / threads.x, 1); + dim3 blocks = dim3(n / threads.x, 1); // create cuda event handles cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); - StopWatchInterface *timer = NULL; + StopWatchInterface* timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); @@ -109,9 +105,9 @@ int main(int argc, char *argv[]) sdkStopTimer(&timer); // have CPU do some work while waiting for stage 1 to finish - unsigned long int counter=0; + unsigned long int counter = 0; - while (cudaEventQuery(stop) == cudaErrorNotReady) + while(cudaEventQuery(stop) == cudaErrorNotReady) { counter++; } @@ -124,7 +120,7 @@ int main(int argc, char *argv[]) printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); // check the output for correctness - bool bFinalResults = (bool)correct_output(a, n, value); + bool bFinalResults = (bool) correct_output(a, n, value); // release resources checkCudaErrors(cudaEventDestroy(start)); diff --git a/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp b/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp index aa93adfd..19cdbb30 100644 --- a/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp +++ b/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp @@ -27,28 +27,24 @@ // includes, project #include -#include // helper utility functions +#include // helper utility functions struct increment_kernel { + template + ALPAKA_FN_ACC void operator()(T_Acc const& acc, int* g_data, int inc_value) const + { + int idx = blockIdx.x * (blockDim.x * elemDim.x) + threadIdx.x; -template< - typename T_Acc -> -ALPAKA_FN_ACC -void operator()(T_Acc const & acc, int *g_data, int inc_value) const -{ - int idx = blockIdx.x * (blockDim.x * elemDim.x) + threadIdx.x; - - for(int i = 0; i < elemDim.x; ++i) - g_data[idx + i] = g_data[idx + i] + inc_value; -} + for(int i = 0; i < elemDim.x; ++i) + g_data[idx + i] = g_data[idx + i] + inc_value; + } }; -int correct_output(int *data, const int n, const int x) +int correct_output(int* data, const int n, const int x) { - for (int i = 0; i < n; i++) - if (data[i] != x) + for(int i = 0; i < n; i++) + if(data[i] != x) { printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); return 0; @@ -57,44 +53,44 @@ int correct_output(int *data, const int n, const int x) return 1; } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { -// int devID; -// cudaDeviceProp deviceProps; + // int devID; + // cudaDeviceProp deviceProps; printf("[%s] - Starting...\n", argv[0]); // This will pick the best possible CUDA capable device -// devID = findCudaDevice(argc, (const char **)argv); + // devID = findCudaDevice(argc, (const char **)argv); // get device name -// checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); -// printf("CUDA device [%s]\n", deviceProps.name); + // checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + // printf("CUDA device [%s]\n", deviceProps.name); int n = 16 * 1024 * 1024; int nbytes = n * sizeof(int); int value = 26; // allocate host memory - int *a = 0; - checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); + int* a = 0; + checkCudaErrors(cudaMallocHost((void**) &a, nbytes)); memset(a, 0, nbytes); // allocate device memory - int *d_a=0; - checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); + int* d_a = 0; + checkCudaErrors(cudaMalloc((void**) &d_a, nbytes)); checkCudaErrors(cudaMemset(d_a, 255, nbytes)); // set kernel launch configuration dim3 threads = dim3(512, 1); - dim3 blocks = dim3(n / threads.x, 1); + dim3 blocks = dim3(n / threads.x, 1); // create cuda event handles cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); - StopWatchInterface *timer = NULL; + StopWatchInterface* timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); @@ -111,9 +107,9 @@ int main(int argc, char *argv[]) sdkStopTimer(&timer); // have CPU do some work while waiting for stage 1 to finish - unsigned long int counter=0; + unsigned long int counter = 0; - while (cudaEventQuery(stop) == cudaErrorNotReady) + while(cudaEventQuery(stop) == cudaErrorNotReady) { counter++; } @@ -126,7 +122,7 @@ int main(int argc, char *argv[]) printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); // check the output for correctness - bool bFinalResults = (bool)correct_output(a, n, value); + bool bFinalResults = (bool) correct_output(a, n, value); // release resources checkCudaErrors(cudaEventDestroy(start)); diff --git a/example/CUDASamples/blackScholes/src/BlackScholes.cpp b/example/CUDASamples/blackScholes/src/BlackScholes.cpp index d09a3a59..be128391 100644 --- a/example/CUDASamples/blackScholes/src/BlackScholes.cpp +++ b/example/CUDASamples/blackScholes/src/BlackScholes.cpp @@ -16,23 +16,21 @@ */ #include - -#include // helper functions for string parsing -#include // helper functions CUDA error checking and initialization +#include // helper functions CUDA error checking and initialization +#include // helper functions for string parsing //////////////////////////////////////////////////////////////////////////////// // Process an array of optN options on CPU //////////////////////////////////////////////////////////////////////////////// extern "C" void BlackScholesCPU( - float *h_CallResult, - float *h_PutResult, - float *h_StockPrice, - float *h_OptionStrike, - float *h_OptionYears, + float* h_CallResult, + float* h_PutResult, + float* h_StockPrice, + float* h_OptionStrike, + float* h_OptionYears, float Riskfree, float Volatility, - int optN -); + int optN); //////////////////////////////////////////////////////////////////////////////// // Process an array of OptN options on GPU @@ -45,7 +43,7 @@ extern "C" void BlackScholesCPU( //////////////////////////////////////////////////////////////////////////////// float RandFloat(float low, float high) { - float t = (float)rand() / (float)RAND_MAX; + float t = (float) rand() / (float) RAND_MAX; return (1.0f - t) * low + t * high; } @@ -53,91 +51,85 @@ float RandFloat(float low, float high) // Data configuration //////////////////////////////////////////////////////////////////////////////// const int OPT_N = 4000000; -const int NUM_ITERATIONS = 500; +const int NUM_ITERATIONS = 500; size_t OPT_SZ = OPT_N * sizeof(float); -const float RISKFREE = 0.02f; -const float VOLATILITY = 0.30f; +const float RISKFREE = 0.02f; +const float VOLATILITY = 0.30f; -#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define DIV_UP(a, b) (((a) + (b) -1) / (b)) //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) +int main(int argc, char** argv) { // Start logs printf("[%s] - Starting...\n", argv[0]); //'h_' prefix - CPU (host) memory space float - //Results calculated by host for reference - *h_CallResultCPU, - *h_PutResultCPU, - //host copy of device results - *h_CallResultGPU, - *h_PutResultGPU, - //host instance of input data - *h_StockPrice, - *h_OptionStrike, - *h_OptionYears; + // Results calculated by host for reference + *h_CallResultCPU, + *h_PutResultCPU, + // host copy of device results + *h_CallResultGPU, *h_PutResultGPU, + // host instance of input data + *h_StockPrice, *h_OptionStrike, *h_OptionYears; //'d_' prefix - device memory space float - //Results calculated by device - *d_CallResult, - *d_PutResult, - //device instance of input data - *d_StockPrice, - *d_OptionStrike, - *d_OptionYears; - - double - delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; - - StopWatchInterface *hTimer = NULL; + // Results calculated by device + *d_CallResult, + *d_PutResult, + // device instance of input data + *d_StockPrice, *d_OptionStrike, *d_OptionYears; + + double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; + + StopWatchInterface* hTimer = NULL; int i; - //findCudaDevice(argc, (const char **)argv); + // findCudaDevice(argc, (const char **)argv); sdkCreateTimer(&hTimer); printf("Initializing data...\n"); printf("...allocating CPU memory for options.\n"); - h_CallResultCPU = (float *)malloc(OPT_SZ); - h_PutResultCPU = (float *)malloc(OPT_SZ); - h_CallResultGPU = (float *)malloc(OPT_SZ); - h_PutResultGPU = (float *)malloc(OPT_SZ); - h_StockPrice = (float *)malloc(OPT_SZ); - h_OptionStrike = (float *)malloc(OPT_SZ); - h_OptionYears = (float *)malloc(OPT_SZ); + h_CallResultCPU = (float*) malloc(OPT_SZ); + h_PutResultCPU = (float*) malloc(OPT_SZ); + h_CallResultGPU = (float*) malloc(OPT_SZ); + h_PutResultGPU = (float*) malloc(OPT_SZ); + h_StockPrice = (float*) malloc(OPT_SZ); + h_OptionStrike = (float*) malloc(OPT_SZ); + h_OptionYears = (float*) malloc(OPT_SZ); printf("...allocating GPU memory for options.\n"); - checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ)); - checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ)); + checkCudaErrors(cudaMalloc((void**) &d_CallResult, OPT_SZ)); + checkCudaErrors(cudaMalloc((void**) &d_PutResult, OPT_SZ)); + checkCudaErrors(cudaMalloc((void**) &d_StockPrice, OPT_SZ)); + checkCudaErrors(cudaMalloc((void**) &d_OptionStrike, OPT_SZ)); + checkCudaErrors(cudaMalloc((void**) &d_OptionYears, OPT_SZ)); printf("...generating input data in CPU mem.\n"); srand(5347); - //Generate options set - for (i = 0; i < OPT_N; i++) + // Generate options set + for(i = 0; i < OPT_N; i++) { h_CallResultCPU[i] = 0.0f; - h_PutResultCPU[i] = -1.0f; - h_StockPrice[i] = RandFloat(5.0f, 30.0f); - h_OptionStrike[i] = RandFloat(1.0f, 100.0f); - h_OptionYears[i] = RandFloat(0.25f, 10.0f); + h_PutResultCPU[i] = -1.0f; + h_StockPrice[i] = RandFloat(5.0f, 30.0f); + h_OptionStrike[i] = RandFloat(1.0f, 100.0f); + h_OptionYears[i] = RandFloat(0.25f, 10.0f); } printf("...copying input data to device mem.\n"); - //Copy options data to device memory for further processing - checkCudaErrors(cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice)); + // Copy options data to device memory for further processing + checkCudaErrors(cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice)); printf("Data init done.\n\n"); @@ -146,43 +138,49 @@ int main(int argc, char **argv) sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); - for (i = 0; i < NUM_ITERATIONS; i++) + for(i = 0; i < NUM_ITERATIONS; i++) { - CUPLA_KERNEL_OPTI(BlackScholesGPU)(DIV_UP((OPT_N/2), 128), 128/*480, 128*/,0,0)( - (float2 *)d_CallResult, - (float2 *)d_PutResult, - (float2 *)d_StockPrice, - (float2 *)d_OptionStrike, - (float2 *)d_OptionYears, + CUPLA_KERNEL_OPTI(BlackScholesGPU) + (DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/, 0, 0)( + (float2*) d_CallResult, + (float2*) d_PutResult, + (float2*) d_StockPrice, + (float2*) d_OptionStrike, + (float2*) d_OptionYears, RISKFREE, VOLATILITY, - OPT_N - ); - //getLastCudaError("BlackScholesGPU() execution failed\n"); + OPT_N); + // getLastCudaError("BlackScholesGPU() execution failed\n"); } checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&hTimer); gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; - //Both call and put is calculated + // Both call and put is calculated printf("Options count : %i \n", 2 * OPT_N); printf("BlackScholes device time : %f msec\n", gpuTime); - printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); - printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); - - printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n", - (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128); + printf("Effective memory bandwidth: %f GB/s\n", ((double) (5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); + printf("Gigaoptions per second : %f \n\n", ((double) (2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); + + printf( + "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = " + "%u\n", + (((double) (2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), + gpuTime * 1e-3, + (2 * OPT_N), + 1, + 128); printf("\nReading back device results...\n"); - //Read back device results to compare them to host results + // Read back device results to compare them to host results checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost)); printf("Checking the results...\n"); printf("...running host calculations.\n\n"); - //Calculate options values on host + // Calculate options values on host BlackScholesCPU( h_CallResultCPU, h_PutResultCPU, @@ -191,28 +189,27 @@ int main(int argc, char **argv) h_OptionYears, RISKFREE, VOLATILITY, - OPT_N - ); + OPT_N); printf("Comparing the results...\n"); - //Calculate max absolute difference and L1 distance - //between CPU and GPU results + // Calculate max absolute difference and L1 distance + // between CPU and GPU results sum_delta = 0; - sum_ref = 0; + sum_ref = 0; max_delta = 0; - for (i = 0; i < OPT_N; i++) + for(i = 0; i < OPT_N; i++) { - ref = h_CallResultCPU[i]; + ref = h_CallResultCPU[i]; delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); - if (delta > max_delta) + if(delta > max_delta) { max_delta = delta; } sum_delta += delta; - sum_ref += fabs(ref); + sum_ref += fabs(ref); } L1norm = sum_delta / sum_ref; @@ -247,13 +244,14 @@ int main(int argc, char **argv) // flushed before the application exits cudaDeviceReset(); - if (L1norm > 1e-6) + if(L1norm > 1e-6) { printf("Test failed!\n"); exit(EXIT_FAILURE); } - printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is " + "enabled.\n\n"); printf("Test passed\n"); exit(EXIT_SUCCESS); } diff --git a/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp b/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp index a6be31a8..737185f3 100644 --- a/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp +++ b/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp @@ -10,9 +10,8 @@ */ - -#include #include +#include /////////////////////////////////////////////////////////////////////////////// @@ -20,21 +19,18 @@ /////////////////////////////////////////////////////////////////////////////// static double CND(double d) { - const double A1 = 0.31938153; - const double A2 = -0.356563782; - const double A3 = 1.781477937; - const double A4 = -1.821255978; - const double A5 = 1.330274429; + const double A1 = 0.31938153; + const double A2 = -0.356563782; + const double A3 = 1.781477937; + const double A4 = -1.821255978; + const double A5 = 1.330274429; const double RSQRT2PI = 0.39894228040143267793994605993438; - double - K = 1.0 / (1.0 + 0.2316419 * cupla::abs(d)); + double K = 1.0 / (1.0 + 0.2316419 * cupla::abs(d)); - double - cnd = RSQRT2PI * cupla::exp(- 0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + double cnd = RSQRT2PI * cupla::exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); - if (d > 0) + if(d > 0) cnd = 1.0 - cnd; return cnd; @@ -45,27 +41,27 @@ static double CND(double d) // Black-Scholes formula for both call and put /////////////////////////////////////////////////////////////////////////////// static void BlackScholesBodyCPU( - float &callResult, - float &putResult, - float Sf, //Stock price - float Xf, //Option strike - float Tf, //Option years - float Rf, //Riskless rate - float Vf //Volatility rate + float& callResult, + float& putResult, + float Sf, // Stock price + float Xf, // Option strike + float Tf, // Option years + float Rf, // Riskless rate + float Vf // Volatility rate ) { double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; double sqrtT = cupla::sqrt(T); - double d1 = (cupla::log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); - double d2 = d1 - V * sqrtT; + double d1 = (cupla::log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; double CNDD1 = CND(d1); double CNDD2 = CND(d2); - //Calculate Call and Put simultaneously - double expRT = exp(- R * T); - callResult = (float)(S * CNDD1 - X * expRT * CNDD2); - putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); + // Calculate Call and Put simultaneously + double expRT = exp(-R * T); + callResult = (float) (S * CNDD1 - X * expRT * CNDD2); + putResult = (float) (X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); } @@ -73,17 +69,16 @@ static void BlackScholesBodyCPU( // Process an array of optN options //////////////////////////////////////////////////////////////////////////////// extern "C" void BlackScholesCPU( - float *h_CallResult, - float *h_PutResult, - float *h_StockPrice, - float *h_OptionStrike, - float *h_OptionYears, + float* h_CallResult, + float* h_PutResult, + float* h_StockPrice, + float* h_OptionStrike, + float* h_OptionYears, float Riskfree, float Volatility, - int optN -) + int optN) { - for (int opt = 0; opt < optN; opt++) + for(int opt = 0; opt < optN; opt++) BlackScholesBodyCPU( h_CallResult[opt], h_PutResult[opt], @@ -91,6 +86,5 @@ extern "C" void BlackScholesCPU( h_OptionStrike[opt], h_OptionYears[opt], Riskfree, - Volatility - ); + Volatility); } diff --git a/example/CUDASamples/common/exception.h b/example/CUDASamples/common/exception.h index adda4bce..a61fa0af 100644 --- a/example/CUDASamples/common/exception.h +++ b/example/CUDASamples/common/exception.h @@ -1,13 +1,13 @@ /* -* Copyright 1993-2013 NVIDIA Corporation. All rights reserved. -* -* Please refer to the NVIDIA end user license agreement (EULA) associated -* with this source code for terms and conditions that govern your use of -* this software. Any use, reproduction, disclosure, or distribution of -* this software and related documentation outside the terms of the EULA -* is strictly prohibited. -* -*/ + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ /* CUda UTility Library */ #ifndef _EXCEPTION_H_ @@ -15,8 +15,9 @@ // includes, system #include -#include #include +#include + #include //! Exception wrapper. @@ -24,38 +25,31 @@ template class Exception : public Std_Exception { - public: - - //! @brief Static construction interface - //! @return Alwayss throws ( Located_Exception) - //! @param file file in which the Exception occurs - //! @param line line in which the Exception occurs - //! @param detailed details on the code fragment causing the Exception - static void throw_it(const char *file, - const int line, - const char *detailed = "-"); - - //! Static construction interface - //! @return Alwayss throws ( Located_Exception) - //! @param file file in which the Exception occurs - //! @param line line in which the Exception occurs - //! @param detailed details on the code fragment causing the Exception - static void throw_it(const char *file, - const int line, - const std::string &detailed); - - //! Destructor - virtual ~Exception() throw(); - - private: - - //! Constructor, default (private) - Exception(); - - //! Constructor, standard - //! @param str string returned by what() - Exception(const std::string &str); - +public: + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char* file, const int line, const char* detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char* file, const int line, const std::string& detailed); + + //! Destructor + virtual ~Exception() throw(); + +private: + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + Exception(const std::string& str); }; //////////////////////////////////////////////////////////////////////////////// @@ -63,8 +57,7 @@ class Exception : public Std_Exception //! @param ex exception to handle //////////////////////////////////////////////////////////////////////////////// template -inline void -handleException(const Exception_Typ &ex) +inline void handleException(const Exception_Typ& ex) { std::cerr << ex.what() << std::endl; @@ -74,16 +67,13 @@ handleException(const Exception_Typ &ex) //! Convenience macros //! Exception caused by dynamic program behavior, e.g. file does not exist -#define RUNTIME_EXCEPTION( msg) \ - Exception::throw_it( __FILE__, __LINE__, msg) +#define RUNTIME_EXCEPTION(msg) Exception::throw_it(__FILE__, __LINE__, msg) //! Logic exception in program, e.g. an assert failed -#define LOGIC_EXCEPTION( msg) \ - Exception::throw_it( __FILE__, __LINE__, msg) +#define LOGIC_EXCEPTION(msg) Exception::throw_it(__FILE__, __LINE__, msg) //! Out of range exception -#define RANGE_EXCEPTION( msg) \ - Exception::throw_it( __FILE__, __LINE__, msg) +#define RANGE_EXCEPTION(msg) Exception::throw_it(__FILE__, __LINE__, msg) //////////////////////////////////////////////////////////////////////////////// //! Implementation @@ -96,9 +86,7 @@ handleException(const Exception_Typ &ex) //! @param Exception causing code fragment (file and line) and detailed infos. //////////////////////////////////////////////////////////////////////////////// /*static*/ template -void -Exception:: -throw_it(const char *file, const int line, const char *detailed) +void Exception::throw_it(const char* file, const int line, const char* detailed) { std::stringstream s; @@ -115,9 +103,7 @@ throw_it(const char *file, const int line, const char *detailed) //! @param Exception causing code fragment (file and line) and detailed infos. //////////////////////////////////////////////////////////////////////////////// /*static*/ template -void -Exception:: -throw_it(const char *file, const int line, const std::string &msg) +void Exception::throw_it(const char* file, const int line, const std::string& msg) { throw_it(file, line, msg.c_str()); } @@ -126,26 +112,27 @@ throw_it(const char *file, const int line, const std::string &msg) //! Constructor, default (private). //////////////////////////////////////////////////////////////////////////////// template -Exception::Exception() : - Std_Exception("Unknown Exception.\n") -{ } +Exception::Exception() : Std_Exception("Unknown Exception.\n") +{ +} //////////////////////////////////////////////////////////////////////////////// //! Constructor, standard (private). //! String returned by what(). //////////////////////////////////////////////////////////////////////////////// template -Exception::Exception(const std::string &s) : - Std_Exception(s) -{ } +Exception::Exception(const std::string& s) : Std_Exception(s) +{ +} //////////////////////////////////////////////////////////////////////////////// //! Destructor //////////////////////////////////////////////////////////////////////////////// template -Exception::~Exception() throw() { } +Exception::~Exception() throw() +{ +} // functions, exported #endif // #ifndef _EXCEPTION_H_ - diff --git a/example/CUDASamples/common/helper_cuda.h b/example/CUDASamples/common/helper_cuda.h index 59fc77a4..ad4a3d17 100644 --- a/example/CUDASamples/common/helper_cuda.h +++ b/example/CUDASamples/common/helper_cuda.h @@ -17,14 +17,13 @@ #pragma once -#include +#include #include +#include #include -#include - #ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 +# define EXIT_WAIVED 2 #endif // Note, it is required that your SDK sample to include the proper header files, please @@ -33,258 +32,258 @@ // CUDA Runtime error messages #ifdef __DRIVER_TYPES_H__ -static const char *_cudaGetErrorEnum(cudaError_t error) +static const char* _cudaGetErrorEnum(cudaError_t error) { - switch (error) + switch(error) { - case cudaSuccess: - return "cudaSuccess"; -/* - case cudaErrorMissingConfiguration: - return "cudaErrorMissingConfiguration"; -*/ - case cudaErrorMemoryAllocation: - return "cudaErrorMemoryAllocation"; + case cudaSuccess: + return "cudaSuccess"; + /* + case cudaErrorMissingConfiguration: + return "cudaErrorMissingConfiguration"; + */ + case cudaErrorMemoryAllocation: + return "cudaErrorMemoryAllocation"; - case cudaErrorInitializationError: - return "cudaErrorInitializationError"; -/* - case cudaErrorLaunchFailure: - return "cudaErrorLaunchFailure"; + case cudaErrorInitializationError: + return "cudaErrorInitializationError"; + /* + case cudaErrorLaunchFailure: + return "cudaErrorLaunchFailure"; - case cudaErrorPriorLaunchFailure: - return "cudaErrorPriorLaunchFailure"; + case cudaErrorPriorLaunchFailure: + return "cudaErrorPriorLaunchFailure"; - case cudaErrorLaunchTimeout: - return "cudaErrorLaunchTimeout"; + case cudaErrorLaunchTimeout: + return "cudaErrorLaunchTimeout"; - case cudaErrorLaunchOutOfResources: - return "cudaErrorLaunchOutOfResources"; + case cudaErrorLaunchOutOfResources: + return "cudaErrorLaunchOutOfResources"; - case cudaErrorInvalidDeviceFunction: - return "cudaErrorInvalidDeviceFunction"; + case cudaErrorInvalidDeviceFunction: + return "cudaErrorInvalidDeviceFunction"; - case cudaErrorInvalidConfiguration: - return "cudaErrorInvalidConfiguration"; + case cudaErrorInvalidConfiguration: + return "cudaErrorInvalidConfiguration"; - case cudaErrorInvalidDevice: - return "cudaErrorInvalidDevice"; + case cudaErrorInvalidDevice: + return "cudaErrorInvalidDevice"; - case cudaErrorInvalidValue: - return "cudaErrorInvalidValue"; + case cudaErrorInvalidValue: + return "cudaErrorInvalidValue"; - case cudaErrorInvalidPitchValue: - return "cudaErrorInvalidPitchValue"; + case cudaErrorInvalidPitchValue: + return "cudaErrorInvalidPitchValue"; - case cudaErrorInvalidSymbol: - return "cudaErrorInvalidSymbol"; + case cudaErrorInvalidSymbol: + return "cudaErrorInvalidSymbol"; - case cudaErrorMapBufferObjectFailed: - return "cudaErrorMapBufferObjectFailed"; + case cudaErrorMapBufferObjectFailed: + return "cudaErrorMapBufferObjectFailed"; - case cudaErrorUnmapBufferObjectFailed: - return "cudaErrorUnmapBufferObjectFailed"; + case cudaErrorUnmapBufferObjectFailed: + return "cudaErrorUnmapBufferObjectFailed"; - case cudaErrorInvalidHostPointer: - return "cudaErrorInvalidHostPointer"; + case cudaErrorInvalidHostPointer: + return "cudaErrorInvalidHostPointer"; - case cudaErrorInvalidDevicePointer: - return "cudaErrorInvalidDevicePointer"; + case cudaErrorInvalidDevicePointer: + return "cudaErrorInvalidDevicePointer"; - case cudaErrorInvalidTexture: - return "cudaErrorInvalidTexture"; + case cudaErrorInvalidTexture: + return "cudaErrorInvalidTexture"; - case cudaErrorInvalidTextureBinding: - return "cudaErrorInvalidTextureBinding"; + case cudaErrorInvalidTextureBinding: + return "cudaErrorInvalidTextureBinding"; - case cudaErrorInvalidChannelDescriptor: - return "cudaErrorInvalidChannelDescriptor"; + case cudaErrorInvalidChannelDescriptor: + return "cudaErrorInvalidChannelDescriptor"; - case cudaErrorInvalidMemcpyDirection: - return "cudaErrorInvalidMemcpyDirection"; + case cudaErrorInvalidMemcpyDirection: + return "cudaErrorInvalidMemcpyDirection"; - case cudaErrorAddressOfConstant: - return "cudaErrorAddressOfConstant"; + case cudaErrorAddressOfConstant: + return "cudaErrorAddressOfConstant"; - case cudaErrorTextureFetchFailed: - return "cudaErrorTextureFetchFailed"; + case cudaErrorTextureFetchFailed: + return "cudaErrorTextureFetchFailed"; - case cudaErrorTextureNotBound: - return "cudaErrorTextureNotBound"; + case cudaErrorTextureNotBound: + return "cudaErrorTextureNotBound"; - case cudaErrorSynchronizationError: - return "cudaErrorSynchronizationError"; + case cudaErrorSynchronizationError: + return "cudaErrorSynchronizationError"; - case cudaErrorInvalidFilterSetting: - return "cudaErrorInvalidFilterSetting"; + case cudaErrorInvalidFilterSetting: + return "cudaErrorInvalidFilterSetting"; - case cudaErrorInvalidNormSetting: - return "cudaErrorInvalidNormSetting"; + case cudaErrorInvalidNormSetting: + return "cudaErrorInvalidNormSetting"; - case cudaErrorMixedDeviceExecution: - return "cudaErrorMixedDeviceExecution"; + case cudaErrorMixedDeviceExecution: + return "cudaErrorMixedDeviceExecution"; - case cudaErrorCudartUnloading: - return "cudaErrorCudartUnloading"; + case cudaErrorCudartUnloading: + return "cudaErrorCudartUnloading"; - case cudaErrorUnknown: - return "cudaErrorUnknown"; + case cudaErrorUnknown: + return "cudaErrorUnknown"; - case cudaErrorNotYetImplemented: - return "cudaErrorNotYetImplemented"; + case cudaErrorNotYetImplemented: + return "cudaErrorNotYetImplemented"; - case cudaErrorMemoryValueTooLarge: - return "cudaErrorMemoryValueTooLarge"; + case cudaErrorMemoryValueTooLarge: + return "cudaErrorMemoryValueTooLarge"; - case cudaErrorInvalidResourceHandle: - return "cudaErrorInvalidResourceHandle"; -*/ - case cudaErrorNotReady: - return "cudaErrorNotReady"; -/* - case cudaErrorInsufficientDriver: - return "cudaErrorInsufficientDriver"; + case cudaErrorInvalidResourceHandle: + return "cudaErrorInvalidResourceHandle"; + */ + case cudaErrorNotReady: + return "cudaErrorNotReady"; + /* + case cudaErrorInsufficientDriver: + return "cudaErrorInsufficientDriver"; - case cudaErrorSetOnActiveProcess: - return "cudaErrorSetOnActiveProcess"; + case cudaErrorSetOnActiveProcess: + return "cudaErrorSetOnActiveProcess"; - case cudaErrorInvalidSurface: - return "cudaErrorInvalidSurface"; + case cudaErrorInvalidSurface: + return "cudaErrorInvalidSurface"; - case cudaErrorNoDevice: - return "cudaErrorNoDevice"; + case cudaErrorNoDevice: + return "cudaErrorNoDevice"; - case cudaErrorECCUncorrectable: - return "cudaErrorECCUncorrectable"; + case cudaErrorECCUncorrectable: + return "cudaErrorECCUncorrectable"; - case cudaErrorSharedObjectSymbolNotFound: - return "cudaErrorSharedObjectSymbolNotFound"; + case cudaErrorSharedObjectSymbolNotFound: + return "cudaErrorSharedObjectSymbolNotFound"; - case cudaErrorSharedObjectInitFailed: - return "cudaErrorSharedObjectInitFailed"; + case cudaErrorSharedObjectInitFailed: + return "cudaErrorSharedObjectInitFailed"; - case cudaErrorUnsupportedLimit: - return "cudaErrorUnsupportedLimit"; + case cudaErrorUnsupportedLimit: + return "cudaErrorUnsupportedLimit"; - case cudaErrorDuplicateVariableName: - return "cudaErrorDuplicateVariableName"; + case cudaErrorDuplicateVariableName: + return "cudaErrorDuplicateVariableName"; - case cudaErrorDuplicateTextureName: - return "cudaErrorDuplicateTextureName"; + case cudaErrorDuplicateTextureName: + return "cudaErrorDuplicateTextureName"; - case cudaErrorDuplicateSurfaceName: - return "cudaErrorDuplicateSurfaceName"; + case cudaErrorDuplicateSurfaceName: + return "cudaErrorDuplicateSurfaceName"; - case cudaErrorDevicesUnavailable: - return "cudaErrorDevicesUnavailable"; + case cudaErrorDevicesUnavailable: + return "cudaErrorDevicesUnavailable"; - case cudaErrorInvalidKernelImage: - return "cudaErrorInvalidKernelImage"; + case cudaErrorInvalidKernelImage: + return "cudaErrorInvalidKernelImage"; - case cudaErrorNoKernelImageForDevice: - return "cudaErrorNoKernelImageForDevice"; + case cudaErrorNoKernelImageForDevice: + return "cudaErrorNoKernelImageForDevice"; - case cudaErrorIncompatibleDriverContext: - return "cudaErrorIncompatibleDriverContext"; + case cudaErrorIncompatibleDriverContext: + return "cudaErrorIncompatibleDriverContext"; - case cudaErrorPeerAccessAlreadyEnabled: - return "cudaErrorPeerAccessAlreadyEnabled"; + case cudaErrorPeerAccessAlreadyEnabled: + return "cudaErrorPeerAccessAlreadyEnabled"; - case cudaErrorPeerAccessNotEnabled: - return "cudaErrorPeerAccessNotEnabled"; + case cudaErrorPeerAccessNotEnabled: + return "cudaErrorPeerAccessNotEnabled"; - case cudaErrorDeviceAlreadyInUse: - return "cudaErrorDeviceAlreadyInUse"; + case cudaErrorDeviceAlreadyInUse: + return "cudaErrorDeviceAlreadyInUse"; - case cudaErrorProfilerDisabled: - return "cudaErrorProfilerDisabled"; + case cudaErrorProfilerDisabled: + return "cudaErrorProfilerDisabled"; - case cudaErrorProfilerNotInitialized: - return "cudaErrorProfilerNotInitialized"; + case cudaErrorProfilerNotInitialized: + return "cudaErrorProfilerNotInitialized"; - case cudaErrorProfilerAlreadyStarted: - return "cudaErrorProfilerAlreadyStarted"; + case cudaErrorProfilerAlreadyStarted: + return "cudaErrorProfilerAlreadyStarted"; - case cudaErrorProfilerAlreadyStopped: - return "cudaErrorProfilerAlreadyStopped"; -*/ + case cudaErrorProfilerAlreadyStopped: + return "cudaErrorProfilerAlreadyStopped"; + */ /* Since CUDA 4.0*/ -/* - case cudaErrorAssert: - return "cudaErrorAssert"; + /* + case cudaErrorAssert: + return "cudaErrorAssert"; - case cudaErrorTooManyPeers: - return "cudaErrorTooManyPeers"; + case cudaErrorTooManyPeers: + return "cudaErrorTooManyPeers"; - case cudaErrorHostMemoryAlreadyRegistered: - return "cudaErrorHostMemoryAlreadyRegistered"; + case cudaErrorHostMemoryAlreadyRegistered: + return "cudaErrorHostMemoryAlreadyRegistered"; - case cudaErrorHostMemoryNotRegistered: - return "cudaErrorHostMemoryNotRegistered"; -*/ + case cudaErrorHostMemoryNotRegistered: + return "cudaErrorHostMemoryNotRegistered"; + */ /* Since CUDA 5.0 */ -/* - case cudaErrorOperatingSystem: - return "cudaErrorOperatingSystem"; + /* + case cudaErrorOperatingSystem: + return "cudaErrorOperatingSystem"; - case cudaErrorPeerAccessUnsupported: - return "cudaErrorPeerAccessUnsupported"; + case cudaErrorPeerAccessUnsupported: + return "cudaErrorPeerAccessUnsupported"; - case cudaErrorLaunchMaxDepthExceeded: - return "cudaErrorLaunchMaxDepthExceeded"; + case cudaErrorLaunchMaxDepthExceeded: + return "cudaErrorLaunchMaxDepthExceeded"; - case cudaErrorLaunchFileScopedTex: - return "cudaErrorLaunchFileScopedTex"; + case cudaErrorLaunchFileScopedTex: + return "cudaErrorLaunchFileScopedTex"; - case cudaErrorLaunchFileScopedSurf: - return "cudaErrorLaunchFileScopedSurf"; + case cudaErrorLaunchFileScopedSurf: + return "cudaErrorLaunchFileScopedSurf"; - case cudaErrorSyncDepthExceeded: - return "cudaErrorSyncDepthExceeded"; + case cudaErrorSyncDepthExceeded: + return "cudaErrorSyncDepthExceeded"; - case cudaErrorLaunchPendingCountExceeded: - return "cudaErrorLaunchPendingCountExceeded"; + case cudaErrorLaunchPendingCountExceeded: + return "cudaErrorLaunchPendingCountExceeded"; - case cudaErrorNotPermitted: - return "cudaErrorNotPermitted"; + case cudaErrorNotPermitted: + return "cudaErrorNotPermitted"; - case cudaErrorNotSupported: - return "cudaErrorNotSupported"; -*/ + case cudaErrorNotSupported: + return "cudaErrorNotSupported"; + */ /* Since CUDA 6.0 */ -/* - case cudaErrorHardwareStackError: - return "cudaErrorHardwareStackError"; + /* + case cudaErrorHardwareStackError: + return "cudaErrorHardwareStackError"; - case cudaErrorIllegalInstruction: - return "cudaErrorIllegalInstruction"; + case cudaErrorIllegalInstruction: + return "cudaErrorIllegalInstruction"; - case cudaErrorMisalignedAddress: - return "cudaErrorMisalignedAddress"; + case cudaErrorMisalignedAddress: + return "cudaErrorMisalignedAddress"; - case cudaErrorInvalidAddressSpace: - return "cudaErrorInvalidAddressSpace"; + case cudaErrorInvalidAddressSpace: + return "cudaErrorInvalidAddressSpace"; - case cudaErrorInvalidPc: - return "cudaErrorInvalidPc"; + case cudaErrorInvalidPc: + return "cudaErrorInvalidPc"; - case cudaErrorIllegalAddress: - return "cudaErrorIllegalAddress"; -*/ + case cudaErrorIllegalAddress: + return "cudaErrorIllegalAddress"; + */ /* Since CUDA 6.5*/ -/* - case cudaErrorInvalidPtx: - return "cudaErrorInvalidPtx"; + /* + case cudaErrorInvalidPtx: + return "cudaErrorInvalidPtx"; - case cudaErrorInvalidGraphicsContext: - return "cudaErrorInvalidGraphicsContext"; + case cudaErrorInvalidGraphicsContext: + return "cudaErrorInvalidGraphicsContext"; - case cudaErrorStartupFailure: - return "cudaErrorStartupFailure"; + case cudaErrorStartupFailure: + return "cudaErrorStartupFailure"; - case cudaErrorApiFailureBase: - return "cudaErrorApiFailureBase"; -*/ + case cudaErrorApiFailureBase: + return "cudaErrorApiFailureBase"; + */ } return ""; @@ -293,150 +292,150 @@ static const char *_cudaGetErrorEnum(cudaError_t error) #ifdef __cuda_cuda_h__ // CUDA Driver API errors -static const char *_cudaGetErrorEnum(CUresult error) +static const char* _cudaGetErrorEnum(CUresult error) { - switch (error) + switch(error) { - case CUDA_SUCCESS: - return "CUDA_SUCCESS"; + case CUDA_SUCCESS: + return "CUDA_SUCCESS"; - case CUDA_ERROR_INVALID_VALUE: - return "CUDA_ERROR_INVALID_VALUE"; + case CUDA_ERROR_INVALID_VALUE: + return "CUDA_ERROR_INVALID_VALUE"; - case CUDA_ERROR_OUT_OF_MEMORY: - return "CUDA_ERROR_OUT_OF_MEMORY"; + case CUDA_ERROR_OUT_OF_MEMORY: + return "CUDA_ERROR_OUT_OF_MEMORY"; - case CUDA_ERROR_NOT_INITIALIZED: - return "CUDA_ERROR_NOT_INITIALIZED"; + case CUDA_ERROR_NOT_INITIALIZED: + return "CUDA_ERROR_NOT_INITIALIZED"; - case CUDA_ERROR_DEINITIALIZED: - return "CUDA_ERROR_DEINITIALIZED"; + case CUDA_ERROR_DEINITIALIZED: + return "CUDA_ERROR_DEINITIALIZED"; - case CUDA_ERROR_PROFILER_DISABLED: - return "CUDA_ERROR_PROFILER_DISABLED"; + case CUDA_ERROR_PROFILER_DISABLED: + return "CUDA_ERROR_PROFILER_DISABLED"; - case CUDA_ERROR_PROFILER_NOT_INITIALIZED: - return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; + case CUDA_ERROR_PROFILER_NOT_INITIALIZED: + return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; - case CUDA_ERROR_PROFILER_ALREADY_STARTED: - return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; + case CUDA_ERROR_PROFILER_ALREADY_STARTED: + return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; - case CUDA_ERROR_PROFILER_ALREADY_STOPPED: - return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; + case CUDA_ERROR_PROFILER_ALREADY_STOPPED: + return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; - case CUDA_ERROR_NO_DEVICE: - return "CUDA_ERROR_NO_DEVICE"; + case CUDA_ERROR_NO_DEVICE: + return "CUDA_ERROR_NO_DEVICE"; - case CUDA_ERROR_INVALID_DEVICE: - return "CUDA_ERROR_INVALID_DEVICE"; + case CUDA_ERROR_INVALID_DEVICE: + return "CUDA_ERROR_INVALID_DEVICE"; - case CUDA_ERROR_INVALID_IMAGE: - return "CUDA_ERROR_INVALID_IMAGE"; + case CUDA_ERROR_INVALID_IMAGE: + return "CUDA_ERROR_INVALID_IMAGE"; - case CUDA_ERROR_INVALID_CONTEXT: - return "CUDA_ERROR_INVALID_CONTEXT"; + case CUDA_ERROR_INVALID_CONTEXT: + return "CUDA_ERROR_INVALID_CONTEXT"; - case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: - return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: + return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; - case CUDA_ERROR_MAP_FAILED: - return "CUDA_ERROR_MAP_FAILED"; + case CUDA_ERROR_MAP_FAILED: + return "CUDA_ERROR_MAP_FAILED"; - case CUDA_ERROR_UNMAP_FAILED: - return "CUDA_ERROR_UNMAP_FAILED"; + case CUDA_ERROR_UNMAP_FAILED: + return "CUDA_ERROR_UNMAP_FAILED"; - case CUDA_ERROR_ARRAY_IS_MAPPED: - return "CUDA_ERROR_ARRAY_IS_MAPPED"; + case CUDA_ERROR_ARRAY_IS_MAPPED: + return "CUDA_ERROR_ARRAY_IS_MAPPED"; - case CUDA_ERROR_ALREADY_MAPPED: - return "CUDA_ERROR_ALREADY_MAPPED"; + case CUDA_ERROR_ALREADY_MAPPED: + return "CUDA_ERROR_ALREADY_MAPPED"; - case CUDA_ERROR_NO_BINARY_FOR_GPU: - return "CUDA_ERROR_NO_BINARY_FOR_GPU"; + case CUDA_ERROR_NO_BINARY_FOR_GPU: + return "CUDA_ERROR_NO_BINARY_FOR_GPU"; - case CUDA_ERROR_ALREADY_ACQUIRED: - return "CUDA_ERROR_ALREADY_ACQUIRED"; + case CUDA_ERROR_ALREADY_ACQUIRED: + return "CUDA_ERROR_ALREADY_ACQUIRED"; - case CUDA_ERROR_NOT_MAPPED: - return "CUDA_ERROR_NOT_MAPPED"; + case CUDA_ERROR_NOT_MAPPED: + return "CUDA_ERROR_NOT_MAPPED"; - case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: - return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: + return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; - case CUDA_ERROR_NOT_MAPPED_AS_POINTER: - return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; + case CUDA_ERROR_NOT_MAPPED_AS_POINTER: + return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; - case CUDA_ERROR_ECC_UNCORRECTABLE: - return "CUDA_ERROR_ECC_UNCORRECTABLE"; + case CUDA_ERROR_ECC_UNCORRECTABLE: + return "CUDA_ERROR_ECC_UNCORRECTABLE"; - case CUDA_ERROR_UNSUPPORTED_LIMIT: - return "CUDA_ERROR_UNSUPPORTED_LIMIT"; + case CUDA_ERROR_UNSUPPORTED_LIMIT: + return "CUDA_ERROR_UNSUPPORTED_LIMIT"; - case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: - return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: + return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; - case CUDA_ERROR_INVALID_SOURCE: - return "CUDA_ERROR_INVALID_SOURCE"; + case CUDA_ERROR_INVALID_SOURCE: + return "CUDA_ERROR_INVALID_SOURCE"; - case CUDA_ERROR_FILE_NOT_FOUND: - return "CUDA_ERROR_FILE_NOT_FOUND"; + case CUDA_ERROR_FILE_NOT_FOUND: + return "CUDA_ERROR_FILE_NOT_FOUND"; - case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: - return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: + return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; - case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: - return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: + return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; - case CUDA_ERROR_OPERATING_SYSTEM: - return "CUDA_ERROR_OPERATING_SYSTEM"; + case CUDA_ERROR_OPERATING_SYSTEM: + return "CUDA_ERROR_OPERATING_SYSTEM"; - case CUDA_ERROR_INVALID_HANDLE: - return "CUDA_ERROR_INVALID_HANDLE"; + case CUDA_ERROR_INVALID_HANDLE: + return "CUDA_ERROR_INVALID_HANDLE"; - case CUDA_ERROR_NOT_FOUND: - return "CUDA_ERROR_NOT_FOUND"; + case CUDA_ERROR_NOT_FOUND: + return "CUDA_ERROR_NOT_FOUND"; - case CUDA_ERROR_NOT_READY: - return "CUDA_ERROR_NOT_READY"; + case CUDA_ERROR_NOT_READY: + return "CUDA_ERROR_NOT_READY"; - case CUDA_ERROR_LAUNCH_FAILED: - return "CUDA_ERROR_LAUNCH_FAILED"; + case CUDA_ERROR_LAUNCH_FAILED: + return "CUDA_ERROR_LAUNCH_FAILED"; - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: - return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; - case CUDA_ERROR_LAUNCH_TIMEOUT: - return "CUDA_ERROR_LAUNCH_TIMEOUT"; + case CUDA_ERROR_LAUNCH_TIMEOUT: + return "CUDA_ERROR_LAUNCH_TIMEOUT"; - case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: - return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: + return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; - case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; - case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; - case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: - return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: + return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; - case CUDA_ERROR_CONTEXT_IS_DESTROYED: - return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + case CUDA_ERROR_CONTEXT_IS_DESTROYED: + return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; - case CUDA_ERROR_ASSERT: - return "CUDA_ERROR_ASSERT"; + case CUDA_ERROR_ASSERT: + return "CUDA_ERROR_ASSERT"; - case CUDA_ERROR_TOO_MANY_PEERS: - return "CUDA_ERROR_TOO_MANY_PEERS"; + case CUDA_ERROR_TOO_MANY_PEERS: + return "CUDA_ERROR_TOO_MANY_PEERS"; - case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; - case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; - case CUDA_ERROR_UNKNOWN: - return "CUDA_ERROR_UNKNOWN"; + case CUDA_ERROR_UNKNOWN: + return "CUDA_ERROR_UNKNOWN"; } return ""; @@ -445,33 +444,33 @@ static const char *_cudaGetErrorEnum(CUresult error) #ifdef CUBLAS_API_H_ // cuBLAS API errors -static const char *_cudaGetErrorEnum(cublasStatus_t error) +static const char* _cudaGetErrorEnum(cublasStatus_t error) { - switch (error) + switch(error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; } return ""; @@ -480,57 +479,57 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) #ifdef _CUFFT_H_ // cuFFT API errors -static const char *_cudaGetErrorEnum(cufftResult error) +static const char* _cudaGetErrorEnum(cufftResult error) { - switch (error) + switch(error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; } return ""; @@ -540,36 +539,36 @@ static const char *_cudaGetErrorEnum(cufftResult error) #ifdef CUSPARSEAPI // cuSPARSE API errors -static const char *_cudaGetErrorEnum(cusparseStatus_t error) +static const char* _cudaGetErrorEnum(cusparseStatus_t error) { - switch (error) + switch(error) { - case CUSPARSE_STATUS_SUCCESS: - return "CUSPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; - case CUSPARSE_STATUS_NOT_INITIALIZED: - return "CUSPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; - case CUSPARSE_STATUS_ALLOC_FAILED: - return "CUSPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; - case CUSPARSE_STATUS_INVALID_VALUE: - return "CUSPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; - case CUSPARSE_STATUS_ARCH_MISMATCH: - return "CUSPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; - case CUSPARSE_STATUS_MAPPING_ERROR: - return "CUSPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; - case CUSPARSE_STATUS_EXECUTION_FAILED: - return "CUSPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; - case CUSPARSE_STATUS_INTERNAL_ERROR: - return "CUSPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; } return ""; @@ -578,48 +577,48 @@ static const char *_cudaGetErrorEnum(cusparseStatus_t error) #ifdef CURAND_H_ // cuRAND API errors -static const char *_cudaGetErrorEnum(curandStatus_t error) +static const char* _cudaGetErrorEnum(curandStatus_t error) { - switch (error) + switch(error) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; } return ""; @@ -628,254 +627,253 @@ static const char *_cudaGetErrorEnum(curandStatus_t error) #ifdef NV_NPPIDEFS_H // NPP API errors -static const char *_cudaGetErrorEnum(NppStatus error) +static const char* _cudaGetErrorEnum(NppStatus error) { - switch (error) + switch(error) { - case NPP_NOT_SUPPORTED_MODE_ERROR: - return "NPP_NOT_SUPPORTED_MODE_ERROR"; + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; - case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; - case NPP_RESIZE_NO_OPERATION_ERROR: - return "NPP_RESIZE_NO_OPERATION_ERROR"; + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; - case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: - return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 +# if((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - case NPP_BAD_ARG_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; - case NPP_COEFF_ERROR: - return "NPP_COEFFICIENT_ERROR"; + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; - case NPP_RECT_ERROR: - return "NPP_RECTANGLE_ERROR"; + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; - case NPP_QUAD_ERROR: - return "NPP_QUADRANGLE_ERROR"; + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; - case NPP_MEM_ALLOC_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; - case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - case NPP_INVALID_INPUT: - return "NPP_INVALID_INPUT"; + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; - case NPP_POINTER_ERROR: - return "NPP_POINTER_ERROR"; + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; - case NPP_WARNING: - return "NPP_WARNING"; + case NPP_WARNING: + return "NPP_WARNING"; - case NPP_ODD_ROI_WARNING: - return "NPP_ODD_ROI_WARNING"; -#else - - // These are for CUDA 5.5 or higher - case NPP_BAD_ARGUMENT_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +# else - case NPP_COEFFICIENT_ERROR: - return "NPP_COEFFICIENT_ERROR"; + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; - case NPP_RECTANGLE_ERROR: - return "NPP_RECTANGLE_ERROR"; + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; - case NPP_QUADRANGLE_ERROR: - return "NPP_QUADRANGLE_ERROR"; + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; - case NPP_MEMORY_ALLOCATION_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; - case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; - case NPP_INVALID_HOST_POINTER_ERROR: - return "NPP_INVALID_HOST_POINTER_ERROR"; + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - case NPP_INVALID_DEVICE_POINTER_ERROR: - return "NPP_INVALID_DEVICE_POINTER_ERROR"; -#endif + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; - case NPP_LUT_NUMBER_OF_LEVELS_ERROR: - return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +# endif - case NPP_TEXTURE_BIND_ERROR: - return "NPP_TEXTURE_BIND_ERROR"; + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; - case NPP_WRONG_INTERSECTION_ROI_ERROR: - return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; - case NPP_NOT_EVEN_STEP_ERROR: - return "NPP_NOT_EVEN_STEP_ERROR"; + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; - case NPP_INTERPOLATION_ERROR: - return "NPP_INTERPOLATION_ERROR"; + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; - case NPP_RESIZE_FACTOR_ERROR: - return "NPP_RESIZE_FACTOR_ERROR"; + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; - case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: - return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - case NPP_MEMFREE_ERR: - return "NPP_MEMFREE_ERR"; +# if((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - case NPP_MEMSET_ERR: - return "NPP_MEMSET_ERR"; + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; - case NPP_MEMCPY_ERR: - return "NPP_MEMCPY_ERROR"; + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; - case NPP_MIRROR_FLIP_ERR: - return "NPP_MIRROR_FLIP_ERR"; -#else + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; - case NPP_MEMFREE_ERROR: - return "NPP_MEMFREE_ERROR"; + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +# else - case NPP_MEMSET_ERROR: - return "NPP_MEMSET_ERROR"; + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; - case NPP_MEMCPY_ERROR: - return "NPP_MEMCPY_ERROR"; + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; - case NPP_MIRROR_FLIP_ERROR: - return "NPP_MIRROR_FLIP_ERROR"; -#endif + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; - case NPP_ALIGNMENT_ERROR: - return "NPP_ALIGNMENT_ERROR"; + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +# endif - case NPP_STEP_ERROR: - return "NPP_STEP_ERROR"; + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; - case NPP_SIZE_ERROR: - return "NPP_SIZE_ERROR"; + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; - case NPP_NULL_POINTER_ERROR: - return "NPP_NULL_POINTER_ERROR"; + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; - case NPP_CUDA_KERNEL_EXECUTION_ERROR: - return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; - case NPP_NOT_IMPLEMENTED_ERROR: - return "NPP_NOT_IMPLEMENTED_ERROR"; + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; - case NPP_ERROR: - return "NPP_ERROR"; + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; - case NPP_SUCCESS: - return "NPP_SUCCESS"; + case NPP_ERROR: + return "NPP_ERROR"; - case NPP_WRONG_INTERSECTION_QUAD_WARNING: - return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + case NPP_SUCCESS: + return "NPP_SUCCESS"; - case NPP_MISALIGNED_DST_ROI_WARNING: - return "NPP_MISALIGNED_DST_ROI_WARNING"; + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; - case NPP_AFFINE_QUAD_INCORRECT_WARNING: - return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; - case NPP_DOUBLE_SIZE_WARNING: - return "NPP_DOUBLE_SIZE_WARNING"; + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; - case NPP_WRONG_INTERSECTION_ROI_WARNING: - return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 - /* These are 6.0 or higher */ - case NPP_LUT_PALETTE_BITSIZE_ERROR: - return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; - case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; +# if((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; - case NPP_QUALITY_INDEX_ERROR: - return "NPP_QUALITY_INDEX_ERROR"; + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; - case NPP_CHANNEL_ORDER_ERROR: - return "NPP_CHANNEL_ORDER_ERROR"; + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; - case NPP_ZERO_MASK_VALUE_ERROR: - return "NPP_ZERO_MASK_VALUE_ERROR"; + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; - case NPP_NUMBER_OF_CHANNELS_ERROR: - return "NPP_NUMBER_OF_CHANNELS_ERROR"; + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; - case NPP_COI_ERROR: - return "NPP_COI_ERROR"; + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; - case NPP_DIVISOR_ERROR: - return "NPP_DIVISOR_ERROR"; + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; - case NPP_CHANNEL_ERROR: - return "NPP_CHANNEL_ERROR"; + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; - case NPP_STRIDE_ERROR: - return "NPP_STRIDE_ERROR"; + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; - case NPP_ANCHOR_ERROR: - return "NPP_ANCHOR_ERROR"; + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; - case NPP_MASK_SIZE_ERROR: - return "NPP_MASK_SIZE_ERROR"; + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; - case NPP_MOMENT_00_ZERO_ERROR: - return "NPP_MOMENT_00_ZERO_ERROR"; + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; - case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: - return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; - case NPP_THRESHOLD_ERROR: - return "NPP_THRESHOLD_ERROR"; + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; - case NPP_CONTEXT_MATCH_ERROR: - return "NPP_CONTEXT_MATCH_ERROR"; + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; - case NPP_FFT_FLAG_ERROR: - return "NPP_FFT_FLAG_ERROR"; + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; - case NPP_FFT_ORDER_ERROR: - return "NPP_FFT_ORDER_ERROR"; + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; - case NPP_SCALE_RANGE_ERROR: - return "NPP_SCALE_RANGE_ERROR"; + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; - case NPP_DATA_TYPE_ERROR: - return "NPP_DATA_TYPE_ERROR"; + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; - case NPP_OUT_OFF_RANGE_ERROR: - return "NPP_OUT_OFF_RANGE_ERROR"; + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; - case NPP_DIVIDE_BY_ZERO_ERROR: - return "NPP_DIVIDE_BY_ZERO_ERROR"; + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; - case NPP_RANGE_ERROR: - return "NPP_RANGE_ERROR"; + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; - case NPP_NO_MEMORY_ERROR: - return "NPP_NO_MEMORY_ERROR"; + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; - case NPP_ERROR_RESERVED: - return "NPP_ERROR_RESERVED"; + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; - case NPP_NO_OPERATION_WARNING: - return "NPP_NO_OPERATION_WARNING"; + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; - case NPP_DIVIDE_BY_ZERO_WARNING: - return "NPP_DIVIDE_BY_ZERO_WARNING"; -#endif + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +# endif } return ""; @@ -883,22 +881,28 @@ static const char *_cudaGetErrorEnum(NppStatus error) #endif #ifdef __DRIVER_TYPES_H__ -#ifndef DEVICE_RESET -#define DEVICE_RESET cudaDeviceReset(); -#endif +# ifndef DEVICE_RESET +# define DEVICE_RESET cudaDeviceReset(); +# endif #else -#ifndef DEVICE_RESET -#define DEVICE_RESET -#endif +# ifndef DEVICE_RESET +# define DEVICE_RESET +# endif #endif -template< typename T > -void check(T result, char const *const func, const char *const file, int const line) +template +void check(T result, char const* const func, const char* const file, int const line) { - if (result) + if(result) { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", - file, line, static_cast(result), _cudaGetErrorEnum(result), func); + fprintf( + stderr, + "CUDA error at %s:%d code=%d(%s) \"%s\" \n", + file, + line, + static_cast(result), + _cudaGetErrorEnum(result), + func); DEVICE_RESET // Make sure we call CUDA Device Reset before exiting exit(EXIT_FAILURE); @@ -907,19 +911,25 @@ void check(T result, char const *const func, const char *const file, int const l #ifdef __DRIVER_TYPES_H__ // This will output the proper CUDA error strings in the event that a CUDA host call returns an error -#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) +# define checkCudaErrors(val) check((val), # val, __FILE__, __LINE__) // This will output the proper error string when calling cudaGetLastError -#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__) +# define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) -inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) +inline void __getLastCudaError(const char* errorMessage, const char* file, const int line) { cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) + if(cudaSuccess != err) { - fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", - file, line, errorMessage, (int)err, cudaGetErrorString(err)); + fprintf( + stderr, + "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", + file, + line, + errorMessage, + (int) err, + cudaGetErrorString(err)); DEVICE_RESET exit(EXIT_FAILURE); } @@ -927,7 +937,7 @@ inline void __getLastCudaError(const char *errorMessage, const char *file, const #endif #ifndef MAX -#define MAX(a,b) (a > b ? a : b) +# define MAX(a, b) (a > b ? a : b) #endif // Beginning of GPU Architecture definitions @@ -940,27 +950,25 @@ inline int _ConvertSMVer2Cores(int major, int minor) int Cores; } sSMtoCores; - sSMtoCores nGpuArchCoresPerSM[] = - { - { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class - { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class - { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class - { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class - { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class - { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class - { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class - { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class - { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class - { -1, -1 } - }; + sSMtoCores nGpuArchCoresPerSM[] + = {{0x10, 8}, // Tesla Generation (SM 1.0) G80 class + {0x11, 8}, // Tesla Generation (SM 1.1) G8x class + {0x12, 8}, // Tesla Generation (SM 1.2) G9x class + {0x13, 8}, // Tesla Generation (SM 1.3) GT200 class + {0x20, 32}, // Fermi Generation (SM 2.0) GF100 class + {0x21, 48}, // Fermi Generation (SM 2.1) GF10x class + {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class + {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class + {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class + {-1, -1}}; int index = 0; - while (nGpuArchCoresPerSM[index].SM != -1) + while(nGpuArchCoresPerSM[index].SM != -1) { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + if(nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { return nGpuArchCoresPerSM[index].Cores; } @@ -969,8 +977,12 @@ inline int _ConvertSMVer2Cores(int major, int minor) } // If we don't find the values, we default use the previous one to run properly - printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores); - return nGpuArchCoresPerSM[index-1].Cores; + printf( + "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", + major, + minor, + nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; } // end of GPU Architecture definitions @@ -981,18 +993,18 @@ inline int gpuDeviceInit(int devID) int device_count; checkCudaErrors(cudaGetDeviceCount(&device_count)); - if (device_count == 0) + if(device_count == 0) { fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } - if (devID < 0) + if(devID < 0) { devID = 0; } - if (devID > device_count-1) + if(devID > device_count - 1) { fprintf(stderr, "\n"); fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count); @@ -1001,23 +1013,23 @@ inline int gpuDeviceInit(int devID) return -devID; } -/* cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + /* cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - if (deviceProp.computeMode == cudaComputeModeProhibited) - { - fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); - return -1; - } + if (deviceProp.computeMode == cudaComputeModeProhibited) + { + fprintf(stderr, "Error: device is running in , no threads can use + ::cudaSetDevice().\n"); return -1; + } - if (deviceProp.major < 1) - { - fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); - exit(EXIT_FAILURE); - } -*/ + if (deviceProp.major < 1) + { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + */ checkCudaErrors(cudaSetDevice(devID)); -// printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); + // printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); return devID; } @@ -1025,9 +1037,9 @@ inline int gpuDeviceInit(int devID) // This function returns the best GPU (with maximum GFLOPS) inline int gpuGetMaxGflopsDeviceId() { - int current_device = 0, sm_per_multiproc = 0; - int max_perf_device = 0; - int device_count = 0, best_SM_arch = 0; + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0, best_SM_arch = 0; int devices_prohibited = 0; unsigned long long max_compute_perf = 0; @@ -1036,21 +1048,21 @@ inline int gpuGetMaxGflopsDeviceId() checkCudaErrors(cudaGetDeviceCount(&device_count)); - if (device_count == 0) + if(device_count == 0) { fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } // Find the best major SM Architecture GPU device - while (current_device < device_count) + while(current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); // If this GPU is not running on Compute Mode prohibited, then we can add it to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) + if(deviceProp.computeMode != cudaComputeModeProhibited) { - if (deviceProp.major > 0 && deviceProp.major < 9999) + if(deviceProp.major > 0 && deviceProp.major < 9999) { best_SM_arch = MAX(best_SM_arch, deviceProp.major); } @@ -1063,23 +1075,23 @@ inline int gpuGetMaxGflopsDeviceId() current_device++; } - if (devices_prohibited == device_count) + if(devices_prohibited == device_count) { - fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n"); - exit(EXIT_FAILURE); + fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); } // Find the best CUDA capable GPU device current_device = 0; - while (current_device < device_count) + while(current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); // If this GPU is not running on Compute Mode prohibited, then we can add it to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) + if(deviceProp.computeMode != cudaComputeModeProhibited) { - if (deviceProp.major == 9999 && deviceProp.minor == 9999) + if(deviceProp.major == 9999 && deviceProp.minor == 9999) { sm_per_multiproc = 1; } @@ -1088,24 +1100,25 @@ inline int gpuGetMaxGflopsDeviceId() sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); } - unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; + unsigned long long compute_perf + = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; - if (compute_perf > max_compute_perf) + if(compute_perf > max_compute_perf) { // If we find GPU with SM major > 2, search only these - if (best_SM_arch > 2) + if(best_SM_arch > 2) { // If our device==dest_SM_arch, choose this, or else pass - if (deviceProp.major == best_SM_arch) + if(deviceProp.major == best_SM_arch) { - max_compute_perf = compute_perf; - max_perf_device = current_device; + max_compute_perf = compute_perf; + max_perf_device = current_device; } } else { - max_compute_perf = compute_perf; - max_perf_device = current_device; + max_compute_perf = compute_perf; + max_perf_device = current_device; } } } @@ -1118,17 +1131,17 @@ inline int gpuGetMaxGflopsDeviceId() // Initialization code to find the best CUDA Device -inline int findCudaDevice(int argc, const char **argv) +inline int findCudaDevice(int argc, const char** argv) { -// cudaDeviceProp deviceProp; + // cudaDeviceProp deviceProp; int devID = 0; // If the command-line has a device number specified, use it - if (checkCmdLineFlag(argc, argv, "device")) + if(checkCmdLineFlag(argc, argv, "device")) { devID = getCmdLineArgumentInt(argc, argv, "device="); - if (devID < 0) + if(devID < 0) { printf("Invalid command line parameter\n "); exit(EXIT_FAILURE); @@ -1137,7 +1150,7 @@ inline int findCudaDevice(int argc, const char **argv) { devID = gpuDeviceInit(devID); - if (devID < 0) + if(devID < 0) { printf("exiting...\n"); exit(EXIT_FAILURE); @@ -1149,8 +1162,9 @@ inline int findCudaDevice(int argc, const char **argv) // Otherwise pick the device with highest Gflops/s devID = gpuGetMaxGflopsDeviceId(); checkCudaErrors(cudaSetDevice(devID)); -// checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); -// printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); + // checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + // printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, + // deviceProp.major, deviceProp.minor); } return devID; @@ -1159,29 +1173,29 @@ inline int findCudaDevice(int argc, const char **argv) // General check for CUDA GPU SM Capabilities inline bool checkCudaCapabilities(int major_version, int minor_version) { -/* cudaDeviceProp deviceProp; - deviceProp.major = 0; - deviceProp.minor = 0; - */ + /* cudaDeviceProp deviceProp; + deviceProp.major = 0; + deviceProp.minor = 0; + */ int dev; checkCudaErrors(cudaGetDevice(&dev)); -// checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + // checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); return true; -/* - if ((deviceProp.major > major_version) || - (deviceProp.major == major_version && deviceProp.minor >= minor_version)) - { - printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor); - return true; - } - else - { - printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); - return false; - } -*/ + /* + if ((deviceProp.major > major_version) || + (deviceProp.major == major_version && deviceProp.minor >= minor_version)) + { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, + deviceProp.minor); return true; + } + else + { + printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, + minor_version); return false; + } + */ } #endif diff --git a/example/CUDASamples/common/helper_functions.h b/example/CUDASamples/common/helper_functions.h index 11538ba7..20b6b17c 100644 --- a/example/CUDASamples/common/helper_functions.h +++ b/example/CUDASamples/common/helper_functions.h @@ -14,29 +14,29 @@ #define HELPER_FUNCTIONS_H #ifdef WIN32 -#pragma warning(disable:4996) +# pragma warning(disable : 4996) #endif // includes, project -#include -#include +#include +#include +#include #include +#include + #include #include #include - -#include -#include -#include -#include +#include +#include // includes, timer, string parsing, image helpers -#include // helper functions for timers -#include // helper functions for string parsing -#include // helper functions for image compare, dump, data comparisons +#include // helper functions for image compare, dump, data comparisons +#include // helper functions for string parsing +#include // helper functions for timers #ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 +# define EXIT_WAIVED 2 #endif #endif // HELPER_FUNCTIONS_H diff --git a/example/CUDASamples/common/helper_image.h b/example/CUDASamples/common/helper_image.h index 4e8b25cd..6412cf1e 100644 --- a/example/CUDASamples/common/helper_image.h +++ b/example/CUDASamples/common/helper_image.h @@ -13,25 +13,25 @@ #ifndef HELPER_IMAGE_H #define HELPER_IMAGE_H -#include +#include #include -#include #include -#include +#include +#include #include #include #include #ifndef MIN -#define MIN(a,b) ((a < b) ? a : b) +# define MIN(a, b) ((a < b) ? a : b) #endif #ifndef MAX -#define MAX(a,b) ((a > b) ? a : b) +# define MAX(a, b) ((a > b) ? a : b) #endif #ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 +# define EXIT_WAIVED 2 #endif #include @@ -55,7 +55,7 @@ namespace //! Conversion operator //! @return converted value //! @param val value to convert - float operator()(const unsigned char &val) + float operator()(const unsigned char& val) { return static_cast(val); } @@ -68,7 +68,7 @@ namespace //! Conversion operator //! @return converted value //! @param val value to convert - float operator()(const unsigned char &val) + float operator()(const unsigned char& val) { return static_cast(val) / 255.0f; } @@ -85,7 +85,7 @@ namespace //! Conversion operator (essentially a passthru //! @return converted value //! @param val value to convert - unsigned char operator()(const unsigned char &val) + unsigned char operator()(const unsigned char& val) { return val; } @@ -98,42 +98,40 @@ namespace //! Conversion operator //! @return converted value //! @param val value to convert - unsigned char operator()(const float &val) + unsigned char operator()(const float& val) { return static_cast(val * 255.0f); } }; -} +} // namespace #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#ifndef FOPEN -#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result != 0) -#endif -#ifndef SSCANF -#define SSCANF sscanf_s -#endif +# ifndef FOPEN +# define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +# endif +# ifndef FOPEN_FAIL +# define FOPEN_FAIL(result) (result != 0) +# endif +# ifndef SSCANF +# define SSCANF sscanf_s +# endif #else -#ifndef FOPEN -#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result == NULL) -#endif -#ifndef SSCANF -#define SSCANF sscanf -#endif +# ifndef FOPEN +# define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +# endif +# ifndef FOPEN_FAIL +# define FOPEN_FAIL(result) (result == NULL) +# endif +# ifndef SSCANF +# define SSCANF sscanf +# endif #endif -inline bool -__loadPPM(const char *file, unsigned char **data, - unsigned int *w, unsigned int *h, unsigned int *channels) +inline bool __loadPPM(const char* file, unsigned char** data, unsigned int* w, unsigned int* h, unsigned int* channels) { - FILE *fp = NULL; + FILE* fp = NULL; - if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) + if(FOPEN_FAIL(FOPEN(fp, file, "rb"))) { std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; return false; @@ -142,17 +140,17 @@ __loadPPM(const char *file, unsigned char **data, // check header char header[PGMHeaderSize]; - if (fgets(header, PGMHeaderSize, fp) == NULL) + if(fgets(header, PGMHeaderSize, fp) == NULL) { std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; return false; } - if (strncmp(header, "P5", 2) == 0) + if(strncmp(header, "P5", 2) == 0) { *channels = 1; } - else if (strncmp(header, "P6", 2) == 0) + else if(strncmp(header, "P6", 2) == 0) { *channels = 3; } @@ -169,50 +167,50 @@ __loadPPM(const char *file, unsigned char **data, unsigned int maxval = 0; unsigned int i = 0; - while (i < 3) + while(i < 3) { - if (fgets(header, PGMHeaderSize, fp) == NULL) + if(fgets(header, PGMHeaderSize, fp) == NULL) { std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; return false; } - if (header[0] == '#') + if(header[0] == '#') { continue; } - if (i == 0) + if(i == 0) { i += SSCANF(header, "%u %u %u", &width, &height, &maxval); } - else if (i == 1) + else if(i == 1) { i += SSCANF(header, "%u %u", &height, &maxval); } - else if (i == 2) + else if(i == 2) { i += SSCANF(header, "%u", &maxval); } } // check if given handle for the data is initialized - if (NULL != *data) + if(NULL != *data) { - if (*w != width || *h != height) + if(*w != width || *h != height) { std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; } } else { - *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height **channels); + *data = (unsigned char*) malloc(sizeof(unsigned char) * width * height * *channels); *w = width; *h = height; } // read and close file - if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0) + if(fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0) { std::cerr << "__LoadPPM() read data returned error." << std::endl; } @@ -222,25 +220,24 @@ __loadPPM(const char *file, unsigned char **data, return true; } -template -inline bool -sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h) +template +inline bool sdkLoadPGM(const char* file, T** data, unsigned int* w, unsigned int* h) { - unsigned char *idata = NULL; + unsigned char* idata = NULL; unsigned int channels; - if (true != __loadPPM(file, &idata, w, h, &channels)) + if(true != __loadPPM(file, &idata, w, h, &channels)) { return false; } - unsigned int size = *w **h * channels; + unsigned int size = *w * *h * channels; // initialize mem if necessary // the correct size is checked / set in loadPGMc() - if (NULL == *data) + if(NULL == *data) { - *data = (T *) malloc(sizeof(T) * size); + *data = (T*) malloc(sizeof(T) * size); } // copy and cast data @@ -251,24 +248,22 @@ sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h) return true; } -template -inline bool -sdkLoadPPM4(const char *file, T **data, - unsigned int *w,unsigned int *h) +template +inline bool sdkLoadPPM4(const char* file, T** data, unsigned int* w, unsigned int* h) { - unsigned char *idata = 0; + unsigned char* idata = 0; unsigned int channels; - if (__loadPPM(file, &idata, w, h, &channels)) + if(__loadPPM(file, &idata, w, h, &channels)) { // pad 4th component - int size = *w **h; + int size = *w * *h; // keep the original pointer - unsigned char *idata_orig = idata; - *data = (T *) malloc(sizeof(T) * size * 4); - unsigned char *ptr = *data; + unsigned char* idata_orig = idata; + *data = (T*) malloc(sizeof(T) * size * 4); + unsigned char* ptr = *data; - for (int i=0; i 0); @@ -296,17 +289,17 @@ __savePPM(const char *file, unsigned char *data, std::fstream fh(file, std::fstream::out | std::fstream::binary); - if (fh.bad()) + if(fh.bad()) { std::cerr << "__savePPM() : Opening file failed." << std::endl; return false; } - if (channels == 1) + if(channels == 1) { fh << "P5\n"; } - else if (channels == 3) + else if(channels == 3) { fh << "P6\n"; } @@ -318,14 +311,14 @@ __savePPM(const char *file, unsigned char *data, fh << w << "\n" << h << "\n" << 0xff << std::endl; - for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i) + for(unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { fh << data[i]; } fh.flush(); - if (fh.bad()) + if(fh.bad()) { std::cerr << "__savePPM() : Writing data failed." << std::endl; return false; @@ -337,12 +330,10 @@ __savePPM(const char *file, unsigned char *data, } template -inline bool -sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h) +inline bool sdkSavePGM(const char* file, T* data, unsigned int w, unsigned int h) { unsigned int size = w * h; - unsigned char *idata = - (unsigned char *) malloc(sizeof(unsigned char) * size); + unsigned char* idata = (unsigned char*) malloc(sizeof(unsigned char) * size); std::transform(data, data + size, idata, ConverterToUByte()); @@ -355,16 +346,14 @@ sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h) return result; } -inline bool -sdkSavePPM4ub(const char *file, unsigned char *data, - unsigned int w, unsigned int h) +inline bool sdkSavePPM4ub(const char* file, unsigned char* data, unsigned int w, unsigned int h) { // strip 4th component int size = w * h; - unsigned char *ndata = (unsigned char *) malloc(sizeof(unsigned char) * size*3); - unsigned char *ptr = ndata; + unsigned char* ndata = (unsigned char*) malloc(sizeof(unsigned char) * size * 3); + unsigned char* ptr = ndata; - for (int i=0; i -inline bool -sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose) +inline bool sdkReadFile(const char* filename, T** data, unsigned int* len, bool verbose) { // check input arguments assert(NULL != filename); assert(NULL != len); // intermediate storage for the data read - std::vector data_read; + std::vector data_read; // open file for reading - FILE *fh = NULL; + FILE* fh = NULL; // check if filestream is valid - if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) + if(FOPEN_FAIL(FOPEN(fh, filename, "r"))) { printf("Unable to open input file: %s\n", filename); return false; @@ -410,7 +398,7 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose) // read all data elements T token; - while (!feof(fh)) + while(!feof(fh)) { fscanf(fh, "%f", &token); data_read.push_back(token); @@ -421,14 +409,14 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose) fclose(fh); // check if the given handle is already initialized - if (NULL != *data) + if(NULL != *data) { - if (*len != data_read.size()) + if(*len != data_read.size()) { std::cerr << "sdkReadFile() : Initialized memory given but " << "size mismatch with signal read " - << "(data read / data init = " << (unsigned int)data_read.size() - << " / " << *len << ")" << std::endl; + << "(data read / data init = " << (unsigned int) data_read.size() << " / " << *len << ")" + << std::endl; return false; } @@ -436,7 +424,7 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose) else { // allocate storage for the data read - *data = (T *) malloc(sizeof(T) * data_read.size()); + *data = (T*) malloc(sizeof(T) * data_read.size()); // store signal size *len = static_cast(data_read.size()); } @@ -456,17 +444,22 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose) //! @param len number of data elements in data, -1 on error ////////////////////////////////////////////////////////////////////////////// template -inline bool -sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned int block_num, unsigned int block_size, bool verbose) +inline bool sdkReadFileBlocks( + const char* filename, + T** data, + unsigned int* len, + unsigned int block_num, + unsigned int block_size, + bool verbose) { // check input arguments assert(NULL != filename); assert(NULL != len); // open file for reading - FILE *fh = fopen(filename, "rb"); + FILE* fh = fopen(filename, "rb"); - if (fh == NULL && verbose) + if(fh == NULL && verbose) { std::cerr << "sdkReadFile() : Opening file failed." << std::endl; return false; @@ -474,11 +467,11 @@ sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned in // check if the given handle is already initialized // allocate storage for the data read - data[block_num] = (T *) malloc(block_size); + data[block_num] = (T*) malloc(block_size); // read all data elements fseek(fh, block_num * block_size, SEEK_SET); - *len = fread(data[block_num], sizeof(T), block_size/sizeof(T), fh); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); fclose(fh); @@ -494,9 +487,13 @@ sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned in //! @param epsilon epsilon for comparison ////////////////////////////////////////////////////////////////////////////// template -inline bool -sdkWriteFile(const char *filename, const T *data, unsigned int len, - const S epsilon, bool verbose, bool append = false) +inline bool sdkWriteFile( + const char* filename, + const T* data, + unsigned int len, + const S epsilon, + bool verbose, + bool append = false) { assert(NULL != filename); assert(NULL != data); @@ -505,7 +502,7 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len, // if (append) { std::fstream fh(filename, std::fstream::out | std::fstream::ate); - if (verbose) + if(verbose) { std::cerr << "sdkWriteFile() : Open file " << filename << " for write/append." << std::endl; } @@ -519,9 +516,9 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len, */ // check if filestream is valid - if (! fh.good()) + if(!fh.good()) { - if (verbose) + if(verbose) { std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; } @@ -533,15 +530,15 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len, fh << "# " << epsilon << "\n"; // write data - for (unsigned int i = 0; (i < len) && (fh.good()); ++i) + for(unsigned int i = 0; (i < len) && (fh.good()); ++i) { fh << data[i] << ' '; } // Check if writing succeeded - if (! fh.good()) + if(!fh.good()) { - if (verbose) + if(verbose) { std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; } @@ -564,18 +561,21 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len, //! @param epsilon epsilon to use for the comparison ////////////////////////////////////////////////////////////////////////////// template -inline bool -compareData(const T *reference, const T *data, const unsigned int len, - const S epsilon, const float threshold) +inline bool compareData( + const T* reference, + const T* data, + const unsigned int len, + const S epsilon, + const float threshold) { assert(epsilon >= 0); bool result = true; unsigned int error_count = 0; - for (unsigned int i = 0; i < len; ++i) + for(unsigned int i = 0; i < len; ++i) { - float diff = (float)reference[i] - (float)data[i]; + float diff = (float) reference[i] - (float) data[i]; bool comp = (diff <= epsilon) && (diff >= -epsilon); result &= comp; @@ -594,23 +594,23 @@ compareData(const T *reference, const T *data, const unsigned int len, #endif } - if (threshold == 0.0f) + if(threshold == 0.0f) { return (result) ? true : false; } else { - if (error_count) + if(error_count) { - printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count); + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float) error_count * 100 / (float) len, error_count); } - return (len*threshold > error_count) ? true : false; + return (len * threshold > error_count) ? true : false; } } #ifndef __MIN_EPSILON_ERROR -#define __MIN_EPSILON_ERROR 1e-3f +# define __MIN_EPSILON_ERROR 1e-3f #endif ////////////////////////////////////////////////////////////////////////////// @@ -623,24 +623,27 @@ compareData(const T *reference, const T *data, const unsigned int len, //! @param epsilon threshold % of (# of bytes) for pass/fail ////////////////////////////////////////////////////////////////////////////// template -inline bool -compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned int len, - const S epsilon, const float threshold) +inline bool compareDataAsFloatThreshold( + const T* reference, + const T* data, + const unsigned int len, + const S epsilon, + const float threshold) { assert(epsilon >= 0); // If we set epsilon to be 0, let's set a minimum threshold - float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + float max_error = MAX((float) epsilon, __MIN_EPSILON_ERROR); int error_count = 0; bool result = true; - for (unsigned int i = 0; i < len; ++i) + for(unsigned int i = 0; i < len; ++i) { - float diff = fabs((float)reference[i] - (float)data[i]); + float diff = fabs((float) reference[i] - (float) data[i]); bool comp = (diff < max_error); result &= comp; - if (! comp) + if(!comp) { error_count++; #if 0 @@ -658,9 +661,9 @@ compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned in } } - if (threshold == 0.0f) + if(threshold == 0.0f) { - if (error_count) + if(error_count) { printf("total # of errors = %d\n", error_count); } @@ -669,28 +672,32 @@ compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned in } else { - if (error_count) + if(error_count) { - printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count); + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float) error_count * 100 / (float) len, error_count); } - return ((len*threshold > error_count) ? true : false); + return ((len * threshold > error_count) ? true : false); } } -inline -void sdkDumpBin(void *data, unsigned int bytes, const char *filename) +inline void sdkDumpBin(void* data, unsigned int bytes, const char* filename) { printf("sdkDumpBin: <%s>\n", filename); - FILE *fp; + FILE* fp; FOPEN(fp, filename, "wb"); fwrite(data, bytes, 1, fp); fflush(fp); fclose(fp); } -inline -bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path) +inline bool sdkCompareBin2BinUint( + const char* src_file, + const char* ref_file, + unsigned int nelements, + const float epsilon, + const float threshold, + char* exec_path) { unsigned int *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; @@ -698,15 +705,15 @@ bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned unsigned long error_count = 0; size_t fsize = 0; - if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) + if(FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { printf("compareBin2Bin unable to open src_file: %s\n", src_file); error_count++; } - char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + char* ref_file_path = sdkFindFilePath(ref_file, exec_path); - if (ref_file_path == NULL) + if(ref_file_path == NULL) { printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); @@ -714,37 +721,41 @@ bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned printf(" FAILED\n"); error_count++; - if (src_fp) + if(src_fp) { fclose(src_fp); } - if (ref_fp) + if(ref_fp) { fclose(ref_fp); } } else { - if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) + if(FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path); error_count++; } - if (src_fp && ref_fp) + if(src_fp && ref_fp) { - src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); - ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); + src_buffer = (unsigned int*) malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int*) malloc(nelements * sizeof(unsigned int)); fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); - printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); - printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize); - printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize); + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", + nelements, + epsilon, + threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, (int) fsize); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int) fsize); - if (!compareData(ref_buffer, src_buffer, nelements, epsilon, threshold)) + if(!compareData(ref_buffer, src_buffer, nelements, epsilon, threshold)) { error_count++; } @@ -757,32 +768,37 @@ bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned } else { - if (src_fp) + if(src_fp) { fclose(src_fp); } - if (ref_fp) + if(ref_fp) { fclose(ref_fp); } } } - if (error_count == 0) + if(error_count == 0) { printf(" OK\n"); } else { - printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + printf(" FAILURE: %d errors...\n", (unsigned int) error_count); } - return (error_count == 0); // returns true if all pixels pass + return (error_count == 0); // returns true if all pixels pass } -inline -bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path) +inline bool sdkCompareBin2BinFloat( + const char* src_file, + const char* ref_file, + unsigned int nelements, + const float epsilon, + const float threshold, + char* exec_path) { float *src_buffer, *ref_buffer; FILE *src_fp = NULL, *ref_fp = NULL; @@ -790,15 +806,15 @@ bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned unsigned long error_count = 0; - if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) + if(FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { printf("compareBin2Bin unable to open src_file: %s\n", src_file); error_count = 1; } - char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + char* ref_file_path = sdkFindFilePath(ref_file, exec_path); - if (ref_file_path == NULL) + if(ref_file_path == NULL) { printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path); printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", exec_path); @@ -806,37 +822,41 @@ bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned printf(" FAILED\n"); error_count++; - if (src_fp) + if(src_fp) { fclose(src_fp); } - if (ref_fp) + if(ref_fp) { fclose(ref_fp); } } else { - if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) + if(FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path); error_count = 1; } - if (src_fp && ref_fp) + if(src_fp && ref_fp) { - src_buffer = (float *)malloc(nelements*sizeof(float)); - ref_buffer = (float *)malloc(nelements*sizeof(float)); + src_buffer = (float*) malloc(nelements * sizeof(float)); + ref_buffer = (float*) malloc(nelements * sizeof(float)); fsize = fread(src_buffer, nelements, sizeof(float), src_fp); fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp); - printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); - printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize); - printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize); + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", + nelements, + epsilon, + threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, (int) fsize); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int) fsize); - if (!compareDataAsFloatThreshold(ref_buffer, src_buffer, nelements, epsilon, threshold)) + if(!compareDataAsFloatThreshold(ref_buffer, src_buffer, nelements, epsilon, threshold)) { error_count++; } @@ -849,42 +869,39 @@ bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned } else { - if (src_fp) + if(src_fp) { fclose(src_fp); } - if (ref_fp) + if(ref_fp) { fclose(ref_fp); } } } - if (error_count == 0) + if(error_count == 0) { printf(" OK\n"); } else { - printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + printf(" FAILURE: %d errors...\n", (unsigned int) error_count); } - return (error_count == 0); // returns true if all pixels pass + return (error_count == 0); // returns true if all pixels pass } -inline bool -sdkCompareL2fe(const float *reference, const float *data, - const unsigned int len, const float epsilon) +inline bool sdkCompareL2fe(const float* reference, const float* data, const unsigned int len, const float epsilon) { assert(epsilon >= 0); float error = 0; float ref = 0; - for (unsigned int i = 0; i < len; ++i) + for(unsigned int i = 0; i < len; ++i) { - float diff = reference[i] - data[i]; error += diff * diff; ref += reference[i] * reference[i]; @@ -892,7 +909,7 @@ sdkCompareL2fe(const float *reference, const float *data, float normRef = sqrtf(ref); - if (fabs(ref) < 1e-7) + if(fabs(ref) < 1e-7) { #ifdef _DEBUG std::cerr << "ERROR, reference l2-norm is 0\n"; @@ -905,10 +922,9 @@ sdkCompareL2fe(const float *reference, const float *data, bool result = error < epsilon; #ifdef _DEBUG - if (! result) + if(!result) { - std::cerr << "ERROR, l2-norm error " - << error << " is greater than epsilon " << epsilon << "\n"; + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " << epsilon << "\n"; } #endif @@ -916,31 +932,27 @@ sdkCompareL2fe(const float *reference, const float *data, return result; } -inline bool -sdkLoadPPMub(const char *file, unsigned char **data, - unsigned int *w,unsigned int *h) +inline bool sdkLoadPPMub(const char* file, unsigned char** data, unsigned int* w, unsigned int* h) { unsigned int channels; return __loadPPM(file, data, w, h, &channels); } -inline bool -sdkLoadPPM4ub(const char *file, unsigned char **data, - unsigned int *w, unsigned int *h) +inline bool sdkLoadPPM4ub(const char* file, unsigned char** data, unsigned int* w, unsigned int* h) { - unsigned char *idata = 0; + unsigned char* idata = 0; unsigned int channels; - if (__loadPPM(file, &idata, w, h, &channels)) + if(__loadPPM(file, &idata, w, h, &channels)) { // pad 4th component - int size = *w **h; + int size = *w * *h; // keep the original pointer - unsigned char *idata_orig = idata; - *data = (unsigned char *) malloc(sizeof(unsigned char) * size * 4); - unsigned char *ptr = *data; + unsigned char* idata_orig = idata; + *data = (unsigned char*) malloc(sizeof(unsigned char) * size * 4); + unsigned char* ptr = *data; - for (int i=0; i Compare (a)rendered: <" << src_file << ">\n"; std::cerr << "> (b)reference: <" << ref_file << ">\n"; } - if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) + if(sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { - if (verboseErrors) + if(verboseErrors) { - std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n"; + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file << "\n"; } return false; } - if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) + if(sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n"; return false; } - if (src_height != ref_height || src_width != ref_width) + if(src_height != ref_height || src_width != ref_width) { - if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width << - "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n"; + if(verboseErrors) + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width << "," << src_height << ")vs(" + << ref_width << "," << ref_height << ")\n"; } - if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width << - "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n"; + if(verboseErrors) + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," << src_height << ") epsilon(" << epsilon + << "), threshold(" << threshold * 100 << "%)\n"; - if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false) + if(compareData(ref_data, src_data, src_width * src_height * 4, epsilon, threshold) == false) { - error_count=1; + error_count = 1; } - if (error_count == 0) + if(error_count == 0) { - if (verboseErrors) + if(verboseErrors) { std::cerr << " OK\n\n"; } } else { - if (verboseErrors) + if(verboseErrors) { - std::cerr << " FAILURE! "< Compare (a)rendered: <" << src_file << ">\n"; std::cerr << "> (b)reference: <" << ref_file << ">\n"; } - if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) + if(sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { - if (verboseErrors) + if(verboseErrors) { - std::cerr << "PGMvsPGM: unable to load ref image file: "<< ref_file << "\n"; + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file << "\n"; } return false; } - if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) + if(sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { std::cerr << "PGMvsPGM: unable to load src image file: " << src_file << "\n"; return false; } - if (src_height != ref_height || src_width != ref_width) + if(src_height != ref_height || src_width != ref_width) { - if (verboseErrors) std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width << - "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n"; + if(verboseErrors) + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width << "," << src_height << ")vs(" + << ref_width << "," << ref_height << ")\n"; } - if (verboseErrors) std::cerr << "PGMvsPGM: comparing images size (" << src_width << - "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n"; + if(verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," << src_height << ") epsilon(" << epsilon + << "), threshold(" << threshold * 100 << "%)\n"; - if (compareData(ref_data, src_data, src_width*src_height, epsilon, threshold) == false) + if(compareData(ref_data, src_data, src_width * src_height, epsilon, threshold) == false) { - error_count=1; + error_count = 1; } - if (error_count == 0) + if(error_count == 0) { - if (verboseErrors) + if(verboseErrors) { std::cerr << " OK\n\n"; } } else { - if (verboseErrors) + if(verboseErrors) { - std::cerr << " FAILURE! "< -#include #include #include -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#ifndef _CRT_SECURE_NO_DEPRECATE -#define _CRT_SECURE_NO_DEPRECATE -#endif -#ifndef STRCASECMP -#define STRCASECMP _stricmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP _strnicmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) -#endif +#include +#include -#ifndef FOPEN -#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result != 0) -#endif -#ifndef SSCANF -#define SSCANF sscanf_s -#endif -#ifndef SPRINTF -#define SPRINTF sprintf_s -#endif +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +# ifndef _CRT_SECURE_NO_DEPRECATE +# define _CRT_SECURE_NO_DEPRECATE +# endif +# ifndef STRCASECMP +# define STRCASECMP _stricmp +# endif +# ifndef STRNCASECMP +# define STRNCASECMP _strnicmp +# endif +# ifndef STRCPY +# define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +# endif + +# ifndef FOPEN +# define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +# endif +# ifndef FOPEN_FAIL +# define FOPEN_FAIL(result) (result != 0) +# endif +# ifndef SSCANF +# define SSCANF sscanf_s +# endif +# ifndef SPRINTF +# define SPRINTF sprintf_s +# endif #else // Linux Includes -#include -#include - -#ifndef STRCASECMP -#define STRCASECMP strcasecmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP strncasecmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) -#endif - -#ifndef FOPEN -#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result == NULL) -#endif -#ifndef SSCANF -#define SSCANF sscanf -#endif -#ifndef SPRINTF -#define SPRINTF sprintf -#endif +# include +# include + +# ifndef STRCASECMP +# define STRCASECMP strcasecmp +# endif +# ifndef STRNCASECMP +# define STRNCASECMP strncasecmp +# endif +# ifndef STRCPY +# define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +# endif + +# ifndef FOPEN +# define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +# endif +# ifndef FOPEN_FAIL +# define FOPEN_FAIL(result) (result == NULL) +# endif +# ifndef SSCANF +# define SSCANF sscanf +# endif +# ifndef SPRINTF +# define SPRINTF sprintf +# endif #endif #ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 +# define EXIT_WAIVED 2 #endif // CUDA Utility Helper Functions -inline int stringRemoveDelimiter(char delimiter, const char *string) +inline int stringRemoveDelimiter(char delimiter, const char* string) { int string_start = 0; - while (string[string_start] == delimiter) + while(string[string_start] == delimiter) { string_start++; } - if (string_start >= (int)strlen(string)-1) + if(string_start >= (int) strlen(string) - 1) { return 0; } @@ -94,19 +95,20 @@ inline int stringRemoveDelimiter(char delimiter, const char *string) return string_start; } -inline int getFileExtension(char *filename, char **extension) +inline int getFileExtension(char* filename, char** extension) { - int string_length = (int)strlen(filename); + int string_length = (int) strlen(filename); - while (filename[string_length--] != '.') + while(filename[string_length--] != '.') { - if (string_length == 0) + if(string_length == 0) break; } - if (string_length > 0) string_length += 2; + if(string_length > 0) + string_length += 2; - if (string_length == 0) + if(string_length == 0) *extension = NULL; else *extension = &filename[string_length]; @@ -115,23 +117,23 @@ inline int getFileExtension(char *filename, char **extension) } -inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) +inline bool checkCmdLineFlag(const int argc, const char** argv, const char* string_ref) { bool bFound = false; - if (argc >= 1) + if(argc >= 1) { - for (int i=1; i < argc; i++) + for(int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; + const char* string_argv = &argv[i][string_start]; - const char *equal_pos = strchr(string_argv, '='); - int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + const char* equal_pos = strchr(string_argv, '='); + int argv_length = (int) (equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); - int length = (int)strlen(string_ref); + int length = (int) strlen(string_ref); - if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) + if(length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) { bFound = true; continue; @@ -143,29 +145,29 @@ inline bool checkCmdLineFlag(const int argc, const char **argv, const char *stri } // This function wraps the CUDA Driver API into a template function -template -inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value) +template +inline bool getCmdLineArgumentValue(const int argc, const char** argv, const char* string_ref, T* value) { bool bFound = false; - if (argc >= 1) + if(argc >= 1) { - for (int i=1; i < argc; i++) + for(int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); + const char* string_argv = &argv[i][string_start]; + int length = (int) strlen(string_ref); - if (!STRNCASECMP(string_argv, string_ref, length)) + if(!STRNCASECMP(string_argv, string_ref, length)) { - if (length+1 <= (int)strlen(string_argv)) + if(length + 1 <= (int) strlen(string_argv)) { int auto_inc = (string_argv[length] == '=') ? 1 : 0; - *value = (T)atoi(&string_argv[length + auto_inc]); + *value = (T) atoi(&string_argv[length + auto_inc]); } bFound = true; - i=argc; + i = argc; } } } @@ -173,22 +175,22 @@ inline bool getCmdLineArgumentValue(const int argc, const char **argv, const cha return bFound; } -inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) +inline int getCmdLineArgumentInt(const int argc, const char** argv, const char* string_ref) { bool bFound = false; int value = -1; - if (argc >= 1) + if(argc >= 1) { - for (int i=1; i < argc; i++) + for(int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); + const char* string_argv = &argv[i][string_start]; + int length = (int) strlen(string_ref); - if (!STRNCASECMP(string_argv, string_ref, length)) + if(!STRNCASECMP(string_argv, string_ref, length)) { - if (length+1 <= (int)strlen(string_argv)) + if(length + 1 <= (int) strlen(string_argv)) { int auto_inc = (string_argv[length] == '=') ? 1 : 0; value = atoi(&string_argv[length + auto_inc]); @@ -204,7 +206,7 @@ inline int getCmdLineArgumentInt(const int argc, const char **argv, const char * } } - if (bFound) + if(bFound) { return value; } @@ -214,25 +216,25 @@ inline int getCmdLineArgumentInt(const int argc, const char **argv, const char * } } -inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref) +inline float getCmdLineArgumentFloat(const int argc, const char** argv, const char* string_ref) { bool bFound = false; float value = -1; - if (argc >= 1) + if(argc >= 1) { - for (int i=1; i < argc; i++) + for(int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); + const char* string_argv = &argv[i][string_start]; + int length = (int) strlen(string_ref); - if (!STRNCASECMP(string_argv, string_ref, length)) + if(!STRNCASECMP(string_argv, string_ref, length)) { - if (length+1 <= (int)strlen(string_argv)) + if(length + 1 <= (int) strlen(string_argv)) { int auto_inc = (string_argv[length] == '=') ? 1 : 0; - value = (float)atof(&string_argv[length + auto_inc]); + value = (float) atof(&string_argv[length + auto_inc]); } else { @@ -245,7 +247,7 @@ inline float getCmdLineArgumentFloat(const int argc, const char **argv, const ch } } - if (bFound) + if(bFound) { return value; } @@ -255,29 +257,28 @@ inline float getCmdLineArgumentFloat(const int argc, const char **argv, const ch } } -inline bool getCmdLineArgumentString(const int argc, const char **argv, - const char *string_ref, char **string_retval) +inline bool getCmdLineArgumentString(const int argc, const char** argv, const char* string_ref, char** string_retval) { bool bFound = false; - if (argc >= 1) + if(argc >= 1) { - for (int i=1; i < argc; i++) + for(int i = 1; i < argc; i++) { int string_start = stringRemoveDelimiter('-', argv[i]); - char *string_argv = (char *)&argv[i][string_start]; - int length = (int)strlen(string_ref); + char* string_argv = (char*) &argv[i][string_start]; + int length = (int) strlen(string_ref); - if (!STRNCASECMP(string_argv, string_ref, length)) + if(!STRNCASECMP(string_argv, string_ref, length)) { - *string_retval = &string_argv[length+1]; + *string_retval = &string_argv[length + 1]; bFound = true; continue; } } } - if (!bFound) + if(!bFound) { *string_retval = NULL; } @@ -293,137 +294,142 @@ inline bool getCmdLineArgumentString(const int argc, const char **argv, //! @param filename name of the file //! @param executable_path optional absolute path of the executable ////////////////////////////////////////////////////////////////////////////// -inline char *sdkFindFilePath(const char *filename, const char *executable_path) +inline char* sdkFindFilePath(const char* filename, const char* executable_path) { // defines a variable that is replaced with the name of the executable // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files) - // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc - const char *searchPath[] = - { - "./", // same dir - "./common/", // "/common/" subdir - "./common/data/", // "/common/data/" subdir - "./data/", // "/data/" subdir - "./src/", // "/src/" subdir - "./src//data/", // "/src//data/" subdir - "./inc/", // "/inc/" subdir - "./0_Simple/", // "/0_Simple/" subdir - "./1_Utilities/", // "/1_Utilities/" subdir - "./2_Graphics/", // "/2_Graphics/" subdir - "./3_Imaging/", // "/3_Imaging/" subdir - "./4_Financial/", // "/4_Financial/" subdir - "./5_Simulations/", // "/5_Simulations/" subdir - "./6_Advanced/", // "/6_Advanced/" subdir - "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir - "./8_Android/", // "/8_Android/" subdir - "./samples/", // "/samples/" subdir - - "../", // up 1 in tree - "../common/", // up 1 in tree, "/common/" subdir - "../common/data/", // up 1 in tree, "/common/data/" subdir - "../data/", // up 1 in tree, "/data/" subdir - "../src/", // up 1 in tree, "/src/" subdir - "../inc/", // up 1 in tree, "/inc/" subdir - - "../0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir - "../1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir - "../2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir - "../3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir - "../4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir - "../5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir - "../6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir - "../7_CUDALibraries//data/",// up 1 in tree, "/7_CUDALibraries//" subdir - "../8_Android//data/", // up 1 in tree, "/8_Android//" subdir - "../samples//data/", // up 1 in tree, "/samples//" subdir - "../../", // up 2 in tree - "../../common/", // up 2 in tree, "/common/" subdir - "../../common/data/", // up 2 in tree, "/common/data/" subdir - "../../data/", // up 2 in tree, "/data/" subdir - "../../src/", // up 2 in tree, "/src/" subdir - "../../inc/", // up 2 in tree, "/inc/" subdir - "../../sandbox//data/", // up 2 in tree, "/sandbox//" subdir - "../../0_Simple//data/", // up 2 in tree, "/0_Simple//" subdir - "../../1_Utilities//data/", // up 2 in tree, "/1_Utilities//" subdir - "../../2_Graphics//data/", // up 2 in tree, "/2_Graphics//" subdir - "../../3_Imaging//data/", // up 2 in tree, "/3_Imaging//" subdir - "../../4_Financial//data/", // up 2 in tree, "/4_Financial//" subdir - "../../5_Simulations//data/", // up 2 in tree, "/5_Simulations//" subdir - "../../6_Advanced//data/", // up 2 in tree, "/6_Advanced//" subdir + // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching + // the .exe or .bat, etc + const char* searchPath[] = { + "./", // same dir + "./common/", // "/common/" subdir + "./common/data/", // "/common/data/" subdir + "./data/", // "/data/" subdir + "./src/", // "/src/" subdir + "./src//data/", // "/src//data/" subdir + "./inc/", // "/inc/" subdir + "./0_Simple/", // "/0_Simple/" subdir + "./1_Utilities/", // "/1_Utilities/" subdir + "./2_Graphics/", // "/2_Graphics/" subdir + "./3_Imaging/", // "/3_Imaging/" subdir + "./4_Financial/", // "/4_Financial/" subdir + "./5_Simulations/", // "/5_Simulations/" subdir + "./6_Advanced/", // "/6_Advanced/" subdir + "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir + "./8_Android/", // "/8_Android/" subdir + "./samples/", // "/samples/" subdir + + "../", // up 1 in tree + "../common/", // up 1 in tree, "/common/" subdir + "../common/data/", // up 1 in tree, "/common/data/" subdir + "../data/", // up 1 in tree, "/data/" subdir + "../src/", // up 1 in tree, "/src/" subdir + "../inc/", // up 1 in tree, "/inc/" subdir + + "../0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir + "../1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir + "../2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir + "../3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir + "../4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir + "../5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir + "../6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir + "../7_CUDALibraries//data/", // up 1 in tree, "/7_CUDALibraries//" subdir + "../8_Android//data/", // up 1 in tree, "/8_Android//" subdir + "../samples//data/", // up 1 in tree, "/samples//" subdir + "../../", // up 2 in tree + "../../common/", // up 2 in tree, "/common/" subdir + "../../common/data/", // up 2 in tree, "/common/data/" subdir + "../../data/", // up 2 in tree, "/data/" subdir + "../../src/", // up 2 in tree, "/src/" subdir + "../../inc/", // up 2 in tree, "/inc/" subdir + "../../sandbox//data/", // up 2 in tree, "/sandbox//" subdir + "../../0_Simple//data/", // up 2 in tree, "/0_Simple//" subdir + "../../1_Utilities//data/", // up 2 in tree, "/1_Utilities//" subdir + "../../2_Graphics//data/", // up 2 in tree, "/2_Graphics//" subdir + "../../3_Imaging//data/", // up 2 in tree, "/3_Imaging//" subdir + "../../4_Financial//data/", // up 2 in tree, "/4_Financial//" subdir + "../../5_Simulations//data/", // up 2 in tree, "/5_Simulations//" subdir + "../../6_Advanced//data/", // up 2 in tree, "/6_Advanced//" subdir "../../7_CUDALibraries//data/", // up 2 in tree, "/7_CUDALibraries//" subdir - "../../8_Android//data/", // up 2 in tree, "/8_Android//" subdir - "../../samples//data/", // up 2 in tree, "/samples//" subdir - "../../../", // up 3 in tree - "../../../src//", // up 3 in tree, "/src//" subdir - "../../../src//data/", // up 3 in tree, "/src//data/" subdir - "../../../src//src/", // up 3 in tree, "/src//src/" subdir - "../../../src//inc/", // up 3 in tree, "/src//inc/" subdir - "../../../sandbox//", // up 3 in tree, "/sandbox//" subdir - "../../../sandbox//data/", // up 3 in tree, "/sandbox//data/" subdir - "../../../sandbox//src/", // up 3 in tree, "/sandbox//src/" subdir - "../../../sandbox//inc/", // up 3 in tree, "/sandbox//inc/" subdir - "../../../0_Simple//data/", // up 3 in tree, "/0_Simple//" subdir - "../../../1_Utilities//data/", // up 3 in tree, "/1_Utilities//" subdir - "../../../2_Graphics//data/", // up 3 in tree, "/2_Graphics//" subdir - "../../../3_Imaging//data/", // up 3 in tree, "/3_Imaging//" subdir - "../../../4_Financial//data/", // up 3 in tree, "/4_Financial//" subdir - "../../../5_Simulations//data/", // up 3 in tree, "/5_Simulations//" subdir - "../../../6_Advanced//data/", // up 3 in tree, "/6_Advanced//" subdir - "../../../7_CUDALibraries//data/", // up 3 in tree, "/7_CUDALibraries//" subdir - "../../../8_Android//data/", // up 3 in tree, "/8_Android//" subdir - "../../../samples//data/", // up 3 in tree, "/samples//" subdir - "../../../common/", // up 3 in tree, "../../../common/" subdir - "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir - "../../../data/", // up 3 in tree, "../../../data/" subdir - "../../../../", // up 4 in tree - "../../../../src//", // up 4 in tree, "/src//" subdir - "../../../../src//data/", // up 4 in tree, "/src//data/" subdir - "../../../../src//src/", // up 4 in tree, "/src//src/" subdir - "../../../../src//inc/", // up 4 in tree, "/src//inc/" subdir - "../../../../sandbox//", // up 4 in tree, "/sandbox//" subdir + "../../8_Android//data/", // up 2 in tree, "/8_Android//" subdir + "../../samples//data/", // up 2 in tree, "/samples//" subdir + "../../../", // up 3 in tree + "../../../src//", // up 3 in tree, "/src//" subdir + "../../../src//data/", // up 3 in tree, "/src//data/" subdir + "../../../src//src/", // up 3 in tree, "/src//src/" subdir + "../../../src//inc/", // up 3 in tree, "/src//inc/" subdir + "../../../sandbox//", // up 3 in tree, "/sandbox//" subdir + "../../../sandbox//data/", // up 3 in tree, "/sandbox//data/" subdir + "../../../sandbox//src/", // up 3 in tree, "/sandbox//src/" subdir + "../../../sandbox//inc/", // up 3 in tree, "/sandbox//inc/" subdir + "../../../0_Simple//data/", // up 3 in tree, "/0_Simple//" subdir + "../../../1_Utilities//data/", // up 3 in tree, "/1_Utilities//" subdir + "../../../2_Graphics//data/", // up 3 in tree, "/2_Graphics//" subdir + "../../../3_Imaging//data/", // up 3 in tree, "/3_Imaging//" subdir + "../../../4_Financial//data/", // up 3 in tree, "/4_Financial//" subdir + "../../../5_Simulations//data/", // up 3 in tree, "/5_Simulations//" subdir + "../../../6_Advanced//data/", // up 3 in tree, "/6_Advanced//" subdir + "../../../7_CUDALibraries//data/", // up 3 in tree, "/7_CUDALibraries//" + // subdir + "../../../8_Android//data/", // up 3 in tree, "/8_Android//" subdir + "../../../samples//data/", // up 3 in tree, "/samples//" subdir + "../../../common/", // up 3 in tree, "../../../common/" subdir + "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir + "../../../data/", // up 3 in tree, "../../../data/" subdir + "../../../../", // up 4 in tree + "../../../../src//", // up 4 in tree, "/src//" subdir + "../../../../src//data/", // up 4 in tree, "/src//data/" subdir + "../../../../src//src/", // up 4 in tree, "/src//src/" subdir + "../../../../src//inc/", // up 4 in tree, "/src//inc/" subdir + "../../../../sandbox//", // up 4 in tree, "/sandbox//" subdir "../../../../sandbox//data/", // up 4 in tree, "/sandbox//data/" subdir - "../../../../sandbox//src/", // up 4 in tree, "/sandbox//src/" subdir - "../../../../sandbox//inc/", // up 4 in tree, "/sandbox//inc/" subdir - "../../../../0_Simple//data/", // up 4 in tree, "/0_Simple//" subdir - "../../../../1_Utilities//data/", // up 4 in tree, "/1_Utilities//" subdir - "../../../../2_Graphics//data/", // up 4 in tree, "/2_Graphics//" subdir - "../../../../3_Imaging//data/", // up 4 in tree, "/3_Imaging//" subdir - "../../../../4_Financial//data/", // up 4 in tree, "/4_Financial//" subdir - "../../../../5_Simulations//data/",// up 4 in tree, "/5_Simulations//" subdir - "../../../../6_Advanced//data/", // up 4 in tree, "/6_Advanced//" subdir - "../../../../7_CUDALibraries//data/", // up 4 in tree, "/7_CUDALibraries//" subdir - "../../../../8_Android//data/", // up 4 in tree, "/8_Android//" subdir - "../../../../samples//data/", // up 4 in tree, "/samples//" subdir - "../../../../common/", // up 4 in tree, "../../../common/" subdir - "../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir - "../../../../data/", // up 4 in tree, "../../../data/" subdir - "../../../../../", // up 5 in tree - "../../../../../src//", // up 5 in tree, "/src//" subdir - "../../../../../src//data/", // up 5 in tree, "/src//data/" subdir - "../../../../../src//src/", // up 5 in tree, "/src//src/" subdir - "../../../../../src//inc/", // up 5 in tree, "/src//inc/" subdir - "../../../../../sandbox//", // up 5 in tree, "/sandbox//" subdir + "../../../../sandbox//src/", // up 4 in tree, "/sandbox//src/" subdir + "../../../../sandbox//inc/", // up 4 in tree, "/sandbox//inc/" subdir + "../../../../0_Simple//data/", // up 4 in tree, "/0_Simple//" subdir + "../../../../1_Utilities//data/", // up 4 in tree, "/1_Utilities//" subdir + "../../../../2_Graphics//data/", // up 4 in tree, "/2_Graphics//" subdir + "../../../../3_Imaging//data/", // up 4 in tree, "/3_Imaging//" subdir + "../../../../4_Financial//data/", // up 4 in tree, "/4_Financial//" subdir + "../../../../5_Simulations//data/", // up 4 in tree, "/5_Simulations//" + // subdir + "../../../../6_Advanced//data/", // up 4 in tree, "/6_Advanced//" subdir + "../../../../7_CUDALibraries//data/", // up 4 in tree, "/7_CUDALibraries//" + // subdir + "../../../../8_Android//data/", // up 4 in tree, "/8_Android//" subdir + "../../../../samples//data/", // up 4 in tree, "/samples//" subdir + "../../../../common/", // up 4 in tree, "../../../common/" subdir + "../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir + "../../../../data/", // up 4 in tree, "../../../data/" subdir + "../../../../../", // up 5 in tree + "../../../../../src//", // up 5 in tree, "/src//" subdir + "../../../../../src//data/", // up 5 in tree, "/src//data/" subdir + "../../../../../src//src/", // up 5 in tree, "/src//src/" subdir + "../../../../../src//inc/", // up 5 in tree, "/src//inc/" subdir + "../../../../../sandbox//", // up 5 in tree, "/sandbox//" subdir "../../../../../sandbox//data/", // up 5 in tree, "/sandbox//data/" subdir - "../../../../../sandbox//src/", // up 5 in tree, "/sandbox//src/" subdir - "../../../../../sandbox//inc/", // up 5 in tree, "/sandbox//inc/" subdir - "../../../../../0_Simple//data/", // up 5 in tree, "/0_Simple//" subdir - "../../../../../1_Utilities//data/", // up 5 in tree, "/1_Utilities//" subdir - "../../../../../2_Graphics//data/", // up 5 in tree, "/2_Graphics//" subdir - "../../../../../3_Imaging//data/", // up 5 in tree, "/3_Imaging//" subdir - "../../../../../4_Financial//data/", // up 5 in tree, "/4_Financial//" subdir - "../../../../../5_Simulations//data/",// up 5 in tree, "/5_Simulations//" subdir - "../../../../../6_Advanced//data/", // up 5 in tree, "/6_Advanced//" subdir - "../../../../../7_CUDALibraries//data/", // up 5 in tree, "/7_CUDALibraries//" subdir - "../../../../../8_Android//data/", // up 5 in tree, "/8_Android//" subdir - "../../../../../samples//data/", // up 5 in tree, "/samples//" subdir - "../../../../../common/", // up 5 in tree, "../../../common/" subdir - "../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir + "../../../../../sandbox//src/", // up 5 in tree, "/sandbox//src/" subdir + "../../../../../sandbox//inc/", // up 5 in tree, "/sandbox//inc/" subdir + "../../../../../0_Simple//data/", // up 5 in tree, "/0_Simple//" subdir + "../../../../../1_Utilities//data/", // up 5 in tree, "/1_Utilities//" subdir + "../../../../../2_Graphics//data/", // up 5 in tree, "/2_Graphics//" subdir + "../../../../../3_Imaging//data/", // up 5 in tree, "/3_Imaging//" subdir + "../../../../../4_Financial//data/", // up 5 in tree, "/4_Financial//" subdir + "../../../../../5_Simulations//data/", // up 5 in tree, "/5_Simulations//" + // subdir + "../../../../../6_Advanced//data/", // up 5 in tree, "/6_Advanced//" subdir + "../../../../../7_CUDALibraries//data/", // up 5 in tree, + // "/7_CUDALibraries//" subdir + "../../../../../8_Android//data/", // up 5 in tree, "/8_Android//" subdir + "../../../../../samples//data/", // up 5 in tree, "/samples//" subdir + "../../../../../common/", // up 5 in tree, "../../../common/" subdir + "../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir }; // Extract the executable name std::string executable_name; - if (executable_path != 0) + if(executable_path != 0) { executable_name = std::string(executable_path); @@ -432,7 +438,7 @@ inline char *sdkFindFilePath(const char *filename, const char *executable_path) size_t delimiter_pos = executable_name.find_last_of('\\'); executable_name.erase(0, delimiter_pos + 1); - if (executable_name.rfind(".exe") != std::string::npos) + if(executable_name.rfind(".exe") != std::string::npos) { // we strip .exe, only if the .exe is found executable_name.resize(executable_name.size() - 4); @@ -441,21 +447,21 @@ inline char *sdkFindFilePath(const char *filename, const char *executable_path) #else // Linux & OSX path delimiter size_t delimiter_pos = executable_name.find_last_of('/'); - executable_name.erase(0,delimiter_pos+1); + executable_name.erase(0, delimiter_pos + 1); #endif } // Loop over all search paths and return the first hit - for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i) + for(unsigned int i = 0; i < sizeof(searchPath) / sizeof(char*); ++i) { std::string path(searchPath[i]); size_t executable_name_pos = path.find(""); // If there is executable_name variable in the searchPath // replace it with the value - if (executable_name_pos != std::string::npos) + if(executable_name_pos != std::string::npos) { - if (executable_path != 0) + if(executable_path != 0) { path.replace(executable_name_pos, strlen(""), executable_name); } @@ -472,20 +478,20 @@ inline char *sdkFindFilePath(const char *filename, const char *executable_path) // Test if the file exists path.append(filename); - FILE *fp; + FILE* fp; FOPEN(fp, path.c_str(), "rb"); - if (fp != NULL) + if(fp != NULL) { fclose(fp); // File found // returning an allocated array here for backwards compatibility reasons - char *file_path = (char *) malloc(path.length() + 1); + char* file_path = (char*) malloc(path.length() + 1); STRCPY(file_path, path.length() + 1, path.c_str()); return file_path; } - if (fp) + if(fp) { fclose(fp); } diff --git a/example/CUDASamples/common/helper_timer.h b/example/CUDASamples/common/helper_timer.h index 39ddc77f..ceb1f3d5 100644 --- a/example/CUDASamples/common/helper_timer.h +++ b/example/CUDASamples/common/helper_timer.h @@ -14,7 +14,7 @@ #define HELPER_TIMER_H #ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 +# define EXIT_WAIVED 2 #endif // includes, system @@ -27,28 +27,28 @@ // But rather in a self contained class interface class StopWatchInterface { - public: - StopWatchInterface() {}; - virtual ~StopWatchInterface() {}; +public: + StopWatchInterface(){}; + virtual ~StopWatchInterface(){}; - public: - //! Start time measurement - virtual void start() = 0; +public: + //! Start time measurement + virtual void start() = 0; - //! Stop time measurement - virtual void stop() = 0; + //! Stop time measurement + virtual void stop() = 0; - //! Reset time counters to zero - virtual void reset() = 0; + //! Reset time counters to zero + virtual void reset() = 0; - //! Time in msec. after start. If the stop watch is still running (i.e. there - //! was no call to stop()) then the elapsed time is returned, otherwise the - //! time between the last start() and stop call is returned - virtual float getTime() = 0; + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; - //! Mean time to date based on the number of times the stopwatch has been - //! _stopped_ (ie finished sessions) and the current total time - virtual float getAverageTime() = 0; + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; }; @@ -57,85 +57,90 @@ class StopWatchInterface ////////////////////////////////////////////////////////////////// #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // includes, system -#define WINDOWS_LEAN_AND_MEAN -#include -#undef min -#undef max +# define WINDOWS_LEAN_AND_MEAN +# include +# undef min +# undef max //! Windows specific implementation of StopWatch class StopWatchWin : public StopWatchInterface { - public: - //! Constructor, default - StopWatchWin() : - start_time(), end_time(), - diff_time(0.0f), total_time(0.0f), - running(false), clock_sessions(0), freq(0), freq_set(false) +public: + //! Constructor, default + StopWatchWin() + : start_time() + , end_time() + , diff_time(0.0f) + , total_time(0.0f) + , running(false) + , clock_sessions(0) + , freq(0) + , freq_set(false) + { + if(!freq_set) { - if (! freq_set) - { - // helper variable - LARGE_INTEGER temp; + // helper variable + LARGE_INTEGER temp; - // get the tick frequency from the OS - QueryPerformanceFrequency((LARGE_INTEGER *) &temp); + // get the tick frequency from the OS + QueryPerformanceFrequency((LARGE_INTEGER*) &temp); - // convert to type in which it is needed - freq = ((double) temp.QuadPart) / 1000.0; + // convert to type in which it is needed + freq = ((double) temp.QuadPart) / 1000.0; - // rememeber query - freq_set = true; - } - }; + // rememeber query + freq_set = true; + } + }; - // Destructor - ~StopWatchWin() { }; + // Destructor + ~StopWatchWin(){}; - public: - //! Start time measurement - inline void start(); +public: + //! Start time measurement + inline void start(); - //! Stop time measurement - inline void stop(); + //! Stop time measurement + inline void stop(); - //! Reset time counters to zero - inline void reset(); + //! Reset time counters to zero + inline void reset(); - //! Time in msec. after start. If the stop watch is still running (i.e. there - //! was no call to stop()) then the elapsed time is returned, otherwise the - //! time between the last start() and stop call is returned - inline float getTime(); + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); - //! Mean time to date based on the number of times the stopwatch has been - //! _stopped_ (ie finished sessions) and the current total time - inline float getAverageTime(); + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); - private: - // member variables +private: + // member variables - //! Start of measurement - LARGE_INTEGER start_time; - //! End of measurement - LARGE_INTEGER end_time; + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; - //! Time difference between the last start and stop - float diff_time; + //! Time difference between the last start and stop + float diff_time; - //! TOTAL time difference between starts and stops - float total_time; + //! TOTAL time difference between starts and stops + float total_time; - //! flag if the stop watch is running - bool running; + //! flag if the stop watch is running + bool running; - //! Number of times clock has been started - //! and stopped to allow averaging - int clock_sessions; + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; - //! tick frequency - double freq; + //! tick frequency + double freq; - //! flag if the frequency has been set - bool freq_set; + //! flag if the frequency has been set + bool freq_set; }; // functions, inlined @@ -143,10 +148,9 @@ class StopWatchWin : public StopWatchInterface //////////////////////////////////////////////////////////////////////////////// //! Start time measurement //////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchWin::start() +inline void StopWatchWin::start() { - QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + QueryPerformanceCounter((LARGE_INTEGER*) &start_time); running = true; } @@ -154,12 +158,10 @@ StopWatchWin::start() //! Stop time measurement and increment add to the current diff_time summation //! variable. Also increment the number of times this clock has been run. //////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchWin::stop() +inline void StopWatchWin::stop() { - QueryPerformanceCounter((LARGE_INTEGER *) &end_time); - diff_time = (float) - (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); + QueryPerformanceCounter((LARGE_INTEGER*) &end_time); + diff_time = (float) (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); total_time += diff_time; clock_sessions++; @@ -170,16 +172,15 @@ StopWatchWin::stop() //! Reset the timer to 0. Does not change the timer running state but does //! recapture this point in time as the current start time if it is running. //////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchWin::reset() +inline void StopWatchWin::reset() { diff_time = 0; total_time = 0; clock_sessions = 0; - if (running) + if(running) { - QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + QueryPerformanceCounter((LARGE_INTEGER*) &start_time); } } @@ -190,18 +191,16 @@ StopWatchWin::reset() //! current diff_time sum, otherwise the current summed time difference alone //! is returned. //////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchWin::getTime() +inline float StopWatchWin::getTime() { // Return the TOTAL time to date float retval = total_time; - if (running) + if(running) { LARGE_INTEGER temp; - QueryPerformanceCounter((LARGE_INTEGER *) &temp); - retval += (float) - (((double)(temp.QuadPart - start_time.QuadPart)) / freq); + QueryPerformanceCounter((LARGE_INTEGER*) &temp); + retval += (float) (((double) (temp.QuadPart - start_time.QuadPart)) / freq); } return retval; @@ -211,76 +210,70 @@ StopWatchWin::getTime() //! Time in msec. for a single run based on the total number of COMPLETED runs //! and the total time. //////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchWin::getAverageTime() +inline float StopWatchWin::getAverageTime() { - return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; } #else // Declarations for Stopwatch on Linux and Mac OSX // includes, system -#include -#include +# include + +# include //! Windows specific implementation of StopWatch class StopWatchLinux : public StopWatchInterface { - public: - //! Constructor, default - StopWatchLinux() : - start_time(), diff_time(0.0), total_time(0.0), - running(false), clock_sessions(0) - { }; - - // Destructor - virtual ~StopWatchLinux() - { }; - - public: - //! Start time measurement - inline void start(); +public: + //! Constructor, default + StopWatchLinux() : start_time(), diff_time(0.0), total_time(0.0), running(false), clock_sessions(0){}; - //! Stop time measurement - inline void stop(); + // Destructor + virtual ~StopWatchLinux(){}; - //! Reset time counters to zero - inline void reset(); +public: + //! Start time measurement + inline void start(); - //! Time in msec. after start. If the stop watch is still running (i.e. there - //! was no call to stop()) then the elapsed time is returned, otherwise the - //! time between the last start() and stop call is returned - inline float getTime(); + //! Stop time measurement + inline void stop(); - //! Mean time to date based on the number of times the stopwatch has been - //! _stopped_ (ie finished sessions) and the current total time - inline float getAverageTime(); + //! Reset time counters to zero + inline void reset(); - private: + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); - // helper functions + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); - //! Get difference between start time and current time - inline float getDiffTime(); +private: + // helper functions - private: + //! Get difference between start time and current time + inline float getDiffTime(); - // member variables +private: + // member variables - //! Start of measurement - struct timeval start_time; + //! Start of measurement + struct timeval start_time; - //! Time difference between the last start and stop - float diff_time; + //! Time difference between the last start and stop + float diff_time; - //! TOTAL time difference between starts and stops - float total_time; + //! TOTAL time difference between starts and stops + float total_time; - //! flag if the stop watch is running - bool running; + //! flag if the stop watch is running + bool running; - //! Number of times clock has been started - //! and stopped to allow averaging - int clock_sessions; + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; }; // functions, inlined @@ -288,8 +281,7 @@ class StopWatchLinux : public StopWatchInterface //////////////////////////////////////////////////////////////////////////////// //! Start time measurement //////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchLinux::start() +inline void StopWatchLinux::start() { gettimeofday(&start_time, 0); running = true; @@ -299,8 +291,7 @@ StopWatchLinux::start() //! Stop time measurement and increment add to the current diff_time summation //! variable. Also increment the number of times this clock has been run. //////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchLinux::stop() +inline void StopWatchLinux::stop() { diff_time = getDiffTime(); total_time += diff_time; @@ -312,14 +303,13 @@ StopWatchLinux::stop() //! Reset the timer to 0. Does not change the timer running state but does //! recapture this point in time as the current start time if it is running. //////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchLinux::reset() +inline void StopWatchLinux::reset() { diff_time = 0; total_time = 0; clock_sessions = 0; - if (running) + if(running) { gettimeofday(&start_time, 0); } @@ -331,13 +321,12 @@ StopWatchLinux::reset() //! current diff_time sum, otherwise the current summed time difference alone //! is returned. //////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchLinux::getTime() +inline float StopWatchLinux::getTime() { // Return the TOTAL time to date float retval = total_time; - if (running) + if(running) { retval += getDiffTime(); } @@ -349,23 +338,20 @@ StopWatchLinux::getTime() //! Time in msec. for a single run based on the total number of COMPLETED runs //! and the total time. //////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchLinux::getAverageTime() +inline float StopWatchLinux::getAverageTime() { - return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; } //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchLinux::getDiffTime() +inline float StopWatchLinux::getDiffTime() { struct timeval t_time; gettimeofday(&t_time, 0); // time difference in milli-seconds - return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec) - + (0.001 * (t_time.tv_usec - start_time.tv_usec))); + return (float) (1000.0 * (t_time.tv_sec - start_time.tv_sec) + (0.001 * (t_time.tv_usec - start_time.tv_usec))); } #endif // WIN32 @@ -377,14 +363,13 @@ StopWatchLinux::getDiffTime() //! @return true if a time has been created, otherwise false //! @param name of the new timer, 0 if the creation failed //////////////////////////////////////////////////////////////////////////////// -inline bool -sdkCreateTimer(StopWatchInterface **timer_interface) +inline bool sdkCreateTimer(StopWatchInterface** timer_interface) { - //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); + // printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - *timer_interface = (StopWatchInterface *)new StopWatchWin(); + *timer_interface = (StopWatchInterface*) new StopWatchWin(); #else - *timer_interface = (StopWatchInterface *)new StopWatchLinux(); + *timer_interface = (StopWatchInterface*) new StopWatchLinux(); #endif return (*timer_interface != NULL) ? true : false; } @@ -395,11 +380,10 @@ sdkCreateTimer(StopWatchInterface **timer_interface) //! @return true if a time has been deleted, otherwise false //! @param name of the timer to delete //////////////////////////////////////////////////////////////////////////////// -inline bool -sdkDeleteTimer(StopWatchInterface **timer_interface) +inline bool sdkDeleteTimer(StopWatchInterface** timer_interface) { - //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if(*timer_interface) { delete *timer_interface; *timer_interface = NULL; @@ -412,11 +396,10 @@ sdkDeleteTimer(StopWatchInterface **timer_interface) //! Start the time with name \a name //! @param name name of the timer to start //////////////////////////////////////////////////////////////////////////////// -inline bool -sdkStartTimer(StopWatchInterface **timer_interface) +inline bool sdkStartTimer(StopWatchInterface** timer_interface) { - //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if(*timer_interface) { (*timer_interface)->start(); } @@ -428,11 +411,10 @@ sdkStartTimer(StopWatchInterface **timer_interface) //! Stop the time with name \a name. Does not reset. //! @param name name of the timer to stop //////////////////////////////////////////////////////////////////////////////// -inline bool -sdkStopTimer(StopWatchInterface **timer_interface) +inline bool sdkStopTimer(StopWatchInterface** timer_interface) { // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) + if(*timer_interface) { (*timer_interface)->stop(); } @@ -444,11 +426,10 @@ sdkStopTimer(StopWatchInterface **timer_interface) //! Resets the timer's counter. //! @param name name of the timer to reset. //////////////////////////////////////////////////////////////////////////////// -inline bool -sdkResetTimer(StopWatchInterface **timer_interface) +inline bool sdkResetTimer(StopWatchInterface** timer_interface) { // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) + if(*timer_interface) { (*timer_interface)->reset(); } @@ -463,11 +444,10 @@ sdkResetTimer(StopWatchInterface **timer_interface) //! Excludes the current running time if the timer is currently running. //! @param name name of the timer to return the time of //////////////////////////////////////////////////////////////////////////////// -inline float -sdkGetAverageTimerValue(StopWatchInterface **timer_interface) +inline float sdkGetAverageTimerValue(StopWatchInterface** timer_interface) { // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface); - if (*timer_interface) + if(*timer_interface) { return (*timer_interface)->getAverageTime(); } @@ -482,11 +462,10 @@ sdkGetAverageTimerValue(StopWatchInterface **timer_interface) //! or timer creation. //! @param name name of the timer to obtain the value of. //////////////////////////////////////////////////////////////////////////////// -inline float -sdkGetTimerValue(StopWatchInterface **timer_interface) +inline float sdkGetTimerValue(StopWatchInterface** timer_interface) { // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); - if (*timer_interface) + if(*timer_interface) { return (*timer_interface)->getTime(); } diff --git a/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp index 4c91d53f..15644865 100644 --- a/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp +++ b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp @@ -15,40 +15,50 @@ * of the programming guide with some additions like error checking. */ -#include #include //std:cout + +#include // For the CUDA runtime routines (prefixed with "cupla_") #include -//Timer for test purpose +// Timer for test purpose #include -#include #include + +#include /** * CUDA Kernel Device code * * Computes the vector addition of A and B into C. The 3 vectors have the same * number of elements numElements. */ -struct vectorAdd { +struct vectorAdd +{ template - ALPAKA_FN_HOST_ACC - void operator()(T_Acc const &acc, const float *A, const float *B, float *C, const int numElements) const { - int begin = cupla::blockDim(acc).x * cupla::blockIdx(acc).x * cupla::threadDim(acc).x + cupla::threadIdx(acc).x * cupla::threadDim(acc).x; - if (begin < numElements) { - int end = (begin + cupla::threadDim(acc).x < numElements) ? begin+cupla::threadDim(acc).x : numElements; - for (int i=begin; i 1e-5) + if(fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { fprintf(stderr, "Result verification failed at element %d!\n", i); exit(EXIT_FAILURE); @@ -169,7 +179,7 @@ main(int argc, char *argv[]) // Free device global memory err = cuplaFree(d_A); - if (err != cuplaSuccess) + if(err != cuplaSuccess) { fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cuplaGetErrorString(err)); exit(EXIT_FAILURE); @@ -177,14 +187,14 @@ main(int argc, char *argv[]) err = cuplaFree(d_B); - if (err != cuplaSuccess) + if(err != cuplaSuccess) { fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cuplaGetErrorString(err)); exit(EXIT_FAILURE); } err = cuplaFree(d_C); - if (err != cuplaSuccess) + if(err != cuplaSuccess) { fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cuplaGetErrorString(err)); exit(EXIT_FAILURE); @@ -203,86 +213,88 @@ main(int argc, char *argv[]) // flushed before the application exits err = cuplaDeviceReset(); - if (err != cuplaSuccess) + if(err != cuplaSuccess) { fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cuplaGetErrorString(err)); exit(EXIT_FAILURE); } printf("Done\n"); - using boost::lexical_cast; using boost::bad_lexical_cast; + using boost::lexical_cast; std::vector args; - while (*++argv){ - try{ + while(*++argv) + { + try + { args.push_back(lexical_cast(*argv)); } - catch( const bad_lexical_cast &){ + catch(const bad_lexical_cast&) + { args.push_back(0); } } - //run benchmartest + // run benchmartest int first = 50000; int last = 100000; - int stepSize= 50000; - if (args.size() >1){ - first=args[0]; - last=args[1]; + int stepSize = 50000; + if(args.size() > 1) + { + first = args[0]; + last = args[1]; } - if (args.size()>2){ - stepSize=args[2]; + if(args.size() > 2) + { + stepSize = args[2]; } benchmarkTest(first, last, stepSize); cuplaDeviceReset(); return 0; } -void -benchmarkTest(int first, int last, int stepSize) +void benchmarkTest(int first, int last, int stepSize) { - - for (int numElements = first; numElements <=last ; numElements+= stepSize) { - std::cout <<"N= " < - (end-start).count() <<"ms"<(end - start).count() << "ms" + << std::endl; + // Free Device memory cuplaFree(d_A); cuplaFree(d_B); cuplaFree(d_C); } } - diff --git a/example/CUDASamples/matrixMul/src/matrixMul.cpp b/example/CUDASamples/matrixMul/src/matrixMul.cpp index e27a2db1..85f6662d 100644 --- a/example/CUDASamples/matrixMul/src/matrixMul.cpp +++ b/example/CUDASamples/matrixMul/src/matrixMul.cpp @@ -24,8 +24,8 @@ */ // System includes -#include #include +#include // CUDA runtime #include @@ -36,94 +36,89 @@ * Matrix multiplication (CUDA Kernel) on the device: C = A * B * wA is A's width and wB is B's width */ -template +template struct matrixMulCUDA { + template + ALPAKA_FN_HOST_ACC void operator()(T_Acc const& acc, float* C, float* A, float* B, int wA, int wB) const + { + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; -template -ALPAKA_FN_HOST_ACC -void operator()(T_Acc const& acc,float *C, float *A, float *B, int wA, int wB) const -{ - // Block index - int bx = blockIdx.x; - int by = blockIdx.y; - - // Thread index - int tx = threadIdx.x; - int ty = threadIdx.y; - - // Index of the first sub-matrix of A processed by the block - int aBegin = wA * BLOCK_SIZE * by; - - // Index of the last sub-matrix of A processed by the block - int aEnd = aBegin + wA - 1; - - // Step size used to iterate through the sub-matrices of A - int aStep = BLOCK_SIZE; - - // Index of the first sub-matrix of B processed by the block - int bBegin = BLOCK_SIZE * bx; - - // Step size used to iterate through the sub-matrices of B - int bStep = BLOCK_SIZE * wB; + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - float Csub = 0; + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * by; - sharedMem(As, cupla::Array,BLOCK_SIZE>); - sharedMem(Bs, cupla::Array,BLOCK_SIZE>); + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int a = aBegin, b = bBegin; - a <= aEnd; - a += aStep, b += bStep) - { + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; - // Declaration of the shared memory array As used to - // store the sub-matrix of A - //__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * bx; - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - //__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[ty][tx] = A[a + wA * ty + tx]; - Bs[ty][tx] = B[b + wB * ty + tx]; + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; - // Synchronize to make sure the matrices are loaded - __syncthreads(); + sharedMem(As, cupla::Array, BLOCK_SIZE>); + sharedMem(Bs, cupla::Array, BLOCK_SIZE>); - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for(int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) + { + // Declaration of the shared memory array As used to + // store the sub-matrix of A + //__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + //__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[ty][tx] = A[a + wA * ty + tx]; + Bs[ty][tx] = B[b + wB * ty + tx]; + + // Synchronize to make sure the matrices are loaded + __syncthreads(); + + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix #pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) - { - Csub += As[ty][k] * Bs[k][tx]; + for(int k = 0; k < BLOCK_SIZE; ++k) + { + Csub += As[ty][k] * Bs[k][tx]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); } - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; + C[c + wB * ty + tx] = Csub; } - - // Write the block sub-matrix to device memory; - // each thread writes one element - int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; - C[c + wB * ty + tx] = Csub; -} }; -void constantInit(float *data, int size, float val) +void constantInit(float* data, int size, float val) { - for (int i = 0; i < size; ++i) + for(int i = 0; i < size; ++i) { data[i] = val; } @@ -132,15 +127,15 @@ void constantInit(float *data, int size, float val) /** * Run a simple test of matrix multiplication using CUDA */ -int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) +int matrixMultiply(int argc, char** argv, int block_size, dim3& dimsA, dim3& dimsB) { // Allocate host memory for matrices A and B unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(float) * size_A; - float *h_A = (float *)malloc(mem_size_A); + float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(float) * size_B; - float *h_B = (float *)malloc(mem_size_B); + float* h_B = (float*) malloc(mem_size_B); // Initialize host memory const float valB = 0.01f; @@ -153,9 +148,9 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // Allocate host matrix C dim3 dimsC(dimsB.x, dimsA.y, 1); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); - float *h_C = (float *) malloc(mem_size_C); + float* h_C = (float*) malloc(mem_size_C); - if (h_C == NULL) + if(h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix C!\n"); exit(EXIT_FAILURE); @@ -163,25 +158,25 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim cudaError_t error; - error = cudaMalloc((void **) &d_A, mem_size_A); + error = cudaMalloc((void**) &d_A, mem_size_A); - if (error != cudaSuccess) + if(error != cudaSuccess) { printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } - error = cudaMalloc((void **) &d_B, mem_size_B); + error = cudaMalloc((void**) &d_B, mem_size_B); - if (error != cudaSuccess) + if(error != cudaSuccess) { printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } - error = cudaMalloc((void **) &d_C, mem_size_C); + error = cudaMalloc((void**) &d_C, mem_size_C); - if (error != cudaSuccess) + if(error != cudaSuccess) { printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); @@ -190,7 +185,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); - if (error != cudaSuccess) + if(error != cudaSuccess) { printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); @@ -198,7 +193,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); - if (error != cudaSuccess) + if(error != cudaSuccess) { printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); @@ -212,13 +207,13 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim printf("Computing result using CUDA Kernel...\n"); // Performs warmup operation using matrixMul CUDA kernel - if (block_size == 16) + if(block_size == 16) { - CUPLA_KERNEL(matrixMulCUDA<16>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x); + CUPLA_KERNEL(matrixMulCUDA<16>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x); } else { - CUPLA_KERNEL(matrixMulCUDA<32>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x); + CUPLA_KERNEL(matrixMulCUDA<32>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x); } printf("done\n"); @@ -229,7 +224,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim cudaEvent_t start; error = cudaEventCreate(&start); - if (error != cudaSuccess) + if(error != cudaSuccess) { fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); @@ -238,7 +233,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim cudaEvent_t stop; error = cudaEventCreate(&stop); - if (error != cudaSuccess) + if(error != cudaSuccess) { fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); @@ -247,31 +242,31 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // Record the start event error = cudaEventRecord(start, NULL); - if (error != cudaSuccess) + if(error != cudaSuccess) { fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Execute the kernel - int nIter = 1; //300; + int nIter = 1; // 300; - for (int j = 0; j < nIter; j++) + for(int j = 0; j < nIter; j++) { - if (block_size == 16) + if(block_size == 16) { - CUPLA_KERNEL(matrixMulCUDA<16>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x); + CUPLA_KERNEL(matrixMulCUDA<16>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x); } else { - CUPLA_KERNEL(matrixMulCUDA<32>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x); + CUPLA_KERNEL(matrixMulCUDA<32>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x); } } // Record the stop event error = cudaEventRecord(stop, NULL); - if (error != cudaSuccess) + if(error != cudaSuccess) { fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); @@ -280,7 +275,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // Wait for the stop event to complete error = cudaEventSynchronize(stop); - if (error != cudaSuccess) + if(error != cudaSuccess) { fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); @@ -289,7 +284,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim float msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); - if (error != cudaSuccess) + if(error != cudaSuccess) { fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); @@ -297,7 +292,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; - double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; + double flopsPerMatrixMul = 2.0 * (double) dimsA.x * (double) dimsA.y * (double) dimsB.x; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", @@ -309,7 +304,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // Copy result from device to host error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); - if (error != cudaSuccess) + if(error != cudaSuccess) { printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); @@ -320,18 +315,18 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // test relative error by the formula // |_cpu - _gpu|/<|x|, |y|> < eps - double eps = 1.e-6 ; // machine zero + double eps = 1.e-6; // machine zero - for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) + for(int i = 0; i < (int) (dimsC.x * dimsC.y); i++) { double abs_err = fabs(h_C[i] - (dimsA.x * valB)); double dot_length = dimsA.x; double abs_val = fabs(h_C[i]); - double rel_err = abs_err/abs_val/dot_length ; + double rel_err = abs_err / abs_val / dot_length; - if (rel_err > eps) + if(rel_err > eps) { - printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps); + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps); correct = false; } } @@ -355,7 +350,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim // flushed before the application exits cudaDeviceReset(); - if (correct) + if(correct) { return EXIT_SUCCESS; } @@ -369,12 +364,11 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim /** * Program main */ -int main(int argc, char **argv) +int main(int argc, char** argv) { printf("[Matrix Multiply Using CUDA] - Starting...\n"); - if (checkCmdLineFlag(argc, (const char **)argv, "help") || - checkCmdLineFlag(argc, (const char **)argv, "?")) + if(checkCmdLineFlag(argc, (const char**) argv, "help") || checkCmdLineFlag(argc, (const char**) argv, "?")) { printf("Usage -device=n (n >= 0 for deviceID)\n"); printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); @@ -387,73 +381,73 @@ int main(int argc, char **argv) // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line int devID = 0; - if (checkCmdLineFlag(argc, (const char **)argv, "device")) + if(checkCmdLineFlag(argc, (const char**) argv, "device")) { - devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); + devID = getCmdLineArgumentInt(argc, (const char**) argv, "device"); cudaSetDevice(devID); } cudaError_t error; -// cudaDeviceProp deviceProp; + // cudaDeviceProp deviceProp; error = cudaGetDevice(&devID); - if (error != cudaSuccess) + if(error != cudaSuccess) { printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__); } -/* - error = cudaGetDeviceProperties(&deviceProp, devID); + /* + error = cudaGetDeviceProperties(&deviceProp, devID); - if (deviceProp.computeMode == cudaComputeModeProhibited) - { - fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); - exit(EXIT_SUCCESS); - } + if (deviceProp.computeMode == cudaComputeModeProhibited) + { + fprintf(stderr, "Error: device is running in , no threads can use + ::cudaSetDevice().\n"); exit(EXIT_SUCCESS); + } - if (error != cudaSuccess) - { - printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); - } - else - { - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); - } - */ + if (error != cudaSuccess) + { + printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); + } + else + { + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, + deviceProp.minor); + } + */ // Use a larger block size for Fermi and above int block_size = 16; // (deviceProp.major < 2) ? 16 : 32; - dim3 dimsA(5*2*block_size, 5*2*block_size, 1); - dim3 dimsB(5*4*block_size, 5*2*block_size, 1); + dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); + dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); // width of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "wA")) + if(checkCmdLineFlag(argc, (const char**) argv, "wA")) { - dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); + dimsA.x = getCmdLineArgumentInt(argc, (const char**) argv, "wA"); } // height of Matrix A - if (checkCmdLineFlag(argc, (const char **)argv, "hA")) + if(checkCmdLineFlag(argc, (const char**) argv, "hA")) { - dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); + dimsA.y = getCmdLineArgumentInt(argc, (const char**) argv, "hA"); } // width of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "wB")) + if(checkCmdLineFlag(argc, (const char**) argv, "wB")) { - dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); + dimsB.x = getCmdLineArgumentInt(argc, (const char**) argv, "wB"); } // height of Matrix B - if (checkCmdLineFlag(argc, (const char **)argv, "hB")) + if(checkCmdLineFlag(argc, (const char**) argv, "hB")) { - dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); + dimsB.y = getCmdLineArgumentInt(argc, (const char**) argv, "hB"); } - if (dimsA.x != dimsB.y) + if(dimsA.x != dimsB.y) { - printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", - dimsA.x, dimsB.y); + printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y); exit(EXIT_FAILURE); } diff --git a/example/CUDASamples/vectorAdd/src/vectorAdd.cpp b/example/CUDASamples/vectorAdd/src/vectorAdd.cpp index 1db118f9..1e37224c 100644 --- a/example/CUDASamples/vectorAdd/src/vectorAdd.cpp +++ b/example/CUDASamples/vectorAdd/src/vectorAdd.cpp @@ -15,40 +15,49 @@ * of the programming guide with some additions like error checking. */ -#include #include //std:cout + +#include // For the CUDA runtime routines (prefixed with "cuda_") #include -//Timer for test purpose +// Timer for test purpose #include -#include #include + +#include /** * CUDA Kernel Device code * * Computes the vector addition of A and B into C. The 3 vectors have the same * number of elements numElements. */ -struct vectorAdd { +struct vectorAdd +{ template - ALPAKA_FN_HOST_ACC - void operator()(T_Acc const &acc, const float *A, const float *B, float *C, const int numElements) const { + ALPAKA_FN_HOST_ACC void operator()( + T_Acc const& acc, + const float* A, + const float* B, + float* C, + const int numElements) const + { int begin = blockDim.x * blockIdx.x * elemDim.x + threadIdx.x * elemDim.x; - if (begin < numElements) { - int end = (begin + elemDim.x < numElements) ? begin+elemDim.x : numElements; - for (int i=begin; i 1e-5) + if(fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { fprintf(stderr, "Result verification failed at element %d!\n", i); exit(EXIT_FAILURE); @@ -169,7 +178,7 @@ main(int argc, char *argv[]) // Free device global memory err = cudaFree(d_A); - if (err != cudaSuccess) + if(err != cudaSuccess) { fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); @@ -177,14 +186,14 @@ main(int argc, char *argv[]) err = cudaFree(d_B); - if (err != cudaSuccess) + if(err != cudaSuccess) { fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaFree(d_C); - if (err != cudaSuccess) + if(err != cudaSuccess) { fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); @@ -203,86 +212,88 @@ main(int argc, char *argv[]) // flushed before the application exits err = cudaDeviceReset(); - if (err != cudaSuccess) + if(err != cudaSuccess) { fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } printf("Done\n"); - using boost::lexical_cast; using boost::bad_lexical_cast; + using boost::lexical_cast; std::vector args; - while (*++argv){ - try{ + while(*++argv) + { + try + { args.push_back(lexical_cast(*argv)); } - catch( const bad_lexical_cast &){ + catch(const bad_lexical_cast&) + { args.push_back(0); } } - //run benchmartest + // run benchmartest int first = 50000; int last = 100000; - int stepSize= 50000; - if (args.size() >1){ - first=args[0]; - last=args[1]; + int stepSize = 50000; + if(args.size() > 1) + { + first = args[0]; + last = args[1]; } - if (args.size()>2){ - stepSize=args[2]; + if(args.size() > 2) + { + stepSize = args[2]; } benchmarkTest(first, last, stepSize); cudaDeviceReset(); return 0; } -void -benchmarkTest(int first, int last, int stepSize) +void benchmarkTest(int first, int last, int stepSize) { - - for (int numElements = first; numElements <=last ; numElements+= stepSize) { - std::cout <<"N= " < - (end-start).count() <<"ms"<(end - start).count() << "ms" + << std::endl; + // Free Device memory cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); } } - diff --git a/include/cuda_to_cupla.hpp b/include/cuda_to_cupla.hpp index 2070e92c..99bd48ef 100644 --- a/include/cuda_to_cupla.hpp +++ b/include/cuda_to_cupla.hpp @@ -22,8 +22,6 @@ #pragma once #include "cupla.hpp" - -#include "cupla/device_functions.hpp" - #include "cupla/cudaToCupla/driverTypes.hpp" #include "cupla/cudaToCupla/runtime.hpp" +#include "cupla/device_functions.hpp" diff --git a/include/cupla.hpp b/include/cupla.hpp index 85284e6d..74701944 100644 --- a/include/cupla.hpp +++ b/include/cupla.hpp @@ -21,5 +21,5 @@ #pragma once -#include "cupla_runtime.hpp" #include "cupla/device_functions.hpp" +#include "cupla_runtime.hpp" diff --git a/include/cupla/api/common.hpp b/include/cupla/api/common.hpp index fdcaac9f..12c9e7c2 100644 --- a/include/cupla/api/common.hpp +++ b/include/cupla/api/common.hpp @@ -21,41 +21,36 @@ #pragma once -#include - #include "cupla/namespace.hpp" #include "cupla/types.hpp" #include "cupla_driver_types.hpp" +#include + inline namespace CUPLA_ACCELERATOR_NAMESPACE { + const char* cuplaGetErrorName(cuplaError_t); -const char * -cuplaGetErrorName(cuplaError_t); + const char* cuplaGetErrorString(cuplaError_t); -const char * -cuplaGetErrorString(cuplaError_t); - -/** returns the last error from a runtime call. - * - * This call reset the error code to cuplaSuccess - * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess - * - * @return cuplaSuccess if there was no error else the corresponding error type - */ -cuplaError_t -cuplaGetLastError(); + /** returns the last error from a runtime call. + * + * This call reset the error code to cuplaSuccess + * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess + * + * @return cuplaSuccess if there was no error else the corresponding error type + */ + cuplaError_t cuplaGetLastError(); -/** returns the last error from a runtime call. - * - * This call does not reset the error code. - * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess - * - * @return cuplaSuccess if there was no error else the corresponding error type - */ -cuplaError_t -cuplaPeekAtLastError(); + /** returns the last error from a runtime call. + * + * This call does not reset the error code. + * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess + * + * @return cuplaSuccess if there was no error else the corresponding error type + */ + cuplaError_t cuplaPeekAtLastError(); -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/api/device.hpp b/include/cupla/api/device.hpp index c2ae1be6..8a58d2aa 100644 --- a/include/cupla/api/device.hpp +++ b/include/cupla/api/device.hpp @@ -21,34 +21,24 @@ #pragma once -#include - #include "cupla/namespace.hpp" #include "cupla/types.hpp" #include "cupla_driver_types.hpp" +#include + inline namespace CUPLA_ACCELERATOR_NAMESPACE { + cuplaError_t cuplaGetDeviceCount(int* count); -cuplaError_t -cuplaGetDeviceCount( int * count); - -cuplaError_t -cuplaSetDevice( int idx); + cuplaError_t cuplaSetDevice(int idx); -cuplaError_t -cuplaGetDevice( int * deviceId ); + cuplaError_t cuplaGetDevice(int* deviceId); -cuplaError_t -cuplaDeviceReset( ); + cuplaError_t cuplaDeviceReset(); -cuplaError_t -cuplaDeviceSynchronize( ); + cuplaError_t cuplaDeviceSynchronize(); -cuplaError_t -cuplaMemGetInfo( - size_t * free, - size_t * total -); + cuplaError_t cuplaMemGetInfo(size_t* free, size_t* total); -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/api/event.hpp b/include/cupla/api/event.hpp index 6b93421f..b95c1ada 100644 --- a/include/cupla/api/event.hpp +++ b/include/cupla/api/event.hpp @@ -21,48 +21,26 @@ #pragma once -#include - #include "cupla/namespace.hpp" #include "cupla/types.hpp" #include "cupla_driver_types.hpp" +#include + inline namespace CUPLA_ACCELERATOR_NAMESPACE { + cuplaError_t cuplaEventCreateWithFlags(cuplaEvent_t* event, unsigned int flags); -cuplaError_t -cuplaEventCreateWithFlags( - cuplaEvent_t * event, - unsigned int flags -); - -cuplaError_t -cuplaEventCreate( - cuplaEvent_t * event -); + cuplaError_t cuplaEventCreate(cuplaEvent_t* event); -cuplaError_t -cuplaEventDestroy( cuplaEvent_t event ); + cuplaError_t cuplaEventDestroy(cuplaEvent_t event); -cuplaError_t -cuplaEventRecord( - cuplaEvent_t event, - cuplaStream_t stream = 0 -); + cuplaError_t cuplaEventRecord(cuplaEvent_t event, cuplaStream_t stream = 0); -cuplaError_t -cuplaEventElapsedTime( - float * ms, - cuplaEvent_t start, - cuplaEvent_t end -); + cuplaError_t cuplaEventElapsedTime(float* ms, cuplaEvent_t start, cuplaEvent_t end); -cuplaError_t -cuplaEventSynchronize( - cuplaEvent_t event -); + cuplaError_t cuplaEventSynchronize(cuplaEvent_t event); -cuplaError_t -cuplaEventQuery( cuplaEvent_t event ); + cuplaError_t cuplaEventQuery(cuplaEvent_t event); -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/api/memory.hpp b/include/cupla/api/memory.hpp index 27f841fc..79ba0e70 100644 --- a/include/cupla/api/memory.hpp +++ b/include/cupla/api/memory.hpp @@ -21,140 +21,74 @@ #pragma once -#include - -#include "cupla/datatypes/dim3.hpp" -#include "cupla/datatypes/uint.hpp" #include "cupla/c/datatypes/cuplaExtent.hpp" +#include "cupla/c/datatypes/cuplaMemcpy3DParms.hpp" #include "cupla/c/datatypes/cuplaPitchedPtr.hpp" - +#include "cupla/c/datatypes/cuplaPos.hpp" +#include "cupla/datatypes/dim3.hpp" +#include "cupla/datatypes/uint.hpp" #include "cupla/namespace.hpp" #include "cupla/types.hpp" #include "cupla_driver_types.hpp" +#include + inline namespace CUPLA_ACCELERATOR_NAMESPACE { + cuplaError_t cuplaMalloc(void** ptrptr, size_t size); + + cuplaError_t cuplaMallocHost(void** ptrptr, size_t size); + + + cuplaError_t cuplaMallocPitch(void** devPtr, size_t* pitch, size_t const width, size_t const height); + + cuplaError_t cuplaMalloc3D(cuplaPitchedPtr* pitchedDevPtr, cuplaExtent const extent); + + + cuplaExtent make_cuplaExtent(size_t const w, size_t const h, size_t const d); + + cuplaPos make_cuplaPos(size_t const x, size_t const y, size_t const z); + + cuplaPitchedPtr make_cuplaPitchedPtr(void* const d, size_t const p, size_t const xsz, size_t const ysz); + + cuplaError_t cuplaFree(void* ptr); + + cuplaError_t cuplaFreeHost(void* ptr); + + cuplaError_t cuplaMemcpy(void* dst, const void* src, size_t count, enum cuplaMemcpyKind kind); + + cuplaError_t cuplaMemcpyAsync( + void* dst, + const void* src, + size_t count, + enum cuplaMemcpyKind kind, + cuplaStream_t stream = 0); + + cuplaError_t cuplaMemsetAsync(void* devPtr, int value, size_t count, cuplaStream_t stream = 0); + + cuplaError_t cuplaMemset(void* devPtr, int value, size_t count); + + cuplaError_t cuplaMemcpy2D( + void* dst, + size_t const dPitch, + void const* const src, + size_t const spitch, + size_t const width, + size_t const height, + enum cuplaMemcpyKind kind); + + cuplaError_t cuplaMemcpy2DAsync( + void* dst, + size_t const dPitch, + void const* const src, + size_t const spitch, + size_t const width, + size_t const height, + enum cuplaMemcpyKind kind, + cuplaStream_t const stream = 0); + + cuplaError_t cuplaMemcpy3DAsync(const cuplaMemcpy3DParms* const p, cuplaStream_t stream = 0); + + cuplaError_t cuplaMemcpy3D(const cuplaMemcpy3DParms* const p); -cuplaError_t -cuplaMalloc( - void **ptrptr, - size_t size -); - -cuplaError_t -cuplaMallocHost( - void **ptrptr, - size_t size -); - - -cuplaError_t -cuplaMallocPitch( - void ** devPtr, - size_t * pitch, - size_t const width, - size_t const height -); - -cuplaError_t -cuplaMalloc3D( - cuplaPitchedPtr * pitchedDevPtr, - cuplaExtent const extent -); - - -cuplaExtent -make_cuplaExtent( - size_t const w, - size_t const h, - size_t const d -); - -cuplaPos -make_cuplaPos( - size_t const x, - size_t const y, - size_t const z -); - -cuplaPitchedPtr -make_cuplaPitchedPtr( - void * const d, - size_t const p, - size_t const xsz, - size_t const ysz -); - -cuplaError_t -cuplaFree(void *ptr); - -cuplaError_t -cuplaFreeHost(void *ptr); - -cuplaError_t -cuplaMemcpy( - void *dst, - const void *src, - size_t count, - enum cuplaMemcpyKind kind -); - -cuplaError_t -cuplaMemcpyAsync( - void *dst, - const void *src, - size_t count, - enum cuplaMemcpyKind kind, - cuplaStream_t stream = 0 -); - -cuplaError_t -cuplaMemsetAsync( - void * devPtr, - int value, - size_t count, - cuplaStream_t stream = 0 -); - -cuplaError_t -cuplaMemset( - void * devPtr, - int value, - size_t count -); - -cuplaError_t -cuplaMemcpy2D( - void * dst, - size_t const dPitch, - void const * const src, - size_t const spitch, - size_t const width, - size_t const height, - enum cuplaMemcpyKind kind -); - -cuplaError_t -cuplaMemcpy2DAsync( - void * dst, - size_t const dPitch, - void const * const src, - size_t const spitch, - size_t const width, - size_t const height, - enum cuplaMemcpyKind kind, - cuplaStream_t const stream = 0 -); - -cuplaError_t -cuplaMemcpy3DAsync( - const cuplaMemcpy3DParms * const p, - cuplaStream_t stream = 0 -); - -cuplaError_t -cuplaMemcpy3D( - const cuplaMemcpy3DParms * const p -); - -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/api/stream.hpp b/include/cupla/api/stream.hpp index d68508de..0c99b8f3 100644 --- a/include/cupla/api/stream.hpp +++ b/include/cupla/api/stream.hpp @@ -21,36 +21,22 @@ #pragma once -#include - #include "cupla/namespace.hpp" #include "cupla/types.hpp" #include "cupla_driver_types.hpp" +#include + inline namespace CUPLA_ACCELERATOR_NAMESPACE { + cuplaError_t cuplaStreamCreate(cuplaStream_t* stream); -cuplaError_t -cuplaStreamCreate( - cuplaStream_t * stream -); - -cuplaError_t -cuplaStreamDestroy( cuplaStream_t stream ); + cuplaError_t cuplaStreamDestroy(cuplaStream_t stream); -cuplaError_t -cuplaStreamSynchronize( - cuplaStream_t stream -); + cuplaError_t cuplaStreamSynchronize(cuplaStream_t stream); -cuplaError_t -cuplaStreamWaitEvent( - cuplaStream_t stream, - cuplaEvent_t event, - unsigned int flags -); + cuplaError_t cuplaStreamWaitEvent(cuplaStream_t stream, cuplaEvent_t event, unsigned int flags); -cuplaError_t -cuplaStreamQuery( cuplaStream_t stream ); + cuplaError_t cuplaStreamQuery(cuplaStream_t stream); -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/c/datatypes/cuplaArray.hpp b/include/cupla/c/datatypes/cuplaArray.hpp index 34905370..588a4eb8 100644 --- a/include/cupla/c/datatypes/cuplaArray.hpp +++ b/include/cupla/c/datatypes/cuplaArray.hpp @@ -21,17 +21,16 @@ #pragma once -#include "cupla/namespace.hpp" -#include "cupla/types.hpp" +#include "cupla/c/datatypes/cuplaExtent.hpp" #include "cupla/c/datatypes/cuplaPitchedPtr.hpp" #include "cupla/c/datatypes/cuplaPos.hpp" -#include "cupla/c/datatypes/cuplaExtent.hpp" +#include "cupla/namespace.hpp" +#include "cupla/types.hpp" inline namespace CUPLA_ACCELERATOR_NAMESPACE { + struct cuplaArray + { + }; -struct cuplaArray -{ -}; - -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/c/datatypes/cuplaExtent.hpp b/include/cupla/c/datatypes/cuplaExtent.hpp index 59721eb1..b5559606 100644 --- a/include/cupla/c/datatypes/cuplaExtent.hpp +++ b/include/cupla/c/datatypes/cuplaExtent.hpp @@ -26,199 +26,127 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE { - -struct cuplaExtent{ - cupla::MemSizeType width, height, depth; - - cuplaExtent() = default; - - ALPAKA_FN_HOST_ACC - cuplaExtent( - cupla::MemSizeType const w, - cupla::MemSizeType const h, - cupla::MemSizeType const d - ) : - width( w ), - height( h ), - depth( d ) - {} - - template< - typename TDim, - typename TSize, - typename = typename std::enable_if< - (TDim::value == 3u) - >::type - > - ALPAKA_FN_HOST_ACC - cuplaExtent( - ::alpaka::Vec< - TDim, - TSize - > const &vec - ) - { - for( uint32_t i(0); i < 3u; ++i ) { - // alpaka vectors are z,y,x. - ( &this->width )[ i ] = vec[ ( 3u - 1u ) - i ]; - } - } - - ALPAKA_FN_HOST_ACC - operator ::alpaka::Vec< - cupla::AlpakaDim< 3u >, - cupla::MemSizeType - >(void) const + struct cuplaExtent { - ::alpaka::Vec< - cupla::AlpakaDim< 3u >, - cupla::MemSizeType - > vec( depth, height, width ); - return vec; - } -}; + cupla::MemSizeType width, height, depth; -} //namespace CUPLA_ACCELERATOR_NAMESPACE + cuplaExtent() = default; + ALPAKA_FN_HOST_ACC + cuplaExtent(cupla::MemSizeType const w, cupla::MemSizeType const h, cupla::MemSizeType const d) + : width(w) + , height(h) + , depth(d) + { + } -namespace alpaka -{ -namespace traits -{ + template::type> + ALPAKA_FN_HOST_ACC cuplaExtent(::alpaka::Vec const& vec) + { + for(uint32_t i(0); i < 3u; ++i) + { + // alpaka vectors are z,y,x. + (&this->width)[i] = vec[(3u - 1u) - i]; + } + } - //! dimension get trait specialization - template<> - struct DimType< - cuplaExtent - >{ - using type = ::alpaka::DimInt<3u>; + ALPAKA_FN_HOST_ACC + operator ::alpaka::Vec, cupla::MemSizeType>(void) const + { + ::alpaka::Vec, cupla::MemSizeType> vec(depth, height, width); + return vec; + } }; -} // namespace traits - -namespace traits -{ - - //! element type trait specialization - template<> - struct ElemType< - cuplaExtent - >{ - using type = cupla::MemSizeType; - }; +} // namespace CUPLA_ACCELERATOR_NAMESPACE -} // namespace traits -namespace extent -{ -namespace traits +namespace alpaka { - - //! extent get trait specialization - template< - typename T_Idx - > - struct GetExtent< - T_Idx, - cuplaExtent, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - - ALPAKA_FN_HOST_ACC - static auto - getExtent( cuplaExtent const & extents ) - -> cupla::MemSizeType { - return (&extents.width)[(3u - 1u) - T_Idx::value]; - } - }; - - //! extent set trait specialization - template< - typename T_Idx, - typename T_Extent - > - struct SetExtent< - T_Idx, - cuplaExtent, - T_Extent, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - setExtent( - cuplaExtent &extents, - T_Extent const &extent - ) - -> void + namespace traits + { + //! dimension get trait specialization + template<> + struct DimType { - (&extents.width)[(3u - 1u) - T_Idx::value] = extent; - } - }; -} // namespace traits -} // namespace extent + using type = ::alpaka::DimInt<3u>; + }; -namespace traits -{ + } // namespace traits - //! offset get trait specialization - template< - typename T_Idx - > - struct GetOffset< - T_Idx, - cuplaExtent, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - getOffset( cuplaExtent const & offsets ) - -> cupla::MemSizeType{ - return (&offsets.width)[(3u - 1u) - T_Idx::value]; - } - }; + namespace traits + { + //! element type trait specialization + template<> + struct ElemType + { + using type = cupla::MemSizeType; + }; + } // namespace traits - //! offset set trait specialization. - template< - typename T_Idx, - typename T_Offset - > - struct SetOffset< - T_Idx, - cuplaExtent, - T_Offset, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - setOffset( - cuplaExtent &offsets, - T_Offset const &offset - ) - -> void { - offsets[(3u - 1u) - T_Idx::value] = offset; - } - }; -} // namespace traits + namespace extent + { + namespace traits + { + //! extent get trait specialization + template + struct GetExtent T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto getExtent(cuplaExtent const& extents) -> cupla::MemSizeType + { + return (&extents.width)[(3u - 1u) - T_Idx::value]; + } + }; + + //! extent set trait specialization + template + struct SetExtent T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto setExtent(cuplaExtent& extents, T_Extent const& extent) -> void + { + (&extents.width)[(3u - 1u) - T_Idx::value] = extent; + } + }; + } // namespace traits + } // namespace extent + + namespace traits + { + //! offset get trait specialization + template + struct GetOffset T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto getOffset(cuplaExtent const& offsets) -> cupla::MemSizeType + { + return (&offsets.width)[(3u - 1u) - T_Idx::value]; + } + }; -namespace traits -{ - //! size type trait specialization. - template<> - struct IdxType< - cuplaExtent - >{ - using type = cupla::MemSizeType; - }; + //! offset set trait specialization. + template + struct SetOffset T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto setOffset(cuplaExtent& offsets, T_Offset const& offset) -> void + { + offsets[(3u - 1u) - T_Idx::value] = offset; + } + }; + } // namespace traits + + namespace traits + { + //! size type trait specialization. + template<> + struct IdxType + { + using type = cupla::MemSizeType; + }; -} // namespace traits -} // namespave alpaka + } // namespace traits +} // namespace alpaka diff --git a/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp b/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp index f23da699..de5caa2e 100644 --- a/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp +++ b/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp @@ -21,28 +21,27 @@ #pragma once -#include "cupla/namespace.hpp" -#include "cupla/types.hpp" #include "cupla/c/datatypes/cuplaArray.hpp" +#include "cupla/c/datatypes/cuplaExtent.hpp" #include "cupla/c/datatypes/cuplaPitchedPtr.hpp" #include "cupla/c/datatypes/cuplaPos.hpp" -#include "cupla/c/datatypes/cuplaExtent.hpp" +#include "cupla/namespace.hpp" +#include "cupla/types.hpp" inline namespace CUPLA_ACCELERATOR_NAMESPACE { + struct cuplaMemcpy3DParms + { + cuplaArray* dstArray; + cuplaPos dstPos; + cuplaPitchedPtr dstPtr; + cuplaExtent extent; + cuplaMemcpyKind kind; + cuplaArray* srcArray; + cuplaPos srcPos; + cuplaPitchedPtr srcPtr; -struct cuplaMemcpy3DParms -{ - cuplaArray* dstArray; - cuplaPos dstPos; - cuplaPitchedPtr dstPtr; - cuplaExtent extent; - cuplaMemcpyKind kind; - cuplaArray * srcArray; - cuplaPos srcPos; - cuplaPitchedPtr srcPtr; - - cuplaMemcpy3DParms() = default; -}; + cuplaMemcpy3DParms() = default; + }; -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/c/datatypes/cuplaPitchedPtr.hpp b/include/cupla/c/datatypes/cuplaPitchedPtr.hpp index 881349c0..ee139af6 100644 --- a/include/cupla/c/datatypes/cuplaPitchedPtr.hpp +++ b/include/cupla/c/datatypes/cuplaPitchedPtr.hpp @@ -21,33 +21,31 @@ #pragma once +#include "cupla/datatypes/uint.hpp" #include "cupla/namespace.hpp" #include "cupla/types.hpp" -#include "cupla/datatypes/uint.hpp" inline namespace CUPLA_ACCELERATOR_NAMESPACE { - -struct cuplaPitchedPtr -{ - void * ptr; - cupla::MemSizeType pitch, xsize, ysize; - - cuplaPitchedPtr() = default; - - ALPAKA_FN_HOST_ACC - cuplaPitchedPtr( - void * const d, - cupla::MemSizeType const p, - cupla::MemSizeType const xsz, - cupla::MemSizeType const ysz - ) : - ptr( d ), - pitch( p ), - xsize( xsz ), - ysize( ysz ) - {} - -}; - -} //namespace CUPLA_ACCELERATOR_NAMESPACE + struct cuplaPitchedPtr + { + void* ptr; + cupla::MemSizeType pitch, xsize, ysize; + + cuplaPitchedPtr() = default; + + ALPAKA_FN_HOST_ACC + cuplaPitchedPtr( + void* const d, + cupla::MemSizeType const p, + cupla::MemSizeType const xsz, + cupla::MemSizeType const ysz) + : ptr(d) + , pitch(p) + , xsize(xsz) + , ysize(ysz) + { + } + }; + +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/include/cupla/c/datatypes/cuplaPos.hpp b/include/cupla/c/datatypes/cuplaPos.hpp index b99a26de..1a56f46b 100644 --- a/include/cupla/c/datatypes/cuplaPos.hpp +++ b/include/cupla/c/datatypes/cuplaPos.hpp @@ -26,198 +26,123 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE { - -struct cuplaPos{ - size_t x, y, z; - - cuplaPos() = default; - - ALPAKA_FN_HOST_ACC - cuplaPos( - size_t const x_in, - size_t const y_in, - size_t const z_in - ) : - x( x_in ), - y( y_in ), - z( z_in ) - {} - - template< - typename TDim, - typename TSize, - typename = typename std::enable_if< - (TDim::value == 3u) - >::type - > - ALPAKA_FN_HOST_ACC - cuplaPos( - ::alpaka::Vec< - TDim, - TSize - > const &vec - ) - { - for( uint32_t i(0); i < 3u; ++i ) { - // alpaka vectors are z,y,x. - ( &this->x )[ i ] = vec[ ( 3u - 1u ) - i ]; - } - } - - ALPAKA_FN_HOST_ACC - operator ::alpaka::Vec< - cupla::AlpakaDim< 3u >, - cupla::MemSizeType - >(void) const + struct cuplaPos { - ::alpaka::Vec< - cupla::AlpakaDim< 3u >, - cupla::MemSizeType - > vec( x, y, z ); - return vec; - } -}; - -} //namespace CUPLA_ACCELERATOR_NAMESPACE - -namespace alpaka -{ -namespace traits -{ - - //! dimension get trait specialization - template<> - struct DimType< - cuplaPos - >{ - using type = ::alpaka::DimInt<3u>; - }; - -} // namespace traits + size_t x, y, z; -namespace traits -{ - - //! element type trait specialization - template<> - struct ElemType< - cuplaPos - >{ - using type = cupla::MemSizeType; - }; - -} // namespace traits - -namespace extent -{ -namespace traits -{ - - //! extent get trait specialization - template< - typename T_Idx - > - struct GetExtent< - T_Idx, - cuplaPos, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ + cuplaPos() = default; ALPAKA_FN_HOST_ACC - static auto - getExtent( cuplaPos const & extents ) - -> cupla::MemSizeType { - return (&extents.x)[(3u - 1u) - T_Idx::value]; - } - }; + cuplaPos(size_t const x_in, size_t const y_in, size_t const z_in) : x(x_in), y(y_in), z(z_in) + { + } + + template::type> + ALPAKA_FN_HOST_ACC cuplaPos(::alpaka::Vec const& vec) + { + for(uint32_t i(0); i < 3u; ++i) + { + // alpaka vectors are z,y,x. + (&this->x)[i] = vec[(3u - 1u) - i]; + } + } - //! extent set trait specialization - template< - typename T_Idx, - typename T_Pos - > - struct SetExtent< - T_Idx, - cuplaPos, - T_Pos, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ ALPAKA_FN_HOST_ACC - static auto - setExtent( - cuplaPos &extents, - T_Pos const &extent - ) - -> void + operator ::alpaka::Vec, cupla::MemSizeType>(void) const { - (&extents.x)[(3u - 1u) - T_Idx::value] = extent; + ::alpaka::Vec, cupla::MemSizeType> vec(x, y, z); + return vec; } }; -} // namespace traits -} // namespace extent -namespace traits +} // namespace CUPLA_ACCELERATOR_NAMESPACE + +namespace alpaka { + namespace traits + { + //! dimension get trait specialization + template<> + struct DimType + { + using type = ::alpaka::DimInt<3u>; + }; - //! offset get trait specialization - template< - typename T_Idx - > - struct GetOffset< - T_Idx, - cuplaPos, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - getOffset( cuplaPos const & offsets ) - -> cupla::MemSizeType{ - return (&offsets.x)[(3u - 1u) - T_Idx::value]; - } - }; + } // namespace traits + namespace traits + { + //! element type trait specialization + template<> + struct ElemType + { + using type = cupla::MemSizeType; + }; - //! offset set trait specialization. - template< - typename T_Idx, - typename T_Offset - > - struct SetOffset< - T_Idx, - cuplaPos, - T_Offset, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - setOffset( - cuplaPos &offsets, - T_Offset const &offset - ) - -> void { - offsets[(3u - 1u) - T_Idx::value] = offset; - } - }; -} // namespace traits + } // namespace traits -namespace traits -{ + namespace extent + { + namespace traits + { + //! extent get trait specialization + template + struct GetExtent T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto getExtent(cuplaPos const& extents) -> cupla::MemSizeType + { + return (&extents.x)[(3u - 1u) - T_Idx::value]; + } + }; + + //! extent set trait specialization + template + struct SetExtent T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto setExtent(cuplaPos& extents, T_Pos const& extent) -> void + { + (&extents.x)[(3u - 1u) - T_Idx::value] = extent; + } + }; + } // namespace traits + } // namespace extent + + namespace traits + { + //! offset get trait specialization + template + struct GetOffset T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto getOffset(cuplaPos const& offsets) -> cupla::MemSizeType + { + return (&offsets.x)[(3u - 1u) - T_Idx::value]; + } + }; - //! size type trait specialization. - template<> - struct IdxType< - cuplaPos - >{ - using type = cupla::MemSizeType; - }; -} // namespace traits -} // namespave alpaka + //! offset set trait specialization. + template + struct SetOffset T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto setOffset(cuplaPos& offsets, T_Offset const& offset) -> void + { + offsets[(3u - 1u) - T_Idx::value] = offset; + } + }; + } // namespace traits + + namespace traits + { + //! size type trait specialization. + template<> + struct IdxType + { + using type = cupla::MemSizeType; + }; + + } // namespace traits +} // namespace alpaka diff --git a/include/cupla/config/AnyOacc.hpp b/include/cupla/config/AnyOacc.hpp index e1602ad6..7000fbd1 100644 --- a/include/cupla/config/AnyOacc.hpp +++ b/include/cupla/config/AnyOacc.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/AnyOmp5.hpp b/include/cupla/config/AnyOmp5.hpp index 64f547d8..45089a01 100644 --- a/include/cupla/config/AnyOmp5.hpp +++ b/include/cupla/config/AnyOmp5.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/CpuOmp2Blocks.hpp b/include/cupla/config/CpuOmp2Blocks.hpp index 34881724..75aef43d 100644 --- a/include/cupla/config/CpuOmp2Blocks.hpp +++ b/include/cupla/config/CpuOmp2Blocks.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/CpuOmp2Threads.hpp b/include/cupla/config/CpuOmp2Threads.hpp index 287bef45..8cd50938 100644 --- a/include/cupla/config/CpuOmp2Threads.hpp +++ b/include/cupla/config/CpuOmp2Threads.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/CpuSerial.hpp b/include/cupla/config/CpuSerial.hpp index a88f3541..09e42f63 100644 --- a/include/cupla/config/CpuSerial.hpp +++ b/include/cupla/config/CpuSerial.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/CpuTbbBlocks.hpp b/include/cupla/config/CpuTbbBlocks.hpp index 643c23d5..81af6924 100644 --- a/include/cupla/config/CpuTbbBlocks.hpp +++ b/include/cupla/config/CpuTbbBlocks.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/CpuThreads.hpp b/include/cupla/config/CpuThreads.hpp index 036c963f..9514b949 100644 --- a/include/cupla/config/CpuThreads.hpp +++ b/include/cupla/config/CpuThreads.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/GpuCudaRt.hpp b/include/cupla/config/GpuCudaRt.hpp index e6d52ad4..4b71d411 100644 --- a/include/cupla/config/GpuCudaRt.hpp +++ b/include/cupla/config/GpuCudaRt.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/config/GpuHipRt.hpp b/include/cupla/config/GpuHipRt.hpp index 6195cdf1..1328442e 100644 --- a/include/cupla/config/GpuHipRt.hpp +++ b/include/cupla/config/GpuHipRt.hpp @@ -24,20 +24,20 @@ #include #ifndef CUPLA_HEADER_ONLY -# define CUPLA_HEADER_ONLY 1 +# define CUPLA_HEADER_ONLY 1 #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# define CUPLA_HEADER_ONLY_FUNC_SPEC inline +#if(CUPLA_HEADER_ONLY == 1) +# define CUPLA_HEADER_ONLY_FUNC_SPEC inline #endif -#if( CUPLA_HEADER_ONLY == 1 ) -# include "cupla/../../src/manager/Driver.cpp" -# include "cupla/../../src/common.cpp" -# include "cupla/../../src/device.cpp" -# include "cupla/../../src/event.cpp" -# include "cupla/../../src/memory.cpp" -# include "cupla/../../src/stream.cpp" +#if(CUPLA_HEADER_ONLY == 1) +# include "cupla/../../src/common.cpp" +# include "cupla/../../src/device.cpp" +# include "cupla/../../src/event.cpp" +# include "cupla/../../src/manager/Driver.cpp" +# include "cupla/../../src/memory.cpp" +# include "cupla/../../src/stream.cpp" #endif #include "cupla.hpp" diff --git a/include/cupla/cudaToCupla/driverTypes.hpp b/include/cupla/cudaToCupla/driverTypes.hpp index 8a798567..76090773 100644 --- a/include/cupla/cudaToCupla/driverTypes.hpp +++ b/include/cupla/cudaToCupla/driverTypes.hpp @@ -51,7 +51,7 @@ #define cudaMemcpy3DParms cuplaMemcpy3DParms #ifdef cudaEventBlockingSync -#undef cudaEventBlockingSync +# undef cudaEventBlockingSync #endif /* cudaEventBlockingSync is a define in CUDA, hence we must remove * the old definition with the cupla enum @@ -59,7 +59,7 @@ #define cudaEventBlockingSync cuplaEventBlockingSync #ifdef cudaEventDisableTiming -#undef cudaEventDisableTiming +# undef cudaEventDisableTiming #endif /* cudaEventDisableTiming is a define in CUDA therefore we must remove * the old definition with the cupla enum @@ -135,23 +135,21 @@ ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int3 make_int3(int x, int y, int z) // recast functions namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ - - template< typename A, typename B > - ALPAKA_FN_HOST_ACC - B A_as_B( A const & x ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - static_assert( sizeof(A) == sizeof(B), "reinterpretation assumes data types of same size!" ); - return reinterpret_cast< B const & >( x ); - } - -} // namespace CUPLA_ACCELERATOR_NAMESPACE + template + ALPAKA_FN_HOST_ACC B A_as_B(A const& x) + { + static_assert(sizeof(A) == sizeof(B), "reinterpretation assumes data types of same size!"); + return reinterpret_cast(x); + } + + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla #ifndef ALPAKA_ACC_GPU_CUDA_ENABLED -# define __int_as_float(...) cupla::A_as_B< int, float >( __VA_ARGS__ ) -# define __float_as_int(...) cupla::A_as_B< float, int >( __VA_ARGS__ ) -# define __longlong_as_double(...) cupla::A_as_B< long long, double >( __VA_ARGS__ ) -# define __double_as_longlong(...) cupla::A_as_B< double, long long >( __VA_ARGS__ ) +# define __int_as_float(...) cupla::A_as_B(__VA_ARGS__) +# define __float_as_int(...) cupla::A_as_B(__VA_ARGS__) +# define __longlong_as_double(...) cupla::A_as_B(__VA_ARGS__) +# define __double_as_longlong(...) cupla::A_as_B(__VA_ARGS__) #endif diff --git a/include/cupla/cudaToCupla/runtime.hpp b/include/cupla/cudaToCupla/runtime.hpp index cea4d4a9..1955f3f0 100644 --- a/include/cupla/cudaToCupla/runtime.hpp +++ b/include/cupla/cudaToCupla/runtime.hpp @@ -85,7 +85,7 @@ * are disabled in CUDA */ #if CUPLA_DEVICE_COMPILE == 0 -# define __fdividef(a,b) ((a)/(b)) -# define __expf(a) cupla::math::exp(a) -# define __logf(a) cupla::math::log(a) +# define __fdividef(a, b) ((a) / (b)) +# define __expf(a) cupla::math::exp(a) +# define __logf(a) cupla::math::log(a) #endif diff --git a/include/cupla/datatypes/Array.hpp b/include/cupla/datatypes/Array.hpp index dd0d4d6d..05fcc3f1 100644 --- a/include/cupla/datatypes/Array.hpp +++ b/include/cupla/datatypes/Array.hpp @@ -26,38 +26,25 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ - - template< - typename T_Type, - size_t T_size - > - struct Array{ - T_Type m_data[T_size]; - - template< - typename T_Idx - > - ALPAKA_FN_HOST_ACC - const T_Type & - operator[]( - const T_Idx idx - ) const { - return m_data[idx]; - } - - template< - typename T_Idx - > - ALPAKA_FN_HOST_ACC - T_Type & - operator[]( - const T_Idx idx - ){ - return m_data[idx]; - } - }; - -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + template + struct Array + { + T_Type m_data[T_size]; + + template + ALPAKA_FN_HOST_ACC const T_Type& operator[](const T_Idx idx) const + { + return m_data[idx]; + } + + template + ALPAKA_FN_HOST_ACC T_Type& operator[](const T_Idx idx) + { + return m_data[idx]; + } + }; + + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/datatypes/dim3.hpp b/include/cupla/datatypes/dim3.hpp index 5f42db23..5a24d53d 100644 --- a/include/cupla/datatypes/dim3.hpp +++ b/include/cupla/datatypes/dim3.hpp @@ -21,49 +21,38 @@ #pragma once +#include "cupla/datatypes/uint.hpp" #include "cupla/namespace.hpp" #include "cupla/types.hpp" -#include "cupla/datatypes/uint.hpp" namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ - - struct dim3 + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - IdxType x, y, z; - - ALPAKA_FN_HOST_ACC - dim3( - IdxType vx = 1, - IdxType vy = 1, - IdxType vz = 1 - ) : - x(vx), - y(vy), - z(vz) - {} - - ALPAKA_FN_HOST_ACC - dim3( - const uint3& v - ) : - x(v.x), - y(v.y), - z(v.z) - {} - - ALPAKA_FN_HOST_ACC - operator uint3(void) + struct dim3 { - uint3 t; - t.x = x; - t.y = y; - t.z = z; - return t; - } - }; - -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + IdxType x, y, z; + + ALPAKA_FN_HOST_ACC + dim3(IdxType vx = 1, IdxType vy = 1, IdxType vz = 1) : x(vx), y(vy), z(vz) + { + } + + ALPAKA_FN_HOST_ACC + dim3(const uint3& v) : x(v.x), y(v.y), z(v.z) + { + } + + ALPAKA_FN_HOST_ACC + operator uint3(void) + { + uint3 t; + t.x = x; + t.y = y; + t.z = z; + return t; + } + }; + + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/datatypes/uint.hpp b/include/cupla/datatypes/uint.hpp index 11c1d509..be77cabd 100644 --- a/include/cupla/datatypes/uint.hpp +++ b/include/cupla/datatypes/uint.hpp @@ -26,200 +26,133 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ - - struct uint3{ - IdxType x, y, z; - - uint3() = default; - - template< - typename TDim, - typename TSize, - typename = typename std::enable_if< - (TDim::value == 3u) - >::type - > - ALPAKA_FN_HOST_ACC - uint3( - ::alpaka::Vec< - TDim, - TSize - > const &vec - ){ - for (uint32_t i(0); i < 3u; ++i) { - // alpaka vectors are z,y,x. - (&(this->x))[i] = vec[(3u - 1u) - i]; + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + struct uint3 + { + IdxType x, y, z; + + uint3() = default; + + template::type> + ALPAKA_FN_HOST_ACC uint3(::alpaka::Vec const& vec) + { + for(uint32_t i(0); i < 3u; ++i) + { + // alpaka vectors are z,y,x. + (&(this->x))[i] = vec[(3u - 1u) - i]; + } } - } - -#if( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1 ) - ALPAKA_FN_HOST_ACC - uint3( - ::uint3 const & vec - ){ - for (uint32_t i(0); i < 3u; ++i) { - (&(this->x))[i] = (&(vec.x))[i]; + +#if(ALPAKA_ACC_GPU_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1) + ALPAKA_FN_HOST_ACC + uint3(::uint3 const& vec) + { + for(uint32_t i(0); i < 3u; ++i) + { + (&(this->x))[i] = (&(vec.x))[i]; + } } - } #endif - ALPAKA_FN_HOST_ACC - operator ::alpaka::Vec< - cupla::AlpakaDim< 3u >, - IdxType - >(void) const - { - ::alpaka::Vec< - cupla::AlpakaDim< 3u >, - IdxType - > vec(z, y, x); - return vec; - } - }; - -} // namespace CUPLA_ACCELERATOR_NAMESPACE + ALPAKA_FN_HOST_ACC + operator ::alpaka::Vec, IdxType>(void) const + { + ::alpaka::Vec, IdxType> vec(z, y, x); + return vec; + } + }; + + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla namespace alpaka { -namespace traits -{ - - //! dimension get trait specialization - template<> - struct DimType< - cupla::uint3 - >{ - using type = ::alpaka::DimInt<3u>; - }; - -} // namespace traits - -namespace traits -{ + namespace traits + { + //! dimension get trait specialization + template<> + struct DimType + { + using type = ::alpaka::DimInt<3u>; + }; - //! element type trait specialization - template<> - struct ElemType< - cupla::uint3 - >{ - using type = cupla::IdxType; - }; + } // namespace traits -} // namespace traits + namespace traits + { + //! element type trait specialization + template<> + struct ElemType + { + using type = cupla::IdxType; + }; -namespace extent -{ -namespace traits -{ + } // namespace traits - //! extent get trait specialization - template< - typename T_Idx - > - struct GetExtent< - T_Idx, - cupla::uint3, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - - ALPAKA_FN_HOST_ACC - static auto - getExtent( cupla::uint3 const &extents ) - -> cupla::IdxType { - return (&extents.x)[(3u - 1u) - T_Idx::value]; - } - }; - - //! extent set trait specialization - template< - typename T_Idx, - typename T_Extent - > - struct SetExtent< - T_Idx, cupla::uint3, - T_Extent, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - setExtent( - cupla::uint3 &extents, - T_Extent const &extent - ) - -> void + namespace extent + { + namespace traits { - (&extents.x)[(3u - 1u) - T_Idx::value] = extent; - } - }; -} // namespace traits -} // namespace extent - -namespace traits -{ + //! extent get trait specialization + template + struct GetExtent T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto getExtent(cupla::uint3 const& extents) -> cupla::IdxType + { + return (&extents.x)[(3u - 1u) - T_Idx::value]; + } + }; + + //! extent set trait specialization + template + struct SetExtent T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto setExtent(cupla::uint3& extents, T_Extent const& extent) -> void + { + (&extents.x)[(3u - 1u) - T_Idx::value] = extent; + } + }; + } // namespace traits + } // namespace extent + + namespace traits + { + //! offset get trait specialization + template + struct GetOffset T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto getOffset(cupla::uint3 const& offsets) -> cupla::IdxType + { + return (&offsets.x)[(3u - 1u) - T_Idx::value]; + } + }; - //! offset get trait specialization - template< - typename T_Idx - > - struct GetOffset< - T_Idx, - cupla::uint3, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - getOffset( cupla::uint3 const & offsets ) - -> cupla::IdxType{ - return (&offsets.x)[(3u - 1u) - T_Idx::value]; - } - }; - - - //! offset set trait specialization. - template< - typename T_Idx, - typename T_Offset - > - struct SetOffset< - T_Idx, - cupla::uint3, - T_Offset, - typename std::enable_if< - (3u > T_Idx::value) - >::type - >{ - ALPAKA_FN_HOST_ACC - static auto - setOffset( - cupla::uint3 &offsets, - T_Offset const &offset - ) - -> void { - offsets[(3u - 1u) - T_Idx::value] = offset; - } - }; -} // namespace traits - -namespace traits -{ - //! size type trait specialization. - template<> - struct IdxType< - cupla::uint3 - >{ - using type = cupla::IdxType; - }; + //! offset set trait specialization. + template + struct SetOffset T_Idx::value)>::type> + { + ALPAKA_FN_HOST_ACC + static auto setOffset(cupla::uint3& offsets, T_Offset const& offset) -> void + { + offsets[(3u - 1u) - T_Idx::value] = offset; + } + }; + } // namespace traits + + namespace traits + { + //! size type trait specialization. + template<> + struct IdxType + { + using type = cupla::IdxType; + }; -} // namespace traits -} // namespave alpaka + } // namespace traits +} // namespace alpaka diff --git a/include/cupla/defines.hpp b/include/cupla/defines.hpp index 708d1760..49e64681 100644 --- a/include/cupla/defines.hpp +++ b/include/cupla/defines.hpp @@ -20,103 +20,90 @@ #pragma once +#include "cupla/namespace.hpp" + #include -#include -#include "cupla/namespace.hpp" +#include #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED -# undef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED -# define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED 1 +# undef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED +# define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED 1 #endif #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED -# undef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED -# define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED 1 +# undef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED +# define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED 1 #endif #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED -# undef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED -# define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED 1 +# undef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +# define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED 1 #endif #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -# undef ALPAKA_ACC_GPU_CUDA_ENABLED -# define ALPAKA_ACC_GPU_CUDA_ENABLED 1 +# undef ALPAKA_ACC_GPU_CUDA_ENABLED +# define ALPAKA_ACC_GPU_CUDA_ENABLED 1 #endif #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -# undef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -# define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED 1 +# undef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +# define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED 1 #endif #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -# undef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -# define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED 1 +# undef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +# define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED 1 #endif #ifdef ALPAKA_ACC_GPU_HIP_ENABLED -# undef ALPAKA_ACC_GPU_HIP_ENABLED -# define ALPAKA_ACC_GPU_HIP_ENABLED 1 +# undef ALPAKA_ACC_GPU_HIP_ENABLED +# define ALPAKA_ACC_GPU_HIP_ENABLED 1 #endif #ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED -# undef ALPAKA_ACC_ANY_BT_OMP5_ENABLED -# define ALPAKA_ACC_ANY_BT_OMP5_ENABLED 1 +# undef ALPAKA_ACC_ANY_BT_OMP5_ENABLED +# define ALPAKA_ACC_ANY_BT_OMP5_ENABLED 1 #endif #ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED -# undef ALPAKA_ACC_ANY_BT_OACC_ENABLED -# define ALPAKA_ACC_ANY_BT_OACC_ENABLED 1 +# undef ALPAKA_ACC_ANY_BT_OACC_ENABLED +# define ALPAKA_ACC_ANY_BT_OACC_ENABLED 1 #endif -#define CUPLA_NUM_SELECTED_DEVICES ( \ - ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED + \ - ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + \ - ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED + \ - ALPAKA_ACC_GPU_CUDA_ENABLED + \ - ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + \ - ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + \ - ALPAKA_ACC_GPU_HIP_ENABLED + \ - ALPAKA_ACC_ANY_BT_OMP5_ENABLED + \ - ALPAKA_ACC_ANY_BT_OACC_ENABLED \ -) - - -#if( CUPLA_NUM_SELECTED_DEVICES == 0 ) - #error "there is no accelerator selected, please run `ccmake .` and select one" +#define CUPLA_NUM_SELECTED_DEVICES \ + (ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED \ + + ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED + ALPAKA_ACC_GPU_CUDA_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED \ + + ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + ALPAKA_ACC_GPU_HIP_ENABLED + ALPAKA_ACC_ANY_BT_OMP5_ENABLED \ + + ALPAKA_ACC_ANY_BT_OACC_ENABLED) + + +#if(CUPLA_NUM_SELECTED_DEVICES == 0) +# error "there is no accelerator selected, please run `ccmake .` and select one" #endif -#if( CUPLA_NUM_SELECTED_DEVICES > 2 ) - #error "please select at most two accelerators" +#if(CUPLA_NUM_SELECTED_DEVICES > 2) +# error "please select at most two accelerators" #endif // count accelerators where the thread count must be one -#define CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES ( \ - ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED + \ - ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + \ - ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED \ -) - -#define CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES ( \ - ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED + \ - ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + \ - ALPAKA_ACC_GPU_CUDA_ENABLED + \ - ALPAKA_ACC_GPU_HIP_ENABLED + \ - ALPAKA_ACC_ANY_BT_OMP5_ENABLED + \ - ALPAKA_ACC_ANY_BT_OACC_ENABLED \ -) - -#if( CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES > 1 ) - #error "it is only alowed to select one thread sequential Alpaka accelerator" +#define CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES \ + (ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) + +#define CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES \ + (ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + ALPAKA_ACC_GPU_CUDA_ENABLED \ + + ALPAKA_ACC_GPU_HIP_ENABLED + ALPAKA_ACC_ANY_BT_OMP5_ENABLED + ALPAKA_ACC_ANY_BT_OACC_ENABLED) + +#if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES > 1) +# error "it is only alowed to select one thread sequential Alpaka accelerator" #endif -#if( CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES > 1 ) - #error "it is only alowed to select one thread parallelized Alpaka accelerator" +#if(CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES > 1) +# error "it is only alowed to select one thread parallelized Alpaka accelerator" #endif #ifndef CUPLA_HEADER_ONLY_FUNC_SPEC -# define CUPLA_HEADER_ONLY_FUNC_SPEC +# define CUPLA_HEADER_ONLY_FUNC_SPEC #endif /*! device compile flag @@ -127,8 +114,8 @@ * * Value is 1 if device path is compiled else 0 */ -#if defined(__CUDA_ARCH__) || ( defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__== 1 && defined(__HIP__) ) - #define CUPLA_DEVICE_COMPILE 1 +#if defined(__CUDA_ARCH__) || (defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__)) +# define CUPLA_DEVICE_COMPILE 1 #else - #define CUPLA_DEVICE_COMPILE 0 +# define CUPLA_DEVICE_COMPILE 0 #endif diff --git a/include/cupla/device/Atomic.hpp b/include/cupla/device/Atomic.hpp index 7e9bc60d..66977a9a 100644 --- a/include/cupla/device/Atomic.hpp +++ b/include/cupla/device/Atomic.hpp @@ -29,151 +29,98 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ - -#define CUPLA_UNARY_ATOMIC_OP(functionName, alpakaOp) \ - /*! \ - * Compared to their CUDA/HIP counterparts, these functions take an additional last \ - * parameter to denote atomicity (synchronization) level. This parameter is \ - * of type cupla::hierarchy::{Grids|Blocks|Threads}. Grids corresponds \ - * to atomicity between different kernels, Blocks - to different blocks \ - * in the same grid/kernel, Threads - to threads of the same block. \ - * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] \ - * @tparam T_Acc alpaka accelerator [alpaka::*] \ - * @tparam T_Type type of the value \ - * @param acc alpaka accelerator \ - * @param ptr destination pointer \ - * @param value source value \ - * @{ \ - */ \ - template< \ - typename T_Hierarchy, \ - typename T_Acc, \ - typename T_Type \ - > \ - ALPAKA_FN_ACC ALPAKA_FN_INLINE \ - T_Type functionName( \ - T_Acc const & acc, \ - T_Type *ptr, \ - T_Type const & value \ - ) \ - { \ - return ::alpaka::atomicOp< alpakaOp >( \ - acc, \ - ptr, \ - value, \ - T_Hierarchy{} \ - ); \ - } \ - \ - /*! @param hierarchy hierarchy level within the operation is atomic \ - */ \ - template< \ - typename T_Acc, \ - typename T_Type, \ - typename T_Hierarchy = alpaka::hierarchy::Grids \ - > \ - ALPAKA_FN_ACC ALPAKA_FN_INLINE \ - T_Type functionName( \ - T_Acc const & acc, \ - T_Type *ptr, \ - T_Type const & value, \ - T_Hierarchy const & hierarchy = T_Hierarchy() \ - ) \ - { \ - return functionName< T_Hierarchy >( \ - acc, \ - ptr, \ - value \ - ); \ - } \ - /*!@} \ - */ + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { +#define CUPLA_UNARY_ATOMIC_OP(functionName, alpakaOp) \ + /*! \ + * Compared to their CUDA/HIP counterparts, these functions take an additional last \ + * parameter to denote atomicity (synchronization) level. This parameter is \ + * of type cupla::hierarchy::{Grids|Blocks|Threads}. Grids corresponds \ + * to atomicity between different kernels, Blocks - to different blocks \ + * in the same grid/kernel, Threads - to threads of the same block. \ + * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] \ + * @tparam T_Acc alpaka accelerator [alpaka::*] \ + * @tparam T_Type type of the value \ + * @param acc alpaka accelerator \ + * @param ptr destination pointer \ + * @param value source value \ + * @{ \ + */ \ + template \ + ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type functionName(T_Acc const& acc, T_Type* ptr, T_Type const& value) \ + { \ + return ::alpaka::atomicOp(acc, ptr, value, T_Hierarchy{}); \ + } \ + \ + /*! @param hierarchy hierarchy level within the operation is atomic \ + */ \ + template \ + ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type \ + functionName(T_Acc const& acc, T_Type* ptr, T_Type const& value, T_Hierarchy const& hierarchy = T_Hierarchy()) \ + { \ + return functionName(acc, ptr, value); \ + } \ + /*!@} \ + */ - /// atomic addition - CUPLA_UNARY_ATOMIC_OP( atomicAdd, ::alpaka::AtomicAdd ) - /// atomic subtraction - CUPLA_UNARY_ATOMIC_OP( atomicSub, ::alpaka::AtomicSub ) - /// atomic minimum - CUPLA_UNARY_ATOMIC_OP( atomicMin, ::alpaka::AtomicMin ) - /// atomic maximum - CUPLA_UNARY_ATOMIC_OP( atomicMax, ::alpaka::AtomicMax ) - /// atomic increment - CUPLA_UNARY_ATOMIC_OP( atomicInc, ::alpaka::AtomicInc ) - /// atomic decrement - CUPLA_UNARY_ATOMIC_OP( atomicDec, ::alpaka::AtomicDec ) - /// atomic bit-wise and - CUPLA_UNARY_ATOMIC_OP( atomicAnd, ::alpaka::AtomicAnd ) - /// atomic bit-wise or - CUPLA_UNARY_ATOMIC_OP( atomicOr, ::alpaka::AtomicOr ) - /// atomic exchange - CUPLA_UNARY_ATOMIC_OP( atomicExch, ::alpaka::AtomicExch ) - /// atomic bit-wise xor - CUPLA_UNARY_ATOMIC_OP( atomicXor, ::alpaka::AtomicXor ) + /// atomic addition + CUPLA_UNARY_ATOMIC_OP(atomicAdd, ::alpaka::AtomicAdd) + /// atomic subtraction + CUPLA_UNARY_ATOMIC_OP(atomicSub, ::alpaka::AtomicSub) + /// atomic minimum + CUPLA_UNARY_ATOMIC_OP(atomicMin, ::alpaka::AtomicMin) + /// atomic maximum + CUPLA_UNARY_ATOMIC_OP(atomicMax, ::alpaka::AtomicMax) + /// atomic increment + CUPLA_UNARY_ATOMIC_OP(atomicInc, ::alpaka::AtomicInc) + /// atomic decrement + CUPLA_UNARY_ATOMIC_OP(atomicDec, ::alpaka::AtomicDec) + /// atomic bit-wise and + CUPLA_UNARY_ATOMIC_OP(atomicAnd, ::alpaka::AtomicAnd) + /// atomic bit-wise or + CUPLA_UNARY_ATOMIC_OP(atomicOr, ::alpaka::AtomicOr) + /// atomic exchange + CUPLA_UNARY_ATOMIC_OP(atomicExch, ::alpaka::AtomicExch) + /// atomic bit-wise xor + CUPLA_UNARY_ATOMIC_OP(atomicXor, ::alpaka::AtomicXor) #undef CUPLA_UNARY_ATOMIC_OP - /** atomic compare and swap - * - * @{ - * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] - * @tparam T_Acc alpaka accelerator [alpaka::*] - * @tparam T_Type type of the value - * @param acc alpaka accelerator - * @param ptr destination pointer - * @param value source value - */ - template< - typename T_Hierarchy, - typename T_Acc, - typename T_Type - > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - T_Type atomicCas( - T_Acc const & acc, - T_Type *ptr, - T_Type const & compare, - T_Type const & value - ) - { - return ::alpaka::atomicOp< ::alpaka::AtomicCas >( - acc, - ptr, - compare, - value, - T_Hierarchy{} - ); - } + /** atomic compare and swap + * + * @{ + * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type + * cupla::hierarchy::*] + * @tparam T_Acc alpaka accelerator [alpaka::*] + * @tparam T_Type type of the value + * @param acc alpaka accelerator + * @param ptr destination pointer + * @param value source value + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type + atomicCas(T_Acc const& acc, T_Type* ptr, T_Type const& compare, T_Type const& value) + { + return ::alpaka::atomicOp<::alpaka::AtomicCas>(acc, ptr, compare, value, T_Hierarchy{}); + } - /*! @param hierarchy hierarchy level within the operation is atomic - */ - template< - typename T_Acc, - typename T_Type, - typename T_Hierarchy = hierarchy::Grids - > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - T_Type atomicCas( - T_Acc const & acc, - T_Type *ptr, - T_Type const & compare, - T_Type const & value, - T_Hierarchy const & hierarchy = T_Hierarchy() - ) - { - return atomicCas< T_Hierarchy >( - acc, - ptr, - compare, - value - ); - } - /*!@} - */ + /*! @param hierarchy hierarchy level within the operation is atomic + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type atomicCas( + T_Acc const& acc, + T_Type* ptr, + T_Type const& compare, + T_Type const& value, + T_Hierarchy const& hierarchy = T_Hierarchy()) + { + return atomicCas(acc, ptr, compare, value); + } + /*!@} + */ -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/Hierarchy.hpp b/include/cupla/device/Hierarchy.hpp index 130faa31..032f89cb 100644 --- a/include/cupla/device/Hierarchy.hpp +++ b/include/cupla/device/Hierarchy.hpp @@ -27,17 +27,16 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -namespace hierarchy -{ - - //! hierarchy definitions for atomic operation - using namespace ::alpaka::hierarchy; + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + namespace hierarchy + { + //! hierarchy definitions for atomic operation + using namespace ::alpaka::hierarchy; -} // namespace layer -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace hierarchy + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/Index.hpp b/include/cupla/device/Index.hpp index c7bd2d8a..32b98228 100644 --- a/include/cupla/device/Index.hpp +++ b/include/cupla/device/Index.hpp @@ -28,96 +28,65 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ - - /** number of blocks within the grid layer - * - * @tparam T_Acc alpaka accelerator [alpaka::*] - * @param acc alpaka accelerator - */ - template< typename T_Acc > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - cupla::uint3 gridDim( T_Acc const & acc ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - return static_cast< uint3 >( - ::alpaka::getWorkDiv< - ::alpaka::Grid, - ::alpaka::Blocks - >( acc ) - ); - } + inline namespace device + { + /** number of blocks within the grid layer + * + * @tparam T_Acc alpaka accelerator [alpaka::*] + * @param acc alpaka accelerator + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 gridDim(T_Acc const& acc) + { + return static_cast(::alpaka::getWorkDiv<::alpaka::Grid, ::alpaka::Blocks>(acc)); + } - /** number of threads within the block layer - * - * @tparam T_Acc alpaka accelerator [alpaka::*] - * @param acc alpaka accelerator - */ - template< typename T_Acc > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - cupla::uint3 blockDim( T_Acc const & acc ) - { - return static_cast< uint3 >( - ::alpaka::getWorkDiv< - ::alpaka::Block, - ::alpaka::Threads - >( acc ) - ); - } + /** number of threads within the block layer + * + * @tparam T_Acc alpaka accelerator [alpaka::*] + * @param acc alpaka accelerator + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 blockDim(T_Acc const& acc) + { + return static_cast(::alpaka::getWorkDiv<::alpaka::Block, ::alpaka::Threads>(acc)); + } - /** number of elements within the thread layer - * - * @tparam T_Acc alpaka accelerator [alpaka::*] - * @param acc alpaka accelerator - */ - template< typename T_Acc > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - cupla::uint3 threadDim( T_Acc const & acc ) - { - return static_cast< uint3 >( - ::alpaka::getWorkDiv< - ::alpaka::Thread, - ::alpaka::Elems - >( acc ) - ); - } + /** number of elements within the thread layer + * + * @tparam T_Acc alpaka accelerator [alpaka::*] + * @param acc alpaka accelerator + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 threadDim(T_Acc const& acc) + { + return static_cast(::alpaka::getWorkDiv<::alpaka::Thread, ::alpaka::Elems>(acc)); + } - /** index of the thread within the block layer - * - * @tparam T_Acc alpaka accelerator [alpaka::*] - * @param acc alpaka accelerator - */ - template< typename T_Acc > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - cupla::uint3 threadIdx( T_Acc const & acc ) - { - return static_cast< uint3 >( - ::alpaka::getIdx< - ::alpaka::Block, - ::alpaka::Threads - >( acc ) - ); - } + /** index of the thread within the block layer + * + * @tparam T_Acc alpaka accelerator [alpaka::*] + * @param acc alpaka accelerator + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 threadIdx(T_Acc const& acc) + { + return static_cast(::alpaka::getIdx<::alpaka::Block, ::alpaka::Threads>(acc)); + } - /** index of the block within the grid layer - * - * @tparam T_Acc alpaka accelerator [alpaka::*] - * @param acc alpaka accelerator - */ - template< typename T_Acc > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - cupla::uint3 blockIdx( T_Acc const & acc ) - { - return static_cast< uint3 >( - ::alpaka::getIdx< - ::alpaka::Grid, - ::alpaka::Blocks - >( acc ) - ); - } + /** index of the block within the grid layer + * + * @tparam T_Acc alpaka accelerator [alpaka::*] + * @param acc alpaka accelerator + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 blockIdx(T_Acc const& acc) + { + return static_cast(::alpaka::getIdx<::alpaka::Grid, ::alpaka::Blocks>(acc)); + } -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/SharedMemory.hpp b/include/cupla/device/SharedMemory.hpp index 03d99398..6202f3f8 100644 --- a/include/cupla/device/SharedMemory.hpp +++ b/include/cupla/device/SharedMemory.hpp @@ -23,10 +23,6 @@ #include -#define sharedMem(ppName, ...) \ - __VA_ARGS__& ppName = \ - ::alpaka::declareSharedVar< __VA_ARGS__, __COUNTER__ >( acc ) +#define sharedMem(ppName, ...) __VA_ARGS__& ppName = ::alpaka::declareSharedVar<__VA_ARGS__, __COUNTER__>(acc) -#define sharedMemExtern(ppName, ...) \ - __VA_ARGS__* ppName = \ - ::alpaka::getDynSharedMem< __VA_ARGS__ >( acc ) +#define sharedMemExtern(ppName, ...) __VA_ARGS__* ppName = ::alpaka::getDynSharedMem<__VA_ARGS__>(acc) diff --git a/include/cupla/device/Synchronization.hpp b/include/cupla/device/Synchronization.hpp index a095dacb..91918b84 100644 --- a/include/cupla/device/Synchronization.hpp +++ b/include/cupla/device/Synchronization.hpp @@ -27,34 +27,31 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ - - /** synchronize threads within the block - * - * @tparam T_Acc alpaka accelerator [alpaka::*] - * @param acc alpaka accelerator - * - * @{ - */ - template< typename T_Acc > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - void syncThreads( T_Acc const & acc ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - ::alpaka::syncBlockThreads( acc ); - } - - template< typename T_Acc > - ALPAKA_FN_ACC ALPAKA_FN_INLINE - void __syncthreads( T_Acc const & acc ) - { - syncThreads( acc ); - } - - //!@} - -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + inline namespace device + { + /** synchronize threads within the block + * + * @tparam T_Acc alpaka accelerator [alpaka::*] + * @param acc alpaka accelerator + * + * @{ + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void syncThreads(T_Acc const& acc) + { + ::alpaka::syncBlockThreads(acc); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void __syncthreads(T_Acc const& acc) + { + syncThreads(acc); + } + + //!@} + + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Abs.hpp b/include/cupla/device/math/Abs.hpp index b90f8a48..38fbf1f1 100644 --- a/include/cupla/device/math/Abs.hpp +++ b/include/cupla/device/math/Abs.hpp @@ -26,18 +26,17 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the absolute value. - CUPLA_UNARY_MATH_FN( abs, alpaka::math::ConceptMathAbs, Abs ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the absolute value. + CUPLA_UNARY_MATH_FN(abs, alpaka::math::ConceptMathAbs, Abs) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Common.hpp b/include/cupla/device/math/Common.hpp index c6a856e0..88f59a45 100644 --- a/include/cupla/device/math/Common.hpp +++ b/include/cupla/device/math/Common.hpp @@ -21,8 +21,8 @@ #pragma once -#include "cupla/types.hpp" #include "cupla/defines.hpp" +#include "cupla/types.hpp" #include @@ -30,149 +30,100 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ -namespace detail -{ - /** Get the concept implementation of the current accelerator - * - * @tparam T_AccOrMathImpl accelerator or math implementation [type alpaka::* or alpaka::math::MathStdLib] - * @tparam T_Concept alpaka concept - * @return implementation of the concept - */ - ALPAKA_NO_HOST_ACC_WARNING - template< typename T_AccOrMathImpl, typename T_Concept > - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto getConcept() + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - using ResultMathConcept = alpaka::concepts::ImplementationBase< - T_Concept, - T_AccOrMathImpl - >; + inline namespace device + { + inline namespace math + { + namespace detail + { + /** Get the concept implementation of the current accelerator + * + * @tparam T_AccOrMathImpl accelerator or math implementation [type alpaka::* or + * alpaka::math::MathStdLib] + * @tparam T_Concept alpaka concept + * @return implementation of the concept + */ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto getConcept() + { + using ResultMathConcept = alpaka::concepts::ImplementationBase; - using AccMathConcept = alpaka::concepts::ImplementationBase< - T_Concept, - Acc - >; + using AccMathConcept = alpaka::concepts::ImplementationBase; - using AccThreadSeqMathConcept = alpaka::concepts::ImplementationBase< - T_Concept, - AccThreadSeq - >; + using AccThreadSeqMathConcept = alpaka::concepts::ImplementationBase; - // cupla Acc and AccThreadSeq should use the same math concept implementation - static_assert( - std::is_same< - AccMathConcept, - AccThreadSeqMathConcept - >::value, - "The math concept implementation for the type 'Acc' and 'AccThreadSeq' must be equal" - ); + // cupla Acc and AccThreadSeq should use the same math concept implementation + static_assert( + std::is_same::value, + "The math concept implementation for the type 'Acc' and 'AccThreadSeq' must be equal"); - return ResultMathConcept{}; - } -} // namespace detail + return ResultMathConcept{}; + } + } // namespace detail -#define CUPLA_UNARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait) \ - /** \ - * @tparam T_Type argument type \ - * @param arg input argument \ - */ \ - ALPAKA_NO_HOST_ACC_WARNING \ - template< typename T_Type > \ - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName( \ - T_Type const & arg \ - ) \ - /* return type is required for the compiler to detect host, device \ - * function qualifier correctly \ - */ \ - -> decltype( \ - alpaka::core::declval, \ - T_Type \ - >>()( \ - detail::getConcept< accOrMathImpl, alpakaMathConcept >(), \ - arg \ - ) \ - ) \ - { \ - return alpaka::math::traits::alpakaMathTrait< \ - alpaka::concepts::ImplementationBase< \ - alpakaMathConcept, \ - accOrMathImpl \ - >, \ - T_Type \ - >{}( \ - detail::getConcept< accOrMathImpl, alpakaMathConcept >(), \ - arg \ - ); \ +#define CUPLA_UNARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait) \ + /** \ + * @tparam T_Type argument type \ + * @param arg input argument \ + */ \ + ALPAKA_NO_HOST_ACC_WARNING \ + template \ + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName( \ + T_Type const& arg) /* return type is required for the compiler to detect host, device \ + * function qualifier correctly \ + */ \ + ->decltype(alpaka::core::declval, \ + T_Type>>()(detail::getConcept(), arg)) \ + { \ + return alpaka::math::traits::alpakaMathTrait< \ + alpaka::concepts::ImplementationBase, \ + T_Type>{}(detail::getConcept(), arg); \ } /* Using the free alpaka functions `alpaka::math::*` will result into `__host__ __device__` * errors, therefore the alpaka math trait must be used. */ -#define CUPLA_BINARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait) \ - /** \ - * @tparam T_Type argument type \ - * @param arg1 first input argument \ - * @param arg2 second input argument \ - */ \ - ALPAKA_NO_HOST_ACC_WARNING \ - template< \ - typename T_Type1, \ - typename T_Type2 \ - > \ - ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName( \ - T_Type1 const & arg1, \ - T_Type2 const & arg2 \ - ) \ - /* return type is required for the compiler to detect host, device \ - * function qualifier correctly \ - */ \ - -> decltype( \ - alpaka::core::declval, \ - T_Type1, \ - T_Type2 \ - >>()( \ - detail::getConcept< accOrMathImpl, alpakaMathConcept >(), \ - arg1, \ - arg2 \ - ) \ - ) \ - { \ - return alpaka::math::traits::alpakaMathTrait< \ - alpaka::concepts::ImplementationBase< \ - alpakaMathConcept, \ - accOrMathImpl \ - >, \ - T_Type1, \ - T_Type2 \ - >{}( \ - detail::getConcept< accOrMathImpl, alpakaMathConcept >(), \ - arg1, \ - arg2 \ - ); \ +#define CUPLA_BINARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait) \ + /** \ + * @tparam T_Type argument type \ + * @param arg1 first input argument \ + * @param arg2 second input argument \ + */ \ + ALPAKA_NO_HOST_ACC_WARNING \ + template \ + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName( \ + T_Type1 const& arg1, \ + T_Type2 const& arg2) /* return type is required for the compiler to detect host, device \ + * function qualifier correctly \ + */ \ + ->decltype(alpaka::core::declval, \ + T_Type1, \ + T_Type2>>()(detail::getConcept(), arg1, arg2)) \ + { \ + return alpaka::math::traits::alpakaMathTrait< \ + alpaka::concepts::ImplementationBase, \ + T_Type1, \ + T_Type2>{}(detail::getConcept(), arg1, arg2); \ } #if CUPLA_DEVICE_COMPILE == 0 - #define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_UNARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait) - #define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_BINARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait) +# define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) \ + CUPLA_UNARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait) +# define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) \ + CUPLA_BINARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait) #else - #define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_UNARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait) - #define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_BINARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait) +# define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) \ + CUPLA_UNARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait) +# define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) \ + CUPLA_BINARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait) #endif -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Comparison.hpp b/include/cupla/device/math/Comparison.hpp index 0cca2fd9..247c7ca9 100644 --- a/include/cupla/device/math/Comparison.hpp +++ b/include/cupla/device/math/Comparison.hpp @@ -26,20 +26,19 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Calculates the smaller value of two arguments. - CUPLA_BINARY_MATH_FN( min, alpaka::math::ConceptMathMin, Min ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Calculates the smaller value of two arguments. + CUPLA_BINARY_MATH_FN(min, alpaka::math::ConceptMathMin, Min) - //! Calculates the larger value of two arguments. - CUPLA_BINARY_MATH_FN( max, alpaka::math::ConceptMathMax, Max ) + //! Calculates the larger value of two arguments. + CUPLA_BINARY_MATH_FN(max, alpaka::math::ConceptMathMax, Max) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Erf.hpp b/include/cupla/device/math/Erf.hpp index dd7be4c6..f8347a28 100644 --- a/include/cupla/device/math/Erf.hpp +++ b/include/cupla/device/math/Erf.hpp @@ -26,17 +26,16 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the error function. - CUPLA_UNARY_MATH_FN( erf, alpaka::math::ConceptMathErf, Erf ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the error function. + CUPLA_UNARY_MATH_FN(erf, alpaka::math::ConceptMathErf, Erf) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Exp.hpp b/include/cupla/device/math/Exp.hpp index 6f343fb5..47e9f70b 100644 --- a/include/cupla/device/math/Exp.hpp +++ b/include/cupla/device/math/Exp.hpp @@ -26,17 +26,16 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes e (Euler's number, 2.7182818...) raised to the given power. - CUPLA_UNARY_MATH_FN( exp, alpaka::math::ConceptMathExp, Exp ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes e (Euler's number, 2.7182818...) raised to the given power. + CUPLA_UNARY_MATH_FN(exp, alpaka::math::ConceptMathExp, Exp) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Log.hpp b/include/cupla/device/math/Log.hpp index 7b8c621b..d49ed531 100644 --- a/include/cupla/device/math/Log.hpp +++ b/include/cupla/device/math/Log.hpp @@ -28,17 +28,16 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the natural (base e) logarithm. - CUPLA_UNARY_MATH_FN( log, alpaka::math::ConceptMathLog, Log ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the natural (base e) logarithm. + CUPLA_UNARY_MATH_FN(log, alpaka::math::ConceptMathLog, Log) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Mod.hpp b/include/cupla/device/math/Mod.hpp index 2ea026a2..d01c5927 100644 --- a/include/cupla/device/math/Mod.hpp +++ b/include/cupla/device/math/Mod.hpp @@ -26,20 +26,19 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the floating-point remainder of the division operation x/y. - CUPLA_BINARY_MATH_FN( fmod, alpaka::math::ConceptMathFmod, Fmod ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the floating-point remainder of the division operation x/y. + CUPLA_BINARY_MATH_FN(fmod, alpaka::math::ConceptMathFmod, Fmod) - //! Computes the IEEE remainder of the floating point division operation x/y. - CUPLA_BINARY_MATH_FN( remainder, alpaka::math::ConceptMathRemainder, Remainder ) + //! Computes the IEEE remainder of the floating point division operation x/y. + CUPLA_BINARY_MATH_FN(remainder, alpaka::math::ConceptMathRemainder, Remainder) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Pow.hpp b/include/cupla/device/math/Pow.hpp index 56a4e464..6d166e75 100644 --- a/include/cupla/device/math/Pow.hpp +++ b/include/cupla/device/math/Pow.hpp @@ -26,17 +26,16 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the value of base raised to the power exp. - CUPLA_BINARY_MATH_FN( pow, alpaka::math::ConceptMathPow, Pow ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the value of base raised to the power exp. + CUPLA_BINARY_MATH_FN(pow, alpaka::math::ConceptMathPow, Pow) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Root.hpp b/include/cupla/device/math/Root.hpp index eafbb769..d5d66e98 100644 --- a/include/cupla/device/math/Root.hpp +++ b/include/cupla/device/math/Root.hpp @@ -26,23 +26,22 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the square root. - CUPLA_UNARY_MATH_FN( sqrt, alpaka::math::ConceptMathSqrt, Sqrt ) - - //! Computes the inverse square root. - CUPLA_UNARY_MATH_FN( rsqrt, alpaka::math::ConceptMathRsqrt, Rsqrt ) - - //! Computes the cubic root. - CUPLA_UNARY_MATH_FN( cbrt, alpaka::math::ConceptMathCbrt, Cbrt ) - -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the square root. + CUPLA_UNARY_MATH_FN(sqrt, alpaka::math::ConceptMathSqrt, Sqrt) + + //! Computes the inverse square root. + CUPLA_UNARY_MATH_FN(rsqrt, alpaka::math::ConceptMathRsqrt, Rsqrt) + + //! Computes the cubic root. + CUPLA_UNARY_MATH_FN(cbrt, alpaka::math::ConceptMathCbrt, Cbrt) + + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Round.hpp b/include/cupla/device/math/Round.hpp index 35bf0834..75359638 100644 --- a/include/cupla/device/math/Round.hpp +++ b/include/cupla/device/math/Round.hpp @@ -26,41 +26,40 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the smallest integer value not less than arg. - CUPLA_UNARY_MATH_FN( ceil, alpaka::math::ConceptMathCeil, Ceil ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the smallest integer value not less than arg. + CUPLA_UNARY_MATH_FN(ceil, alpaka::math::ConceptMathCeil, Ceil) - //! Computes the largest integer value not greater than arg. - CUPLA_UNARY_MATH_FN( floor, alpaka::math::ConceptMathFloor, Floor ) + //! Computes the largest integer value not greater than arg. + CUPLA_UNARY_MATH_FN(floor, alpaka::math::ConceptMathFloor, Floor) - //! Computes the nearest integer not greater in magnitude than arg. - CUPLA_UNARY_MATH_FN( trunc, alpaka::math::ConceptMathTrunc, Trunc ) + //! Computes the nearest integer not greater in magnitude than arg. + CUPLA_UNARY_MATH_FN(trunc, alpaka::math::ConceptMathTrunc, Trunc) - /** Computes the nearest integer value to arg (in floating-point format). - * - * Rounding halfway cases away from zero, regardless of the current rounding mode. - */ - CUPLA_UNARY_MATH_FN( round, alpaka::math::ConceptMathRound, Round ) + /** Computes the nearest integer value to arg (in floating-point format). + * + * Rounding halfway cases away from zero, regardless of the current rounding mode. + */ + CUPLA_UNARY_MATH_FN(round, alpaka::math::ConceptMathRound, Round) - /** Computes the nearest integer value to arg (in integer format). - * - * Rounding halfway cases away from zero, regardless of the current rounding mode. - */ - CUPLA_UNARY_MATH_FN( lround, alpaka::math::ConceptMathRound, Lround ) + /** Computes the nearest integer value to arg (in integer format). + * + * Rounding halfway cases away from zero, regardless of the current rounding mode. + */ + CUPLA_UNARY_MATH_FN(lround, alpaka::math::ConceptMathRound, Lround) - /** Computes the nearest integer value to arg (in integer format). - * - * Rounding halfway cases away from zero, regardless of the current rounding mode. - */ - CUPLA_UNARY_MATH_FN( llround, alpaka::math::ConceptMathRound, Llround ) + /** Computes the nearest integer value to arg (in integer format). + * + * Rounding halfway cases away from zero, regardless of the current rounding mode. + */ + CUPLA_UNARY_MATH_FN(llround, alpaka::math::ConceptMathRound, Llround) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device/math/Trigo.hpp b/include/cupla/device/math/Trigo.hpp index 15ce6a36..8197334e 100644 --- a/include/cupla/device/math/Trigo.hpp +++ b/include/cupla/device/math/Trigo.hpp @@ -26,35 +26,34 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -inline namespace device -{ -inline namespace math -{ - - //! Computes the sine (measured in radians). - CUPLA_UNARY_MATH_FN( sin, alpaka::math::ConceptMathSin, Sin ) + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + inline namespace device + { + inline namespace math + { + //! Computes the sine (measured in radians). + CUPLA_UNARY_MATH_FN(sin, alpaka::math::ConceptMathSin, Sin) - //! Computes the cosine (measured in radians). - CUPLA_UNARY_MATH_FN( cos, alpaka::math::ConceptMathCos, Cos ) + //! Computes the cosine (measured in radians). + CUPLA_UNARY_MATH_FN(cos, alpaka::math::ConceptMathCos, Cos) - //! Computes the tangent (measured in radians). - CUPLA_UNARY_MATH_FN( tan, alpaka::math::ConceptMathTan, Tan ) + //! Computes the tangent (measured in radians). + CUPLA_UNARY_MATH_FN(tan, alpaka::math::ConceptMathTan, Tan) - //! Computes the principal value of the arc sine. - CUPLA_UNARY_MATH_FN( asin, alpaka::math::ConceptMathAsin, Asin ) + //! Computes the principal value of the arc sine. + CUPLA_UNARY_MATH_FN(asin, alpaka::math::ConceptMathAsin, Asin) - //! Computes the principal value of the arc cosine. - CUPLA_UNARY_MATH_FN( acos, alpaka::math::ConceptMathAcos, Acos ) + //! Computes the principal value of the arc cosine. + CUPLA_UNARY_MATH_FN(acos, alpaka::math::ConceptMathAcos, Acos) - //! Computes the principal value of the arc tangent. - CUPLA_UNARY_MATH_FN( atan, alpaka::math::ConceptMathAtan, Atan ) + //! Computes the principal value of the arc tangent. + CUPLA_UNARY_MATH_FN(atan, alpaka::math::ConceptMathAtan, Atan) - //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant. - CUPLA_BINARY_MATH_FN( atan2, alpaka::math::ConceptMathAtan2, Atan2 ) + //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant. + CUPLA_BINARY_MATH_FN(atan2, alpaka::math::ConceptMathAtan2, Atan2) -} // namespace math -} // namespace device -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace math + } // namespace device + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/device_functions.hpp b/include/cupla/device_functions.hpp index fe164900..11fb5d88 100644 --- a/include/cupla/device_functions.hpp +++ b/include/cupla/device_functions.hpp @@ -21,8 +21,8 @@ #pragma once -#include "cupla/device/Synchronization.hpp" -#include "cupla/device/Index.hpp" #include "cupla/device/Atomic.hpp" +#include "cupla/device/Index.hpp" #include "cupla/device/SharedMemory.hpp" +#include "cupla/device/Synchronization.hpp" #include "cupla/device/math.hpp" diff --git a/include/cupla/kernel.hpp b/include/cupla/kernel.hpp index 597f8eff..7f745290 100644 --- a/include/cupla/kernel.hpp +++ b/include/cupla/kernel.hpp @@ -21,248 +21,193 @@ #pragma once -#include "cupla/namespace.hpp" -#include "cupla/types.hpp" - #include "cupla/datatypes/dim3.hpp" #include "cupla/datatypes/uint.hpp" -#include "cupla/manager/Stream.hpp" #include "cupla/manager/Device.hpp" +#include "cupla/manager/Stream.hpp" +#include "cupla/namespace.hpp" #include "cupla/traits/IsThreadSeqAcc.hpp" +#include "cupla/types.hpp" #include namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ - - /** get block and elements extents - * - * can swap the block and element extents depend on the selected Alpaka - * accelerator - */ - template< - typename T_Acc, - bool T_isThreadSeqAcc = traits::IsThreadSeqAcc< T_Acc >::value - > - struct GetBlockAndElemExtents - { - static void get( dim3 const & , dim3 const & ) - { } - }; - - template< typename T_Acc > - struct GetBlockAndElemExtents< - T_Acc, - true - > + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - static void get( dim3 & blockSize, dim3 & elemSize ) + /** get block and elements extents + * + * can swap the block and element extents depend on the selected Alpaka + * accelerator + */ + template::value> + struct GetBlockAndElemExtents { - std::swap( blockSize, elemSize ); - } - }; - - /** wrapper for kernel types - * - * This implements the possibility to define dynamic shared memory without - * specializing the needed alpaka trait BlockSharedMemDynSizeBytes - */ - template< - typename T_Kernel - > - struct CuplaKernel : - public T_Kernel - { - size_t const m_dynSharedMemBytes; - - CuplaKernel( size_t const & dynSharedMemBytes ) : - m_dynSharedMemBytes( dynSharedMemBytes ) - { } - }; - - /** execute a kernel - * - * @tparam T_KernelType type of the kernel - * @tparam T_Acc accelerator used to execute the kernel - * - */ - template< - typename T_KernelType, - typename T_Acc - > - class KernelExecutor - { - IdxVec3 const m_gridSize; - IdxVec3 const m_blockSize; - IdxVec3 const m_elemSize; - uint32_t const m_dynSharedMemSize; - cuplaStream_t const m_stream; - - public: - KernelExecutor( - dim3 const & gridSize, - dim3 const & blockSize, - dim3 const & elemSize, - uint32_t const & dynSharedMemSize, - cuplaStream_t const & stream - ) : - m_gridSize( gridSize.z, gridSize.y, gridSize.x ), - m_blockSize( blockSize.z, blockSize.y, blockSize.x ), - m_elemSize( elemSize.z, elemSize.y, elemSize.x ), - m_dynSharedMemSize( dynSharedMemSize ), - m_stream( stream ) - {} + static void get(dim3 const&, dim3 const&) + { + } + }; - template< typename... T_Args > - void operator()( T_Args && ... args ) const + template + struct GetBlockAndElemExtents { - ::alpaka::WorkDivMembers< - KernelDim, - IdxType - > workDiv( - m_gridSize, - m_blockSize, - m_elemSize - ); - auto const exec = ::alpaka::createTaskKernel< T_Acc >( - workDiv, - CuplaKernel< T_KernelType >{ m_dynSharedMemSize }, - std::forward< T_Args >( args )... - ); - - auto & stream = cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( m_stream ); - - ::alpaka::enqueue(stream, exec); - } - }; - - /** Cuda like configuration interface for a kernel - * - * Interface is compatible to the argument order of a cuda kernel `T_KernelType<<<...>>>` - */ - template< - typename T_KernelType - > - struct KernelCudaLike - { - auto operator()( - dim3 const & gridSize, - dim3 const & blockSize, - uint32_t const & dynSharedMemSize = 0, - cuplaStream_t const & stream = 0 - ) const - -> KernelExecutor< - T_KernelType, - cupla::Acc - > + static void get(dim3& blockSize, dim3& elemSize) + { + std::swap(blockSize, elemSize); + } + }; + + /** wrapper for kernel types + * + * This implements the possibility to define dynamic shared memory without + * specializing the needed alpaka trait BlockSharedMemDynSizeBytes + */ + template + struct CuplaKernel : public T_Kernel { - return KernelExecutor< - T_KernelType, - cupla::Acc - >(gridSize, blockSize, dim3(), dynSharedMemSize, stream); - } - }; - - /* Kernel configuration interface with element support - * - * The kernel must support the alpaka element layer. - * - * Swap the blockSize and the elemSize depending on the activated accelerator. - * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) ) - * and the elemSize is set to the user defined blockSize - */ - template< - typename T_KernelType - > - struct SwitchableElementLevel - { - auto operator()( - dim3 const & gridSize, - dim3 const & blockSize, - uint32_t const & dynSharedMemSize = 0, - cuplaStream_t const & stream = 0 - ) const - -> KernelExecutor< - T_KernelType, - cupla::AccThreadSeq - > + size_t const m_dynSharedMemBytes; + + CuplaKernel(size_t const& dynSharedMemBytes) : m_dynSharedMemBytes(dynSharedMemBytes) + { + } + }; + + /** execute a kernel + * + * @tparam T_KernelType type of the kernel + * @tparam T_Acc accelerator used to execute the kernel + * + */ + template + class KernelExecutor { - dim3 tmpBlockSize = blockSize; - dim3 tmpElemSize; - GetBlockAndElemExtents::get( tmpBlockSize, tmpElemSize ); - - return KernelExecutor< - T_KernelType, - cupla::AccThreadSeq - >(gridSize, tmpBlockSize, tmpElemSize, dynSharedMemSize, stream); - } - }; - - /** Kernel configuration interface with element support - * - * The kernel must support the alpaka element level - */ - template< - typename T_KernelType - > - struct KernelWithElementLevel - { - auto operator()( - dim3 const & gridSize, - dim3 const & blockSize, - dim3 const & elemSize, - uint32_t const & dynSharedMemSize = 0, - cuplaStream_t const & stream = 0 - ) const - -> KernelExecutor< - T_KernelType, - cupla::Acc - > + IdxVec3 const m_gridSize; + IdxVec3 const m_blockSize; + IdxVec3 const m_elemSize; + uint32_t const m_dynSharedMemSize; + cuplaStream_t const m_stream; + + public: + KernelExecutor( + dim3 const& gridSize, + dim3 const& blockSize, + dim3 const& elemSize, + uint32_t const& dynSharedMemSize, + cuplaStream_t const& stream) + : m_gridSize(gridSize.z, gridSize.y, gridSize.x) + , m_blockSize(blockSize.z, blockSize.y, blockSize.x) + , m_elemSize(elemSize.z, elemSize.y, elemSize.x) + , m_dynSharedMemSize(dynSharedMemSize) + , m_stream(stream) + { + } + + template + void operator()(T_Args&&... args) const + { + ::alpaka::WorkDivMembers workDiv(m_gridSize, m_blockSize, m_elemSize); + auto const exec = ::alpaka::createTaskKernel( + workDiv, + CuplaKernel{m_dynSharedMemSize}, + std::forward(args)...); + + auto& stream = cupla::manager::Stream::get().stream(m_stream); + + ::alpaka::enqueue(stream, exec); + } + }; + + /** Cuda like configuration interface for a kernel + * + * Interface is compatible to the argument order of a cuda kernel `T_KernelType<<<...>>>` + */ + template + struct KernelCudaLike { - return KernelExecutor< - T_KernelType, - cupla::Acc - >(gridSize, blockSize, elemSize, dynSharedMemSize, stream); - } - }; - -} // namespace CUPLA_ACCELERATOR_NAMESPACE + auto operator()( + dim3 const& gridSize, + dim3 const& blockSize, + uint32_t const& dynSharedMemSize = 0, + cuplaStream_t const& stream = 0) const -> KernelExecutor + { + return KernelExecutor(gridSize, blockSize, dim3(), dynSharedMemSize, stream); + } + }; + + /* Kernel configuration interface with element support + * + * The kernel must support the alpaka element layer. + * + * Swap the blockSize and the elemSize depending on the activated accelerator. + * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) ) + * and the elemSize is set to the user defined blockSize + */ + template + struct SwitchableElementLevel + { + auto operator()( + dim3 const& gridSize, + dim3 const& blockSize, + uint32_t const& dynSharedMemSize = 0, + cuplaStream_t const& stream = 0) const -> KernelExecutor + { + dim3 tmpBlockSize = blockSize; + dim3 tmpElemSize; + GetBlockAndElemExtents::get(tmpBlockSize, tmpElemSize); + + return KernelExecutor( + gridSize, + tmpBlockSize, + tmpElemSize, + dynSharedMemSize, + stream); + } + }; + + /** Kernel configuration interface with element support + * + * The kernel must support the alpaka element level + */ + template + struct KernelWithElementLevel + { + auto operator()( + dim3 const& gridSize, + dim3 const& blockSize, + dim3 const& elemSize, + uint32_t const& dynSharedMemSize = 0, + cuplaStream_t const& stream = 0) const -> KernelExecutor + { + return KernelExecutor( + gridSize, + blockSize, + elemSize, + dynSharedMemSize, + stream); + } + }; + + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla namespace alpaka { -namespace traits -{ - //! CuplaKernel has defined the extern shared memory as member - template< - typename T_UserKernel, - typename T_Acc - > - struct BlockSharedMemDynSizeBytes< - ::cupla::CuplaKernel< T_UserKernel >, - T_Acc - > + namespace traits { - template< - typename... TArgs - > - ALPAKA_FN_HOST_ACC - static auto - getBlockSharedMemDynSizeBytes( - ::cupla::CuplaKernel< T_UserKernel > const & userKernel, - TArgs const & ...) - -> ::alpaka::Idx + //! CuplaKernel has defined the extern shared memory as member + template + struct BlockSharedMemDynSizeBytes<::cupla::CuplaKernel, T_Acc> { - return userKernel.m_dynSharedMemBytes; - } - }; -} // namespace traits + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes( + ::cupla::CuplaKernel const& userKernel, + TArgs const&...) -> ::alpaka::Idx + { + return userKernel.m_dynSharedMemBytes; + } + }; + } // namespace traits } // namespace alpaka @@ -270,7 +215,10 @@ namespace traits * * The alpaka element level is ignored and always set to dim3(1,1,1) */ -#define CUPLA_KERNEL(...) ::cupla::KernelCudaLike<__VA_ARGS__>{} +#define CUPLA_KERNEL(...) \ + ::cupla::KernelCudaLike<__VA_ARGS__> \ + { \ + } /** call the kernel with an hidden element layer * @@ -281,10 +229,16 @@ namespace traits * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) ) * and the elemSize is set to the user defined blockSize */ -#define CUPLA_KERNEL_OPTI(...) ::cupla::SwitchableElementLevel<__VA_ARGS__>{} +#define CUPLA_KERNEL_OPTI(...) \ + ::cupla::SwitchableElementLevel<__VA_ARGS__> \ + { \ + } /** cupla kernel call with elements * * The kernel must support the alpaka element level */ -#define CUPLA_KERNEL_ELEM(...) ::cupla::KernelWithElementLevel<__VA_ARGS__>{} +#define CUPLA_KERNEL_ELEM(...) \ + ::cupla::KernelWithElementLevel<__VA_ARGS__> \ + { \ + } diff --git a/include/cupla/manager/Device.hpp b/include/cupla/manager/Device.hpp index 3476e663..9d1ad5f9 100644 --- a/include/cupla/manager/Device.hpp +++ b/include/cupla/manager/Device.hpp @@ -32,148 +32,112 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -namespace manager -{ - - template< - typename T_DeviceType - > - struct Device + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - using DeviceType = T_DeviceType; - - using DeviceMap = std::map< - int, - std::unique_ptr< - DeviceType - > - >; - - DeviceMap m_map; - int m_currentDevice; - - static Device & - get() + namespace manager { - static Device device; - return device; - } - - auto - device( - int idx = 0 - ) - -> DeviceType & - { - m_currentDevice = idx; - auto iter = m_map.find( idx ); - if( iter != m_map.end() ) - { - return *iter->second; - } - else + template + struct Device { - using Pltf = ::alpaka::Pltf< DeviceType >; + using DeviceType = T_DeviceType; - const int numDevices = count(); - if( idx >= numDevices ) - { - std::stringstream err; - err << "Unable to return device " << idx << ". There are only " << numDevices << " devices!"; - throw std::system_error( - cuplaErrorInvalidDevice, - err.str() - ); - } + using DeviceMap = std::map>; - std::unique_ptr< DeviceType > dev; + DeviceMap m_map; + int m_currentDevice; - try + static Device& get() { - /* device id is not in the list - * - * select device with idx - */ - dev.reset( - new DeviceType( - alpaka::getDevByIdx< - Pltf - >( idx ) - ) - ); + static Device device; + return device; } - catch( const std::runtime_error& e ) + + auto device(int idx = 0) -> DeviceType& { - throw std::system_error( - cuplaErrorDeviceAlreadyInUse, - e.what() - ); + m_currentDevice = idx; + auto iter = m_map.find(idx); + if(iter != m_map.end()) + { + return *iter->second; + } + else + { + using Pltf = ::alpaka::Pltf; + + const int numDevices = count(); + if(idx >= numDevices) + { + std::stringstream err; + err << "Unable to return device " << idx << ". There are only " << numDevices + << " devices!"; + throw std::system_error(cuplaErrorInvalidDevice, err.str()); + } + + std::unique_ptr dev; + + try + { + /* device id is not in the list + * + * select device with idx + */ + dev.reset(new DeviceType(alpaka::getDevByIdx(idx))); + } + catch(const std::runtime_error& e) + { + throw std::system_error(cuplaErrorDeviceAlreadyInUse, e.what()); + } + m_map.insert(std::make_pair(idx, std::move(dev))); + return *m_map[idx]; + } } - m_map.insert( - std::make_pair( idx, std::move( dev ) ) - ); - return *m_map[ idx ]; - } - } - - /**! reset the current device - * - * streams, memory and events on the current device must be - * deleted at first by the user - * - * @return true in success case else false - */ - bool reset() - { - ::alpaka::reset( this->current( ) ); - auto iter = m_map.find( this->id( ) ); - if( iter == m_map.end() ) - { - std::cerr << "device " << this->id( ) << - " can not destroyed (was never created) " << - std::endl; - return false; - } - else - { - m_map.erase( iter ); - return true; - } - } - - auto - id() - -> int - { - return m_currentDevice; - } - - auto - current() - -> DeviceType & - { - return this->device( this->id( ) ); - } + /**! reset the current device + * + * streams, memory and events on the current device must be + * deleted at first by the user + * + * @return true in success case else false + */ + bool reset() + { + ::alpaka::reset(this->current()); + auto iter = m_map.find(this->id()); + + if(iter == m_map.end()) + { + std::cerr << "device " << this->id() << " can not destroyed (was never created) " << std::endl; + return false; + } + else + { + m_map.erase(iter); + return true; + } + } - auto - count() - -> int - { - using Pltf = ::alpaka::Pltf< DeviceType >; - return static_cast< int >( ::alpaka::getDevCount< Pltf >( ) ); - } + auto id() -> int + { + return m_currentDevice; + } - protected: - Device() : m_currentDevice( 0 ) - { + auto current() -> DeviceType& + { + return this->device(this->id()); + } - } + auto count() -> int + { + using Pltf = ::alpaka::Pltf; + return static_cast(::alpaka::getDevCount()); + } - }; + protected: + Device() : m_currentDevice(0) + { + } + }; -} //namespace manager -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + } // namespace manager + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/manager/Driver.hpp b/include/cupla/manager/Driver.hpp index 0e3f0cf1..ff6e28e4 100644 --- a/include/cupla/manager/Driver.hpp +++ b/include/cupla/manager/Driver.hpp @@ -25,29 +25,27 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -namespace manager -{ - -/** initialize the cupla environment - * - * avoid side effects with singletons in the user code - */ -class Driver -{ - -public: - static Driver& get() + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - static Driver driver; - return driver; - } -private: - - Driver(); -}; - -} //namespace manager -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + namespace manager + { + /** initialize the cupla environment + * + * avoid side effects with singletons in the user code + */ + class Driver + { + public: + static Driver& get() + { + static Driver driver; + return driver; + } + + private: + Driver(); + }; + + } // namespace manager + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/manager/Event.hpp b/include/cupla/manager/Event.hpp index 8d3ee193..bac0b360 100644 --- a/include/cupla/manager/Event.hpp +++ b/include/cupla/manager/Event.hpp @@ -21,223 +21,174 @@ #pragma once +#include "cupla/manager/Device.hpp" #include "cupla/namespace.hpp" #include "cupla/types.hpp" -#include "cupla/manager/Device.hpp" #include "cupla_driver_types.hpp" -#include +#include #include #include #include -#include +#include namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -namespace manager -{ - -namespace detail -{ - template< - typename T_DeviceType, - typename T_QueueType - > - class EmulatedEvent + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - private: - bool hasTimer; - - using TimePoint = std::chrono::time_point< - std::chrono::high_resolution_clock - >; - - TimePoint time; - - public: - using AlpakaEvent = ::alpaka::Event< T_QueueType >; - std::unique_ptr< AlpakaEvent > event; - - EmulatedEvent( uint32_t flags ) : - hasTimer( !( flags & cuplaEventDisableTiming ) ), - event( - new AlpakaEvent( - Device< T_DeviceType >::get().current() - // The alpaka interfaces for this constructor are different depending on the backend. -#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 || ALPAKA_ACC_GPU_CUDA_ENABLED == 1 ) - ,!(flags & cuplaEventBlockingSync) -#endif - ) - ) - { - - } - - AlpakaEvent & - operator *() + namespace manager { - return *event; - } - - void record( T_QueueType & stream ) - { - ::alpaka::enqueue( stream, *event ); - if( hasTimer ) + namespace detail { - ::alpaka::wait( *event ); - time = std::chrono::high_resolution_clock::now(); - } - } - - TimePoint getTimePoint() const - { - return time; - } + template + class EmulatedEvent + { + private: + bool hasTimer; - double elapsedSince( EmulatedEvent const & startEvent ) - { - if( !hasTimer ) - std::cerr<<"event has no timing enabled"< timeElapsed_ms = time - startEvent.getTimePoint(); - return timeElapsed_ms.count(); - } - - }; -} - template< - typename T_DeviceType, - typename T_QueueType - > - struct Event - { - using DeviceType = T_DeviceType; - using QueueType = T_QueueType; + using TimePoint = std::chrono::time_point; - using EventType = detail::EmulatedEvent< - DeviceType, - QueueType - >; + TimePoint time; - using EventMap = std::map< - cuplaEvent_t, - std::unique_ptr< - EventType - > - >; + public: + using AlpakaEvent = ::alpaka::Event; + std::unique_ptr event; - using MapVector = std::vector< EventMap >; - - MapVector m_mapVector; - - static auto - get() - -> Event & - { - static Event event; - return event; - } - - auto - create( uint32_t flags ) - -> cuplaEvent_t - { - - auto& device = Device< DeviceType >::get(); - - std::unique_ptr< - EventType - > eventPtr( - new EventType( - flags - ) - ); - - cuplaEvent_t eventId = reinterpret_cast< cuplaEvent_t >( - m_id++ - ); - m_mapVector[ device.id() ].insert( - std::make_pair( eventId, std::move( eventPtr ) ) - ); - return eventId; - } - - auto - event( cuplaEvent_t eventId = 0 ) - -> EventType & - { - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); - auto iter = m_mapVector[ deviceId ].find( - eventId - ); - - if( iter == m_mapVector[ device.id( ) ].end() ) - { - std::cerr << "event " << eventId << - " not exists on device "<< deviceId << std::endl; - } - // @todo: check if stream was created - return *(iter->second); - } - - auto - destroy( cuplaEvent_t eventId ) - -> bool - { - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); - - auto iter = m_mapVector[ deviceId ].find( - eventId - ); - - if( iter == m_mapVector[ deviceId ].end() ) - { - std::cerr << "stream " << eventId << - " can not destroyed (was never created) on device " << - deviceId << - std::endl; - return false; - } - else + EmulatedEvent(uint32_t flags) + : hasTimer(!(flags & cuplaEventDisableTiming)) + , event(new AlpakaEvent( + Device::get().current() + // The alpaka interfaces for this constructor are different depending on the backend. +#if(ALPAKA_ACC_GPU_HIP_ENABLED == 1 || ALPAKA_ACC_GPU_CUDA_ENABLED == 1) + , + !(flags & cuplaEventBlockingSync) +#endif + )) + { + } + + AlpakaEvent& operator*() + { + return *event; + } + + void record(T_QueueType& stream) + { + ::alpaka::enqueue(stream, *event); + if(hasTimer) + { + ::alpaka::wait(*event); + time = std::chrono::high_resolution_clock::now(); + } + } + + TimePoint getTimePoint() const + { + return time; + } + + double elapsedSince(EmulatedEvent const& startEvent) + { + if(!hasTimer) + std::cerr << "event has no timing enabled" << std::endl; + + std::chrono::duration timeElapsed_ms = time - startEvent.getTimePoint(); + return timeElapsed_ms.count(); + } + }; + } // namespace detail + template + struct Event { - m_mapVector[ deviceId ].erase( iter ); - return true; - } - } - - /** delete all events on the current device - * - * @return true in success case else false - */ - bool - reset( ) - { - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); - - m_mapVector[ deviceId ].clear( ); - // reset id to allow that this instance can be reused - m_id = 0u; - - // @todo: check if clear creates errors - return true; - } - - - protected: - Event() : m_mapVector( Device< DeviceType >::get().count() ) - { - } - - //! unique if for the next stream - size_t m_id = 0u; - - }; - -} //namespace manager -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + using DeviceType = T_DeviceType; + using QueueType = T_QueueType; + + using EventType = detail::EmulatedEvent; + + using EventMap = std::map>; + + using MapVector = std::vector; + + MapVector m_mapVector; + + static auto get() -> Event& + { + static Event event; + return event; + } + + auto create(uint32_t flags) -> cuplaEvent_t + { + auto& device = Device::get(); + + std::unique_ptr eventPtr(new EventType(flags)); + + cuplaEvent_t eventId = reinterpret_cast(m_id++); + m_mapVector[device.id()].insert(std::make_pair(eventId, std::move(eventPtr))); + return eventId; + } + + auto event(cuplaEvent_t eventId = 0) -> EventType& + { + auto& device = Device::get(); + const auto deviceId = device.id(); + auto iter = m_mapVector[deviceId].find(eventId); + + if(iter == m_mapVector[device.id()].end()) + { + std::cerr << "event " << eventId << " not exists on device " << deviceId << std::endl; + } + // @todo: check if stream was created + return *(iter->second); + } + + auto destroy(cuplaEvent_t eventId) -> bool + { + auto& device = Device::get(); + const auto deviceId = device.id(); + + auto iter = m_mapVector[deviceId].find(eventId); + + if(iter == m_mapVector[deviceId].end()) + { + std::cerr << "stream " << eventId << " can not destroyed (was never created) on device " + << deviceId << std::endl; + return false; + } + else + { + m_mapVector[deviceId].erase(iter); + return true; + } + } + + /** delete all events on the current device + * + * @return true in success case else false + */ + bool reset() + { + auto& device = Device::get(); + const auto deviceId = device.id(); + + m_mapVector[deviceId].clear(); + // reset id to allow that this instance can be reused + m_id = 0u; + + // @todo: check if clear creates errors + return true; + } + + + protected: + Event() : m_mapVector(Device::get().count()) + { + } + + //! unique if for the next stream + size_t m_id = 0u; + }; + + } // namespace manager + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/manager/Memory.hpp b/include/cupla/manager/Memory.hpp index 1d3edc64..a1de369c 100644 --- a/include/cupla/manager/Memory.hpp +++ b/include/cupla/manager/Memory.hpp @@ -21,135 +21,97 @@ #pragma once +#include "cupla/manager/Device.hpp" #include "cupla/namespace.hpp" #include "cupla/types.hpp" -#include "cupla/manager/Device.hpp" -#include #include #include #include +#include namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -namespace manager -{ - - template< - typename T_DeviceType, - typename T_Dim - > - struct Memory + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - using DeviceType = T_DeviceType; - static constexpr uint32_t dim = T_Dim::value; - - using BufType = ::alpaka::Buf< - DeviceType, - uint8_t, - T_Dim, - MemSizeType - >; - - using MemoryMap = std::map< - uint8_t*, - std::unique_ptr< - BufType - > - >; - - using MapVector = std::vector< MemoryMap >; - - MapVector m_mapVector; - - static auto - get() - -> Memory & + namespace manager { - static Memory mem; - return mem; - } + template + struct Memory + { + using DeviceType = T_DeviceType; + static constexpr uint32_t dim = T_Dim::value; + using BufType = ::alpaka::Buf; - auto - alloc( - MemVec< dim > const & extent - ) - -> BufType & - { + using MemoryMap = std::map>; - auto& device = Device< DeviceType >::get(); - - std::unique_ptr< - BufType - > bufPtr( - new BufType( - ::alpaka::allocBuf( - device.current(), - extent - ) - ) - ); - - - uint8_t *nativePtr = ::alpaka::getPtrNative(*bufPtr); - m_mapVector[ device.id() ].insert( - std::make_pair( nativePtr, std::move( bufPtr ) ) - ); - return *m_mapVector[ device.id() ][ nativePtr ]; - } - - auto - free( void * ptr) - -> bool - { - if( ptr == nullptr) - return true; + using MapVector = std::vector; - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); + MapVector m_mapVector; - auto iter = m_mapVector[ deviceId ].find( - static_cast< uint8_t * >( ptr ) - ); + static auto get() -> Memory& + { + static Memory mem; + return mem; + } - if( iter == m_mapVector[ deviceId ].end() ) - { - return false; - } - else - { - m_mapVector[ deviceId ].erase( iter ); - return true; - } - } - - /** delete all memory on the current device - * - * @return true in success case else false - */ - bool - reset( ) - { - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); - m_mapVector[ deviceId ].clear( ); + auto alloc(MemVec const& extent) -> BufType& + { + auto& device = Device::get(); - // @todo: check if clear creates errors - return true; - } + std::unique_ptr bufPtr( + new BufType(::alpaka::allocBuf(device.current(), extent))); - protected: - Memory() : m_mapVector( Device< DeviceType >::get().count() ) - { - } + uint8_t* nativePtr = ::alpaka::getPtrNative(*bufPtr); + m_mapVector[device.id()].insert(std::make_pair(nativePtr, std::move(bufPtr))); + return *m_mapVector[device.id()][nativePtr]; + } + + auto free(void* ptr) -> bool + { + if(ptr == nullptr) + return true; + + auto& device = Device::get(); + const auto deviceId = device.id(); + + auto iter = m_mapVector[deviceId].find(static_cast(ptr)); + + if(iter == m_mapVector[deviceId].end()) + { + return false; + } + else + { + m_mapVector[deviceId].erase(iter); + return true; + } + } + + /** delete all memory on the current device + * + * @return true in success case else false + */ + bool reset() + { + auto& device = Device::get(); + const auto deviceId = device.id(); + + m_mapVector[deviceId].clear(); + + // @todo: check if clear creates errors + return true; + } - }; + protected: + Memory() : m_mapVector(Device::get().count()) + { + } + }; -} //namespace manager -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + } // namespace manager + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/manager/Stream.hpp b/include/cupla/manager/Stream.hpp index 43fc4d33..aea9362d 100644 --- a/include/cupla/manager/Stream.hpp +++ b/include/cupla/manager/Stream.hpp @@ -21,155 +21,124 @@ #pragma once +#include "cupla/manager/Device.hpp" #include "cupla/namespace.hpp" #include "cupla/types.hpp" -#include "cupla/manager/Device.hpp" #include "cupla_driver_types.hpp" #include -#include #include +#include namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -namespace manager -{ - - template< - typename T_DeviceType, - typename T_QueueType - > - struct Stream + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - using DeviceType = T_DeviceType; - using QueueType = T_QueueType; - - - using StreamMap = std::map< - cuplaStream_t, - std::unique_ptr< - QueueType - > - >; - using MapVector = std::vector< StreamMap >; - - MapVector m_mapVector; - - static auto - get() - -> Stream & + namespace manager { - static Stream stream; - return stream; - } + template + struct Stream + { + using DeviceType = T_DeviceType; + using QueueType = T_QueueType; - auto - create( ) - -> cuplaStream_t - { - return createNewStream(reinterpret_cast< cuplaStream_t >(m_id++)); - } - auto - stream( cuplaStream_t streamId = 0 ) - -> QueueType & - { - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); - auto iter = m_mapVector[ deviceId ].find( - streamId - ); + using StreamMap = std::map>; + using MapVector = std::vector; - if( iter == m_mapVector[ device.id( ) ].end() ) - { - if( streamId == 0 ) + MapVector m_mapVector; + + static auto get() -> Stream& { - createNewStream( streamId ); - return this->stream( streamId ); + static Stream stream; + return stream; } - else + + auto create() -> cuplaStream_t { - std::cerr << "stream " << streamId << - " not exists on device "<< deviceId << std::endl; + return createNewStream(reinterpret_cast(m_id++)); } - } - // @todo: check if stream was created - return *(iter->second); - } - - auto - destroy( cuplaStream_t streamId) - -> bool - { - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); - - auto iter = m_mapVector[ deviceId ].find( - streamId - ); - if( iter == m_mapVector[ deviceId ].end() ) - { - std::cerr << "stream " << streamId << - " can not destroyed (was never created) on device " << - deviceId << - std::endl; - return false; - } - else - { - m_mapVector[ deviceId ].erase( iter ); - return true; - } - } - - - /** delete all streams on the current device - * - * @return true in success case else false - */ - bool - reset( ) - { - auto& device = Device< DeviceType >::get(); - const auto deviceId = device.id(); + auto stream(cuplaStream_t streamId = 0) -> QueueType& + { + auto& device = Device::get(); + const auto deviceId = device.id(); + auto iter = m_mapVector[deviceId].find(streamId); + + if(iter == m_mapVector[device.id()].end()) + { + if(streamId == 0) + { + createNewStream(streamId); + return this->stream(streamId); + } + else + { + std::cerr << "stream " << streamId << " not exists on device " << deviceId << std::endl; + } + } + // @todo: check if stream was created + return *(iter->second); + } - m_mapVector[ deviceId ].clear( ); + auto destroy(cuplaStream_t streamId) -> bool + { + auto& device = Device::get(); + const auto deviceId = device.id(); + + auto iter = m_mapVector[deviceId].find(streamId); + + if(iter == m_mapVector[deviceId].end()) + { + std::cerr << "stream " << streamId << " can not destroyed (was never created) on device " + << deviceId << std::endl; + return false; + } + else + { + m_mapVector[deviceId].erase(iter); + return true; + } + } - // @todo: check if clear creates errors - return true; - } - protected: - Stream() : m_mapVector( Device< DeviceType >::get().count() ) - { - } + /** delete all streams on the current device + * + * @return true in success case else false + */ + bool reset() + { + auto& device = Device::get(); + const auto deviceId = device.id(); - auto - createNewStream( cuplaStream_t streamId ) - -> cuplaStream_t - { + m_mapVector[deviceId].clear(); - auto& device = Device< DeviceType >::get(); + // @todo: check if clear creates errors + return true; + } - auto streamPtr = std::make_unique< QueueType >( device.current() ); - m_mapVector[ device.id() ].insert( - std::make_pair( streamId, std::move( streamPtr ) ) - ); - return streamId; - } + protected: + Stream() : m_mapVector(Device::get().count()) + { + } - /** unique id for the next stream - * - * The enumeration starts with id one. Id zero is reserved - * for the default stream. - */ - size_t m_id = 1u; + auto createNewStream(cuplaStream_t streamId) -> cuplaStream_t + { + auto& device = Device::get(); - }; + auto streamPtr = std::make_unique(device.current()); + m_mapVector[device.id()].insert(std::make_pair(streamId, std::move(streamPtr))); + return streamId; + } -} //namespace manager -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + /** unique id for the next stream + * + * The enumeration starts with id one. Id zero is reserved + * for the default stream. + */ + size_t m_id = 1u; + }; + + } // namespace manager + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla/namespace.hpp b/include/cupla/namespace.hpp index d9030f00..5037fdfc 100644 --- a/include/cupla/namespace.hpp +++ b/include/cupla/namespace.hpp @@ -26,76 +26,76 @@ #if CUPLA_STREAM_ASYNC_ENABLED // thread parallel and thread sequential accelerator is used together -# if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1) -# define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_async -# else +# if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1) +# define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_async +# else -# ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_async -# endif +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_async +# endif -# ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_async -# endif +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_async +# endif -# ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_async -# endif +# ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_async +# endif -# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_async -# endif +# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_async +# endif -# ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_async -# endif +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_async +# endif -# ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_async -# endif +# ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_async +# endif -# ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_async -# endif +# ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_async +# endif -# endif // mixed accelerator usage +# endif // mixed accelerator usage #else // CUPLA_STREAM_ASYNC_ENABLED // thread parallel and thread sequential accelerator is used together -# if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1) -# define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_sync -# else +# if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1) +# define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_sync +# else -# ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_sync -# endif +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_sync +# endif -# ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_sync -# endif +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_sync +# endif -# ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_sync -# endif +# ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_sync +# endif -# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_sync -# endif +# ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_sync +# endif -# ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_sync -# endif +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_sync +# endif -# ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_sync -# endif +# ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_sync +# endif -# ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED -# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_sync -# endif +# ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED +# define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_sync +# endif -# endif // mixed accelerator usage +# endif // mixed accelerator usage #endif // CUPLA_STREAM_ASYNC_ENABLED diff --git a/include/cupla/traits/IsThreadSeqAcc.hpp b/include/cupla/traits/IsThreadSeqAcc.hpp index 5bced676..50da8d72 100644 --- a/include/cupla/traits/IsThreadSeqAcc.hpp +++ b/include/cupla/traits/IsThreadSeqAcc.hpp @@ -26,71 +26,46 @@ namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -namespace traits -{ - - /** check if thread level is full sequential - * - * \return ::value true if no threads where used in the thread level - * else false - */ - template< typename T_Acc > - struct IsThreadSeqAcc + inline namespace CUPLA_ACCELERATOR_NAMESPACE { - static constexpr bool value = false; - }; + namespace traits + { + /** check if thread level is full sequential + * + * \return ::value true if no threads where used in the thread level + * else false + */ + template + struct IsThreadSeqAcc + { + static constexpr bool value = false; + }; #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED - template< - typename T_KernelDim, - typename T_IndexType - > - struct IsThreadSeqAcc< - ::alpaka::AccCpuOmp2Blocks< - T_KernelDim, - T_IndexType - > - > - { - static constexpr bool value = true; - }; + template + struct IsThreadSeqAcc<::alpaka::AccCpuOmp2Blocks> + { + static constexpr bool value = true; + }; #endif #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - template< - typename T_KernelDim, - typename T_IndexType - > - struct IsThreadSeqAcc< - ::alpaka::AccCpuSerial< - T_KernelDim, - T_IndexType - > - > - { - static constexpr bool value = true; - }; + template + struct IsThreadSeqAcc<::alpaka::AccCpuSerial> + { + static constexpr bool value = true; + }; #endif #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - template< - typename T_KernelDim, - typename T_IndexType - > - struct IsThreadSeqAcc< - ::alpaka::AccCpuTbbBlocks< - T_KernelDim, - T_IndexType - > - > - { - static constexpr bool value = true; - }; + template + struct IsThreadSeqAcc<::alpaka::AccCpuTbbBlocks> + { + static constexpr bool value = true; + }; #endif -} // namespace traits -} // namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace traits + } // namespace CUPLA_ACCELERATOR_NAMESPACE } // namespace cupla diff --git a/include/cupla/types.hpp b/include/cupla/types.hpp index 755db275..92d3f71b 100644 --- a/include/cupla/types.hpp +++ b/include/cupla/types.hpp @@ -20,240 +20,146 @@ #pragma once -#include -#include - #include "cupla/defines.hpp" #include "cupla/namespace.hpp" +#include + +#include + namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + using MemSizeType = size_t; + using IdxType = unsigned int; - using MemSizeType = size_t; - using IdxType = unsigned int; + static constexpr uint32_t Dimensions = 3u; - static constexpr uint32_t Dimensions = 3u; + template + using AlpakaDim = ::alpaka::DimInt; - template< - uint32_t T_dim - > - using AlpakaDim = ::alpaka::DimInt< T_dim >; + using KernelDim = AlpakaDim; - using KernelDim = AlpakaDim< Dimensions >; + using IdxVec3 = ::alpaka::Vec; - using IdxVec3 = ::alpaka::Vec< - KernelDim, - IdxType - >; + template + using MemVec = ::alpaka::Vec, MemSizeType>; - template< - uint32_t T_dim - > - using MemVec = ::alpaka::Vec< - AlpakaDim< T_dim >, - MemSizeType - >; + using AccHost = ::alpaka::DevCpu; + using AccHostStream = ::alpaka::QueueCpuBlocking; - using AccHost = ::alpaka::DevCpu; - using AccHostStream = ::alpaka::QueueCpuBlocking; +#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) \ + || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) \ + || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) -#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) || \ - defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) || \ - defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) || \ - defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \ - defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) - - using AccDev = ::alpaka::DevCpu; -# if (CUPLA_STREAM_ASYNC_ENABLED == 1) + using AccDev = ::alpaka::DevCpu; +# if(CUPLA_STREAM_ASYNC_ENABLED == 1) using AccStream = ::alpaka::QueueCpuNonBlocking; -# else +# else using AccStream = ::alpaka::QueueCpuBlocking; -# endif - -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED - using Acc = ::alpaka::AccCpuOmp2Threads< - KernelDim, - IdxType - >; -#endif - -#if (ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED == 1) - #if (CUPLA_NUM_SELECTED_DEVICES == 1) - using Acc = ::alpaka::AccCpuOmp2Blocks< - KernelDim, - IdxType - >; - #else - using AccThreadSeq = ::alpaka::AccCpuOmp2Blocks< - KernelDim, - IdxType - >; - #endif -#endif - -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED - using Acc = ::alpaka::AccCpuThreads< - KernelDim, - IdxType - >; -#endif - -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - #if (CUPLA_NUM_SELECTED_DEVICES == 1) - using Acc = ::alpaka::AccCpuSerial< - KernelDim, - IdxType - >; - #else - using AccThreadSeq = ::alpaka::AccCpuSerial< - KernelDim, - IdxType - >; - #endif -#endif - -#if (ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED == 1) - #if (CUPLA_NUM_SELECTED_DEVICES == 1) - using Acc = ::alpaka::AccCpuTbbBlocks< - KernelDim, - IdxType - >; - #else - using AccThreadSeq = ::alpaka::AccCpuTbbBlocks< - KernelDim, - IdxType - >; - #endif -#endif +# endif + +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED + using Acc = ::alpaka::AccCpuOmp2Threads; +# endif + +# if(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED == 1) +# if(CUPLA_NUM_SELECTED_DEVICES == 1) + using Acc = ::alpaka::AccCpuOmp2Blocks; +# else + using AccThreadSeq = ::alpaka::AccCpuOmp2Blocks; +# endif +# endif + +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + using Acc = ::alpaka::AccCpuThreads; +# endif + +# ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +# if(CUPLA_NUM_SELECTED_DEVICES == 1) + using Acc = ::alpaka::AccCpuSerial; +# else + using AccThreadSeq = ::alpaka::AccCpuSerial; +# endif +# endif + +# if(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED == 1) +# if(CUPLA_NUM_SELECTED_DEVICES == 1) + using Acc = ::alpaka::AccCpuTbbBlocks; +# else + using AccThreadSeq = ::alpaka::AccCpuTbbBlocks; +# endif +# endif #endif #ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED - using AccDev = ::alpaka::DevOmp5; -# if (CUPLA_STREAM_ASYNC_ENABLED == 1) + using AccDev = ::alpaka::DevOmp5; +# if(CUPLA_STREAM_ASYNC_ENABLED == 1) using AccStream = ::alpaka::QueueOmp5NonBlocking; -# else +# else using AccStream = ::alpaka::QueueOmp5Blocking; -# endif - using Acc = ::alpaka::AccOmp5< - KernelDim, - IdxType - >; +# endif + using Acc = ::alpaka::AccOmp5; #endif #ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED - using AccDev = ::alpaka::DevOacc; -# if (CUPLA_STREAM_ASYNC_ENABLED == 1) + using AccDev = ::alpaka::DevOacc; +# if(CUPLA_STREAM_ASYNC_ENABLED == 1) using AccStream = ::alpaka::QueueOaccNonBlocking; -# else +# else using AccStream = ::alpaka::QueueOaccBlocking; -# endif - using Acc = ::alpaka::AccOacc< - KernelDim, - IdxType - >; +# endif + using Acc = ::alpaka::AccOacc; #endif #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - using AccDev = ::alpaka::DevCudaRt; -# if (CUPLA_STREAM_ASYNC_ENABLED == 1) + using AccDev = ::alpaka::DevCudaRt; +# if(CUPLA_STREAM_ASYNC_ENABLED == 1) using AccStream = ::alpaka::QueueCudaRtNonBlocking; -# else +# else using AccStream = ::alpaka::QueueCudaRtBlocking; -# endif - using Acc = ::alpaka::AccGpuCudaRt< - KernelDim, - IdxType - >; +# endif + using Acc = ::alpaka::AccGpuCudaRt; #endif #ifdef ALPAKA_ACC_GPU_HIP_ENABLED - using AccDev = ::alpaka::DevHipRt; -# if (CUPLA_STREAM_ASYNC_ENABLED == 1) + using AccDev = ::alpaka::DevHipRt; +# if(CUPLA_STREAM_ASYNC_ENABLED == 1) using AccStream = ::alpaka::QueueHipRtNonBlocking; -# else +# else using AccStream = ::alpaka::QueueHipRtBlocking; -# endif - using Acc = ::alpaka::AccGpuHipRt< - KernelDim, - IdxType - >; +# endif + using Acc = ::alpaka::AccGpuHipRt; #endif -#if (CUPLA_NUM_SELECTED_DEVICES == 1) - /** is an Alpaka accelerator which limits the thread count per block to one - * - * if only one accelerator is selected than it can be a accelerator without - * thread restrictions - */ - using AccThreadSeq = Acc; +#if(CUPLA_NUM_SELECTED_DEVICES == 1) + /** is an Alpaka accelerator which limits the thread count per block to one + * + * if only one accelerator is selected than it can be a accelerator without + * thread restrictions + */ + using AccThreadSeq = Acc; #endif - template< - uint32_t T_dim - > - using AccBuf = ::alpaka::Buf< - AccDev, - uint8_t, - AlpakaDim< T_dim >, - MemSizeType - >; + template + using AccBuf = ::alpaka::Buf, MemSizeType>; - template< - uint32_t T_dim - > - using HostBuf = ::alpaka::Buf< - AccHost, - uint8_t, - AlpakaDim< T_dim >, - MemSizeType - >; + template + using HostBuf = ::alpaka::Buf, MemSizeType>; - template< - unsigned T_dim - > - using HostBufWrapper = - ::alpaka::ViewPlainPtr< - AccHost, - uint8_t, - AlpakaDim< T_dim >, - MemSizeType - >; + template + using HostBufWrapper = ::alpaka::ViewPlainPtr, MemSizeType>; - template< - unsigned T_dim - > - using HostViewWrapper = - ::alpaka::ViewSubView< - AccHost, - uint8_t, - AlpakaDim< T_dim >, - MemSizeType - >; + template + using HostViewWrapper = ::alpaka::ViewSubView, MemSizeType>; - template< - unsigned T_dim - > - using DeviceBufWrapper = - ::alpaka::ViewPlainPtr< - AccDev, - uint8_t, - AlpakaDim< T_dim >, - MemSizeType - >; + template + using DeviceBufWrapper = ::alpaka::ViewPlainPtr, MemSizeType>; - template< - unsigned T_dim - > - using DeviceViewWrapper = - ::alpaka::ViewSubView< - AccDev, - uint8_t, - AlpakaDim< T_dim >, - MemSizeType - >; + template + using DeviceViewWrapper = ::alpaka::ViewSubView, MemSizeType>; -} // namespace CUPLA_ACCELERATOR_NAMESPACE -} // namepsace cupla + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/include/cupla_driver_types.hpp b/include/cupla_driver_types.hpp index d0373b05..548ba62a 100644 --- a/include/cupla_driver_types.hpp +++ b/include/cupla_driver_types.hpp @@ -25,15 +25,15 @@ // emulated that cuda runtime is loaded #ifndef __DRIVER_TYPES_H__ -# define __DRIVER_TYPES_H__ +# define __DRIVER_TYPES_H__ #endif enum cuplaMemcpyKind { - cuplaMemcpyHostToHost, - cuplaMemcpyHostToDevice, - cuplaMemcpyDeviceToHost, - cuplaMemcpyDeviceToDevice + cuplaMemcpyHostToHost, + cuplaMemcpyHostToDevice, + cuplaMemcpyDeviceToHost, + cuplaMemcpyDeviceToDevice }; enum cuplaError @@ -63,42 +63,44 @@ using cuplaEvent_t = void*; /** error category for `cuplaError` */ struct CuplaErrorCode : public std::error_category { - char const * name() const noexcept override { return "cuplaError"; } + char const* name() const noexcept override + { + return "cuplaError"; + } std::string message(int ev) const override - { - return message_cstr( ev ); + { + return message_cstr(ev); } - static char const * message_cstr(int ev) - { + static char const* message_cstr(int ev) + { switch(ev) { - case cuplaSuccess: - return "cuplaSuccess"; - case cuplaErrorMemoryAllocation: - return "cuplaErrorMemoryAllocation"; - case cuplaErrorInitializationError: - return "cuplaErrorInitializationError"; - case cuplaErrorNotReady: - return "cuplaErrorNotReady"; - case cuplaErrorDeviceAlreadyInUse: - return "cuplaErrorDeviceAlreadyInUse"; - default: - return "not defined cuplaError"; + case cuplaSuccess: + return "cuplaSuccess"; + case cuplaErrorMemoryAllocation: + return "cuplaErrorMemoryAllocation"; + case cuplaErrorInitializationError: + return "cuplaErrorInitializationError"; + case cuplaErrorNotReady: + return "cuplaErrorNotReady"; + case cuplaErrorDeviceAlreadyInUse: + return "cuplaErrorDeviceAlreadyInUse"; + default: + return "not defined cuplaError"; }; } }; namespace std { - - template< > - struct is_error_code_enum< cuplaError > : public true_type{}; + template<> + struct is_error_code_enum : public true_type + { + }; } // namespace std -inline std::error_code make_error_code( const cuplaError result ) +inline std::error_code make_error_code(const cuplaError result) { - return std::error_code( static_cast(result), CuplaErrorCode() ); + return std::error_code(static_cast(result), CuplaErrorCode()); } - - diff --git a/include/cupla_runtime.hpp b/include/cupla_runtime.hpp index 07e9be20..8e869ba7 100644 --- a/include/cupla_runtime.hpp +++ b/include/cupla_runtime.hpp @@ -21,34 +21,31 @@ #pragma once -#include - -#include "cupla/namespace.hpp" -#include "cupla/kernel.hpp" - +#include "cupla/api/common.hpp" +#include "cupla/api/device.hpp" +#include "cupla/api/event.hpp" +#include "cupla/api/memory.hpp" +#include "cupla/api/stream.hpp" #include "cupla/c/datatypes/cuplaArray.hpp" -#include "cupla/datatypes/dim3.hpp" -#include "cupla/datatypes/uint.hpp" #include "cupla/c/datatypes/cuplaExtent.hpp" -#include "cupla/c/datatypes/cuplaPos.hpp" #include "cupla/c/datatypes/cuplaMemcpy3DParms.hpp" #include "cupla/c/datatypes/cuplaPitchedPtr.hpp" - +#include "cupla/c/datatypes/cuplaPos.hpp" +#include "cupla/datatypes/dim3.hpp" +#include "cupla/datatypes/uint.hpp" +#include "cupla/kernel.hpp" +#include "cupla/manager/Driver.hpp" +#include "cupla/namespace.hpp" #include "cupla/types.hpp" #include "cupla_driver_types.hpp" -#include "cupla/api/common.hpp" -#include "cupla/api/device.hpp" -#include "cupla/api/stream.hpp" -#include "cupla/api/event.hpp" -#include "cupla/api/memory.hpp" -#include "cupla/manager/Driver.hpp" +#include namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ - const auto driver = manager::Driver::get(); + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + const auto driver = manager::Driver::get(); -} //namespace cupla -} //namespace CUPLA_ACCELERATOR_NAMESPACE + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/src/common.cpp b/src/common.cpp index 48ef4399..ed08ea01 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -19,56 +19,52 @@ */ -#include "cupla/namespace.hpp" -#include "cupla_runtime.hpp" -#include "cupla/manager/Memory.hpp" +#include "cupla/api/common.hpp" + #include "cupla/manager/Device.hpp" -#include "cupla/manager/Stream.hpp" #include "cupla/manager/Event.hpp" -#include "cupla/api/common.hpp" +#include "cupla/manager/Memory.hpp" +#include "cupla/manager/Stream.hpp" +#include "cupla/namespace.hpp" +#include "cupla_runtime.hpp" inline namespace CUPLA_ACCELERATOR_NAMESPACE { + CUPLA_HEADER_ONLY_FUNC_SPEC + const char* cuplaGetErrorName(cuplaError_t e) + { + return CuplaErrorCode::message_cstr(e); + } -CUPLA_HEADER_ONLY_FUNC_SPEC -const char * -cuplaGetErrorName(cuplaError_t e) -{ - return CuplaErrorCode::message_cstr(e); -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -const char * -cuplaGetErrorString(cuplaError_t e) -{ - return CuplaErrorCode::message_cstr(e); -} + CUPLA_HEADER_ONLY_FUNC_SPEC + const char* cuplaGetErrorString(cuplaError_t e) + { + return CuplaErrorCode::message_cstr(e); + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaGetLastError() -{ -#if( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 ) - // reset the last cuda error - return (cuplaError_t)cudaGetLastError(); -#elif( ALPAKA_ACC_GPU_HIP_ENABLED == 1 ) - return (cuplaError_t)hipGetLastError(); + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaGetLastError() + { +#if(ALPAKA_ACC_GPU_CUDA_ENABLED == 1) + // reset the last cuda error + return (cuplaError_t) cudaGetLastError(); +#elif(ALPAKA_ACC_GPU_HIP_ENABLED == 1) + return (cuplaError_t) hipGetLastError(); #else - return cuplaSuccess; + return cuplaSuccess; #endif -} + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaPeekAtLastError() -{ -#if( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 ) - return (cuplaError_t)cudaPeekAtLastError(); -#elif( ALPAKA_ACC_GPU_HIP_ENABLED == 1 ) - return (cuplaError_t)hipPeekAtLastError(); + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaPeekAtLastError() + { +#if(ALPAKA_ACC_GPU_CUDA_ENABLED == 1) + return (cuplaError_t) cudaPeekAtLastError(); +#elif(ALPAKA_ACC_GPU_HIP_ENABLED == 1) + return (cuplaError_t) hipPeekAtLastError(); #else - return cuplaSuccess; + return cuplaSuccess; #endif -} + } -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/src/device.cpp b/src/device.cpp index e2127784..81826821 100644 --- a/src/device.cpp +++ b/src/device.cpp @@ -19,113 +19,84 @@ */ -#include "cupla/namespace.hpp" -#include "cupla_runtime.hpp" -#include "cupla/manager/Memory.hpp" #include "cupla/manager/Device.hpp" -#include "cupla/manager/Stream.hpp" -#include "cupla/manager/Event.hpp" + #include "cupla/api/device.hpp" +#include "cupla/manager/Event.hpp" +#include "cupla/manager/Memory.hpp" +#include "cupla/manager/Stream.hpp" +#include "cupla/namespace.hpp" +#include "cupla_runtime.hpp" + #include inline namespace CUPLA_ACCELERATOR_NAMESPACE { + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaGetDeviceCount(int* count) + { + *count = cupla::manager::Device::get().count(); + return cuplaSuccess; + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaGetDeviceCount( int * count) -{ - *count = cupla::manager::Device< cupla::AccDev >::get().count(); - return cuplaSuccess; -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaSetDevice(int idx) + { + try + { + cupla::manager::Device::get().device(idx); + } + catch(const std::system_error& e) + { + return static_cast(e.code().value()); + } + return cuplaSuccess; + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaSetDevice( int idx) -{ - try + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaGetDevice(int* deviceId) { - cupla::manager::Device< cupla::AccDev >::get().device( idx ); + *deviceId = cupla::manager::Device::get().id(); + return cuplaSuccess; } - catch(const std::system_error& e) + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaDeviceReset() { - return static_cast( e.code().value() ); + // wait that all work on the device is finished + cuplaDeviceSynchronize(); + + // delete all events on the current device + cupla::manager::Event::get().reset(); + + // delete all memory on the current device + cupla::manager::Memory>::get().reset(); + + cupla::manager::Memory>::get().reset(); + + cupla::manager::Memory>::get().reset(); + + // delete all streams on the current device + cupla::manager::Stream::get().reset(); + + cupla::manager::Device::get().reset(); + return cuplaSuccess; } - return cuplaSuccess; -} -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaGetDevice( int * deviceId ) -{ - *deviceId = cupla::manager::Device< cupla::AccDev >::get().id(); - return cuplaSuccess; -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaDeviceSynchronize() + { + ::alpaka::wait(cupla::manager::Device::get().current()); + return cuplaSuccess; + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaDeviceReset( ) -{ - // wait that all work on the device is finished - cuplaDeviceSynchronize( ); - - // delete all events on the current device - cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().reset( ); - - // delete all memory on the current device - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<1u> - >::get().reset( ); - - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<2u> - >::get().reset( ); - - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<3u> - >::get().reset( ); - - // delete all streams on the current device - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().reset( ); - - cupla::manager::Device< cupla::AccDev >::get( ).reset( ); - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaDeviceSynchronize( ) -{ - ::alpaka::wait( - cupla::manager::Device< cupla::AccDev >::get( ).current( ) - ); - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemGetInfo( - size_t * free, - size_t * total -) -{ - auto& device( - cupla::manager::Device< - cupla::AccDev - >::get().current() - ); - *total = ::alpaka::getMemBytes( device ); - *free = ::alpaka::getFreeMemBytes( device ); - return cuplaSuccess; -} - -} //namespace CUPLA_ACCELERATOR_NAMESPACE + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemGetInfo(size_t* free, size_t* total) + { + auto& device(cupla::manager::Device::get().current()); + *total = ::alpaka::getMemBytes(device); + *free = ::alpaka::getFreeMemBytes(device); + return cuplaSuccess; + } + +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/src/event.cpp b/src/event.cpp index 9cbf779c..037506c8 100644 --- a/src/event.cpp +++ b/src/event.cpp @@ -19,133 +19,83 @@ */ -#include "cupla/namespace.hpp" -#include "cupla_runtime.hpp" -#include "cupla/manager/Memory.hpp" -#include "cupla/manager/Device.hpp" -#include "cupla/manager/Stream.hpp" #include "cupla/manager/Event.hpp" + #include "cupla/api/event.hpp" +#include "cupla/manager/Device.hpp" +#include "cupla/manager/Memory.hpp" +#include "cupla/manager/Stream.hpp" +#include "cupla/namespace.hpp" +#include "cupla_runtime.hpp" inline namespace CUPLA_ACCELERATOR_NAMESPACE { + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaEventCreateWithFlags(cuplaEvent_t* event, unsigned int flags) + { + *event = cupla::manager::Event::get().create(flags); -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaEventCreateWithFlags( - cuplaEvent_t * event, - unsigned int flags -) -{ - *event = cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().create( flags ); + return cuplaSuccess; + } - return cuplaSuccess; -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaEventCreate(cuplaEvent_t* event) + { + *event = cupla::manager::Event::get().create(0); -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaEventCreate( - cuplaEvent_t * event -) -{ - *event = cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().create( 0 ); + return cuplaSuccess; + } - return cuplaSuccess; -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaEventDestroy(cuplaEvent_t event) + { + if(cupla::manager::Event::get().destroy(event)) + return cuplaSuccess; + else + return cuplaErrorInitializationError; + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaEventDestroy( cuplaEvent_t event ) -{ - if( - cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().destroy( event ) - ) + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaEventRecord(cuplaEvent_t event, cuplaStream_t stream) + { + auto& streamObject = cupla::manager::Stream::get().stream(stream); + auto& eventObject = cupla::manager::Event::get().event(event); + + eventObject.record(streamObject); return cuplaSuccess; - else - return cuplaErrorInitializationError; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaEventRecord( - cuplaEvent_t event, - cuplaStream_t stream -) -{ - auto& streamObject = cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ); - auto& eventObject = cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().event( event ); - - eventObject.record( streamObject ); - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaEventElapsedTime( - float * ms, - cuplaEvent_t start, - cuplaEvent_t end -) -{ - auto& eventStart = cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().event( start ); - auto& eventEnd = cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().event( end ); - *ms = static_cast< float >( eventEnd.elapsedSince( eventStart ) ); - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaEventSynchronize( - cuplaEvent_t event -) -{ - auto& eventObject = cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().event( event ); - ::alpaka::wait( *eventObject ); - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaEventQuery( cuplaEvent_t event ) -{ - auto& eventObject = cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().event( event ); + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaEventElapsedTime(float* ms, cuplaEvent_t start, cuplaEvent_t end) + { + auto& eventStart = cupla::manager::Event::get().event(start); + auto& eventEnd = cupla::manager::Event::get().event(end); + *ms = static_cast(eventEnd.elapsedSince(eventStart)); + return cuplaSuccess; + } - if( ::alpaka::isComplete( *eventObject ) ) + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaEventSynchronize(cuplaEvent_t event) { + auto& eventObject = cupla::manager::Event::get().event(event); + ::alpaka::wait(*eventObject); return cuplaSuccess; } - else + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaEventQuery(cuplaEvent_t event) { - return cuplaErrorNotReady; + auto& eventObject = cupla::manager::Event::get().event(event); + + if(::alpaka::isComplete(*eventObject)) + { + return cuplaSuccess; + } + else + { + return cuplaErrorNotReady; + } } -} -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/src/manager/Driver.cpp b/src/manager/Driver.cpp index 0486588e..a2f4a0dc 100644 --- a/src/manager/Driver.cpp +++ b/src/manager/Driver.cpp @@ -18,53 +18,38 @@ * */ -#include "cupla/namespace.hpp" -#include "cupla/types.hpp" -#include "cupla_runtime.hpp" #include "cupla/manager/Driver.hpp" -#include "cupla/manager/Memory.hpp" + #include "cupla/manager/Device.hpp" -#include "cupla/manager/Stream.hpp" #include "cupla/manager/Event.hpp" +#include "cupla/manager/Memory.hpp" +#include "cupla/manager/Stream.hpp" +#include "cupla/namespace.hpp" +#include "cupla/types.hpp" +#include "cupla_runtime.hpp" namespace cupla { -inline namespace CUPLA_ACCELERATOR_NAMESPACE -{ -namespace manager -{ - -CUPLA_HEADER_ONLY_FUNC_SPEC Driver::Driver() -{ - cupla::manager::Device< cupla::AccDev >::get( ); + inline namespace CUPLA_ACCELERATOR_NAMESPACE + { + namespace manager + { + CUPLA_HEADER_ONLY_FUNC_SPEC Driver::Driver() + { + cupla::manager::Device::get(); - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get(); + cupla::manager::Stream::get(); - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<3u> - >::get(); + cupla::manager::Memory>::get(); - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<2u> - >::get(); + cupla::manager::Memory>::get(); - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<1u> - >::get(); + cupla::manager::Memory>::get(); - cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get(); -} + cupla::manager::Event::get(); + } -} //namespace manager -} //namespace CUPLA_ACCELERATOR_NAMESPACE -} //namespace cupla + } // namespace manager + } // namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace cupla diff --git a/src/memory.cpp b/src/memory.cpp index 77a8fc56..72bb3567 100644 --- a/src/memory.cpp +++ b/src/memory.cpp @@ -19,951 +19,458 @@ */ -#include "cupla/namespace.hpp" -#include "cupla_runtime.hpp" #include "cupla/manager/Memory.hpp" + +#include "cupla/api/memory.hpp" #include "cupla/manager/Device.hpp" -#include "cupla/manager/Stream.hpp" #include "cupla/manager/Event.hpp" -#include "cupla/api/memory.hpp" +#include "cupla/manager/Stream.hpp" +#include "cupla/namespace.hpp" +#include "cupla_runtime.hpp" inline namespace CUPLA_ACCELERATOR_NAMESPACE { + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMalloc(void** ptrptr, size_t size) + { + const ::alpaka::Vec, cupla::MemSizeType> extent(size); -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMalloc( - void **ptrptr, - size_t size -) -{ + auto& buf = cupla::manager::Memory>::get().alloc(extent); - const ::alpaka::Vec< - cupla::AlpakaDim<1u>, - cupla::MemSizeType - > extent( size ); - - auto& buf = cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<1u> - >::get().alloc( extent ); - - // @toto catch errors - *ptrptr = ::alpaka::getPtrNative(buf); - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMallocPitch( - void ** devPtr, - size_t * pitch, - size_t const width, - size_t const height -) -{ - const ::alpaka::Vec< - cupla::AlpakaDim< 2u >, - cupla::MemSizeType - > extent( height, width ); - - auto& buf = cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim< 2u > - >::get().alloc( extent ); - - // @toto catch errors - *devPtr = ::alpaka::getPtrNative(buf); - *pitch = ::alpaka::getPitchBytes< 1u >( buf ); - - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMalloc3D( - cuplaPitchedPtr * const pitchedDevPtr, - cuplaExtent const extent -) -{ + // @toto catch errors + *ptrptr = ::alpaka::getPtrNative(buf); + return cuplaSuccess; + } - auto& buf = cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim< 3u > - >::get().alloc( extent ); - - // @toto catch errors - *pitchedDevPtr = make_cuplaPitchedPtr( - ::alpaka::getPtrNative(buf), - ::alpaka::getPitchBytes< 2u >( buf ), - extent.width, - extent.height - ); - - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaExtent -make_cuplaExtent( - size_t const w, - size_t const h, - size_t const d -) -{ - return cuplaExtent( w, h, d ); -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaPos -make_cuplaPos( - size_t const x, - size_t const y, - size_t const z -) -{ - return cuplaPos( x, y, z ); -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaPitchedPtr -make_cuplaPitchedPtr( - void * const d, - size_t const p, - size_t const xsz, - size_t const ysz -) -{ - return cuplaPitchedPtr( d, p, xsz, ysz ); -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMallocHost( - void **ptrptr, - size_t size -) -{ - const ::alpaka::Vec< - cupla::AlpakaDim<1u>, - cupla::MemSizeType - > extent( size ); + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMallocPitch(void** devPtr, size_t* pitch, size_t const width, size_t const height) + { + const ::alpaka::Vec, cupla::MemSizeType> extent(height, width); - auto& buf = cupla::manager::Memory< - cupla::AccHost, - cupla::AlpakaDim<1u> - >::get().alloc( extent ); + auto& buf = cupla::manager::Memory>::get().alloc(extent); - prepareForAsyncCopy( buf ); + // @toto catch errors + *devPtr = ::alpaka::getPtrNative(buf); + *pitch = ::alpaka::getPitchBytes<1u>(buf); + + return cuplaSuccess; + } - // @toto catch errors - *ptrptr = ::alpaka::getPtrNative(buf); - return cuplaSuccess; -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMalloc3D(cuplaPitchedPtr* const pitchedDevPtr, cuplaExtent const extent) + { + auto& buf = cupla::manager::Memory>::get().alloc(extent); -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t cuplaFree(void *ptr) -{ + // @toto catch errors + *pitchedDevPtr = make_cuplaPitchedPtr( + ::alpaka::getPtrNative(buf), + ::alpaka::getPitchBytes<2u>(buf), + extent.width, + extent.height); - if( - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<1u> - >::get().free( ptr ) - ) return cuplaSuccess; - else if( - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<2u> - >::get().free( ptr ) - ) - return cuplaSuccess; - else if( - cupla::manager::Memory< - cupla::AccDev, - cupla::AlpakaDim<3u> - >::get().free( ptr ) - ) - return cuplaSuccess; - else - return cuplaErrorMemoryAllocation; + } -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaExtent make_cuplaExtent(size_t const w, size_t const h, size_t const d) + { + return cuplaExtent(w, h, d); + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t cuplaFreeHost(void *ptr) -{ + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaPos make_cuplaPos(size_t const x, size_t const y, size_t const z) + { + return cuplaPos(x, y, z); + } - if( - cupla::manager::Memory< - cupla::AccHost, - cupla::AlpakaDim<1u> - >::get().free( ptr ) - ) + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaPitchedPtr make_cuplaPitchedPtr(void* const d, size_t const p, size_t const xsz, size_t const ysz) + { + return cuplaPitchedPtr(d, p, xsz, ysz); + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMallocHost(void** ptrptr, size_t size) + { + const ::alpaka::Vec, cupla::MemSizeType> extent(size); + + auto& buf = cupla::manager::Memory>::get().alloc(extent); + + prepareForAsyncCopy(buf); + + // @toto catch errors + *ptrptr = ::alpaka::getPtrNative(buf); return cuplaSuccess; - else - return cuplaErrorMemoryAllocation; - -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t cuplaMemcpyAsync( - void *dst, - const void *src, - size_t count, - enum cuplaMemcpyKind kind, - cuplaStream_t stream -) -{ - const ::alpaka::Vec< - cupla::AlpakaDim<1u>, - cupla::MemSizeType - > numBytes(count); - - auto& device( - cupla::manager::Device< - cupla::AccDev - >::get().current() - ); - - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ) - ); - - switch(kind) + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaFree(void* ptr) { + if(cupla::manager::Memory>::get().free(ptr)) + return cuplaSuccess; + else if(cupla::manager::Memory>::get().free(ptr)) + return cuplaSuccess; + else if(cupla::manager::Memory>::get().free(ptr)) + return cuplaSuccess; + else + return cuplaErrorMemoryAllocation; + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaFreeHost(void* ptr) + { + if(cupla::manager::Memory>::get().free(ptr)) + return cuplaSuccess; + else + return cuplaErrorMemoryAllocation; + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemcpyAsync( + void* dst, + const void* src, + size_t count, + enum cuplaMemcpyKind kind, + cuplaStream_t stream) + { + const ::alpaka::Vec, cupla::MemSizeType> numBytes(count); + + auto& device(cupla::manager::Device::get().current()); + + auto& streamObject(cupla::manager::Stream::get().stream(stream)); + + switch(kind) + { case cuplaMemcpyHostToDevice: { - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - - const cupla::HostBufWrapper< 1u > hBuf( - const_cast( - static_cast(src) - ), - host, - numBytes - ); - cupla::DeviceBufWrapper< 1u > dBuf( - static_cast( - dst - ), - device, - numBytes - ); + auto& host(cupla::manager::Device::get().current()); - ::alpaka::memcpy( - streamObject, - dBuf, - hBuf, - numBytes - ); + const cupla::HostBufWrapper<1u> hBuf( + const_cast(static_cast(src)), + host, + numBytes); + cupla::DeviceBufWrapper<1u> dBuf(static_cast(dst), device, numBytes); + ::alpaka::memcpy(streamObject, dBuf, hBuf, numBytes); } - break; + break; case cuplaMemcpyDeviceToHost: { - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - const cupla::DeviceBufWrapper< 1u > dBuf( - const_cast( - static_cast(src) - ), + auto& host(cupla::manager::Device::get().current()); + const cupla::DeviceBufWrapper<1u> dBuf( + const_cast(static_cast(src)), device, - numBytes - ); - cupla::HostBufWrapper< 1u > hBuf( - static_cast( - dst - ), - host, - numBytes - ); - - ::alpaka::memcpy( - streamObject, - hBuf, - dBuf, - numBytes - ); + numBytes); + cupla::HostBufWrapper<1u> hBuf(static_cast(dst), host, numBytes); + ::alpaka::memcpy(streamObject, hBuf, dBuf, numBytes); } - break; + break; case cuplaMemcpyDeviceToDevice: { - const cupla::DeviceBufWrapper< 1u > dSrcBuf( - const_cast( - static_cast(src) - ), + const cupla::DeviceBufWrapper<1u> dSrcBuf( + const_cast(static_cast(src)), device, - numBytes - ); - cupla::DeviceBufWrapper< 1u > dDestBuf( - static_cast( - dst - ), - device, - numBytes - ); - - ::alpaka::memcpy( - streamObject, - dDestBuf, - dSrcBuf, - numBytes - ); + numBytes); + cupla::DeviceBufWrapper<1u> dDestBuf(static_cast(dst), device, numBytes); + ::alpaka::memcpy(streamObject, dDestBuf, dSrcBuf, numBytes); } - break; + break; case cuplaMemcpyHostToHost: { - auto& hostStreamObject( - cupla::manager::Stream< - cupla::AccHost, - cupla::AccHostStream - >::get().stream( stream ) - ); - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - const cupla::HostBufWrapper< 1u > hSrcBuf( - const_cast( - static_cast(src) - ), + auto& hostStreamObject(cupla::manager::Stream::get().stream(stream)); + auto& host(cupla::manager::Device::get().current()); + const cupla::HostBufWrapper<1u> hSrcBuf( + const_cast(static_cast(src)), host, - numBytes - ); - cupla::HostBufWrapper< 1u > hDestBuf( - static_cast( - dst - ), - host, - numBytes - ); - - ::alpaka::memcpy( - hostStreamObject, - hDestBuf, - hSrcBuf, - numBytes - ); + numBytes); + cupla::HostBufWrapper<1u> hDestBuf(static_cast(dst), host, numBytes); + ::alpaka::memcpy(hostStreamObject, hDestBuf, hSrcBuf, numBytes); } break; + } + return cuplaSuccess; } - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemcpy( - void *dst, - const void *src, - size_t count, - enum cuplaMemcpyKind kind -) -{ - cuplaDeviceSynchronize(); - - cuplaMemcpyAsync( - dst, - src, - count, - kind, - 0 - ); - - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( 0 ) - ); - ::alpaka::wait( streamObject ); - - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemsetAsync( - void * devPtr, - int value, - size_t count, - cuplaStream_t stream -) -{ - auto& device( - cupla::manager::Device< - cupla::AccDev - >::get().current() - ); - - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ) - ); - - ::alpaka::Vec< - cupla::AlpakaDim<1u>, - cupla::MemSizeType - > const - numBytes(count); - - cupla::DeviceBufWrapper< 1u > - dBuf( - static_cast< uint8_t * >( devPtr ), - device, - numBytes - ); - - ::alpaka::memset( - streamObject, - dBuf, - value, - numBytes - ); - - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemset( - void * devPtr, - int value, - size_t count -) -{ - cuplaDeviceSynchronize(); - - cuplaMemsetAsync( - devPtr, - value, - count, - 0 - ); - - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( 0 ) - ); - ::alpaka::wait( streamObject ); - - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemcpy2DAsync( - void * dst, - size_t const dPitch, - void const * const src, - size_t const sPitch, - size_t const width, - size_t const height, - enum cuplaMemcpyKind kind, - cuplaStream_t const stream -) -{ - const ::alpaka::Vec< - cupla::AlpakaDim<2u>, - cupla::MemSizeType - > numBytes( height, width ); - - const ::alpaka::Vec< - cupla::AlpakaDim<2u>, - cupla::MemSizeType - > dstPitch( dPitch * height , dPitch ); - - const ::alpaka::Vec< - cupla::AlpakaDim<2u>, - cupla::MemSizeType - > srcPitch( sPitch * height , sPitch ); - - auto& device( - cupla::manager::Device< - cupla::AccDev - >::get().current() - ); - - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ) - ); - - switch(kind) + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemcpy(void* dst, const void* src, size_t count, enum cuplaMemcpyKind kind) + { + cuplaDeviceSynchronize(); + + cuplaMemcpyAsync(dst, src, count, kind, 0); + + auto& streamObject(cupla::manager::Stream::get().stream(0)); + ::alpaka::wait(streamObject); + + return cuplaSuccess; + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemsetAsync(void* devPtr, int value, size_t count, cuplaStream_t stream) { + auto& device(cupla::manager::Device::get().current()); + + auto& streamObject(cupla::manager::Stream::get().stream(stream)); + + ::alpaka::Vec, cupla::MemSizeType> const numBytes(count); + + cupla::DeviceBufWrapper<1u> dBuf(static_cast(devPtr), device, numBytes); + + ::alpaka::memset(streamObject, dBuf, value, numBytes); + + return cuplaSuccess; + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemset(void* devPtr, int value, size_t count) + { + cuplaDeviceSynchronize(); + + cuplaMemsetAsync(devPtr, value, count, 0); + + auto& streamObject(cupla::manager::Stream::get().stream(0)); + ::alpaka::wait(streamObject); + + return cuplaSuccess; + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemcpy2DAsync( + void* dst, + size_t const dPitch, + void const* const src, + size_t const sPitch, + size_t const width, + size_t const height, + enum cuplaMemcpyKind kind, + cuplaStream_t const stream) + { + const ::alpaka::Vec, cupla::MemSizeType> numBytes(height, width); + + const ::alpaka::Vec, cupla::MemSizeType> dstPitch(dPitch * height, dPitch); + + const ::alpaka::Vec, cupla::MemSizeType> srcPitch(sPitch * height, sPitch); + + auto& device(cupla::manager::Device::get().current()); + + auto& streamObject(cupla::manager::Stream::get().stream(stream)); + + switch(kind) + { case cuplaMemcpyHostToDevice: { - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - - const cupla::HostBufWrapper< 2u > hBuf( - const_cast( - static_cast(src) - ), + auto& host(cupla::manager::Device::get().current()); + + const cupla::HostBufWrapper<2u> hBuf( + const_cast(static_cast(src)), host, numBytes, - srcPitch - ); - cupla::DeviceBufWrapper< 2u > dBuf( - static_cast( - dst - ), - device, - numBytes, - dstPitch - ); - - ::alpaka::memcpy( - streamObject, - dBuf, - hBuf, - numBytes - ); + srcPitch); + cupla::DeviceBufWrapper<2u> dBuf(static_cast(dst), device, numBytes, dstPitch); + ::alpaka::memcpy(streamObject, dBuf, hBuf, numBytes); } - break; + break; case cuplaMemcpyDeviceToHost: { - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - const cupla::DeviceBufWrapper< 2u > dBuf( - const_cast( - static_cast(src) - ), + auto& host(cupla::manager::Device::get().current()); + const cupla::DeviceBufWrapper<2u> dBuf( + const_cast(static_cast(src)), device, numBytes, - srcPitch - ); - cupla::HostBufWrapper< 2u > hBuf( - static_cast( - dst - ), - host, - numBytes, - dstPitch - ); - - ::alpaka::memcpy( - streamObject, - hBuf, - dBuf, - numBytes - ); + srcPitch); + cupla::HostBufWrapper<2u> hBuf(static_cast(dst), host, numBytes, dstPitch); + ::alpaka::memcpy(streamObject, hBuf, dBuf, numBytes); } - break; + break; case cuplaMemcpyDeviceToDevice: { - const cupla::DeviceBufWrapper< 2u > dSrcBuf( - const_cast( - static_cast(src) - ), - device, - numBytes, - srcPitch - ); - cupla::DeviceBufWrapper< 2u > dDestBuf( - static_cast( - dst - ), + const cupla::DeviceBufWrapper<2u> dSrcBuf( + const_cast(static_cast(src)), device, numBytes, - dstPitch - ); - - ::alpaka::memcpy( - streamObject, - dDestBuf, - dSrcBuf, - numBytes - ); + srcPitch); + cupla::DeviceBufWrapper<2u> dDestBuf(static_cast(dst), device, numBytes, dstPitch); + ::alpaka::memcpy(streamObject, dDestBuf, dSrcBuf, numBytes); } break; case cuplaMemcpyHostToHost: { - auto& hostStreamObject( - cupla::manager::Stream< - cupla::AccHost, - cupla::AccHostStream - >::get().stream( stream ) - ); - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - const cupla::HostBufWrapper< 2u > hSrcBuf( - const_cast( - static_cast(src) - ), - host, - numBytes, - srcPitch - ); - cupla::HostBufWrapper< 2u > hDestBuf( - static_cast( - dst - ), + auto& hostStreamObject(cupla::manager::Stream::get().stream(stream)); + auto& host(cupla::manager::Device::get().current()); + const cupla::HostBufWrapper<2u> hSrcBuf( + const_cast(static_cast(src)), host, numBytes, - dstPitch - ); - - ::alpaka::memcpy( - hostStreamObject, - hDestBuf, - hSrcBuf, - numBytes - ); + srcPitch); + cupla::HostBufWrapper<2u> hDestBuf(static_cast(dst), host, numBytes, dstPitch); + ::alpaka::memcpy(hostStreamObject, hDestBuf, hSrcBuf, numBytes); } break; + } + return cuplaSuccess; } - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemcpy2D( - void * dst, - size_t const dPitch, - void const * const src, - size_t const sPitch, - size_t const width, - size_t const height, - enum cuplaMemcpyKind kind -) -{ - cuplaDeviceSynchronize(); - - cuplaMemcpy2DAsync( - dst, - dPitch, - src, - sPitch, - width, - height, - kind, - 0 - ); - - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( 0 ) - ); - ::alpaka::wait( streamObject ); - - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemcpy3DAsync( - const cuplaMemcpy3DParms * const p, - cuplaStream_t stream -) -{ - const ::alpaka::Vec< - cupla::AlpakaDim<3u>, - cupla::MemSizeType - > numBytes( p->extent ); - - const ::alpaka::Vec< - cupla::AlpakaDim<3u>, - cupla::MemSizeType - > extentSrc( - p->srcPtr.xsize * p->srcPtr.ysize * ( p->extent.depth + p->srcPos.z ), - p->srcPtr.xsize * p->srcPtr.ysize, - p->srcPtr.xsize - ); - - const ::alpaka::Vec< - cupla::AlpakaDim<3u>, - cupla::MemSizeType - > extentDst( - p->dstPtr.xsize * p->dstPtr.ysize * ( p->extent.depth + p->dstPos.z ), - p->dstPtr.xsize * p->dstPtr.ysize, - p->dstPtr.xsize - ); - - const ::alpaka::Vec< - cupla::AlpakaDim<3u>, - cupla::MemSizeType - > offsetSrc( - p->srcPos.z, - p->srcPos.y, - p->srcPos.x - ); - - const ::alpaka::Vec< - cupla::AlpakaDim<3u>, - cupla::MemSizeType - > offsetDst( - p->dstPos.z, - p->dstPos.y, - p->dstPos.x - ); - - const ::alpaka::Vec< - cupla::AlpakaDim<3u>, - cupla::MemSizeType - > dstPitch( - p->dstPtr.pitch * p->dstPtr.ysize * ( p->extent.depth + p->dstPos.z ), // @todo: can't create z pitch, but is not needed by alpaka - p->dstPtr.pitch * p->dstPtr.ysize, - p->dstPtr.pitch - ); - - const ::alpaka::Vec< - cupla::AlpakaDim<3u>, - cupla::MemSizeType - > srcPitch( - p->srcPtr.pitch * p->srcPtr.ysize * ( p->extent.depth + p->srcPos.z ), // @todo: can't create z pitch, but is not needed by alpaka - p->srcPtr.pitch * p->srcPtr.ysize, - p->srcPtr.pitch - ); - - auto& device( - cupla::manager::Device< - cupla::AccDev - >::get().current() - ); - - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ) - ); - - switch(p->kind) + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemcpy2D( + void* dst, + size_t const dPitch, + void const* const src, + size_t const sPitch, + size_t const width, + size_t const height, + enum cuplaMemcpyKind kind) + { + cuplaDeviceSynchronize(); + + cuplaMemcpy2DAsync(dst, dPitch, src, sPitch, width, height, kind, 0); + + auto& streamObject(cupla::manager::Stream::get().stream(0)); + ::alpaka::wait(streamObject); + + return cuplaSuccess; + } + + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemcpy3DAsync(const cuplaMemcpy3DParms* const p, cuplaStream_t stream) { + const ::alpaka::Vec, cupla::MemSizeType> numBytes(p->extent); + + const ::alpaka::Vec, cupla::MemSizeType> extentSrc( + p->srcPtr.xsize * p->srcPtr.ysize * (p->extent.depth + p->srcPos.z), + p->srcPtr.xsize * p->srcPtr.ysize, + p->srcPtr.xsize); + + const ::alpaka::Vec, cupla::MemSizeType> extentDst( + p->dstPtr.xsize * p->dstPtr.ysize * (p->extent.depth + p->dstPos.z), + p->dstPtr.xsize * p->dstPtr.ysize, + p->dstPtr.xsize); + + const ::alpaka::Vec, cupla::MemSizeType> offsetSrc(p->srcPos.z, p->srcPos.y, p->srcPos.x); + + const ::alpaka::Vec, cupla::MemSizeType> offsetDst(p->dstPos.z, p->dstPos.y, p->dstPos.x); + + const ::alpaka::Vec< + cupla::AlpakaDim<3u>, + cupla::MemSizeType> + dstPitch( + p->dstPtr.pitch * p->dstPtr.ysize + * (p->extent.depth + p->dstPos.z), // @todo: can't create z pitch, but is not needed by alpaka + p->dstPtr.pitch * p->dstPtr.ysize, + p->dstPtr.pitch); + + const ::alpaka::Vec< + cupla::AlpakaDim<3u>, + cupla::MemSizeType> + srcPitch( + p->srcPtr.pitch * p->srcPtr.ysize + * (p->extent.depth + p->srcPos.z), // @todo: can't create z pitch, but is not needed by alpaka + p->srcPtr.pitch * p->srcPtr.ysize, + p->srcPtr.pitch); + + auto& device(cupla::manager::Device::get().current()); + + auto& streamObject(cupla::manager::Stream::get().stream(stream)); + + switch(p->kind) + { case cuplaMemcpyHostToDevice: { - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - - cupla::HostBufWrapper< 3u > hBuf( - const_cast( - static_cast(p->srcPtr.ptr) - ), + auto& host(cupla::manager::Device::get().current()); + + cupla::HostBufWrapper<3u> hBuf( + const_cast(static_cast(p->srcPtr.ptr)), host, extentSrc, - srcPitch - ); - cupla::DeviceBufWrapper< 3u > dBuf( - static_cast( - p->dstPtr.ptr - ), - device, - extentDst, - dstPitch - ); + srcPitch); + cupla::DeviceBufWrapper<3u> dBuf(static_cast(p->dstPtr.ptr), device, extentDst, dstPitch); - cupla::DeviceViewWrapper< 3u > dView( - dBuf, - extentDst - offsetDst, - offsetDst - ); + cupla::DeviceViewWrapper<3u> dView(dBuf, extentDst - offsetDst, offsetDst); ::alpaka::memcpy( streamObject, dView, - cupla::HostViewWrapper< 3u >( - hBuf, - extentSrc - offsetSrc, - offsetSrc - ), - numBytes - ); - + cupla::HostViewWrapper<3u>(hBuf, extentSrc - offsetSrc, offsetSrc), + numBytes); } - break; + break; case cuplaMemcpyDeviceToHost: { - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - cupla::DeviceBufWrapper< 3u > dBuf( - const_cast( - static_cast(p->srcPtr.ptr) - ), + auto& host(cupla::manager::Device::get().current()); + cupla::DeviceBufWrapper<3u> dBuf( + const_cast(static_cast(p->srcPtr.ptr)), device, extentSrc, - srcPitch - ); - cupla::HostBufWrapper< 3u > hBuf( - static_cast( - p->dstPtr.ptr - ), - host, - extentDst, - dstPitch - ); + srcPitch); + cupla::HostBufWrapper<3u> hBuf(static_cast(p->dstPtr.ptr), host, extentDst, dstPitch); - cupla::HostViewWrapper< 3u > hView( - hBuf, - extentDst - offsetDst, - offsetDst - ); + cupla::HostViewWrapper<3u> hView(hBuf, extentDst - offsetDst, offsetDst); ::alpaka::memcpy( streamObject, hView, - cupla::DeviceViewWrapper< 3u >( - dBuf, - extentSrc - offsetSrc, - offsetSrc - ), - numBytes - ); - + cupla::DeviceViewWrapper<3u>(dBuf, extentSrc - offsetSrc, offsetSrc), + numBytes); } - break; + break; case cuplaMemcpyDeviceToDevice: { - cupla::DeviceBufWrapper< 3u > dSrcBuf( - const_cast( - static_cast(p->srcPtr.ptr) - ), + cupla::DeviceBufWrapper<3u> dSrcBuf( + const_cast(static_cast(p->srcPtr.ptr)), device, extentSrc, - srcPitch - ); - cupla::DeviceBufWrapper< 3u > dDestBuf( - static_cast( - p->dstPtr.ptr - ), - device, - extentDst, - dstPitch - ); + srcPitch); + cupla::DeviceBufWrapper<3u> dDestBuf(static_cast(p->dstPtr.ptr), device, extentDst, dstPitch); - cupla::DeviceViewWrapper< 3u > dView( - dDestBuf, - extentDst - offsetDst, - offsetDst - ); + cupla::DeviceViewWrapper<3u> dView(dDestBuf, extentDst - offsetDst, offsetDst); ::alpaka::memcpy( streamObject, dView, - cupla::DeviceViewWrapper< 3u >( - dSrcBuf, - extentSrc - offsetSrc, - offsetSrc - ), - numBytes - ); - + cupla::DeviceViewWrapper<3u>(dSrcBuf, extentSrc - offsetSrc, offsetSrc), + numBytes); } break; case cuplaMemcpyHostToHost: { - auto& hostStreamObject( - cupla::manager::Stream< - cupla::AccHost, - cupla::AccHostStream - >::get().stream( stream ) - ); - - auto& host( - cupla::manager::Device< - cupla::AccHost - >::get().current() - ); - cupla::HostBufWrapper< 3u > hSrcBuf( - const_cast( - static_cast(p->srcPtr.ptr) - ), + auto& hostStreamObject(cupla::manager::Stream::get().stream(stream)); + + auto& host(cupla::manager::Device::get().current()); + cupla::HostBufWrapper<3u> hSrcBuf( + const_cast(static_cast(p->srcPtr.ptr)), host, extentSrc, - srcPitch - ); - cupla::HostBufWrapper< 3u > hDestBuf( - static_cast( - p->dstPtr.ptr - ), - host, - extentDst, - dstPitch - ); - - cupla::HostViewWrapper< 3u > hView( - hDestBuf, - extentDst - offsetDst, - offsetDst - ); + srcPitch); + cupla::HostBufWrapper<3u> hDestBuf(static_cast(p->dstPtr.ptr), host, extentDst, dstPitch); + + cupla::HostViewWrapper<3u> hView(hDestBuf, extentDst - offsetDst, offsetDst); ::alpaka::memcpy( hostStreamObject, hView, - cupla::HostViewWrapper< 3u >( - hSrcBuf, - extentSrc - offsetSrc, - offsetSrc - ), - numBytes - ); - + cupla::HostViewWrapper<3u>(hSrcBuf, extentSrc - offsetSrc, offsetSrc), + numBytes); } break; + } + return cuplaSuccess; } - return cuplaSuccess; -} - -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaMemcpy3D( - const cuplaMemcpy3DParms * const p -) -{ - cuplaDeviceSynchronize(); - cuplaMemcpy3DAsync( p, 0 ); + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaMemcpy3D(const cuplaMemcpy3DParms* const p) + { + cuplaDeviceSynchronize(); + + cuplaMemcpy3DAsync(p, 0); - auto& streamObject( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( 0 ) - ); - ::alpaka::wait( streamObject ); + auto& streamObject(cupla::manager::Stream::get().stream(0)); + ::alpaka::wait(streamObject); - return cuplaSuccess; -} + return cuplaSuccess; + } -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/src/stream.cpp b/src/stream.cpp index e936286c..9e59c5d7 100644 --- a/src/stream.cpp +++ b/src/stream.cpp @@ -19,96 +19,62 @@ */ -#include "cupla/namespace.hpp" -#include "cupla_runtime.hpp" -#include "cupla/manager/Memory.hpp" -#include "cupla/manager/Device.hpp" #include "cupla/manager/Stream.hpp" -#include "cupla/manager/Event.hpp" #include "cupla/api/stream.hpp" +#include "cupla/manager/Device.hpp" +#include "cupla/manager/Event.hpp" +#include "cupla/manager/Memory.hpp" +#include "cupla/namespace.hpp" +#include "cupla_runtime.hpp" inline namespace CUPLA_ACCELERATOR_NAMESPACE { + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaStreamCreate(cuplaStream_t* stream) + { + *stream = cupla::manager::Stream::get().create(); -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaStreamCreate( - cuplaStream_t * stream -) -{ - *stream = cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().create(); + return cuplaSuccess; + } - return cuplaSuccess; -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaStreamDestroy(cuplaStream_t stream) + { + if(cupla::manager::Stream::get().destroy(stream)) + return cuplaSuccess; + else + return cuplaErrorInitializationError; + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaStreamDestroy( cuplaStream_t stream ) -{ - if( - cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().destroy( stream ) - ) + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaStreamSynchronize(cuplaStream_t stream) + { + auto& streamObject = cupla::manager::Stream::get().stream(stream); + ::alpaka::wait(streamObject); return cuplaSuccess; - else - return cuplaErrorInitializationError; -} + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaStreamSynchronize( - cuplaStream_t stream -) -{ - auto& streamObject = cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ); - ::alpaka::wait( streamObject ); - return cuplaSuccess; -} + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaStreamWaitEvent(cuplaStream_t stream, cuplaEvent_t event, unsigned int) + { + auto& streamObject = cupla::manager::Stream::get().stream(stream); -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaStreamWaitEvent( - cuplaStream_t stream, - cuplaEvent_t event, - unsigned int -) -{ - auto& streamObject = cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ); + auto& eventObject = *cupla::manager::Event::get().event(event); - auto& eventObject = *cupla::manager::Event< - cupla::AccDev, - cupla::AccStream - >::get().event( event ); - - ::alpaka::wait(streamObject,eventObject); - return cuplaSuccess; -} + ::alpaka::wait(streamObject, eventObject); + return cuplaSuccess; + } -CUPLA_HEADER_ONLY_FUNC_SPEC -cuplaError_t -cuplaStreamQuery( cuplaStream_t stream ) -{ - auto& streamObject = cupla::manager::Stream< - cupla::AccDev, - cupla::AccStream - >::get().stream( stream ); + CUPLA_HEADER_ONLY_FUNC_SPEC + cuplaError_t cuplaStreamQuery(cuplaStream_t stream) + { + auto& streamObject = cupla::manager::Stream::get().stream(stream); - if( alpaka::empty( streamObject ) ) - return cuplaSuccess; - else - return cuplaErrorNotReady; -} + if(alpaka::empty(streamObject)) + return cuplaSuccess; + else + return cuplaErrorNotReady; + } -} //namespace CUPLA_ACCELERATOR_NAMESPACE +} // namespace CUPLA_ACCELERATOR_NAMESPACE diff --git a/test/system/config/kernel.cpp b/test/system/config/kernel.cpp index 2768e0aa..e2bd8ca3 100644 --- a/test/system/config/kernel.cpp +++ b/test/system/config/kernel.cpp @@ -19,20 +19,20 @@ */ -#if defined( CUPLA_ACC_CpuOmp2Blocks ) -# include -#elif defined( CUPLA_ACC_CpuOmp2Threads ) -# include -#elif defined( CUPLA_ACC_CpuSerial ) -# include -#elif defined( CUPLA_ACC_CpuTbbBlocks ) -# include -#elif defined( CUPLA_ACC_CpuThreads ) -# include -#elif defined( CUPLA_ACC_GpuCudaRt ) -# include -#elif defined( CUPLA_ACC_GpuHipRt ) -# include +#if defined(CUPLA_ACC_CpuOmp2Blocks) +# include +#elif defined(CUPLA_ACC_CpuOmp2Threads) +# include +#elif defined(CUPLA_ACC_CpuSerial) +# include +#elif defined(CUPLA_ACC_CpuTbbBlocks) +# include +#elif defined(CUPLA_ACC_CpuThreads) +# include +#elif defined(CUPLA_ACC_GpuCudaRt) +# include +#elif defined(CUPLA_ACC_GpuHipRt) +# include #endif #include "cuda_to_cupla.hpp" @@ -40,9 +40,9 @@ struct IncrementKernel { template - ALPAKA_FN_ACC void operator()( T_Acc const & acc, int * ptr) const + ALPAKA_FN_ACC void operator()(T_Acc const& acc, int* ptr) const { - for( int i = 0; i < elemDim.x; ++i ) + for(int i = 0; i < elemDim.x; ++i) atomicAdd(ptr, 1); } }; @@ -51,7 +51,5 @@ struct IncrementKernel void callIncrementKernel(int* pr_d) { // increment 42 times - CUPLA_KERNEL_OPTI( - IncrementKernel - )(7, 6)(pr_d); + CUPLA_KERNEL_OPTI(IncrementKernel)(7, 6)(pr_d); } diff --git a/test/system/config/main.cpp b/test/system/config/main.cpp index e7d09039..91fe30a5 100644 --- a/test/system/config/main.cpp +++ b/test/system/config/main.cpp @@ -19,20 +19,20 @@ */ -#if defined( CUPLA_ACC_CpuOmp2Blocks ) -# include -#elif defined( CUPLA_ACC_CpuOmp2Threads ) -# include -#elif defined( CUPLA_ACC_CpuSerial ) -# include -#elif defined( CUPLA_ACC_CpuTbbBlocks ) -# include -#elif defined( CUPLA_ACC_CpuThreads ) -# include -#elif defined( CUPLA_ACC_GpuCudaRt ) -# include -#elif defined( CUPLA_ACC_GpuHipRt ) -# include +#if defined(CUPLA_ACC_CpuOmp2Blocks) +# include +#elif defined(CUPLA_ACC_CpuOmp2Threads) +# include +#elif defined(CUPLA_ACC_CpuSerial) +# include +#elif defined(CUPLA_ACC_CpuTbbBlocks) +# include +#elif defined(CUPLA_ACC_CpuThreads) +# include +#elif defined(CUPLA_ACC_GpuCudaRt) +# include +#elif defined(CUPLA_ACC_GpuHipRt) +# include #endif #include "cuda_to_cupla.hpp" @@ -42,16 +42,16 @@ extern void callIncrementKernel(int* pr_d); int main() { int res_h = 0; - int *res_ptr_d = nullptr; - cudaMalloc( (void**)&res_ptr_d, sizeof( int ) ); + int* res_ptr_d = nullptr; + cudaMalloc((void**) &res_ptr_d, sizeof(int)); // reset result to zero - cuplaMemset( res_ptr_d, 0, sizeof( int ) ); + cuplaMemset(res_ptr_d, 0, sizeof(int)); // increment 42 times callIncrementKernel(res_ptr_d); - cudaMemcpy(&res_h, res_ptr_d, sizeof( int ), cudaMemcpyDeviceToHost); + cudaMemcpy(&res_h, res_ptr_d, sizeof(int), cudaMemcpyDeviceToHost); return res_h != 42; } From 3ac491dac631be133e639a2a8aee549ef4914d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Wed, 1 Sep 2021 11:11:01 +0200 Subject: [PATCH 3/3] CI: test code formation Test code formation with clang-format-11. --- .gitlab-ci.yml | 40 ++++++++++++++++++++++++++++++++++ script/check_cpp_code_style.sh | 11 ++++++++++ 2 files changed, 51 insertions(+) create mode 100755 script/check_cpp_code_style.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9d246505..bc61e886 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,7 +8,32 @@ include: - local: '/script/compiler_base.yml' +stages: + - validate + - compile-and-run + +################################################################################ +# Check code formation with clang-format +# pull request validation: +# - check C++ code style +pull-request-validation: + stage: validate + image: ubuntu:focal + script: + - apt update + # install clang-format-11 + - apt install -y -q gnupg2 wget + - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - + - echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main" | tee -a /etc/apt/sources.list + - apt update + - DEBIAN_FRONTEND=noninteractive apt install -y clang-format-11 + # Check C++ code style + - source $CI_PROJECT_DIR/script/check_cpp_code_style.sh + tags: + - x86_64 + cuda92: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-gcc:1.4 variables: CUPLA_CXX: "g++-6" @@ -21,48 +46,56 @@ cuda92: extends: .base_cuda cuda100: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-gcc:1.4 variables: CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" extends: .base_cuda cuda101: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-gcc:1.4 variables: CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" extends: .base_cuda cuda102: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda102-gcc:1.4 variables: CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" extends: .base_cuda gcc1: + stage: compile-and-run variables: CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9" CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0" extends: .base_gcc gcc2: + stage: compile-and-run variables: CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9" CUPLA_BOOST_VERSIONS: "1.68.0 1.69.0 1.70.0" extends: .base_gcc gcc3: + stage: compile-and-run variables: CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9" CUPLA_BOOST_VERSIONS: "1.71.0 1.72.0 1.73.0" extends: .base_gcc clang: + stage: compile-and-run variables: CUPLA_CXX: "clang++-5.0 clang++-6.0 clang++-7 clang++-8 clang++-9 clang++-10 clang++-11" CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0" extends: .base_clang cudaClang92: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-clang:1.4 variables: CUPLA_CXX: "clang++-8 clang++-10 clang++-11" @@ -70,6 +103,7 @@ cudaClang92: extends: .base_cuda_clang cudaClang100: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-clang:1.4 variables: CUPLA_CXX: "clang++-8 clang++-9 clang++-10 clang++-11" @@ -77,6 +111,7 @@ cudaClang100: extends: .base_cuda_clang cudaClang101: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-clang:1.4 variables: CUPLA_CXX: "clang++-9 clang++-10 clang++-11" @@ -84,6 +119,7 @@ cudaClang101: extends: .base_cuda_clang hip42: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-rocm4.2:1.4 variables: CMAKE_MODULE_PATH: "/opt/rocm-4.2.0/hip/cmake" @@ -97,6 +133,7 @@ hip42: # build external project and use cupla via cmake add_subdirectory() # use internal alpaka addSubdirectoryInternal: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4 variables: GIT_SUBMODULE_STRATEGY: normal @@ -110,6 +147,7 @@ addSubdirectoryInternal: # build external project and use cupla via cmake add_subdirectory() # use installed alpaka addSubdirectoryExternal: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4 variables: GIT_SUBMODULE_STRATEGY: normal @@ -124,6 +162,7 @@ addSubdirectoryExternal: # build external project and use cupla via cmake find_package() # cupla was installed with disabled examples findPackageWithoutExample: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4 variables: GIT_SUBMODULE_STRATEGY: normal @@ -139,6 +178,7 @@ findPackageWithoutExample: # build external project and use cupla via cmake find_package() # cupla was installed with enabled examples findPackageWithExample: + stage: compile-and-run image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4 variables: GIT_SUBMODULE_STRATEGY: normal diff --git a/script/check_cpp_code_style.sh b/script/check_cpp_code_style.sh new file mode 100755 index 00000000..37c32c29 --- /dev/null +++ b/script/check_cpp_code_style.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -e +set -o pipefail + +cd $CI_PROJECT_DIR + +# check code style with clang format +find src example include test -iname "*.def" \ + -o -iname "*.h" -o -iname "*.cpp" -o -iname "*.hpp" \ + | xargs clang-format-11 --dry-run --Werror