From ff2fb5cb1132dd69a3d57d07412193c0f57b50d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Wed, 1 Sep 2021 10:54:22 +0200
Subject: [PATCH 1/3] add clang-format file

Add alpaka's clang format file and add to `IncludeCategories:` a section
about cupla includes.
---
 .clang-format | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..37aa603c
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,116 @@
+---
+# General options
+Language: Cpp
+Standard: c++17
+DisableFormat: false
+
+AccessModifierOffset: -4
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: false
+AlignEscapedNewlines: Right
+AlignOperands: false
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Allman
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+BreakStringLiterals: true
+ColumnLimit: 119
+CommentPragmas:  '^ COMMENT pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks: Regroup
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: false
+IndentGotoLabels: true
+IndentPPDirectives: AfterHash
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: All
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 1000
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: Never
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+TabWidth: 4
+UseCRLF: false
+UseTab: Never
+
+# Project specific options
+IncludeCategories:
+  # Local headers (in "") above all else
+  - Regex: '"([A-Za-z0-9.\/-_])+"'
+    Priority: 1
+  # <alpaka/foo.hpp> after local headers
+  - Regex: '"cupla/([A-Za-z0-9.\/-_])+"'
+    Priority: 2
+  # <alpaka/foo.hpp> after local headers
+  - Regex: '<alpaka/([A-Za-z0-9.\/-_])+>'
+    Priority: 3
+  # C++ standard library headers are the last group to be included
+  - Regex: '<([A-Za-z0-9\/-_])+>'
+    Priority: 4
+
+# Future options - not supported in clang-format 11
+# AlignConsecutiveBitFields: false
+# AllowShortEnumsOnASingleLine: false
+# BitFieldColonSpacing: Both
+# IndentCaseBlocks: true
+# IndentExternBlock: AfterExternBlock
+# OperandAlignmentStyle: Align
+...

From d3a9ba7733937dd6d4c8e9f75eb79aea066a9678 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Wed, 1 Sep 2021 11:00:19 +0200
Subject: [PATCH 2/3] apply clang-format

Format code.

```
find src example include test  -iname "*.def" \
  -o -iname "*.h" -o -iname "*.cpp" -o -iname "*.hpp" \
  | xargs clang-format-11 -i
```
---
 example/CUDASamples/asyncAPI/src/asyncAPI.cpp |   54 +-
 .../asyncAPI_tuned/src/asyncAPI.cpp           |   56 +-
 .../blackScholes/src/BlackScholes.cpp         |  180 ++-
 .../blackScholes/src/BlackScholes_gold.cpp    |   66 +-
 example/CUDASamples/common/exception.h        |  115 +-
 example/CUDASamples/common/helper_cuda.h      | 1300 +++++++++--------
 example/CUDASamples/common/helper_functions.h |   24 +-
 example/CUDASamples/common/helper_image.h     |  504 ++++---
 example/CUDASamples/common/helper_string.h    |  474 +++---
 example/CUDASamples/common/helper_timer.h     |  343 ++---
 .../cuplaVectorAdd/src/vectorAdd.cpp          |  176 +--
 .../CUDASamples/matrixMul/src/matrixMul.cpp   |  276 ++--
 .../CUDASamples/vectorAdd/src/vectorAdd.cpp   |  173 ++-
 include/cuda_to_cupla.hpp                     |    4 +-
 include/cupla.hpp                             |    2 +-
 include/cupla/api/common.hpp                  |   47 +-
 include/cupla/api/device.hpp                  |   28 +-
 include/cupla/api/event.hpp                   |   42 +-
 include/cupla/api/memory.hpp                  |  190 +--
 include/cupla/api/stream.hpp                  |   30 +-
 include/cupla/c/datatypes/cuplaArray.hpp      |   15 +-
 include/cupla/c/datatypes/cuplaExtent.hpp     |  284 ++--
 .../cupla/c/datatypes/cuplaMemcpy3DParms.hpp  |   33 +-
 include/cupla/c/datatypes/cuplaPitchedPtr.hpp |   48 +-
 include/cupla/c/datatypes/cuplaPos.hpp        |  281 ++--
 include/cupla/config/AnyOacc.hpp              |   20 +-
 include/cupla/config/AnyOmp5.hpp              |   20 +-
 include/cupla/config/CpuOmp2Blocks.hpp        |   20 +-
 include/cupla/config/CpuOmp2Threads.hpp       |   20 +-
 include/cupla/config/CpuSerial.hpp            |   20 +-
 include/cupla/config/CpuTbbBlocks.hpp         |   20 +-
 include/cupla/config/CpuThreads.hpp           |   20 +-
 include/cupla/config/GpuCudaRt.hpp            |   20 +-
 include/cupla/config/GpuHipRt.hpp             |   20 +-
 include/cupla/cudaToCupla/driverTypes.hpp     |   32 +-
 include/cupla/cudaToCupla/runtime.hpp         |    6 +-
 include/cupla/datatypes/Array.hpp             |   57 +-
 include/cupla/datatypes/dim3.hpp              |   67 +-
 include/cupla/datatypes/uint.hpp              |  293 ++--
 include/cupla/defines.hpp                     |  107 +-
 include/cupla/device/Atomic.hpp               |  229 ++-
 include/cupla/device/Hierarchy.hpp            |   23 +-
 include/cupla/device/Index.hpp                |  141 +-
 include/cupla/device/SharedMemory.hpp         |    8 +-
 include/cupla/device/Synchronization.hpp      |   55 +-
 include/cupla/device/math/Abs.hpp             |   23 +-
 include/cupla/device/math/Common.hpp          |  209 +--
 include/cupla/device/math/Comparison.hpp      |   27 +-
 include/cupla/device/math/Erf.hpp             |   23 +-
 include/cupla/device/math/Exp.hpp             |   23 +-
 include/cupla/device/math/Log.hpp             |   23 +-
 include/cupla/device/math/Mod.hpp             |   27 +-
 include/cupla/device/math/Pow.hpp             |   23 +-
 include/cupla/device/math/Root.hpp            |   37 +-
 include/cupla/device/math/Round.hpp           |   61 +-
 include/cupla/device/math/Trigo.hpp           |   47 +-
 include/cupla/device_functions.hpp            |    4 +-
 include/cupla/kernel.hpp                      |  398 +++--
 include/cupla/manager/Device.hpp              |  222 ++-
 include/cupla/manager/Driver.hpp              |   48 +-
 include/cupla/manager/Event.hpp               |  355 ++---
 include/cupla/manager/Memory.hpp              |  180 +--
 include/cupla/manager/Stream.hpp              |  217 ++-
 include/cupla/namespace.hpp                   |  100 +-
 include/cupla/traits/IsThreadSeqAcc.hpp       |   85 +-
 include/cupla/types.hpp                       |  288 ++--
 include/cupla_driver_types.hpp                |   60 +-
 include/cupla_runtime.hpp                     |   37 +-
 src/common.cpp                                |   76 +-
 src/device.cpp                                |  163 +--
 src/event.cpp                                 |  172 +--
 src/manager/Driver.cpp                        |   59 +-
 src/memory.cpp                                | 1171 +++++----------
 src/stream.cpp                                |  120 +-
 test/system/config/kernel.cpp                 |   36 +-
 test/system/config/main.cpp                   |   36 +-
 76 files changed, 4413 insertions(+), 5880 deletions(-)

diff --git a/example/CUDASamples/asyncAPI/src/asyncAPI.cpp b/example/CUDASamples/asyncAPI/src/asyncAPI.cpp
index 595d4b1a..f61aa532 100644
--- a/example/CUDASamples/asyncAPI/src/asyncAPI.cpp
+++ b/example/CUDASamples/asyncAPI/src/asyncAPI.cpp
@@ -27,26 +27,22 @@
 
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h> // helper utility functions 
+#include <helper_functions.h> // helper utility functions
 
 struct increment_kernel
 {
-
-template<
-    typename T_Acc
->
-ALPAKA_FN_ACC 
-void operator()(T_Acc const & acc, int *g_data, int inc_value) const
-{
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    g_data[idx] = g_data[idx] + inc_value;
-}
+    template<typename T_Acc>
+    ALPAKA_FN_ACC void operator()(T_Acc const& acc, int* g_data, int inc_value) const
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        g_data[idx] = g_data[idx] + inc_value;
+    }
 };
 
-int correct_output(int *data, const int n, const int x)
+int correct_output(int* data, const int n, const int x)
 {
-    for (int i = 0; i < n; i++)
-        if (data[i] != x)
+    for(int i = 0; i < n; i++)
+        if(data[i] != x)
         {
             printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
             return 0;
@@ -55,44 +51,44 @@ int correct_output(int *data, const int n, const int x)
     return 1;
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
-//    int devID;
-//    cudaDeviceProp deviceProps;
+    //    int devID;
+    //    cudaDeviceProp deviceProps;
 
     printf("[%s] - Starting...\n", argv[0]);
 
     // This will pick the best possible CUDA capable device
-//    devID = findCudaDevice(argc, (const char **)argv);
+    //    devID = findCudaDevice(argc, (const char **)argv);
 
     // get device name
-//    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-//    printf("CUDA device [%s]\n", deviceProps.name);
+    //    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    //    printf("CUDA device [%s]\n", deviceProps.name);
 
     int n = 16 * 1024 * 1024;
     int nbytes = n * sizeof(int);
     int value = 26;
 
     // allocate host memory
-    int *a = 0;
-    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+    int* a = 0;
+    checkCudaErrors(cudaMallocHost((void**) &a, nbytes));
     memset(a, 0, nbytes);
 
     // allocate device memory
-    int *d_a=0;
-    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+    int* d_a = 0;
+    checkCudaErrors(cudaMalloc((void**) &d_a, nbytes));
     checkCudaErrors(cudaMemset(d_a, 255, nbytes));
 
     // set kernel launch configuration
     dim3 threads = dim3(512, 1);
-    dim3 blocks  = dim3(n / threads.x, 1);
+    dim3 blocks = dim3(n / threads.x, 1);
 
     // create cuda event handles
     cudaEvent_t start, stop;
     checkCudaErrors(cudaEventCreate(&start));
     checkCudaErrors(cudaEventCreate(&stop));
 
-    StopWatchInterface *timer = NULL;
+    StopWatchInterface* timer = NULL;
     sdkCreateTimer(&timer);
     sdkResetTimer(&timer);
 
@@ -109,9 +105,9 @@ int main(int argc, char *argv[])
     sdkStopTimer(&timer);
 
     // have CPU do some work while waiting for stage 1 to finish
-    unsigned long int counter=0;
+    unsigned long int counter = 0;
 
-    while (cudaEventQuery(stop) == cudaErrorNotReady)
+    while(cudaEventQuery(stop) == cudaErrorNotReady)
     {
         counter++;
     }
@@ -124,7 +120,7 @@ int main(int argc, char *argv[])
     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
 
     // check the output for correctness
-    bool bFinalResults = (bool)correct_output(a, n, value);
+    bool bFinalResults = (bool) correct_output(a, n, value);
 
     // release resources
     checkCudaErrors(cudaEventDestroy(start));
diff --git a/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp b/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp
index aa93adfd..19cdbb30 100644
--- a/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp
+++ b/example/CUDASamples/asyncAPI_tuned/src/asyncAPI.cpp
@@ -27,28 +27,24 @@
 
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h> // helper utility functions 
+#include <helper_functions.h> // helper utility functions
 
 struct increment_kernel
 {
+    template<typename T_Acc>
+    ALPAKA_FN_ACC void operator()(T_Acc const& acc, int* g_data, int inc_value) const
+    {
+        int idx = blockIdx.x * (blockDim.x * elemDim.x) + threadIdx.x;
 
-template<
-    typename T_Acc
->
-ALPAKA_FN_ACC 
-void operator()(T_Acc const & acc, int *g_data, int inc_value) const
-{
-    int idx = blockIdx.x * (blockDim.x * elemDim.x) + threadIdx.x;
-    
-    for(int i = 0; i < elemDim.x; ++i)
-        g_data[idx + i] = g_data[idx + i] + inc_value;
-}
+        for(int i = 0; i < elemDim.x; ++i)
+            g_data[idx + i] = g_data[idx + i] + inc_value;
+    }
 };
 
-int correct_output(int *data, const int n, const int x)
+int correct_output(int* data, const int n, const int x)
 {
-    for (int i = 0; i < n; i++)
-        if (data[i] != x)
+    for(int i = 0; i < n; i++)
+        if(data[i] != x)
         {
             printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
             return 0;
@@ -57,44 +53,44 @@ int correct_output(int *data, const int n, const int x)
     return 1;
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
-//    int devID;
-//    cudaDeviceProp deviceProps;
+    //    int devID;
+    //    cudaDeviceProp deviceProps;
 
     printf("[%s] - Starting...\n", argv[0]);
 
     // This will pick the best possible CUDA capable device
-//    devID = findCudaDevice(argc, (const char **)argv);
+    //    devID = findCudaDevice(argc, (const char **)argv);
 
     // get device name
-//    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-//    printf("CUDA device [%s]\n", deviceProps.name);
+    //    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    //    printf("CUDA device [%s]\n", deviceProps.name);
 
     int n = 16 * 1024 * 1024;
     int nbytes = n * sizeof(int);
     int value = 26;
 
     // allocate host memory
-    int *a = 0;
-    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+    int* a = 0;
+    checkCudaErrors(cudaMallocHost((void**) &a, nbytes));
     memset(a, 0, nbytes);
 
     // allocate device memory
-    int *d_a=0;
-    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+    int* d_a = 0;
+    checkCudaErrors(cudaMalloc((void**) &d_a, nbytes));
     checkCudaErrors(cudaMemset(d_a, 255, nbytes));
 
     // set kernel launch configuration
     dim3 threads = dim3(512, 1);
-    dim3 blocks  = dim3(n / threads.x, 1);
+    dim3 blocks = dim3(n / threads.x, 1);
 
     // create cuda event handles
     cudaEvent_t start, stop;
     checkCudaErrors(cudaEventCreate(&start));
     checkCudaErrors(cudaEventCreate(&stop));
 
-    StopWatchInterface *timer = NULL;
+    StopWatchInterface* timer = NULL;
     sdkCreateTimer(&timer);
     sdkResetTimer(&timer);
 
@@ -111,9 +107,9 @@ int main(int argc, char *argv[])
     sdkStopTimer(&timer);
 
     // have CPU do some work while waiting for stage 1 to finish
-    unsigned long int counter=0;
+    unsigned long int counter = 0;
 
-    while (cudaEventQuery(stop) == cudaErrorNotReady)
+    while(cudaEventQuery(stop) == cudaErrorNotReady)
     {
         counter++;
     }
@@ -126,7 +122,7 @@ int main(int argc, char *argv[])
     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
 
     // check the output for correctness
-    bool bFinalResults = (bool)correct_output(a, n, value);
+    bool bFinalResults = (bool) correct_output(a, n, value);
 
     // release resources
     checkCudaErrors(cudaEventDestroy(start));
diff --git a/example/CUDASamples/blackScholes/src/BlackScholes.cpp b/example/CUDASamples/blackScholes/src/BlackScholes.cpp
index d09a3a59..be128391 100644
--- a/example/CUDASamples/blackScholes/src/BlackScholes.cpp
+++ b/example/CUDASamples/blackScholes/src/BlackScholes.cpp
@@ -16,23 +16,21 @@
  */
 
 #include <cuda_to_cupla.hpp>
-
-#include <helper_functions.h>   // helper functions for string parsing
-#include <helper_cuda.h>        // helper functions CUDA error checking and initialization
+#include <helper_cuda.h> // helper functions CUDA error checking and initialization
+#include <helper_functions.h> // helper functions for string parsing
 
 ////////////////////////////////////////////////////////////////////////////////
 // Process an array of optN options on CPU
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" void BlackScholesCPU(
-    float *h_CallResult,
-    float *h_PutResult,
-    float *h_StockPrice,
-    float *h_OptionStrike,
-    float *h_OptionYears,
+    float* h_CallResult,
+    float* h_PutResult,
+    float* h_StockPrice,
+    float* h_OptionStrike,
+    float* h_OptionYears,
     float Riskfree,
     float Volatility,
-    int optN
-);
+    int optN);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Process an array of OptN options on GPU
@@ -45,7 +43,7 @@ extern "C" void BlackScholesCPU(
 ////////////////////////////////////////////////////////////////////////////////
 float RandFloat(float low, float high)
 {
-    float t = (float)rand() / (float)RAND_MAX;
+    float t = (float) rand() / (float) RAND_MAX;
     return (1.0f - t) * low + t * high;
 }
 
@@ -53,91 +51,85 @@ float RandFloat(float low, float high)
 // Data configuration
 ////////////////////////////////////////////////////////////////////////////////
 const int OPT_N = 4000000;
-const int  NUM_ITERATIONS = 500;
+const int NUM_ITERATIONS = 500;
 
 
 size_t OPT_SZ = OPT_N * sizeof(float);
-const float      RISKFREE = 0.02f;
-const float    VOLATILITY = 0.30f;
+const float RISKFREE = 0.02f;
+const float VOLATILITY = 0.30f;
 
-#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define DIV_UP(a, b) (((a) + (b) -1) / (b))
 
 ////////////////////////////////////////////////////////////////////////////////
 // Main program
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv)
+int main(int argc, char** argv)
 {
     // Start logs
     printf("[%s] - Starting...\n", argv[0]);
 
     //'h_' prefix - CPU (host) memory space
     float
-    //Results calculated by host for reference
-    *h_CallResultCPU,
-    *h_PutResultCPU,
-    //host copy of device results
-    *h_CallResultGPU,
-    *h_PutResultGPU,
-    //host instance of input data
-    *h_StockPrice,
-    *h_OptionStrike,
-    *h_OptionYears;
+        // Results calculated by host for reference
+        *h_CallResultCPU,
+        *h_PutResultCPU,
+        // host copy of device results
+        *h_CallResultGPU, *h_PutResultGPU,
+        // host instance of input data
+        *h_StockPrice, *h_OptionStrike, *h_OptionYears;
 
     //'d_' prefix - device memory space
     float
-    //Results calculated by device
-    *d_CallResult,
-    *d_PutResult,
-    //device instance of input data
-    *d_StockPrice,
-    *d_OptionStrike,
-    *d_OptionYears;
-
-    double
-    delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;
-
-    StopWatchInterface *hTimer = NULL;
+        // Results calculated by device
+        *d_CallResult,
+        *d_PutResult,
+        // device instance of input data
+        *d_StockPrice, *d_OptionStrike, *d_OptionYears;
+
+    double delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime;
+
+    StopWatchInterface* hTimer = NULL;
     int i;
 
-    //findCudaDevice(argc, (const char **)argv);
+    // findCudaDevice(argc, (const char **)argv);
 
     sdkCreateTimer(&hTimer);
 
     printf("Initializing data...\n");
     printf("...allocating CPU memory for options.\n");
-    h_CallResultCPU = (float *)malloc(OPT_SZ);
-    h_PutResultCPU  = (float *)malloc(OPT_SZ);
-    h_CallResultGPU = (float *)malloc(OPT_SZ);
-    h_PutResultGPU  = (float *)malloc(OPT_SZ);
-    h_StockPrice    = (float *)malloc(OPT_SZ);
-    h_OptionStrike  = (float *)malloc(OPT_SZ);
-    h_OptionYears   = (float *)malloc(OPT_SZ);
+    h_CallResultCPU = (float*) malloc(OPT_SZ);
+    h_PutResultCPU = (float*) malloc(OPT_SZ);
+    h_CallResultGPU = (float*) malloc(OPT_SZ);
+    h_PutResultGPU = (float*) malloc(OPT_SZ);
+    h_StockPrice = (float*) malloc(OPT_SZ);
+    h_OptionStrike = (float*) malloc(OPT_SZ);
+    h_OptionYears = (float*) malloc(OPT_SZ);
 
     printf("...allocating GPU memory for options.\n");
-    checkCudaErrors(cudaMalloc((void **)&d_CallResult,   OPT_SZ));
-    checkCudaErrors(cudaMalloc((void **)&d_PutResult,    OPT_SZ));
-    checkCudaErrors(cudaMalloc((void **)&d_StockPrice,   OPT_SZ));
-    checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ));
-    checkCudaErrors(cudaMalloc((void **)&d_OptionYears,  OPT_SZ));
+    checkCudaErrors(cudaMalloc((void**) &d_CallResult, OPT_SZ));
+    checkCudaErrors(cudaMalloc((void**) &d_PutResult, OPT_SZ));
+    checkCudaErrors(cudaMalloc((void**) &d_StockPrice, OPT_SZ));
+    checkCudaErrors(cudaMalloc((void**) &d_OptionStrike, OPT_SZ));
+    checkCudaErrors(cudaMalloc((void**) &d_OptionYears, OPT_SZ));
 
     printf("...generating input data in CPU mem.\n");
     srand(5347);
 
-    //Generate options set
-    for (i = 0; i < OPT_N; i++)
+    // Generate options set
+    for(i = 0; i < OPT_N; i++)
     {
         h_CallResultCPU[i] = 0.0f;
-        h_PutResultCPU[i]  = -1.0f;
-        h_StockPrice[i]    = RandFloat(5.0f, 30.0f);
-        h_OptionStrike[i]  = RandFloat(1.0f, 100.0f);
-        h_OptionYears[i]   = RandFloat(0.25f, 10.0f);
+        h_PutResultCPU[i] = -1.0f;
+        h_StockPrice[i] = RandFloat(5.0f, 30.0f);
+        h_OptionStrike[i] = RandFloat(1.0f, 100.0f);
+        h_OptionYears[i] = RandFloat(0.25f, 10.0f);
     }
 
     printf("...copying input data to device mem.\n");
-    //Copy options data to device memory for further processing
-    checkCudaErrors(cudaMemcpy(d_StockPrice,  h_StockPrice,   OPT_SZ, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike,  OPT_SZ, cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_OptionYears,  h_OptionYears,   OPT_SZ, cudaMemcpyHostToDevice));
+    // Copy options data to device memory for further processing
+    checkCudaErrors(cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice));
     printf("Data init done.\n\n");
 
 
@@ -146,43 +138,49 @@ int main(int argc, char **argv)
     sdkResetTimer(&hTimer);
     sdkStartTimer(&hTimer);
 
-    for (i = 0; i < NUM_ITERATIONS; i++)
+    for(i = 0; i < NUM_ITERATIONS; i++)
     {
-        CUPLA_KERNEL_OPTI(BlackScholesGPU)(DIV_UP((OPT_N/2), 128), 128/*480, 128*/,0,0)(
-            (float2 *)d_CallResult,
-            (float2 *)d_PutResult,
-            (float2 *)d_StockPrice,
-            (float2 *)d_OptionStrike,
-            (float2 *)d_OptionYears,
+        CUPLA_KERNEL_OPTI(BlackScholesGPU)
+        (DIV_UP((OPT_N / 2), 128), 128 /*480, 128*/, 0, 0)(
+            (float2*) d_CallResult,
+            (float2*) d_PutResult,
+            (float2*) d_StockPrice,
+            (float2*) d_OptionStrike,
+            (float2*) d_OptionYears,
             RISKFREE,
             VOLATILITY,
-            OPT_N
-        );
-        //getLastCudaError("BlackScholesGPU() execution failed\n");
+            OPT_N);
+        // getLastCudaError("BlackScholesGPU() execution failed\n");
     }
 
     checkCudaErrors(cudaDeviceSynchronize());
     sdkStopTimer(&hTimer);
     gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
 
-    //Both call and put is calculated
+    // Both call and put is calculated
     printf("Options count             : %i     \n", 2 * OPT_N);
     printf("BlackScholes device time  : %f msec\n", gpuTime);
-    printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
-    printf("Gigaoptions per second    : %f     \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
-
-    printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n",
-           (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128);
+    printf("Effective memory bandwidth: %f GB/s\n", ((double) (5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3));
+    printf("Gigaoptions per second    : %f     \n\n", ((double) (2 * OPT_N) * 1E-9) / (gpuTime * 1E-3));
+
+    printf(
+        "BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = "
+        "%u\n",
+        (((double) (2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)),
+        gpuTime * 1e-3,
+        (2 * OPT_N),
+        1,
+        128);
 
     printf("\nReading back device results...\n");
-    //Read back device results to compare them to host results
+    // Read back device results to compare them to host results
     checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(h_PutResultGPU,  d_PutResult,  OPT_SZ, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost));
 
 
     printf("Checking the results...\n");
     printf("...running host calculations.\n\n");
-    //Calculate options values on host
+    // Calculate options values on host
     BlackScholesCPU(
         h_CallResultCPU,
         h_PutResultCPU,
@@ -191,28 +189,27 @@ int main(int argc, char **argv)
         h_OptionYears,
         RISKFREE,
         VOLATILITY,
-        OPT_N
-    );
+        OPT_N);
 
     printf("Comparing the results...\n");
-    //Calculate max absolute difference and L1 distance
-    //between CPU and GPU results
+    // Calculate max absolute difference and L1 distance
+    // between CPU and GPU results
     sum_delta = 0;
-    sum_ref   = 0;
+    sum_ref = 0;
     max_delta = 0;
 
-    for (i = 0; i < OPT_N; i++)
+    for(i = 0; i < OPT_N; i++)
     {
-        ref   = h_CallResultCPU[i];
+        ref = h_CallResultCPU[i];
         delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]);
 
-        if (delta > max_delta)
+        if(delta > max_delta)
         {
             max_delta = delta;
         }
 
         sum_delta += delta;
-        sum_ref   += fabs(ref);
+        sum_ref += fabs(ref);
     }
 
     L1norm = sum_delta / sum_ref;
@@ -247,13 +244,14 @@ int main(int argc, char **argv)
     // flushed before the application exits
     cudaDeviceReset();
 
-    if (L1norm > 1e-6)
+    if(L1norm > 1e-6)
     {
         printf("Test failed!\n");
         exit(EXIT_FAILURE);
     }
 
-    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n");
+    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is "
+           "enabled.\n\n");
     printf("Test passed\n");
     exit(EXIT_SUCCESS);
 }
diff --git a/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp b/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp
index a6be31a8..737185f3 100644
--- a/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp
+++ b/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp
@@ -10,9 +10,8 @@
  */
 
 
-
-#include <math.h>
 #include <cupla.hpp>
+#include <math.h>
 
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -20,21 +19,18 @@
 ///////////////////////////////////////////////////////////////////////////////
 static double CND(double d)
 {
-    const double       A1 = 0.31938153;
-    const double       A2 = -0.356563782;
-    const double       A3 = 1.781477937;
-    const double       A4 = -1.821255978;
-    const double       A5 = 1.330274429;
+    const double A1 = 0.31938153;
+    const double A2 = -0.356563782;
+    const double A3 = 1.781477937;
+    const double A4 = -1.821255978;
+    const double A5 = 1.330274429;
     const double RSQRT2PI = 0.39894228040143267793994605993438;
 
-    double
-    K = 1.0 / (1.0 + 0.2316419 * cupla::abs(d));
+    double K = 1.0 / (1.0 + 0.2316419 * cupla::abs(d));
 
-    double
-    cnd = RSQRT2PI * cupla::exp(- 0.5 * d * d) *
-          (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+    double cnd = RSQRT2PI * cupla::exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
 
-    if (d > 0)
+    if(d > 0)
         cnd = 1.0 - cnd;
 
     return cnd;
@@ -45,27 +41,27 @@ static double CND(double d)
 // Black-Scholes formula for both call and put
 ///////////////////////////////////////////////////////////////////////////////
 static void BlackScholesBodyCPU(
-    float &callResult,
-    float &putResult,
-    float Sf, //Stock price
-    float Xf, //Option strike
-    float Tf, //Option years
-    float Rf, //Riskless rate
-    float Vf  //Volatility rate
+    float& callResult,
+    float& putResult,
+    float Sf, // Stock price
+    float Xf, // Option strike
+    float Tf, // Option years
+    float Rf, // Riskless rate
+    float Vf // Volatility rate
 )
 {
     double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
 
     double sqrtT = cupla::sqrt(T);
-    double    d1 = (cupla::log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
-    double    d2 = d1 - V * sqrtT;
+    double d1 = (cupla::log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
+    double d2 = d1 - V * sqrtT;
     double CNDD1 = CND(d1);
     double CNDD2 = CND(d2);
 
-    //Calculate Call and Put simultaneously
-    double expRT = exp(- R * T);
-    callResult   = (float)(S * CNDD1 - X * expRT * CNDD2);
-    putResult    = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
+    // Calculate Call and Put simultaneously
+    double expRT = exp(-R * T);
+    callResult = (float) (S * CNDD1 - X * expRT * CNDD2);
+    putResult = (float) (X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
 }
 
 
@@ -73,17 +69,16 @@ static void BlackScholesBodyCPU(
 // Process an array of optN options
 ////////////////////////////////////////////////////////////////////////////////
 extern "C" void BlackScholesCPU(
-    float *h_CallResult,
-    float *h_PutResult,
-    float *h_StockPrice,
-    float *h_OptionStrike,
-    float *h_OptionYears,
+    float* h_CallResult,
+    float* h_PutResult,
+    float* h_StockPrice,
+    float* h_OptionStrike,
+    float* h_OptionYears,
     float Riskfree,
     float Volatility,
-    int optN
-)
+    int optN)
 {
-    for (int opt = 0; opt < optN; opt++)
+    for(int opt = 0; opt < optN; opt++)
         BlackScholesBodyCPU(
             h_CallResult[opt],
             h_PutResult[opt],
@@ -91,6 +86,5 @@ extern "C" void BlackScholesCPU(
             h_OptionStrike[opt],
             h_OptionYears[opt],
             Riskfree,
-            Volatility
-        );
+            Volatility);
 }
diff --git a/example/CUDASamples/common/exception.h b/example/CUDASamples/common/exception.h
index adda4bce..a61fa0af 100644
--- a/example/CUDASamples/common/exception.h
+++ b/example/CUDASamples/common/exception.h
@@ -1,13 +1,13 @@
 /*
-* Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
-*
-* Please refer to the NVIDIA end user license agreement (EULA) associated
-* with this source code for terms and conditions that govern your use of
-* this software. Any use, reproduction, disclosure, or distribution of
-* this software and related documentation outside the terms of the EULA
-* is strictly prohibited.
-*
-*/
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
 
 /* CUda UTility Library */
 #ifndef _EXCEPTION_H_
@@ -15,8 +15,9 @@
 
 // includes, system
 #include <exception>
-#include <stdexcept>
 #include <iostream>
+#include <stdexcept>
+
 #include <stdlib.h>
 
 //! Exception wrapper.
@@ -24,38 +25,31 @@
 template<class Std_Exception>
 class Exception : public Std_Exception
 {
-    public:
-
-        //! @brief Static construction interface
-        //! @return Alwayss throws ( Located_Exception<Exception>)
-        //! @param file file in which the Exception occurs
-        //! @param line line in which the Exception occurs
-        //! @param detailed details on the code fragment causing the Exception
-        static void throw_it(const char *file,
-                             const int line,
-                             const char *detailed = "-");
-
-        //! Static construction interface
-        //! @return Alwayss throws ( Located_Exception<Exception>)
-        //! @param file file in which the Exception occurs
-        //! @param line line in which the Exception occurs
-        //! @param detailed details on the code fragment causing the Exception
-        static void throw_it(const char *file,
-                             const int line,
-                             const std::string &detailed);
-
-        //! Destructor
-        virtual ~Exception() throw();
-
-    private:
-
-        //! Constructor, default (private)
-        Exception();
-
-        //! Constructor, standard
-        //! @param str string returned by what()
-        Exception(const std::string &str);
-
+public:
+    //! @brief Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it(const char* file, const int line, const char* detailed = "-");
+
+    //! Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it(const char* file, const int line, const std::string& detailed);
+
+    //! Destructor
+    virtual ~Exception() throw();
+
+private:
+    //! Constructor, default (private)
+    Exception();
+
+    //! Constructor, standard
+    //! @param str string returned by what()
+    Exception(const std::string& str);
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -63,8 +57,7 @@ class Exception : public Std_Exception
 //! @param ex exception to handle
 ////////////////////////////////////////////////////////////////////////////////
 template<class Exception_Typ>
-inline void
-handleException(const Exception_Typ &ex)
+inline void handleException(const Exception_Typ& ex)
 {
     std::cerr << ex.what() << std::endl;
 
@@ -74,16 +67,13 @@ handleException(const Exception_Typ &ex)
 //! Convenience macros
 
 //! Exception caused by dynamic program behavior, e.g. file does not exist
-#define RUNTIME_EXCEPTION( msg) \
-    Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
+#define RUNTIME_EXCEPTION(msg) Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
 
 //! Logic exception in program, e.g. an assert failed
-#define LOGIC_EXCEPTION( msg) \
-    Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
+#define LOGIC_EXCEPTION(msg) Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
 
 //! Out of range exception
-#define RANGE_EXCEPTION( msg) \
-    Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
+#define RANGE_EXCEPTION(msg) Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
 
 ////////////////////////////////////////////////////////////////////////////////
 //! Implementation
@@ -96,9 +86,7 @@ handleException(const Exception_Typ &ex)
 //! @param  Exception causing code fragment (file and line) and detailed infos.
 ////////////////////////////////////////////////////////////////////////////////
 /*static*/ template<class Std_Exception>
-void
-Exception<Std_Exception>::
-throw_it(const char *file, const int line, const char *detailed)
+void Exception<Std_Exception>::throw_it(const char* file, const int line, const char* detailed)
 {
     std::stringstream s;
 
@@ -115,9 +103,7 @@ throw_it(const char *file, const int line, const char *detailed)
 //! @param  Exception causing code fragment (file and line) and detailed infos.
 ////////////////////////////////////////////////////////////////////////////////
 /*static*/ template<class Std_Exception>
-void
-Exception<Std_Exception>::
-throw_it(const char *file, const int line, const std::string &msg)
+void Exception<Std_Exception>::throw_it(const char* file, const int line, const std::string& msg)
 {
     throw_it(file, line, msg.c_str());
 }
@@ -126,26 +112,27 @@ throw_it(const char *file, const int line, const std::string &msg)
 //! Constructor, default (private).
 ////////////////////////////////////////////////////////////////////////////////
 template<class Std_Exception>
-Exception<Std_Exception>::Exception() :
-    Std_Exception("Unknown Exception.\n")
-{ }
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n")
+{
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 //! Constructor, standard (private).
 //! String returned by what().
 ////////////////////////////////////////////////////////////////////////////////
 template<class Std_Exception>
-Exception<Std_Exception>::Exception(const std::string &s) :
-    Std_Exception(s)
-{ }
+Exception<Std_Exception>::Exception(const std::string& s) : Std_Exception(s)
+{
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 //! Destructor
 ////////////////////////////////////////////////////////////////////////////////
 template<class Std_Exception>
-Exception<Std_Exception>::~Exception() throw() { }
+Exception<Std_Exception>::~Exception() throw()
+{
+}
 
 // functions, exported
 
 #endif // #ifndef _EXCEPTION_H_
-
diff --git a/example/CUDASamples/common/helper_cuda.h b/example/CUDASamples/common/helper_cuda.h
index 59fc77a4..ad4a3d17 100644
--- a/example/CUDASamples/common/helper_cuda.h
+++ b/example/CUDASamples/common/helper_cuda.h
@@ -17,14 +17,13 @@
 
 #pragma once
 
-#include <stdlib.h>
+#include <helper_string.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
-#include <helper_string.h>
-
 #ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
+#    define EXIT_WAIVED 2
 #endif
 
 // Note, it is required that your SDK sample to include the proper header files, please
@@ -33,258 +32,258 @@
 
 // CUDA Runtime error messages
 #ifdef __DRIVER_TYPES_H__
-static const char *_cudaGetErrorEnum(cudaError_t error)
+static const char* _cudaGetErrorEnum(cudaError_t error)
 {
-    switch (error)
+    switch(error)
     {
-        case cudaSuccess:
-            return "cudaSuccess";
-/*
-        case cudaErrorMissingConfiguration:
-            return "cudaErrorMissingConfiguration";
-*/
-        case cudaErrorMemoryAllocation:
-            return "cudaErrorMemoryAllocation";
+    case cudaSuccess:
+        return "cudaSuccess";
+        /*
+                case cudaErrorMissingConfiguration:
+                    return "cudaErrorMissingConfiguration";
+        */
+    case cudaErrorMemoryAllocation:
+        return "cudaErrorMemoryAllocation";
 
-        case cudaErrorInitializationError:
-            return "cudaErrorInitializationError";
-/*
-        case cudaErrorLaunchFailure:
-            return "cudaErrorLaunchFailure";
+    case cudaErrorInitializationError:
+        return "cudaErrorInitializationError";
+        /*
+                case cudaErrorLaunchFailure:
+                    return "cudaErrorLaunchFailure";
 
-        case cudaErrorPriorLaunchFailure:
-            return "cudaErrorPriorLaunchFailure";
+                case cudaErrorPriorLaunchFailure:
+                    return "cudaErrorPriorLaunchFailure";
 
-        case cudaErrorLaunchTimeout:
-            return "cudaErrorLaunchTimeout";
+                case cudaErrorLaunchTimeout:
+                    return "cudaErrorLaunchTimeout";
 
-        case cudaErrorLaunchOutOfResources:
-            return "cudaErrorLaunchOutOfResources";
+                case cudaErrorLaunchOutOfResources:
+                    return "cudaErrorLaunchOutOfResources";
 
-        case cudaErrorInvalidDeviceFunction:
-            return "cudaErrorInvalidDeviceFunction";
+                case cudaErrorInvalidDeviceFunction:
+                    return "cudaErrorInvalidDeviceFunction";
 
-        case cudaErrorInvalidConfiguration:
-            return "cudaErrorInvalidConfiguration";
+                case cudaErrorInvalidConfiguration:
+                    return "cudaErrorInvalidConfiguration";
 
-        case cudaErrorInvalidDevice:
-            return "cudaErrorInvalidDevice";
+                case cudaErrorInvalidDevice:
+                    return "cudaErrorInvalidDevice";
 
-        case cudaErrorInvalidValue:
-            return "cudaErrorInvalidValue";
+                case cudaErrorInvalidValue:
+                    return "cudaErrorInvalidValue";
 
-        case cudaErrorInvalidPitchValue:
-            return "cudaErrorInvalidPitchValue";
+                case cudaErrorInvalidPitchValue:
+                    return "cudaErrorInvalidPitchValue";
 
-        case cudaErrorInvalidSymbol:
-            return "cudaErrorInvalidSymbol";
+                case cudaErrorInvalidSymbol:
+                    return "cudaErrorInvalidSymbol";
 
-        case cudaErrorMapBufferObjectFailed:
-            return "cudaErrorMapBufferObjectFailed";
+                case cudaErrorMapBufferObjectFailed:
+                    return "cudaErrorMapBufferObjectFailed";
 
-        case cudaErrorUnmapBufferObjectFailed:
-            return "cudaErrorUnmapBufferObjectFailed";
+                case cudaErrorUnmapBufferObjectFailed:
+                    return "cudaErrorUnmapBufferObjectFailed";
 
-        case cudaErrorInvalidHostPointer:
-            return "cudaErrorInvalidHostPointer";
+                case cudaErrorInvalidHostPointer:
+                    return "cudaErrorInvalidHostPointer";
 
-        case cudaErrorInvalidDevicePointer:
-            return "cudaErrorInvalidDevicePointer";
+                case cudaErrorInvalidDevicePointer:
+                    return "cudaErrorInvalidDevicePointer";
 
-        case cudaErrorInvalidTexture:
-            return "cudaErrorInvalidTexture";
+                case cudaErrorInvalidTexture:
+                    return "cudaErrorInvalidTexture";
 
-        case cudaErrorInvalidTextureBinding:
-            return "cudaErrorInvalidTextureBinding";
+                case cudaErrorInvalidTextureBinding:
+                    return "cudaErrorInvalidTextureBinding";
 
-        case cudaErrorInvalidChannelDescriptor:
-            return "cudaErrorInvalidChannelDescriptor";
+                case cudaErrorInvalidChannelDescriptor:
+                    return "cudaErrorInvalidChannelDescriptor";
 
-        case cudaErrorInvalidMemcpyDirection:
-            return "cudaErrorInvalidMemcpyDirection";
+                case cudaErrorInvalidMemcpyDirection:
+                    return "cudaErrorInvalidMemcpyDirection";
 
-        case cudaErrorAddressOfConstant:
-            return "cudaErrorAddressOfConstant";
+                case cudaErrorAddressOfConstant:
+                    return "cudaErrorAddressOfConstant";
 
-        case cudaErrorTextureFetchFailed:
-            return "cudaErrorTextureFetchFailed";
+                case cudaErrorTextureFetchFailed:
+                    return "cudaErrorTextureFetchFailed";
 
-        case cudaErrorTextureNotBound:
-            return "cudaErrorTextureNotBound";
+                case cudaErrorTextureNotBound:
+                    return "cudaErrorTextureNotBound";
 
-        case cudaErrorSynchronizationError:
-            return "cudaErrorSynchronizationError";
+                case cudaErrorSynchronizationError:
+                    return "cudaErrorSynchronizationError";
 
-        case cudaErrorInvalidFilterSetting:
-            return "cudaErrorInvalidFilterSetting";
+                case cudaErrorInvalidFilterSetting:
+                    return "cudaErrorInvalidFilterSetting";
 
-        case cudaErrorInvalidNormSetting:
-            return "cudaErrorInvalidNormSetting";
+                case cudaErrorInvalidNormSetting:
+                    return "cudaErrorInvalidNormSetting";
 
-        case cudaErrorMixedDeviceExecution:
-            return "cudaErrorMixedDeviceExecution";
+                case cudaErrorMixedDeviceExecution:
+                    return "cudaErrorMixedDeviceExecution";
 
-        case cudaErrorCudartUnloading:
-            return "cudaErrorCudartUnloading";
+                case cudaErrorCudartUnloading:
+                    return "cudaErrorCudartUnloading";
 
-        case cudaErrorUnknown:
-            return "cudaErrorUnknown";
+                case cudaErrorUnknown:
+                    return "cudaErrorUnknown";
 
-        case cudaErrorNotYetImplemented:
-            return "cudaErrorNotYetImplemented";
+                case cudaErrorNotYetImplemented:
+                    return "cudaErrorNotYetImplemented";
 
-        case cudaErrorMemoryValueTooLarge:
-            return "cudaErrorMemoryValueTooLarge";
+                case cudaErrorMemoryValueTooLarge:
+                    return "cudaErrorMemoryValueTooLarge";
 
-        case cudaErrorInvalidResourceHandle:
-            return "cudaErrorInvalidResourceHandle";
-*/
-        case cudaErrorNotReady:
-            return "cudaErrorNotReady";
-/*
-        case cudaErrorInsufficientDriver:
-            return "cudaErrorInsufficientDriver";
+                case cudaErrorInvalidResourceHandle:
+                    return "cudaErrorInvalidResourceHandle";
+        */
+    case cudaErrorNotReady:
+        return "cudaErrorNotReady";
+        /*
+                case cudaErrorInsufficientDriver:
+                    return "cudaErrorInsufficientDriver";
 
-        case cudaErrorSetOnActiveProcess:
-            return "cudaErrorSetOnActiveProcess";
+                case cudaErrorSetOnActiveProcess:
+                    return "cudaErrorSetOnActiveProcess";
 
-        case cudaErrorInvalidSurface:
-            return "cudaErrorInvalidSurface";
+                case cudaErrorInvalidSurface:
+                    return "cudaErrorInvalidSurface";
 
-        case cudaErrorNoDevice:
-            return "cudaErrorNoDevice";
+                case cudaErrorNoDevice:
+                    return "cudaErrorNoDevice";
 
-        case cudaErrorECCUncorrectable:
-            return "cudaErrorECCUncorrectable";
+                case cudaErrorECCUncorrectable:
+                    return "cudaErrorECCUncorrectable";
 
-        case cudaErrorSharedObjectSymbolNotFound:
-            return "cudaErrorSharedObjectSymbolNotFound";
+                case cudaErrorSharedObjectSymbolNotFound:
+                    return "cudaErrorSharedObjectSymbolNotFound";
 
-        case cudaErrorSharedObjectInitFailed:
-            return "cudaErrorSharedObjectInitFailed";
+                case cudaErrorSharedObjectInitFailed:
+                    return "cudaErrorSharedObjectInitFailed";
 
-        case cudaErrorUnsupportedLimit:
-            return "cudaErrorUnsupportedLimit";
+                case cudaErrorUnsupportedLimit:
+                    return "cudaErrorUnsupportedLimit";
 
-        case cudaErrorDuplicateVariableName:
-            return "cudaErrorDuplicateVariableName";
+                case cudaErrorDuplicateVariableName:
+                    return "cudaErrorDuplicateVariableName";
 
-        case cudaErrorDuplicateTextureName:
-            return "cudaErrorDuplicateTextureName";
+                case cudaErrorDuplicateTextureName:
+                    return "cudaErrorDuplicateTextureName";
 
-        case cudaErrorDuplicateSurfaceName:
-            return "cudaErrorDuplicateSurfaceName";
+                case cudaErrorDuplicateSurfaceName:
+                    return "cudaErrorDuplicateSurfaceName";
 
-        case cudaErrorDevicesUnavailable:
-            return "cudaErrorDevicesUnavailable";
+                case cudaErrorDevicesUnavailable:
+                    return "cudaErrorDevicesUnavailable";
 
-        case cudaErrorInvalidKernelImage:
-            return "cudaErrorInvalidKernelImage";
+                case cudaErrorInvalidKernelImage:
+                    return "cudaErrorInvalidKernelImage";
 
-        case cudaErrorNoKernelImageForDevice:
-            return "cudaErrorNoKernelImageForDevice";
+                case cudaErrorNoKernelImageForDevice:
+                    return "cudaErrorNoKernelImageForDevice";
 
-        case cudaErrorIncompatibleDriverContext:
-            return "cudaErrorIncompatibleDriverContext";
+                case cudaErrorIncompatibleDriverContext:
+                    return "cudaErrorIncompatibleDriverContext";
 
-        case cudaErrorPeerAccessAlreadyEnabled:
-            return "cudaErrorPeerAccessAlreadyEnabled";
+                case cudaErrorPeerAccessAlreadyEnabled:
+                    return "cudaErrorPeerAccessAlreadyEnabled";
 
-        case cudaErrorPeerAccessNotEnabled:
-            return "cudaErrorPeerAccessNotEnabled";
+                case cudaErrorPeerAccessNotEnabled:
+                    return "cudaErrorPeerAccessNotEnabled";
 
-        case cudaErrorDeviceAlreadyInUse:
-            return "cudaErrorDeviceAlreadyInUse";
+                case cudaErrorDeviceAlreadyInUse:
+                    return "cudaErrorDeviceAlreadyInUse";
 
-        case cudaErrorProfilerDisabled:
-            return "cudaErrorProfilerDisabled";
+                case cudaErrorProfilerDisabled:
+                    return "cudaErrorProfilerDisabled";
 
-        case cudaErrorProfilerNotInitialized:
-            return "cudaErrorProfilerNotInitialized";
+                case cudaErrorProfilerNotInitialized:
+                    return "cudaErrorProfilerNotInitialized";
 
-        case cudaErrorProfilerAlreadyStarted:
-            return "cudaErrorProfilerAlreadyStarted";
+                case cudaErrorProfilerAlreadyStarted:
+                    return "cudaErrorProfilerAlreadyStarted";
 
-        case cudaErrorProfilerAlreadyStopped:
-            return "cudaErrorProfilerAlreadyStopped";
-*/
+                case cudaErrorProfilerAlreadyStopped:
+                    return "cudaErrorProfilerAlreadyStopped";
+        */
         /* Since CUDA 4.0*/
-/*
-        case cudaErrorAssert:
-            return "cudaErrorAssert";
+        /*
+                case cudaErrorAssert:
+                    return "cudaErrorAssert";
 
-        case cudaErrorTooManyPeers:
-            return "cudaErrorTooManyPeers";
+                case cudaErrorTooManyPeers:
+                    return "cudaErrorTooManyPeers";
 
-        case cudaErrorHostMemoryAlreadyRegistered:
-            return "cudaErrorHostMemoryAlreadyRegistered";
+                case cudaErrorHostMemoryAlreadyRegistered:
+                    return "cudaErrorHostMemoryAlreadyRegistered";
 
-        case cudaErrorHostMemoryNotRegistered:
-            return "cudaErrorHostMemoryNotRegistered";
-*/
+                case cudaErrorHostMemoryNotRegistered:
+                    return "cudaErrorHostMemoryNotRegistered";
+        */
         /* Since CUDA 5.0 */
-/*
-        case cudaErrorOperatingSystem:
-            return "cudaErrorOperatingSystem";
+        /*
+                case cudaErrorOperatingSystem:
+                    return "cudaErrorOperatingSystem";
 
-        case cudaErrorPeerAccessUnsupported:
-            return "cudaErrorPeerAccessUnsupported";
+                case cudaErrorPeerAccessUnsupported:
+                    return "cudaErrorPeerAccessUnsupported";
 
-        case cudaErrorLaunchMaxDepthExceeded:
-            return "cudaErrorLaunchMaxDepthExceeded";
+                case cudaErrorLaunchMaxDepthExceeded:
+                    return "cudaErrorLaunchMaxDepthExceeded";
 
-        case cudaErrorLaunchFileScopedTex:
-            return "cudaErrorLaunchFileScopedTex";
+                case cudaErrorLaunchFileScopedTex:
+                    return "cudaErrorLaunchFileScopedTex";
 
-        case cudaErrorLaunchFileScopedSurf:
-            return "cudaErrorLaunchFileScopedSurf";
+                case cudaErrorLaunchFileScopedSurf:
+                    return "cudaErrorLaunchFileScopedSurf";
 
-        case cudaErrorSyncDepthExceeded:
-            return "cudaErrorSyncDepthExceeded";
+                case cudaErrorSyncDepthExceeded:
+                    return "cudaErrorSyncDepthExceeded";
 
-        case cudaErrorLaunchPendingCountExceeded:
-            return "cudaErrorLaunchPendingCountExceeded";
+                case cudaErrorLaunchPendingCountExceeded:
+                    return "cudaErrorLaunchPendingCountExceeded";
 
-        case cudaErrorNotPermitted:
-            return "cudaErrorNotPermitted";
+                case cudaErrorNotPermitted:
+                    return "cudaErrorNotPermitted";
 
-        case cudaErrorNotSupported:
-            return "cudaErrorNotSupported";
-*/
+                case cudaErrorNotSupported:
+                    return "cudaErrorNotSupported";
+        */
         /* Since CUDA 6.0 */
-/*
-        case cudaErrorHardwareStackError:
-            return "cudaErrorHardwareStackError";
+        /*
+                case cudaErrorHardwareStackError:
+                    return "cudaErrorHardwareStackError";
 
-        case cudaErrorIllegalInstruction:
-            return "cudaErrorIllegalInstruction";
+                case cudaErrorIllegalInstruction:
+                    return "cudaErrorIllegalInstruction";
 
-        case cudaErrorMisalignedAddress:
-            return "cudaErrorMisalignedAddress";
+                case cudaErrorMisalignedAddress:
+                    return "cudaErrorMisalignedAddress";
 
-        case cudaErrorInvalidAddressSpace:
-            return "cudaErrorInvalidAddressSpace";
+                case cudaErrorInvalidAddressSpace:
+                    return "cudaErrorInvalidAddressSpace";
 
-        case cudaErrorInvalidPc:
-            return "cudaErrorInvalidPc";
+                case cudaErrorInvalidPc:
+                    return "cudaErrorInvalidPc";
 
-        case cudaErrorIllegalAddress:
-            return "cudaErrorIllegalAddress";
-*/
+                case cudaErrorIllegalAddress:
+                    return "cudaErrorIllegalAddress";
+        */
         /* Since CUDA 6.5*/
-/*
-        case cudaErrorInvalidPtx:
-            return "cudaErrorInvalidPtx";
+        /*
+                case cudaErrorInvalidPtx:
+                    return "cudaErrorInvalidPtx";
 
-        case cudaErrorInvalidGraphicsContext:
-            return "cudaErrorInvalidGraphicsContext";
+                case cudaErrorInvalidGraphicsContext:
+                    return "cudaErrorInvalidGraphicsContext";
 
-        case cudaErrorStartupFailure:
-            return "cudaErrorStartupFailure";
+                case cudaErrorStartupFailure:
+                    return "cudaErrorStartupFailure";
 
-        case cudaErrorApiFailureBase:
-            return "cudaErrorApiFailureBase";
-*/
+                case cudaErrorApiFailureBase:
+                    return "cudaErrorApiFailureBase";
+        */
     }
 
     return "<unknown>";
@@ -293,150 +292,150 @@ static const char *_cudaGetErrorEnum(cudaError_t error)
 
 #ifdef __cuda_cuda_h__
 // CUDA Driver API errors
-static const char *_cudaGetErrorEnum(CUresult error)
+static const char* _cudaGetErrorEnum(CUresult error)
 {
-    switch (error)
+    switch(error)
     {
-        case CUDA_SUCCESS:
-            return "CUDA_SUCCESS";
+    case CUDA_SUCCESS:
+        return "CUDA_SUCCESS";
 
-        case CUDA_ERROR_INVALID_VALUE:
-            return "CUDA_ERROR_INVALID_VALUE";
+    case CUDA_ERROR_INVALID_VALUE:
+        return "CUDA_ERROR_INVALID_VALUE";
 
-        case CUDA_ERROR_OUT_OF_MEMORY:
-            return "CUDA_ERROR_OUT_OF_MEMORY";
+    case CUDA_ERROR_OUT_OF_MEMORY:
+        return "CUDA_ERROR_OUT_OF_MEMORY";
 
-        case CUDA_ERROR_NOT_INITIALIZED:
-            return "CUDA_ERROR_NOT_INITIALIZED";
+    case CUDA_ERROR_NOT_INITIALIZED:
+        return "CUDA_ERROR_NOT_INITIALIZED";
 
-        case CUDA_ERROR_DEINITIALIZED:
-            return "CUDA_ERROR_DEINITIALIZED";
+    case CUDA_ERROR_DEINITIALIZED:
+        return "CUDA_ERROR_DEINITIALIZED";
 
-        case CUDA_ERROR_PROFILER_DISABLED:
-            return "CUDA_ERROR_PROFILER_DISABLED";
+    case CUDA_ERROR_PROFILER_DISABLED:
+        return "CUDA_ERROR_PROFILER_DISABLED";
 
-        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
-            return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+    case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+        return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
 
-        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
-            return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+    case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+        return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
 
-        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
-            return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+    case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+        return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
 
-        case CUDA_ERROR_NO_DEVICE:
-            return "CUDA_ERROR_NO_DEVICE";
+    case CUDA_ERROR_NO_DEVICE:
+        return "CUDA_ERROR_NO_DEVICE";
 
-        case CUDA_ERROR_INVALID_DEVICE:
-            return "CUDA_ERROR_INVALID_DEVICE";
+    case CUDA_ERROR_INVALID_DEVICE:
+        return "CUDA_ERROR_INVALID_DEVICE";
 
-        case CUDA_ERROR_INVALID_IMAGE:
-            return "CUDA_ERROR_INVALID_IMAGE";
+    case CUDA_ERROR_INVALID_IMAGE:
+        return "CUDA_ERROR_INVALID_IMAGE";
 
-        case CUDA_ERROR_INVALID_CONTEXT:
-            return "CUDA_ERROR_INVALID_CONTEXT";
+    case CUDA_ERROR_INVALID_CONTEXT:
+        return "CUDA_ERROR_INVALID_CONTEXT";
 
-        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
-            return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+        return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
 
-        case CUDA_ERROR_MAP_FAILED:
-            return "CUDA_ERROR_MAP_FAILED";
+    case CUDA_ERROR_MAP_FAILED:
+        return "CUDA_ERROR_MAP_FAILED";
 
-        case CUDA_ERROR_UNMAP_FAILED:
-            return "CUDA_ERROR_UNMAP_FAILED";
+    case CUDA_ERROR_UNMAP_FAILED:
+        return "CUDA_ERROR_UNMAP_FAILED";
 
-        case CUDA_ERROR_ARRAY_IS_MAPPED:
-            return "CUDA_ERROR_ARRAY_IS_MAPPED";
+    case CUDA_ERROR_ARRAY_IS_MAPPED:
+        return "CUDA_ERROR_ARRAY_IS_MAPPED";
 
-        case CUDA_ERROR_ALREADY_MAPPED:
-            return "CUDA_ERROR_ALREADY_MAPPED";
+    case CUDA_ERROR_ALREADY_MAPPED:
+        return "CUDA_ERROR_ALREADY_MAPPED";
 
-        case CUDA_ERROR_NO_BINARY_FOR_GPU:
-            return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+    case CUDA_ERROR_NO_BINARY_FOR_GPU:
+        return "CUDA_ERROR_NO_BINARY_FOR_GPU";
 
-        case CUDA_ERROR_ALREADY_ACQUIRED:
-            return "CUDA_ERROR_ALREADY_ACQUIRED";
+    case CUDA_ERROR_ALREADY_ACQUIRED:
+        return "CUDA_ERROR_ALREADY_ACQUIRED";
 
-        case CUDA_ERROR_NOT_MAPPED:
-            return "CUDA_ERROR_NOT_MAPPED";
+    case CUDA_ERROR_NOT_MAPPED:
+        return "CUDA_ERROR_NOT_MAPPED";
 
-        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
-            return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+    case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+        return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
 
-        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
-            return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+    case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+        return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
 
-        case CUDA_ERROR_ECC_UNCORRECTABLE:
-            return "CUDA_ERROR_ECC_UNCORRECTABLE";
+    case CUDA_ERROR_ECC_UNCORRECTABLE:
+        return "CUDA_ERROR_ECC_UNCORRECTABLE";
 
-        case CUDA_ERROR_UNSUPPORTED_LIMIT:
-            return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+    case CUDA_ERROR_UNSUPPORTED_LIMIT:
+        return "CUDA_ERROR_UNSUPPORTED_LIMIT";
 
-        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
-            return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+    case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+        return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
 
-        case CUDA_ERROR_INVALID_SOURCE:
-            return "CUDA_ERROR_INVALID_SOURCE";
+    case CUDA_ERROR_INVALID_SOURCE:
+        return "CUDA_ERROR_INVALID_SOURCE";
 
-        case CUDA_ERROR_FILE_NOT_FOUND:
-            return "CUDA_ERROR_FILE_NOT_FOUND";
+    case CUDA_ERROR_FILE_NOT_FOUND:
+        return "CUDA_ERROR_FILE_NOT_FOUND";
 
-        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
-            return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+    case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+        return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
 
-        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
-            return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+    case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+        return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
 
-        case CUDA_ERROR_OPERATING_SYSTEM:
-            return "CUDA_ERROR_OPERATING_SYSTEM";
+    case CUDA_ERROR_OPERATING_SYSTEM:
+        return "CUDA_ERROR_OPERATING_SYSTEM";
 
-        case CUDA_ERROR_INVALID_HANDLE:
-            return "CUDA_ERROR_INVALID_HANDLE";
+    case CUDA_ERROR_INVALID_HANDLE:
+        return "CUDA_ERROR_INVALID_HANDLE";
 
-        case CUDA_ERROR_NOT_FOUND:
-            return "CUDA_ERROR_NOT_FOUND";
+    case CUDA_ERROR_NOT_FOUND:
+        return "CUDA_ERROR_NOT_FOUND";
 
-        case CUDA_ERROR_NOT_READY:
-            return "CUDA_ERROR_NOT_READY";
+    case CUDA_ERROR_NOT_READY:
+        return "CUDA_ERROR_NOT_READY";
 
-        case CUDA_ERROR_LAUNCH_FAILED:
-            return "CUDA_ERROR_LAUNCH_FAILED";
+    case CUDA_ERROR_LAUNCH_FAILED:
+        return "CUDA_ERROR_LAUNCH_FAILED";
 
-        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-            return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+        return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
 
-        case CUDA_ERROR_LAUNCH_TIMEOUT:
-            return "CUDA_ERROR_LAUNCH_TIMEOUT";
+    case CUDA_ERROR_LAUNCH_TIMEOUT:
+        return "CUDA_ERROR_LAUNCH_TIMEOUT";
 
-        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
-            return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
+        return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
 
-        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
-            return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+    case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+        return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
 
-        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
-            return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+    case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+        return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
 
-        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
-            return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+    case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+        return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
 
-        case CUDA_ERROR_CONTEXT_IS_DESTROYED:
-            return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+    case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+        return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
 
-        case CUDA_ERROR_ASSERT:
-            return "CUDA_ERROR_ASSERT";
+    case CUDA_ERROR_ASSERT:
+        return "CUDA_ERROR_ASSERT";
 
-        case CUDA_ERROR_TOO_MANY_PEERS:
-            return "CUDA_ERROR_TOO_MANY_PEERS";
+    case CUDA_ERROR_TOO_MANY_PEERS:
+        return "CUDA_ERROR_TOO_MANY_PEERS";
 
-        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
-            return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+    case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+        return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
 
-        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
-            return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+    case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+        return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
 
-        case CUDA_ERROR_UNKNOWN:
-            return "CUDA_ERROR_UNKNOWN";
+    case CUDA_ERROR_UNKNOWN:
+        return "CUDA_ERROR_UNKNOWN";
     }
 
     return "<unknown>";
@@ -445,33 +444,33 @@ static const char *_cudaGetErrorEnum(CUresult error)
 
 #ifdef CUBLAS_API_H_
 // cuBLAS API errors
-static const char *_cudaGetErrorEnum(cublasStatus_t error)
+static const char* _cudaGetErrorEnum(cublasStatus_t error)
 {
-    switch (error)
+    switch(error)
     {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_SUCCESS:
+        return "CUBLAS_STATUS_SUCCESS";
 
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+        return "CUBLAS_STATUS_NOT_INITIALIZED";
 
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+        return "CUBLAS_STATUS_ALLOC_FAILED";
 
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_INVALID_VALUE:
+        return "CUBLAS_STATUS_INVALID_VALUE";
 
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "CUBLAS_STATUS_ARCH_MISMATCH";
 
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+        return "CUBLAS_STATUS_MAPPING_ERROR";
 
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+        return "CUBLAS_STATUS_EXECUTION_FAILED";
 
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+        return "CUBLAS_STATUS_INTERNAL_ERROR";
     }
 
     return "<unknown>";
@@ -480,57 +479,57 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error)
 
 #ifdef _CUFFT_H_
 // cuFFT API errors
-static const char *_cudaGetErrorEnum(cufftResult error)
+static const char* _cudaGetErrorEnum(cufftResult error)
 {
-    switch (error)
+    switch(error)
     {
-        case CUFFT_SUCCESS:
-            return "CUFFT_SUCCESS";
+    case CUFFT_SUCCESS:
+        return "CUFFT_SUCCESS";
 
-        case CUFFT_INVALID_PLAN:
-            return "CUFFT_INVALID_PLAN";
+    case CUFFT_INVALID_PLAN:
+        return "CUFFT_INVALID_PLAN";
 
-        case CUFFT_ALLOC_FAILED:
-            return "CUFFT_ALLOC_FAILED";
+    case CUFFT_ALLOC_FAILED:
+        return "CUFFT_ALLOC_FAILED";
 
-        case CUFFT_INVALID_TYPE:
-            return "CUFFT_INVALID_TYPE";
+    case CUFFT_INVALID_TYPE:
+        return "CUFFT_INVALID_TYPE";
 
-        case CUFFT_INVALID_VALUE:
-            return "CUFFT_INVALID_VALUE";
+    case CUFFT_INVALID_VALUE:
+        return "CUFFT_INVALID_VALUE";
 
-        case CUFFT_INTERNAL_ERROR:
-            return "CUFFT_INTERNAL_ERROR";
+    case CUFFT_INTERNAL_ERROR:
+        return "CUFFT_INTERNAL_ERROR";
 
-        case CUFFT_EXEC_FAILED:
-            return "CUFFT_EXEC_FAILED";
+    case CUFFT_EXEC_FAILED:
+        return "CUFFT_EXEC_FAILED";
 
-        case CUFFT_SETUP_FAILED:
-            return "CUFFT_SETUP_FAILED";
+    case CUFFT_SETUP_FAILED:
+        return "CUFFT_SETUP_FAILED";
 
-        case CUFFT_INVALID_SIZE:
-            return "CUFFT_INVALID_SIZE";
+    case CUFFT_INVALID_SIZE:
+        return "CUFFT_INVALID_SIZE";
 
-        case CUFFT_UNALIGNED_DATA:
-            return "CUFFT_UNALIGNED_DATA";
+    case CUFFT_UNALIGNED_DATA:
+        return "CUFFT_UNALIGNED_DATA";
 
-        case CUFFT_INCOMPLETE_PARAMETER_LIST:
-            return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
 
-        case CUFFT_INVALID_DEVICE:
-            return "CUFFT_INVALID_DEVICE";
+    case CUFFT_INVALID_DEVICE:
+        return "CUFFT_INVALID_DEVICE";
 
-        case CUFFT_PARSE_ERROR:
-            return "CUFFT_PARSE_ERROR";
+    case CUFFT_PARSE_ERROR:
+        return "CUFFT_PARSE_ERROR";
 
-        case CUFFT_NO_WORKSPACE:
-            return "CUFFT_NO_WORKSPACE";
+    case CUFFT_NO_WORKSPACE:
+        return "CUFFT_NO_WORKSPACE";
 
-        case CUFFT_NOT_IMPLEMENTED:
-            return "CUFFT_NOT_IMPLEMENTED";
+    case CUFFT_NOT_IMPLEMENTED:
+        return "CUFFT_NOT_IMPLEMENTED";
 
-        case CUFFT_LICENSE_ERROR:
-            return "CUFFT_LICENSE_ERROR";
+    case CUFFT_LICENSE_ERROR:
+        return "CUFFT_LICENSE_ERROR";
     }
 
     return "<unknown>";
@@ -540,36 +539,36 @@ static const char *_cudaGetErrorEnum(cufftResult error)
 
 #ifdef CUSPARSEAPI
 // cuSPARSE API errors
-static const char *_cudaGetErrorEnum(cusparseStatus_t error)
+static const char* _cudaGetErrorEnum(cusparseStatus_t error)
 {
-    switch (error)
+    switch(error)
     {
-        case CUSPARSE_STATUS_SUCCESS:
-            return "CUSPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_SUCCESS:
+        return "CUSPARSE_STATUS_SUCCESS";
 
-        case CUSPARSE_STATUS_NOT_INITIALIZED:
-            return "CUSPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+        return "CUSPARSE_STATUS_NOT_INITIALIZED";
 
-        case CUSPARSE_STATUS_ALLOC_FAILED:
-            return "CUSPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+        return "CUSPARSE_STATUS_ALLOC_FAILED";
 
-        case CUSPARSE_STATUS_INVALID_VALUE:
-            return "CUSPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_INVALID_VALUE:
+        return "CUSPARSE_STATUS_INVALID_VALUE";
 
-        case CUSPARSE_STATUS_ARCH_MISMATCH:
-            return "CUSPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+        return "CUSPARSE_STATUS_ARCH_MISMATCH";
 
-        case CUSPARSE_STATUS_MAPPING_ERROR:
-            return "CUSPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+        return "CUSPARSE_STATUS_MAPPING_ERROR";
 
-        case CUSPARSE_STATUS_EXECUTION_FAILED:
-            return "CUSPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+        return "CUSPARSE_STATUS_EXECUTION_FAILED";
 
-        case CUSPARSE_STATUS_INTERNAL_ERROR:
-            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+        return "CUSPARSE_STATUS_INTERNAL_ERROR";
 
-        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+        return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
     }
 
     return "<unknown>";
@@ -578,48 +577,48 @@ static const char *_cudaGetErrorEnum(cusparseStatus_t error)
 
 #ifdef CURAND_H_
 // cuRAND API errors
-static const char *_cudaGetErrorEnum(curandStatus_t error)
+static const char* _cudaGetErrorEnum(curandStatus_t error)
 {
-    switch (error)
+    switch(error)
     {
-        case CURAND_STATUS_SUCCESS:
-            return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_SUCCESS:
+        return "CURAND_STATUS_SUCCESS";
 
-        case CURAND_STATUS_VERSION_MISMATCH:
-            return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_VERSION_MISMATCH:
+        return "CURAND_STATUS_VERSION_MISMATCH";
 
-        case CURAND_STATUS_NOT_INITIALIZED:
-            return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_NOT_INITIALIZED:
+        return "CURAND_STATUS_NOT_INITIALIZED";
 
-        case CURAND_STATUS_ALLOCATION_FAILED:
-            return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+        return "CURAND_STATUS_ALLOCATION_FAILED";
 
-        case CURAND_STATUS_TYPE_ERROR:
-            return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_TYPE_ERROR:
+        return "CURAND_STATUS_TYPE_ERROR";
 
-        case CURAND_STATUS_OUT_OF_RANGE:
-            return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_OUT_OF_RANGE:
+        return "CURAND_STATUS_OUT_OF_RANGE";
 
-        case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-            return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+        return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
 
-        case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-            return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+        return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
 
-        case CURAND_STATUS_LAUNCH_FAILURE:
-            return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+        return "CURAND_STATUS_LAUNCH_FAILURE";
 
-        case CURAND_STATUS_PREEXISTING_FAILURE:
-            return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+        return "CURAND_STATUS_PREEXISTING_FAILURE";
 
-        case CURAND_STATUS_INITIALIZATION_FAILED:
-            return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+        return "CURAND_STATUS_INITIALIZATION_FAILED";
 
-        case CURAND_STATUS_ARCH_MISMATCH:
-            return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_ARCH_MISMATCH:
+        return "CURAND_STATUS_ARCH_MISMATCH";
 
-        case CURAND_STATUS_INTERNAL_ERROR:
-            return "CURAND_STATUS_INTERNAL_ERROR";
+    case CURAND_STATUS_INTERNAL_ERROR:
+        return "CURAND_STATUS_INTERNAL_ERROR";
     }
 
     return "<unknown>";
@@ -628,254 +627,253 @@ static const char *_cudaGetErrorEnum(curandStatus_t error)
 
 #ifdef NV_NPPIDEFS_H
 // NPP API errors
-static const char *_cudaGetErrorEnum(NppStatus error)
+static const char* _cudaGetErrorEnum(NppStatus error)
 {
-    switch (error)
+    switch(error)
     {
-        case NPP_NOT_SUPPORTED_MODE_ERROR:
-            return "NPP_NOT_SUPPORTED_MODE_ERROR";
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+        return "NPP_NOT_SUPPORTED_MODE_ERROR";
 
-        case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
-            return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+        return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
 
-        case NPP_RESIZE_NO_OPERATION_ERROR:
-            return "NPP_RESIZE_NO_OPERATION_ERROR";
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+        return "NPP_RESIZE_NO_OPERATION_ERROR";
 
-        case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
-            return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+        return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
 
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+#    if((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
 
-        case NPP_BAD_ARG_ERROR:
-            return "NPP_BAD_ARGUMENT_ERROR";
+    case NPP_BAD_ARG_ERROR:
+        return "NPP_BAD_ARGUMENT_ERROR";
 
-        case NPP_COEFF_ERROR:
-            return "NPP_COEFFICIENT_ERROR";
+    case NPP_COEFF_ERROR:
+        return "NPP_COEFFICIENT_ERROR";
 
-        case NPP_RECT_ERROR:
-            return "NPP_RECTANGLE_ERROR";
+    case NPP_RECT_ERROR:
+        return "NPP_RECTANGLE_ERROR";
 
-        case NPP_QUAD_ERROR:
-            return "NPP_QUADRANGLE_ERROR";
+    case NPP_QUAD_ERROR:
+        return "NPP_QUADRANGLE_ERROR";
 
-        case NPP_MEM_ALLOC_ERR:
-            return "NPP_MEMORY_ALLOCATION_ERROR";
+    case NPP_MEM_ALLOC_ERR:
+        return "NPP_MEMORY_ALLOCATION_ERROR";
 
-        case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
-            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+        return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
 
-        case NPP_INVALID_INPUT:
-            return "NPP_INVALID_INPUT";
+    case NPP_INVALID_INPUT:
+        return "NPP_INVALID_INPUT";
 
-        case NPP_POINTER_ERROR:
-            return "NPP_POINTER_ERROR";
+    case NPP_POINTER_ERROR:
+        return "NPP_POINTER_ERROR";
 
-        case NPP_WARNING:
-            return "NPP_WARNING";
+    case NPP_WARNING:
+        return "NPP_WARNING";
 
-        case NPP_ODD_ROI_WARNING:
-            return "NPP_ODD_ROI_WARNING";
-#else
-
-            // These are for CUDA 5.5 or higher
-        case NPP_BAD_ARGUMENT_ERROR:
-            return "NPP_BAD_ARGUMENT_ERROR";
+    case NPP_ODD_ROI_WARNING:
+        return "NPP_ODD_ROI_WARNING";
+#    else
 
-        case NPP_COEFFICIENT_ERROR:
-            return "NPP_COEFFICIENT_ERROR";
+        // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+        return "NPP_BAD_ARGUMENT_ERROR";
 
-        case NPP_RECTANGLE_ERROR:
-            return "NPP_RECTANGLE_ERROR";
+    case NPP_COEFFICIENT_ERROR:
+        return "NPP_COEFFICIENT_ERROR";
 
-        case NPP_QUADRANGLE_ERROR:
-            return "NPP_QUADRANGLE_ERROR";
+    case NPP_RECTANGLE_ERROR:
+        return "NPP_RECTANGLE_ERROR";
 
-        case NPP_MEMORY_ALLOCATION_ERR:
-            return "NPP_MEMORY_ALLOCATION_ERROR";
+    case NPP_QUADRANGLE_ERROR:
+        return "NPP_QUADRANGLE_ERROR";
 
-        case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
-            return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+    case NPP_MEMORY_ALLOCATION_ERR:
+        return "NPP_MEMORY_ALLOCATION_ERROR";
 
-        case NPP_INVALID_HOST_POINTER_ERROR:
-            return "NPP_INVALID_HOST_POINTER_ERROR";
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+        return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
 
-        case NPP_INVALID_DEVICE_POINTER_ERROR:
-            return "NPP_INVALID_DEVICE_POINTER_ERROR";
-#endif
+    case NPP_INVALID_HOST_POINTER_ERROR:
+        return "NPP_INVALID_HOST_POINTER_ERROR";
 
-        case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
-            return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+        return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#    endif
 
-        case NPP_TEXTURE_BIND_ERROR:
-            return "NPP_TEXTURE_BIND_ERROR";
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+        return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
 
-        case NPP_WRONG_INTERSECTION_ROI_ERROR:
-            return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+    case NPP_TEXTURE_BIND_ERROR:
+        return "NPP_TEXTURE_BIND_ERROR";
 
-        case NPP_NOT_EVEN_STEP_ERROR:
-            return "NPP_NOT_EVEN_STEP_ERROR";
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+        return "NPP_WRONG_INTERSECTION_ROI_ERROR";
 
-        case NPP_INTERPOLATION_ERROR:
-            return "NPP_INTERPOLATION_ERROR";
+    case NPP_NOT_EVEN_STEP_ERROR:
+        return "NPP_NOT_EVEN_STEP_ERROR";
 
-        case NPP_RESIZE_FACTOR_ERROR:
-            return "NPP_RESIZE_FACTOR_ERROR";
+    case NPP_INTERPOLATION_ERROR:
+        return "NPP_INTERPOLATION_ERROR";
 
-        case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
-            return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+    case NPP_RESIZE_FACTOR_ERROR:
+        return "NPP_RESIZE_FACTOR_ERROR";
 
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+        return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
 
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
 
-        case NPP_MEMFREE_ERR:
-            return "NPP_MEMFREE_ERR";
+#    if((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
 
-        case NPP_MEMSET_ERR:
-            return "NPP_MEMSET_ERR";
+    case NPP_MEMFREE_ERR:
+        return "NPP_MEMFREE_ERR";
 
-        case NPP_MEMCPY_ERR:
-            return "NPP_MEMCPY_ERROR";
+    case NPP_MEMSET_ERR:
+        return "NPP_MEMSET_ERR";
 
-        case NPP_MIRROR_FLIP_ERR:
-            return "NPP_MIRROR_FLIP_ERR";
-#else
+    case NPP_MEMCPY_ERR:
+        return "NPP_MEMCPY_ERROR";
 
-        case NPP_MEMFREE_ERROR:
-            return "NPP_MEMFREE_ERROR";
+    case NPP_MIRROR_FLIP_ERR:
+        return "NPP_MIRROR_FLIP_ERR";
+#    else
 
-        case NPP_MEMSET_ERROR:
-            return "NPP_MEMSET_ERROR";
+    case NPP_MEMFREE_ERROR:
+        return "NPP_MEMFREE_ERROR";
 
-        case NPP_MEMCPY_ERROR:
-            return "NPP_MEMCPY_ERROR";
+    case NPP_MEMSET_ERROR:
+        return "NPP_MEMSET_ERROR";
 
-        case NPP_MIRROR_FLIP_ERROR:
-            return "NPP_MIRROR_FLIP_ERROR";
-#endif
+    case NPP_MEMCPY_ERROR:
+        return "NPP_MEMCPY_ERROR";
 
-        case NPP_ALIGNMENT_ERROR:
-            return "NPP_ALIGNMENT_ERROR";
+    case NPP_MIRROR_FLIP_ERROR:
+        return "NPP_MIRROR_FLIP_ERROR";
+#    endif
 
-        case NPP_STEP_ERROR:
-            return "NPP_STEP_ERROR";
+    case NPP_ALIGNMENT_ERROR:
+        return "NPP_ALIGNMENT_ERROR";
 
-        case NPP_SIZE_ERROR:
-            return "NPP_SIZE_ERROR";
+    case NPP_STEP_ERROR:
+        return "NPP_STEP_ERROR";
 
-        case NPP_NULL_POINTER_ERROR:
-            return "NPP_NULL_POINTER_ERROR";
+    case NPP_SIZE_ERROR:
+        return "NPP_SIZE_ERROR";
 
-        case NPP_CUDA_KERNEL_EXECUTION_ERROR:
-            return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+    case NPP_NULL_POINTER_ERROR:
+        return "NPP_NULL_POINTER_ERROR";
 
-        case NPP_NOT_IMPLEMENTED_ERROR:
-            return "NPP_NOT_IMPLEMENTED_ERROR";
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+        return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
 
-        case NPP_ERROR:
-            return "NPP_ERROR";
+    case NPP_NOT_IMPLEMENTED_ERROR:
+        return "NPP_NOT_IMPLEMENTED_ERROR";
 
-        case NPP_SUCCESS:
-            return "NPP_SUCCESS";
+    case NPP_ERROR:
+        return "NPP_ERROR";
 
-        case NPP_WRONG_INTERSECTION_QUAD_WARNING:
-            return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+    case NPP_SUCCESS:
+        return "NPP_SUCCESS";
 
-        case NPP_MISALIGNED_DST_ROI_WARNING:
-            return "NPP_MISALIGNED_DST_ROI_WARNING";
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+        return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
 
-        case NPP_AFFINE_QUAD_INCORRECT_WARNING:
-            return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+        return "NPP_MISALIGNED_DST_ROI_WARNING";
 
-        case NPP_DOUBLE_SIZE_WARNING:
-            return "NPP_DOUBLE_SIZE_WARNING";
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+        return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
 
-        case NPP_WRONG_INTERSECTION_ROI_WARNING:
-            return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+    case NPP_DOUBLE_SIZE_WARNING:
+        return "NPP_DOUBLE_SIZE_WARNING";
 
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
-        /* These are 6.0 or higher */
-        case NPP_LUT_PALETTE_BITSIZE_ERROR:
-            return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+        return "NPP_WRONG_INTERSECTION_ROI_WARNING";
 
-        case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
-            return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+#    if((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+        return "NPP_LUT_PALETTE_BITSIZE_ERROR";
 
-        case NPP_QUALITY_INDEX_ERROR:
-            return "NPP_QUALITY_INDEX_ERROR";
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+        return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
 
-        case NPP_CHANNEL_ORDER_ERROR:
-            return "NPP_CHANNEL_ORDER_ERROR";
+    case NPP_QUALITY_INDEX_ERROR:
+        return "NPP_QUALITY_INDEX_ERROR";
 
-        case NPP_ZERO_MASK_VALUE_ERROR:
-            return "NPP_ZERO_MASK_VALUE_ERROR";
+    case NPP_CHANNEL_ORDER_ERROR:
+        return "NPP_CHANNEL_ORDER_ERROR";
 
-        case NPP_NUMBER_OF_CHANNELS_ERROR:
-            return "NPP_NUMBER_OF_CHANNELS_ERROR";
+    case NPP_ZERO_MASK_VALUE_ERROR:
+        return "NPP_ZERO_MASK_VALUE_ERROR";
 
-        case NPP_COI_ERROR:
-            return "NPP_COI_ERROR";
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+        return "NPP_NUMBER_OF_CHANNELS_ERROR";
 
-        case NPP_DIVISOR_ERROR:
-            return "NPP_DIVISOR_ERROR";
+    case NPP_COI_ERROR:
+        return "NPP_COI_ERROR";
 
-        case NPP_CHANNEL_ERROR:
-            return "NPP_CHANNEL_ERROR";
+    case NPP_DIVISOR_ERROR:
+        return "NPP_DIVISOR_ERROR";
 
-        case NPP_STRIDE_ERROR:
-            return "NPP_STRIDE_ERROR";
+    case NPP_CHANNEL_ERROR:
+        return "NPP_CHANNEL_ERROR";
 
-        case NPP_ANCHOR_ERROR:
-            return "NPP_ANCHOR_ERROR";
+    case NPP_STRIDE_ERROR:
+        return "NPP_STRIDE_ERROR";
 
-        case NPP_MASK_SIZE_ERROR:
-            return "NPP_MASK_SIZE_ERROR";
+    case NPP_ANCHOR_ERROR:
+        return "NPP_ANCHOR_ERROR";
 
-        case NPP_MOMENT_00_ZERO_ERROR:
-            return "NPP_MOMENT_00_ZERO_ERROR";
+    case NPP_MASK_SIZE_ERROR:
+        return "NPP_MASK_SIZE_ERROR";
 
-        case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
-            return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+    case NPP_MOMENT_00_ZERO_ERROR:
+        return "NPP_MOMENT_00_ZERO_ERROR";
 
-        case NPP_THRESHOLD_ERROR:
-            return "NPP_THRESHOLD_ERROR";
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+        return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
 
-        case NPP_CONTEXT_MATCH_ERROR:
-            return "NPP_CONTEXT_MATCH_ERROR";
+    case NPP_THRESHOLD_ERROR:
+        return "NPP_THRESHOLD_ERROR";
 
-        case NPP_FFT_FLAG_ERROR:
-            return "NPP_FFT_FLAG_ERROR";
+    case NPP_CONTEXT_MATCH_ERROR:
+        return "NPP_CONTEXT_MATCH_ERROR";
 
-        case NPP_FFT_ORDER_ERROR:
-            return "NPP_FFT_ORDER_ERROR";
+    case NPP_FFT_FLAG_ERROR:
+        return "NPP_FFT_FLAG_ERROR";
 
-        case NPP_SCALE_RANGE_ERROR:
-            return "NPP_SCALE_RANGE_ERROR";
+    case NPP_FFT_ORDER_ERROR:
+        return "NPP_FFT_ORDER_ERROR";
 
-        case NPP_DATA_TYPE_ERROR:
-            return "NPP_DATA_TYPE_ERROR";
+    case NPP_SCALE_RANGE_ERROR:
+        return "NPP_SCALE_RANGE_ERROR";
 
-        case NPP_OUT_OFF_RANGE_ERROR:
-            return "NPP_OUT_OFF_RANGE_ERROR";
+    case NPP_DATA_TYPE_ERROR:
+        return "NPP_DATA_TYPE_ERROR";
 
-        case NPP_DIVIDE_BY_ZERO_ERROR:
-            return "NPP_DIVIDE_BY_ZERO_ERROR";
+    case NPP_OUT_OFF_RANGE_ERROR:
+        return "NPP_OUT_OFF_RANGE_ERROR";
 
-        case NPP_RANGE_ERROR:
-            return "NPP_RANGE_ERROR";
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+        return "NPP_DIVIDE_BY_ZERO_ERROR";
 
-        case NPP_NO_MEMORY_ERROR:
-            return "NPP_NO_MEMORY_ERROR";
+    case NPP_RANGE_ERROR:
+        return "NPP_RANGE_ERROR";
 
-        case NPP_ERROR_RESERVED:
-            return "NPP_ERROR_RESERVED";
+    case NPP_NO_MEMORY_ERROR:
+        return "NPP_NO_MEMORY_ERROR";
 
-        case NPP_NO_OPERATION_WARNING:
-            return "NPP_NO_OPERATION_WARNING";
+    case NPP_ERROR_RESERVED:
+        return "NPP_ERROR_RESERVED";
 
-        case NPP_DIVIDE_BY_ZERO_WARNING:
-            return "NPP_DIVIDE_BY_ZERO_WARNING";
-#endif
+    case NPP_NO_OPERATION_WARNING:
+        return "NPP_NO_OPERATION_WARNING";
 
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+        return "NPP_DIVIDE_BY_ZERO_WARNING";
+#    endif
     }
 
     return "<unknown>";
@@ -883,22 +881,28 @@ static const char *_cudaGetErrorEnum(NppStatus error)
 #endif
 
 #ifdef __DRIVER_TYPES_H__
-#ifndef DEVICE_RESET
-#define DEVICE_RESET cudaDeviceReset();
-#endif
+#    ifndef DEVICE_RESET
+#        define DEVICE_RESET cudaDeviceReset();
+#    endif
 #else
-#ifndef DEVICE_RESET
-#define DEVICE_RESET
-#endif
+#    ifndef DEVICE_RESET
+#        define DEVICE_RESET
+#    endif
 #endif
 
-template< typename T >
-void check(T result, char const *const func, const char *const file, int const line)
+template<typename T>
+void check(T result, char const* const func, const char* const file, int const line)
 {
-    if (result)
+    if(result)
     {
-        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
-                file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+        fprintf(
+            stderr,
+            "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+            file,
+            line,
+            static_cast<unsigned int>(result),
+            _cudaGetErrorEnum(result),
+            func);
         DEVICE_RESET
         // Make sure we call CUDA Device Reset before exiting
         exit(EXIT_FAILURE);
@@ -907,19 +911,25 @@ void check(T result, char const *const func, const char *const file, int const l
 
 #ifdef __DRIVER_TYPES_H__
 // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
-#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+#    define checkCudaErrors(val) check((val), #    val, __FILE__, __LINE__)
 
 // This will output the proper error string when calling cudaGetLastError
-#define getLastCudaError(msg)      __getLastCudaError (msg, __FILE__, __LINE__)
+#    define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
 
-inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
+inline void __getLastCudaError(const char* errorMessage, const char* file, const int line)
 {
     cudaError_t err = cudaGetLastError();
 
-    if (cudaSuccess != err)
+    if(cudaSuccess != err)
     {
-        fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
-                file, line, errorMessage, (int)err, cudaGetErrorString(err));
+        fprintf(
+            stderr,
+            "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
+            file,
+            line,
+            errorMessage,
+            (int) err,
+            cudaGetErrorString(err));
         DEVICE_RESET
         exit(EXIT_FAILURE);
     }
@@ -927,7 +937,7 @@ inline void __getLastCudaError(const char *errorMessage, const char *file, const
 #endif
 
 #ifndef MAX
-#define MAX(a,b) (a > b ? a : b)
+#    define MAX(a, b) (a > b ? a : b)
 #endif
 
 // Beginning of GPU Architecture definitions
@@ -940,27 +950,25 @@ inline int _ConvertSMVer2Cores(int major, int minor)
         int Cores;
     } sSMtoCores;
 
-    sSMtoCores nGpuArchCoresPerSM[] =
-    {
-        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
-        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
-        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
-        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
-        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
-        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
-        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
-        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
-        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
-        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
-        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
-        {   -1, -1 }
-    };
+    sSMtoCores nGpuArchCoresPerSM[]
+        = {{0x10, 8}, // Tesla Generation (SM 1.0) G80 class
+           {0x11, 8}, // Tesla Generation (SM 1.1) G8x class
+           {0x12, 8}, // Tesla Generation (SM 1.2) G9x class
+           {0x13, 8}, // Tesla Generation (SM 1.3) GT200 class
+           {0x20, 32}, // Fermi Generation (SM 2.0) GF100 class
+           {0x21, 48}, // Fermi Generation (SM 2.1) GF10x class
+           {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+           {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+           {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+           {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+           {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+           {-1, -1}};
 
     int index = 0;
 
-    while (nGpuArchCoresPerSM[index].SM != -1)
+    while(nGpuArchCoresPerSM[index].SM != -1)
     {
-        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+        if(nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
         {
             return nGpuArchCoresPerSM[index].Cores;
         }
@@ -969,8 +977,12 @@ inline int _ConvertSMVer2Cores(int major, int minor)
     }
 
     // If we don't find the values, we default use the previous one to run properly
-    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
-    return nGpuArchCoresPerSM[index-1].Cores;
+    printf(
+        "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
+        major,
+        minor,
+        nGpuArchCoresPerSM[index - 1].Cores);
+    return nGpuArchCoresPerSM[index - 1].Cores;
 }
 // end of GPU Architecture definitions
 
@@ -981,18 +993,18 @@ inline int gpuDeviceInit(int devID)
     int device_count;
     checkCudaErrors(cudaGetDeviceCount(&device_count));
 
-    if (device_count == 0)
+    if(device_count == 0)
     {
         fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
         exit(EXIT_FAILURE);
     }
 
-    if (devID < 0)
+    if(devID < 0)
     {
         devID = 0;
     }
 
-    if (devID > device_count-1)
+    if(devID > device_count - 1)
     {
         fprintf(stderr, "\n");
         fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
@@ -1001,23 +1013,23 @@ inline int gpuDeviceInit(int devID)
         return -devID;
     }
 
-/*  cudaDeviceProp deviceProp;
-    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+    /*  cudaDeviceProp deviceProp;
+        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 
-    if (deviceProp.computeMode == cudaComputeModeProhibited)
-    {
-        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
-        return -1;
-    }
+        if (deviceProp.computeMode == cudaComputeModeProhibited)
+        {
+            fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use
+       ::cudaSetDevice().\n"); return -1;
+        }
 
-    if (deviceProp.major < 1)
-    {
-        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-*/
+        if (deviceProp.major < 1)
+        {
+            fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+            exit(EXIT_FAILURE);
+        }
+    */
     checkCudaErrors(cudaSetDevice(devID));
-//    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
+    //    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
 
     return devID;
 }
@@ -1025,9 +1037,9 @@ inline int gpuDeviceInit(int devID)
 // This function returns the best GPU (with maximum GFLOPS)
 inline int gpuGetMaxGflopsDeviceId()
 {
-    int current_device     = 0, sm_per_multiproc  = 0;
-    int max_perf_device    = 0;
-    int device_count       = 0, best_SM_arch      = 0;
+    int current_device = 0, sm_per_multiproc = 0;
+    int max_perf_device = 0;
+    int device_count = 0, best_SM_arch = 0;
     int devices_prohibited = 0;
 
     unsigned long long max_compute_perf = 0;
@@ -1036,21 +1048,21 @@ inline int gpuGetMaxGflopsDeviceId()
 
     checkCudaErrors(cudaGetDeviceCount(&device_count));
 
-    if (device_count == 0)
+    if(device_count == 0)
     {
         fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n");
         exit(EXIT_FAILURE);
     }
 
     // Find the best major SM Architecture GPU device
-    while (current_device < device_count)
+    while(current_device < device_count)
     {
         cudaGetDeviceProperties(&deviceProp, current_device);
 
         // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
-        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        if(deviceProp.computeMode != cudaComputeModeProhibited)
         {
-            if (deviceProp.major > 0 && deviceProp.major < 9999)
+            if(deviceProp.major > 0 && deviceProp.major < 9999)
             {
                 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
             }
@@ -1063,23 +1075,23 @@ inline int gpuGetMaxGflopsDeviceId()
         current_device++;
     }
 
-    if (devices_prohibited == device_count)
+    if(devices_prohibited == device_count)
     {
-    	fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
-    	exit(EXIT_FAILURE);
+        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n");
+        exit(EXIT_FAILURE);
     }
 
     // Find the best CUDA capable GPU device
     current_device = 0;
 
-    while (current_device < device_count)
+    while(current_device < device_count)
     {
         cudaGetDeviceProperties(&deviceProp, current_device);
 
         // If this GPU is not running on Compute Mode prohibited, then we can add it to the list
-        if (deviceProp.computeMode != cudaComputeModeProhibited)
+        if(deviceProp.computeMode != cudaComputeModeProhibited)
         {
-            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
+            if(deviceProp.major == 9999 && deviceProp.minor == 9999)
             {
                 sm_per_multiproc = 1;
             }
@@ -1088,24 +1100,25 @@ inline int gpuGetMaxGflopsDeviceId()
                 sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
             }
 
-            unsigned long long compute_perf  = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
+            unsigned long long compute_perf
+                = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
 
-            if (compute_perf  > max_compute_perf)
+            if(compute_perf > max_compute_perf)
             {
                 // If we find GPU with SM major > 2, search only these
-                if (best_SM_arch > 2)
+                if(best_SM_arch > 2)
                 {
                     // If our device==dest_SM_arch, choose this, or else pass
-                    if (deviceProp.major == best_SM_arch)
+                    if(deviceProp.major == best_SM_arch)
                     {
-                        max_compute_perf  = compute_perf;
-                        max_perf_device   = current_device;
+                        max_compute_perf = compute_perf;
+                        max_perf_device = current_device;
                     }
                 }
                 else
                 {
-                    max_compute_perf  = compute_perf;
-                    max_perf_device   = current_device;
+                    max_compute_perf = compute_perf;
+                    max_perf_device = current_device;
                 }
             }
         }
@@ -1118,17 +1131,17 @@ inline int gpuGetMaxGflopsDeviceId()
 
 
 // Initialization code to find the best CUDA Device
-inline int findCudaDevice(int argc, const char **argv)
+inline int findCudaDevice(int argc, const char** argv)
 {
-//    cudaDeviceProp deviceProp;
+    //    cudaDeviceProp deviceProp;
     int devID = 0;
 
     // If the command-line has a device number specified, use it
-    if (checkCmdLineFlag(argc, argv, "device"))
+    if(checkCmdLineFlag(argc, argv, "device"))
     {
         devID = getCmdLineArgumentInt(argc, argv, "device=");
 
-        if (devID < 0)
+        if(devID < 0)
         {
             printf("Invalid command line parameter\n ");
             exit(EXIT_FAILURE);
@@ -1137,7 +1150,7 @@ inline int findCudaDevice(int argc, const char **argv)
         {
             devID = gpuDeviceInit(devID);
 
-            if (devID < 0)
+            if(devID < 0)
             {
                 printf("exiting...\n");
                 exit(EXIT_FAILURE);
@@ -1149,8 +1162,9 @@ inline int findCudaDevice(int argc, const char **argv)
         // Otherwise pick the device with highest Gflops/s
         devID = gpuGetMaxGflopsDeviceId();
         checkCudaErrors(cudaSetDevice(devID));
-//        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-//        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
+        //        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+        //        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name,
+        //        deviceProp.major, deviceProp.minor);
     }
 
     return devID;
@@ -1159,29 +1173,29 @@ inline int findCudaDevice(int argc, const char **argv)
 // General check for CUDA GPU SM Capabilities
 inline bool checkCudaCapabilities(int major_version, int minor_version)
 {
-/*    cudaDeviceProp deviceProp;
-    deviceProp.major = 0;
-    deviceProp.minor = 0;
- */
+    /*    cudaDeviceProp deviceProp;
+        deviceProp.major = 0;
+        deviceProp.minor = 0;
+     */
     int dev;
 
     checkCudaErrors(cudaGetDevice(&dev));
-//    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+    //    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
 
     return true;
-/*
-    if ((deviceProp.major > major_version) ||
-        (deviceProp.major == major_version && deviceProp.minor >= minor_version))
-    {
-        printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
-        return true;
-    }
-    else
-    {
-        printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
-        return false;
-    }
-*/
+    /*
+        if ((deviceProp.major > major_version) ||
+            (deviceProp.major == major_version && deviceProp.minor >= minor_version))
+        {
+            printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major,
+       deviceProp.minor); return true;
+        }
+        else
+        {
+            printf("  No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version,
+       minor_version); return false;
+        }
+    */
 }
 #endif
 
diff --git a/example/CUDASamples/common/helper_functions.h b/example/CUDASamples/common/helper_functions.h
index 11538ba7..20b6b17c 100644
--- a/example/CUDASamples/common/helper_functions.h
+++ b/example/CUDASamples/common/helper_functions.h
@@ -14,29 +14,29 @@
 #define HELPER_FUNCTIONS_H
 
 #ifdef WIN32
-#pragma warning(disable:4996)
+#    pragma warning(disable : 4996)
 #endif
 
 // includes, project
-#include <stdio.h>
-#include <stdlib.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
 #include <string>
+#include <vector>
+
 #include <assert.h>
 #include <exception.h>
 #include <math.h>
-
-#include <fstream>
-#include <vector>
-#include <iostream>
-#include <algorithm>
+#include <stdio.h>
+#include <stdlib.h>
 
 // includes, timer, string parsing, image helpers
-#include <helper_timer.h>   // helper functions for timers
-#include <helper_string.h>  // helper functions for string parsing
-#include <helper_image.h>   // helper functions for image compare, dump, data comparisons
+#include <helper_image.h> // helper functions for image compare, dump, data comparisons
+#include <helper_string.h> // helper functions for string parsing
+#include <helper_timer.h> // helper functions for timers
 
 #ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
+#    define EXIT_WAIVED 2
 #endif
 
 #endif //  HELPER_FUNCTIONS_H
diff --git a/example/CUDASamples/common/helper_image.h b/example/CUDASamples/common/helper_image.h
index 4e8b25cd..6412cf1e 100644
--- a/example/CUDASamples/common/helper_image.h
+++ b/example/CUDASamples/common/helper_image.h
@@ -13,25 +13,25 @@
 #ifndef HELPER_IMAGE_H
 #define HELPER_IMAGE_H
 
-#include <string>
+#include <algorithm>
 #include <fstream>
-#include <vector>
 #include <iostream>
-#include <algorithm>
+#include <string>
+#include <vector>
 
 #include <assert.h>
 #include <exception.h>
 #include <math.h>
 
 #ifndef MIN
-#define MIN(a,b) ((a < b) ? a : b)
+#    define MIN(a, b) ((a < b) ? a : b)
 #endif
 #ifndef MAX
-#define MAX(a,b) ((a > b) ? a : b)
+#    define MAX(a, b) ((a > b) ? a : b)
 #endif
 
 #ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
+#    define EXIT_WAIVED 2
 #endif
 
 #include <helper_string.h>
@@ -55,7 +55,7 @@ namespace
         //! Conversion operator
         //! @return converted value
         //! @param  val  value to convert
-        float operator()(const unsigned char &val)
+        float operator()(const unsigned char& val)
         {
             return static_cast<unsigned char>(val);
         }
@@ -68,7 +68,7 @@ namespace
         //! Conversion operator
         //! @return converted value
         //! @param  val  value to convert
-        float operator()(const unsigned char &val)
+        float operator()(const unsigned char& val)
         {
             return static_cast<float>(val) / 255.0f;
         }
@@ -85,7 +85,7 @@ namespace
         //! Conversion operator (essentially a passthru
         //! @return converted value
         //! @param  val  value to convert
-        unsigned char operator()(const unsigned char &val)
+        unsigned char operator()(const unsigned char& val)
         {
             return val;
         }
@@ -98,42 +98,40 @@ namespace
         //! Conversion operator
         //! @return converted value
         //! @param  val  value to convert
-        unsigned char operator()(const float &val)
+        unsigned char operator()(const float& val)
         {
             return static_cast<unsigned char>(val * 255.0f);
         }
     };
-}
+} // namespace
 
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-#ifndef FOPEN
-#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result != 0)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf_s
-#endif
+#    ifndef FOPEN
+#        define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#    endif
+#    ifndef FOPEN_FAIL
+#        define FOPEN_FAIL(result) (result != 0)
+#    endif
+#    ifndef SSCANF
+#        define SSCANF sscanf_s
+#    endif
 #else
-#ifndef FOPEN
-#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result == NULL)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf
-#endif
+#    ifndef FOPEN
+#        define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#    endif
+#    ifndef FOPEN_FAIL
+#        define FOPEN_FAIL(result) (result == NULL)
+#    endif
+#    ifndef SSCANF
+#        define SSCANF sscanf
+#    endif
 #endif
 
-inline bool
-__loadPPM(const char *file, unsigned char **data,
-          unsigned int *w, unsigned int *h, unsigned int *channels)
+inline bool __loadPPM(const char* file, unsigned char** data, unsigned int* w, unsigned int* h, unsigned int* channels)
 {
-    FILE *fp = NULL;
+    FILE* fp = NULL;
 
-    if (FOPEN_FAIL(FOPEN(fp, file, "rb")))
+    if(FOPEN_FAIL(FOPEN(fp, file, "rb")))
     {
         std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
         return false;
@@ -142,17 +140,17 @@ __loadPPM(const char *file, unsigned char **data,
     // check header
     char header[PGMHeaderSize];
 
-    if (fgets(header, PGMHeaderSize, fp) == NULL)
+    if(fgets(header, PGMHeaderSize, fp) == NULL)
     {
         std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
         return false;
     }
 
-    if (strncmp(header, "P5", 2) == 0)
+    if(strncmp(header, "P5", 2) == 0)
     {
         *channels = 1;
     }
-    else if (strncmp(header, "P6", 2) == 0)
+    else if(strncmp(header, "P6", 2) == 0)
     {
         *channels = 3;
     }
@@ -169,50 +167,50 @@ __loadPPM(const char *file, unsigned char **data,
     unsigned int maxval = 0;
     unsigned int i = 0;
 
-    while (i < 3)
+    while(i < 3)
     {
-        if (fgets(header, PGMHeaderSize, fp) == NULL)
+        if(fgets(header, PGMHeaderSize, fp) == NULL)
         {
             std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
             return false;
         }
 
-        if (header[0] == '#')
+        if(header[0] == '#')
         {
             continue;
         }
 
-        if (i == 0)
+        if(i == 0)
         {
             i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
         }
-        else if (i == 1)
+        else if(i == 1)
         {
             i += SSCANF(header, "%u %u", &height, &maxval);
         }
-        else if (i == 2)
+        else if(i == 2)
         {
             i += SSCANF(header, "%u", &maxval);
         }
     }
 
     // check if given handle for the data is initialized
-    if (NULL != *data)
+    if(NULL != *data)
     {
-        if (*w != width || *h != height)
+        if(*w != width || *h != height)
         {
             std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
         }
     }
     else
     {
-        *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height **channels);
+        *data = (unsigned char*) malloc(sizeof(unsigned char) * width * height * *channels);
         *w = width;
         *h = height;
     }
 
     // read and close file
-    if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0)
+    if(fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0)
     {
         std::cerr << "__LoadPPM() read data returned error." << std::endl;
     }
@@ -222,25 +220,24 @@ __loadPPM(const char *file, unsigned char **data,
     return true;
 }
 
-template <class T>
-inline bool
-sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h)
+template<class T>
+inline bool sdkLoadPGM(const char* file, T** data, unsigned int* w, unsigned int* h)
 {
-    unsigned char *idata = NULL;
+    unsigned char* idata = NULL;
     unsigned int channels;
 
-    if (true != __loadPPM(file, &idata, w, h, &channels))
+    if(true != __loadPPM(file, &idata, w, h, &channels))
     {
         return false;
     }
 
-    unsigned int size = *w **h * channels;
+    unsigned int size = *w * *h * channels;
 
     // initialize mem if necessary
     // the correct size is checked / set in loadPGMc()
-    if (NULL == *data)
+    if(NULL == *data)
     {
-        *data = (T *) malloc(sizeof(T) * size);
+        *data = (T*) malloc(sizeof(T) * size);
     }
 
     // copy and cast data
@@ -251,24 +248,22 @@ sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h)
     return true;
 }
 
-template <class T>
-inline bool
-sdkLoadPPM4(const char *file, T **data,
-            unsigned int *w,unsigned int *h)
+template<class T>
+inline bool sdkLoadPPM4(const char* file, T** data, unsigned int* w, unsigned int* h)
 {
-    unsigned char *idata = 0;
+    unsigned char* idata = 0;
     unsigned int channels;
 
-    if (__loadPPM(file, &idata, w, h, &channels))
+    if(__loadPPM(file, &idata, w, h, &channels))
     {
         // pad 4th component
-        int size = *w **h;
+        int size = *w * *h;
         // keep the original pointer
-        unsigned char *idata_orig = idata;
-        *data = (T *) malloc(sizeof(T) * size * 4);
-        unsigned char *ptr = *data;
+        unsigned char* idata_orig = idata;
+        *data = (T*) malloc(sizeof(T) * size * 4);
+        unsigned char* ptr = *data;
 
-        for (int i=0; i<size; i++)
+        for(int i = 0; i < size; i++)
         {
             *ptr++ = *idata++;
             *ptr++ = *idata++;
@@ -286,9 +281,7 @@ sdkLoadPPM4(const char *file, T **data,
     }
 }
 
-inline bool
-__savePPM(const char *file, unsigned char *data,
-          unsigned int w, unsigned int h, unsigned int channels)
+inline bool __savePPM(const char* file, unsigned char* data, unsigned int w, unsigned int h, unsigned int channels)
 {
     assert(NULL != data);
     assert(w > 0);
@@ -296,17 +289,17 @@ __savePPM(const char *file, unsigned char *data,
 
     std::fstream fh(file, std::fstream::out | std::fstream::binary);
 
-    if (fh.bad())
+    if(fh.bad())
     {
         std::cerr << "__savePPM() : Opening file failed." << std::endl;
         return false;
     }
 
-    if (channels == 1)
+    if(channels == 1)
     {
         fh << "P5\n";
     }
-    else if (channels == 3)
+    else if(channels == 3)
     {
         fh << "P6\n";
     }
@@ -318,14 +311,14 @@ __savePPM(const char *file, unsigned char *data,
 
     fh << w << "\n" << h << "\n" << 0xff << std::endl;
 
-    for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)
+    for(unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i)
     {
         fh << data[i];
     }
 
     fh.flush();
 
-    if (fh.bad())
+    if(fh.bad())
     {
         std::cerr << "__savePPM() : Writing data failed." << std::endl;
         return false;
@@ -337,12 +330,10 @@ __savePPM(const char *file, unsigned char *data,
 }
 
 template<class T>
-inline bool
-sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h)
+inline bool sdkSavePGM(const char* file, T* data, unsigned int w, unsigned int h)
 {
     unsigned int size = w * h;
-    unsigned char *idata =
-        (unsigned char *) malloc(sizeof(unsigned char) * size);
+    unsigned char* idata = (unsigned char*) malloc(sizeof(unsigned char) * size);
 
     std::transform(data, data + size, idata, ConverterToUByte<T>());
 
@@ -355,16 +346,14 @@ sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h)
     return result;
 }
 
-inline bool
-sdkSavePPM4ub(const char *file, unsigned char *data,
-              unsigned int w, unsigned int h)
+inline bool sdkSavePPM4ub(const char* file, unsigned char* data, unsigned int w, unsigned int h)
 {
     // strip 4th component
     int size = w * h;
-    unsigned char *ndata = (unsigned char *) malloc(sizeof(unsigned char) * size*3);
-    unsigned char *ptr = ndata;
+    unsigned char* ndata = (unsigned char*) malloc(sizeof(unsigned char) * size * 3);
+    unsigned char* ptr = ndata;
 
-    for (int i=0; i<size; i++)
+    for(int i = 0; i < size; i++)
     {
         *ptr++ = *data++;
         *ptr++ = *data++;
@@ -387,21 +376,20 @@ sdkSavePPM4ub(const char *file, unsigned char *data,
 //! @param len  number of data elements in data, -1 on error
 //////////////////////////////////////////////////////////////////////////////
 template<class T>
-inline bool
-sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
+inline bool sdkReadFile(const char* filename, T** data, unsigned int* len, bool verbose)
 {
     // check input arguments
     assert(NULL != filename);
     assert(NULL != len);
 
     // intermediate storage for the data read
-    std::vector<T>  data_read;
+    std::vector<T> data_read;
 
     // open file for reading
-    FILE *fh = NULL;
+    FILE* fh = NULL;
 
     // check if filestream is valid
-    if (FOPEN_FAIL(FOPEN(fh, filename, "r")))
+    if(FOPEN_FAIL(FOPEN(fh, filename, "r")))
     {
         printf("Unable to open input file: %s\n", filename);
         return false;
@@ -410,7 +398,7 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
     // read all data elements
     T token;
 
-    while (!feof(fh))
+    while(!feof(fh))
     {
         fscanf(fh, "%f", &token);
         data_read.push_back(token);
@@ -421,14 +409,14 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
     fclose(fh);
 
     // check if the given handle is already initialized
-    if (NULL != *data)
+    if(NULL != *data)
     {
-        if (*len != data_read.size())
+        if(*len != data_read.size())
         {
             std::cerr << "sdkReadFile() : Initialized memory given but "
                       << "size  mismatch with signal read "
-                      << "(data read / data init = " << (unsigned int)data_read.size()
-                      <<  " / " << *len << ")" << std::endl;
+                      << "(data read / data init = " << (unsigned int) data_read.size() << " / " << *len << ")"
+                      << std::endl;
 
             return false;
         }
@@ -436,7 +424,7 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
     else
     {
         // allocate storage for the data read
-        *data = (T *) malloc(sizeof(T) * data_read.size());
+        *data = (T*) malloc(sizeof(T) * data_read.size());
         // store signal size
         *len = static_cast<unsigned int>(data_read.size());
     }
@@ -456,17 +444,22 @@ sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose)
 //! @param len  number of data elements in data, -1 on error
 //////////////////////////////////////////////////////////////////////////////
 template<class T>
-inline bool
-sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned int block_num, unsigned int block_size, bool verbose)
+inline bool sdkReadFileBlocks(
+    const char* filename,
+    T** data,
+    unsigned int* len,
+    unsigned int block_num,
+    unsigned int block_size,
+    bool verbose)
 {
     // check input arguments
     assert(NULL != filename);
     assert(NULL != len);
 
     // open file for reading
-    FILE *fh = fopen(filename, "rb");
+    FILE* fh = fopen(filename, "rb");
 
-    if (fh == NULL && verbose)
+    if(fh == NULL && verbose)
     {
         std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
         return false;
@@ -474,11 +467,11 @@ sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned in
 
     // check if the given handle is already initialized
     // allocate storage for the data read
-    data[block_num] = (T *) malloc(block_size);
+    data[block_num] = (T*) malloc(block_size);
 
     // read all data elements
     fseek(fh, block_num * block_size, SEEK_SET);
-    *len = fread(data[block_num], sizeof(T), block_size/sizeof(T), fh);
+    *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
 
     fclose(fh);
 
@@ -494,9 +487,13 @@ sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned in
 //! @param epsilon  epsilon for comparison
 //////////////////////////////////////////////////////////////////////////////
 template<class T, class S>
-inline bool
-sdkWriteFile(const char *filename, const T *data, unsigned int len,
-             const S epsilon, bool verbose, bool append = false)
+inline bool sdkWriteFile(
+    const char* filename,
+    const T* data,
+    unsigned int len,
+    const S epsilon,
+    bool verbose,
+    bool append = false)
 {
     assert(NULL != filename);
     assert(NULL != data);
@@ -505,7 +502,7 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len,
     //    if (append) {
     std::fstream fh(filename, std::fstream::out | std::fstream::ate);
 
-    if (verbose)
+    if(verbose)
     {
         std::cerr << "sdkWriteFile() : Open file " << filename << " for write/append." << std::endl;
     }
@@ -519,9 +516,9 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len,
     */
 
     // check if filestream is valid
-    if (! fh.good())
+    if(!fh.good())
     {
-        if (verbose)
+        if(verbose)
         {
             std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
         }
@@ -533,15 +530,15 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len,
     fh << "# " << epsilon << "\n";
 
     // write data
-    for (unsigned int i = 0; (i < len) && (fh.good()); ++i)
+    for(unsigned int i = 0; (i < len) && (fh.good()); ++i)
     {
         fh << data[i] << ' ';
     }
 
     // Check if writing succeeded
-    if (! fh.good())
+    if(!fh.good())
     {
-        if (verbose)
+        if(verbose)
         {
             std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
         }
@@ -564,18 +561,21 @@ sdkWriteFile(const char *filename, const T *data, unsigned int len,
 //! @param epsilon    epsilon to use for the comparison
 //////////////////////////////////////////////////////////////////////////////
 template<class T, class S>
-inline bool
-compareData(const T *reference, const T *data, const unsigned int len,
-            const S epsilon, const float threshold)
+inline bool compareData(
+    const T* reference,
+    const T* data,
+    const unsigned int len,
+    const S epsilon,
+    const float threshold)
 {
     assert(epsilon >= 0);
 
     bool result = true;
     unsigned int error_count = 0;
 
-    for (unsigned int i = 0; i < len; ++i)
+    for(unsigned int i = 0; i < len; ++i)
     {
-        float diff = (float)reference[i] - (float)data[i];
+        float diff = (float) reference[i] - (float) data[i];
         bool comp = (diff <= epsilon) && (diff >= -epsilon);
         result &= comp;
 
@@ -594,23 +594,23 @@ compareData(const T *reference, const T *data, const unsigned int len,
 #endif
     }
 
-    if (threshold == 0.0f)
+    if(threshold == 0.0f)
     {
         return (result) ? true : false;
     }
     else
     {
-        if (error_count)
+        if(error_count)
         {
-            printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
+            printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float) error_count * 100 / (float) len, error_count);
         }
 
-        return (len*threshold > error_count) ? true : false;
+        return (len * threshold > error_count) ? true : false;
     }
 }
 
 #ifndef __MIN_EPSILON_ERROR
-#define __MIN_EPSILON_ERROR 1e-3f
+#    define __MIN_EPSILON_ERROR 1e-3f
 #endif
 
 //////////////////////////////////////////////////////////////////////////////
@@ -623,24 +623,27 @@ compareData(const T *reference, const T *data, const unsigned int len,
 //! @param epsilon    threshold % of (# of bytes) for pass/fail
 //////////////////////////////////////////////////////////////////////////////
 template<class T, class S>
-inline bool
-compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned int len,
-                            const S epsilon, const float threshold)
+inline bool compareDataAsFloatThreshold(
+    const T* reference,
+    const T* data,
+    const unsigned int len,
+    const S epsilon,
+    const float threshold)
 {
     assert(epsilon >= 0);
 
     // If we set epsilon to be 0, let's set a minimum threshold
-    float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+    float max_error = MAX((float) epsilon, __MIN_EPSILON_ERROR);
     int error_count = 0;
     bool result = true;
 
-    for (unsigned int i = 0; i < len; ++i)
+    for(unsigned int i = 0; i < len; ++i)
     {
-        float diff = fabs((float)reference[i] - (float)data[i]);
+        float diff = fabs((float) reference[i] - (float) data[i]);
         bool comp = (diff < max_error);
         result &= comp;
 
-        if (! comp)
+        if(!comp)
         {
             error_count++;
 #if 0
@@ -658,9 +661,9 @@ compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned in
         }
     }
 
-    if (threshold == 0.0f)
+    if(threshold == 0.0f)
     {
-        if (error_count)
+        if(error_count)
         {
             printf("total # of errors = %d\n", error_count);
         }
@@ -669,28 +672,32 @@ compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned in
     }
     else
     {
-        if (error_count)
+        if(error_count)
         {
-            printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count);
+            printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float) error_count * 100 / (float) len, error_count);
         }
 
-        return ((len*threshold > error_count) ? true : false);
+        return ((len * threshold > error_count) ? true : false);
     }
 }
 
-inline
-void sdkDumpBin(void *data, unsigned int bytes, const char *filename)
+inline void sdkDumpBin(void* data, unsigned int bytes, const char* filename)
 {
     printf("sdkDumpBin: <%s>\n", filename);
-    FILE *fp;
+    FILE* fp;
     FOPEN(fp, filename, "wb");
     fwrite(data, bytes, 1, fp);
     fflush(fp);
     fclose(fp);
 }
 
-inline
-bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
+inline bool sdkCompareBin2BinUint(
+    const char* src_file,
+    const char* ref_file,
+    unsigned int nelements,
+    const float epsilon,
+    const float threshold,
+    char* exec_path)
 {
     unsigned int *src_buffer, *ref_buffer;
     FILE *src_fp = NULL, *ref_fp = NULL;
@@ -698,15 +705,15 @@ bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned
     unsigned long error_count = 0;
     size_t fsize = 0;
 
-    if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
+    if(FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
     {
         printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n", src_file);
         error_count++;
     }
 
-    char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+    char* ref_file_path = sdkFindFilePath(ref_file, exec_path);
 
-    if (ref_file_path == NULL)
+    if(ref_file_path == NULL)
     {
         printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n", ref_file, exec_path);
         printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file);
@@ -714,37 +721,41 @@ bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned
         printf("  FAILED\n");
         error_count++;
 
-        if (src_fp)
+        if(src_fp)
         {
             fclose(src_fp);
         }
 
-        if (ref_fp)
+        if(ref_fp)
         {
             fclose(ref_fp);
         }
     }
     else
     {
-        if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
+        if(FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
         {
             printf("compareBin2Bin <unsigned int>  unable to open ref_file: %s\n", ref_file_path);
             error_count++;
         }
 
-        if (src_fp && ref_fp)
+        if(src_fp && ref_fp)
         {
-            src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
-            ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int));
+            src_buffer = (unsigned int*) malloc(nelements * sizeof(unsigned int));
+            ref_buffer = (unsigned int*) malloc(nelements * sizeof(unsigned int));
 
             fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
             fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
 
-            printf("> compareBin2Bin <unsigned int> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
-            printf("   src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
-            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
+            printf(
+                "> compareBin2Bin <unsigned int> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n",
+                nelements,
+                epsilon,
+                threshold);
+            printf("   src_file <%s>, size=%d bytes\n", src_file, (int) fsize);
+            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path, (int) fsize);
 
-            if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
+            if(!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
             {
                 error_count++;
             }
@@ -757,32 +768,37 @@ bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned
         }
         else
         {
-            if (src_fp)
+            if(src_fp)
             {
                 fclose(src_fp);
             }
 
-            if (ref_fp)
+            if(ref_fp)
             {
                 fclose(ref_fp);
             }
         }
     }
 
-    if (error_count == 0)
+    if(error_count == 0)
     {
         printf("  OK\n");
     }
     else
     {
-        printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+        printf("  FAILURE: %d errors...\n", (unsigned int) error_count);
     }
 
-    return (error_count == 0);  // returns true if all pixels pass
+    return (error_count == 0); // returns true if all pixels pass
 }
 
-inline
-bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path)
+inline bool sdkCompareBin2BinFloat(
+    const char* src_file,
+    const char* ref_file,
+    unsigned int nelements,
+    const float epsilon,
+    const float threshold,
+    char* exec_path)
 {
     float *src_buffer, *ref_buffer;
     FILE *src_fp = NULL, *ref_fp = NULL;
@@ -790,15 +806,15 @@ bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned
 
     unsigned long error_count = 0;
 
-    if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
+    if(FOPEN_FAIL(FOPEN(src_fp, src_file, "rb")))
     {
         printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
         error_count = 1;
     }
 
-    char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+    char* ref_file_path = sdkFindFilePath(ref_file, exec_path);
 
-    if (ref_file_path == NULL)
+    if(ref_file_path == NULL)
     {
         printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file, exec_path);
         printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", exec_path);
@@ -806,37 +822,41 @@ bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned
         printf("  FAILED\n");
         error_count++;
 
-        if (src_fp)
+        if(src_fp)
         {
             fclose(src_fp);
         }
 
-        if (ref_fp)
+        if(ref_fp)
         {
             fclose(ref_fp);
         }
     }
     else
     {
-        if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
+        if(FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb")))
         {
             printf("compareBin2Bin <float> unable to open ref_file: %s\n", ref_file_path);
             error_count = 1;
         }
 
-        if (src_fp && ref_fp)
+        if(src_fp && ref_fp)
         {
-            src_buffer = (float *)malloc(nelements*sizeof(float));
-            ref_buffer = (float *)malloc(nelements*sizeof(float));
+            src_buffer = (float*) malloc(nelements * sizeof(float));
+            ref_buffer = (float*) malloc(nelements * sizeof(float));
 
             fsize = fread(src_buffer, nelements, sizeof(float), src_fp);
             fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp);
 
-            printf("> compareBin2Bin <float> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold);
-            printf("   src_file <%s>, size=%d bytes\n", src_file, (int)fsize);
-            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize);
+            printf(
+                "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f, threshold=%4.2f\n",
+                nelements,
+                epsilon,
+                threshold);
+            printf("   src_file <%s>, size=%d bytes\n", src_file, (int) fsize);
+            printf("   ref_file <%s>, size=%d bytes\n", ref_file_path, (int) fsize);
 
-            if (!compareDataAsFloatThreshold<float, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
+            if(!compareDataAsFloatThreshold<float, float>(ref_buffer, src_buffer, nelements, epsilon, threshold))
             {
                 error_count++;
             }
@@ -849,42 +869,39 @@ bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned
         }
         else
         {
-            if (src_fp)
+            if(src_fp)
             {
                 fclose(src_fp);
             }
 
-            if (ref_fp)
+            if(ref_fp)
             {
                 fclose(ref_fp);
             }
         }
     }
 
-    if (error_count == 0)
+    if(error_count == 0)
     {
         printf("  OK\n");
     }
     else
     {
-        printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+        printf("  FAILURE: %d errors...\n", (unsigned int) error_count);
     }
 
-    return (error_count == 0);  // returns true if all pixels pass
+    return (error_count == 0); // returns true if all pixels pass
 }
 
-inline bool
-sdkCompareL2fe(const float *reference, const float *data,
-               const unsigned int len, const float epsilon)
+inline bool sdkCompareL2fe(const float* reference, const float* data, const unsigned int len, const float epsilon)
 {
     assert(epsilon >= 0);
 
     float error = 0;
     float ref = 0;
 
-    for (unsigned int i = 0; i < len; ++i)
+    for(unsigned int i = 0; i < len; ++i)
     {
-
         float diff = reference[i] - data[i];
         error += diff * diff;
         ref += reference[i] * reference[i];
@@ -892,7 +909,7 @@ sdkCompareL2fe(const float *reference, const float *data,
 
     float normRef = sqrtf(ref);
 
-    if (fabs(ref) < 1e-7)
+    if(fabs(ref) < 1e-7)
     {
 #ifdef _DEBUG
         std::cerr << "ERROR, reference l2-norm is 0\n";
@@ -905,10 +922,9 @@ sdkCompareL2fe(const float *reference, const float *data,
     bool result = error < epsilon;
 #ifdef _DEBUG
 
-    if (! result)
+    if(!result)
     {
-        std::cerr << "ERROR, l2-norm error "
-                  << error << " is greater than epsilon " << epsilon << "\n";
+        std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " << epsilon << "\n";
     }
 
 #endif
@@ -916,31 +932,27 @@ sdkCompareL2fe(const float *reference, const float *data,
     return result;
 }
 
-inline bool
-sdkLoadPPMub(const char *file, unsigned char **data,
-             unsigned int *w,unsigned int *h)
+inline bool sdkLoadPPMub(const char* file, unsigned char** data, unsigned int* w, unsigned int* h)
 {
     unsigned int channels;
     return __loadPPM(file, data, w, h, &channels);
 }
 
-inline bool
-sdkLoadPPM4ub(const char *file, unsigned char **data,
-              unsigned int *w, unsigned int *h)
+inline bool sdkLoadPPM4ub(const char* file, unsigned char** data, unsigned int* w, unsigned int* h)
 {
-    unsigned char *idata = 0;
+    unsigned char* idata = 0;
     unsigned int channels;
 
-    if (__loadPPM(file, &idata, w, h, &channels))
+    if(__loadPPM(file, &idata, w, h, &channels))
     {
         // pad 4th component
-        int size = *w **h;
+        int size = *w * *h;
         // keep the original pointer
-        unsigned char *idata_orig = idata;
-        *data = (unsigned char *) malloc(sizeof(unsigned char) * size * 4);
-        unsigned char *ptr = *data;
+        unsigned char* idata_orig = idata;
+        *data = (unsigned char*) malloc(sizeof(unsigned char) * size * 4);
+        unsigned char* ptr = *data;
 
-        for (int i=0; i<size; i++)
+        for(int i = 0; i < size; i++)
         {
             *ptr++ = *idata++;
             *ptr++ = *idata++;
@@ -959,18 +971,21 @@ sdkLoadPPM4ub(const char *file, unsigned char **data,
 }
 
 
-inline bool
-sdkComparePPM(const char *src_file, const char *ref_file,
-              const float epsilon, const float threshold, bool verboseErrors)
+inline bool sdkComparePPM(
+    const char* src_file,
+    const char* ref_file,
+    const float epsilon,
+    const float threshold,
+    bool verboseErrors)
 {
     unsigned char *src_data, *ref_data;
     unsigned long error_count = 0;
     unsigned int ref_width, ref_height;
     unsigned int src_width, src_height;
 
-    if (src_file == NULL || ref_file == NULL)
+    if(src_file == NULL || ref_file == NULL)
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
             std::cerr << "PPMvsPPM: src_file or ref_file is NULL.  Aborting comparison\n";
         }
@@ -978,73 +993,78 @@ sdkComparePPM(const char *src_file, const char *ref_file,
         return false;
     }
 
-    if (verboseErrors)
+    if(verboseErrors)
     {
         std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
         std::cerr << ">         (b)reference: <" << ref_file << ">\n";
     }
 
 
-    if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true)
+    if(sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true)
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
-            std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n";
+            std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file << "\n";
         }
 
         return false;
     }
 
-    if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true)
+    if(sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true)
     {
         std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n";
         return false;
     }
 
-    if (src_height != ref_height || src_width != ref_width)
+    if(src_height != ref_height || src_width != ref_width)
     {
-        if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width <<
-                                         "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
+        if(verboseErrors)
+            std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width << "," << src_height << ")vs("
+                      << ref_width << "," << ref_height << ")\n";
     }
 
-    if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width <<
-                                     "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
+    if(verboseErrors)
+        std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," << src_height << ") epsilon(" << epsilon
+                  << "), threshold(" << threshold * 100 << "%)\n";
 
-    if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false)
+    if(compareData(ref_data, src_data, src_width * src_height * 4, epsilon, threshold) == false)
     {
-        error_count=1;
+        error_count = 1;
     }
 
-    if (error_count == 0)
+    if(error_count == 0)
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
             std::cerr << "    OK\n\n";
         }
     }
     else
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
-            std::cerr << "    FAILURE!  "<<error_count<<" errors...\n\n";
+            std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
         }
     }
 
-    return (error_count == 0)? true : false;  // returns true if all pixels pass
+    return (error_count == 0) ? true : false; // returns true if all pixels pass
 }
 
-inline bool
-sdkComparePGM(const char *src_file, const char *ref_file,
-              const float epsilon, const float threshold, bool verboseErrors)
+inline bool sdkComparePGM(
+    const char* src_file,
+    const char* ref_file,
+    const float epsilon,
+    const float threshold,
+    bool verboseErrors)
 {
     unsigned char *src_data = 0, *ref_data = 0;
     unsigned long error_count = 0;
     unsigned int ref_width, ref_height;
     unsigned int src_width, src_height;
 
-    if (src_file == NULL || ref_file == NULL)
+    if(src_file == NULL || ref_file == NULL)
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
             std::cerr << "PGMvsPGM: src_file or ref_file is NULL.  Aborting comparison\n";
         }
@@ -1052,59 +1072,61 @@ sdkComparePGM(const char *src_file, const char *ref_file,
         return false;
     }
 
-    if (verboseErrors)
+    if(verboseErrors)
     {
         std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
         std::cerr << ">         (b)reference: <" << ref_file << ">\n";
     }
 
 
-    if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true)
+    if(sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true)
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
-            std::cerr << "PGMvsPGM: unable to load ref image file: "<< ref_file << "\n";
+            std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file << "\n";
         }
 
         return false;
     }
 
-    if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true)
+    if(sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true)
     {
         std::cerr << "PGMvsPGM: unable to load src image file: " << src_file << "\n";
         return false;
     }
 
-    if (src_height != ref_height || src_width != ref_width)
+    if(src_height != ref_height || src_width != ref_width)
     {
-        if (verboseErrors) std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width <<
-                                         "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n";
+        if(verboseErrors)
+            std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width << "," << src_height << ")vs("
+                      << ref_width << "," << ref_height << ")\n";
     }
 
-    if (verboseErrors) std::cerr << "PGMvsPGM: comparing images size (" << src_width <<
-                                     "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n";
+    if(verboseErrors)
+        std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," << src_height << ") epsilon(" << epsilon
+                  << "), threshold(" << threshold * 100 << "%)\n";
 
-    if (compareData(ref_data, src_data, src_width*src_height, epsilon, threshold) == false)
+    if(compareData(ref_data, src_data, src_width * src_height, epsilon, threshold) == false)
     {
-        error_count=1;
+        error_count = 1;
     }
 
-    if (error_count == 0)
+    if(error_count == 0)
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
             std::cerr << "    OK\n\n";
         }
     }
     else
     {
-        if (verboseErrors)
+        if(verboseErrors)
         {
-            std::cerr << "    FAILURE!  "<<error_count<<" errors...\n\n";
+            std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
         }
     }
 
-    return (error_count == 0)? true : false;  // returns true if all pixels pass
+    return (error_count == 0) ? true : false; // returns true if all pixels pass
 }
 
 #endif // HELPER_IMAGE_H
diff --git a/example/CUDASamples/common/helper_string.h b/example/CUDASamples/common/helper_string.h
index cdf35dfb..ccd3c051 100644
--- a/example/CUDASamples/common/helper_string.h
+++ b/example/CUDASamples/common/helper_string.h
@@ -13,80 +13,81 @@
 #ifndef STRING_HELPER_H
 #define STRING_HELPER_H
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <fstream>
 #include <string>
 
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-#ifndef _CRT_SECURE_NO_DEPRECATE
-#define _CRT_SECURE_NO_DEPRECATE
-#endif
-#ifndef STRCASECMP
-#define STRCASECMP  _stricmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP _strnicmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
-#endif
+#include <stdio.h>
+#include <stdlib.h>
 
-#ifndef FOPEN
-#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result != 0)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf_s
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf_s
-#endif
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#    ifndef _CRT_SECURE_NO_DEPRECATE
+#        define _CRT_SECURE_NO_DEPRECATE
+#    endif
+#    ifndef STRCASECMP
+#        define STRCASECMP _stricmp
+#    endif
+#    ifndef STRNCASECMP
+#        define STRNCASECMP _strnicmp
+#    endif
+#    ifndef STRCPY
+#        define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#    endif
+
+#    ifndef FOPEN
+#        define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#    endif
+#    ifndef FOPEN_FAIL
+#        define FOPEN_FAIL(result) (result != 0)
+#    endif
+#    ifndef SSCANF
+#        define SSCANF sscanf_s
+#    endif
+#    ifndef SPRINTF
+#        define SPRINTF sprintf_s
+#    endif
 #else // Linux Includes
-#include <string.h>
-#include <strings.h>
-
-#ifndef STRCASECMP
-#define STRCASECMP  strcasecmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP strncasecmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result == NULL)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf
-#endif
+#    include <string.h>
+#    include <strings.h>
+
+#    ifndef STRCASECMP
+#        define STRCASECMP strcasecmp
+#    endif
+#    ifndef STRNCASECMP
+#        define STRNCASECMP strncasecmp
+#    endif
+#    ifndef STRCPY
+#        define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#    endif
+
+#    ifndef FOPEN
+#        define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#    endif
+#    ifndef FOPEN_FAIL
+#        define FOPEN_FAIL(result) (result == NULL)
+#    endif
+#    ifndef SSCANF
+#        define SSCANF sscanf
+#    endif
+#    ifndef SPRINTF
+#        define SPRINTF sprintf
+#    endif
 #endif
 
 #ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
+#    define EXIT_WAIVED 2
 #endif
 
 // CUDA Utility Helper Functions
-inline int stringRemoveDelimiter(char delimiter, const char *string)
+inline int stringRemoveDelimiter(char delimiter, const char* string)
 {
     int string_start = 0;
 
-    while (string[string_start] == delimiter)
+    while(string[string_start] == delimiter)
     {
         string_start++;
     }
 
-    if (string_start >= (int)strlen(string)-1)
+    if(string_start >= (int) strlen(string) - 1)
     {
         return 0;
     }
@@ -94,19 +95,20 @@ inline int stringRemoveDelimiter(char delimiter, const char *string)
     return string_start;
 }
 
-inline int getFileExtension(char *filename, char **extension)
+inline int getFileExtension(char* filename, char** extension)
 {
-    int string_length = (int)strlen(filename);
+    int string_length = (int) strlen(filename);
 
-    while (filename[string_length--] != '.')
+    while(filename[string_length--] != '.')
     {
-        if (string_length == 0)
+        if(string_length == 0)
             break;
     }
 
-    if (string_length > 0) string_length += 2;
+    if(string_length > 0)
+        string_length += 2;
 
-    if (string_length == 0)
+    if(string_length == 0)
         *extension = NULL;
     else
         *extension = &filename[string_length];
@@ -115,23 +117,23 @@ inline int getFileExtension(char *filename, char **extension)
 }
 
 
-inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
+inline bool checkCmdLineFlag(const int argc, const char** argv, const char* string_ref)
 {
     bool bFound = false;
 
-    if (argc >= 1)
+    if(argc >= 1)
     {
-        for (int i=1; i < argc; i++)
+        for(int i = 1; i < argc; i++)
         {
             int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
+            const char* string_argv = &argv[i][string_start];
 
-            const char *equal_pos = strchr(string_argv, '=');
-            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+            const char* equal_pos = strchr(string_argv, '=');
+            int argv_length = (int) (equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
 
-            int length = (int)strlen(string_ref);
+            int length = (int) strlen(string_ref);
 
-            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
+            if(length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
             {
                 bFound = true;
                 continue;
@@ -143,29 +145,29 @@ inline bool checkCmdLineFlag(const int argc, const char **argv, const char *stri
 }
 
 // This function wraps the CUDA Driver API into a template function
-template <class T>
-inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
+template<class T>
+inline bool getCmdLineArgumentValue(const int argc, const char** argv, const char* string_ref, T* value)
 {
     bool bFound = false;
 
-    if (argc >= 1)
+    if(argc >= 1)
     {
-        for (int i=1; i < argc; i++)
+        for(int i = 1; i < argc; i++)
         {
             int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
-            int length = (int)strlen(string_ref);
+            const char* string_argv = &argv[i][string_start];
+            int length = (int) strlen(string_ref);
 
-            if (!STRNCASECMP(string_argv, string_ref, length))
+            if(!STRNCASECMP(string_argv, string_ref, length))
             {
-                if (length+1 <= (int)strlen(string_argv))
+                if(length + 1 <= (int) strlen(string_argv))
                 {
                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-                    *value = (T)atoi(&string_argv[length + auto_inc]);
+                    *value = (T) atoi(&string_argv[length + auto_inc]);
                 }
 
                 bFound = true;
-                i=argc;
+                i = argc;
             }
         }
     }
@@ -173,22 +175,22 @@ inline bool getCmdLineArgumentValue(const int argc, const char **argv, const cha
     return bFound;
 }
 
-inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
+inline int getCmdLineArgumentInt(const int argc, const char** argv, const char* string_ref)
 {
     bool bFound = false;
     int value = -1;
 
-    if (argc >= 1)
+    if(argc >= 1)
     {
-        for (int i=1; i < argc; i++)
+        for(int i = 1; i < argc; i++)
         {
             int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
-            int length = (int)strlen(string_ref);
+            const char* string_argv = &argv[i][string_start];
+            int length = (int) strlen(string_ref);
 
-            if (!STRNCASECMP(string_argv, string_ref, length))
+            if(!STRNCASECMP(string_argv, string_ref, length))
             {
-                if (length+1 <= (int)strlen(string_argv))
+                if(length + 1 <= (int) strlen(string_argv))
                 {
                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
                     value = atoi(&string_argv[length + auto_inc]);
@@ -204,7 +206,7 @@ inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *
         }
     }
 
-    if (bFound)
+    if(bFound)
     {
         return value;
     }
@@ -214,25 +216,25 @@ inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *
     }
 }
 
-inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
+inline float getCmdLineArgumentFloat(const int argc, const char** argv, const char* string_ref)
 {
     bool bFound = false;
     float value = -1;
 
-    if (argc >= 1)
+    if(argc >= 1)
     {
-        for (int i=1; i < argc; i++)
+        for(int i = 1; i < argc; i++)
         {
             int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
-            int length = (int)strlen(string_ref);
+            const char* string_argv = &argv[i][string_start];
+            int length = (int) strlen(string_ref);
 
-            if (!STRNCASECMP(string_argv, string_ref, length))
+            if(!STRNCASECMP(string_argv, string_ref, length))
             {
-                if (length+1 <= (int)strlen(string_argv))
+                if(length + 1 <= (int) strlen(string_argv))
                 {
                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-                    value = (float)atof(&string_argv[length + auto_inc]);
+                    value = (float) atof(&string_argv[length + auto_inc]);
                 }
                 else
                 {
@@ -245,7 +247,7 @@ inline float getCmdLineArgumentFloat(const int argc, const char **argv, const ch
         }
     }
 
-    if (bFound)
+    if(bFound)
     {
         return value;
     }
@@ -255,29 +257,28 @@ inline float getCmdLineArgumentFloat(const int argc, const char **argv, const ch
     }
 }
 
-inline bool getCmdLineArgumentString(const int argc, const char **argv,
-                                     const char *string_ref, char **string_retval)
+inline bool getCmdLineArgumentString(const int argc, const char** argv, const char* string_ref, char** string_retval)
 {
     bool bFound = false;
 
-    if (argc >= 1)
+    if(argc >= 1)
     {
-        for (int i=1; i < argc; i++)
+        for(int i = 1; i < argc; i++)
         {
             int string_start = stringRemoveDelimiter('-', argv[i]);
-            char *string_argv = (char *)&argv[i][string_start];
-            int length = (int)strlen(string_ref);
+            char* string_argv = (char*) &argv[i][string_start];
+            int length = (int) strlen(string_ref);
 
-            if (!STRNCASECMP(string_argv, string_ref, length))
+            if(!STRNCASECMP(string_argv, string_ref, length))
             {
-                *string_retval = &string_argv[length+1];
+                *string_retval = &string_argv[length + 1];
                 bFound = true;
                 continue;
             }
         }
     }
 
-    if (!bFound)
+    if(!bFound)
     {
         *string_retval = NULL;
     }
@@ -293,137 +294,142 @@ inline bool getCmdLineArgumentString(const int argc, const char **argv,
 //! @param filename         name of the file
 //! @param executable_path  optional absolute path of the executable
 //////////////////////////////////////////////////////////////////////////////
-inline char *sdkFindFilePath(const char *filename, const char *executable_path)
+inline char* sdkFindFilePath(const char* filename, const char* executable_path)
 {
     // <executable_name> defines a variable that is replaced with the name of the executable
 
     // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
-    // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
-    const char *searchPath[] =
-    {
-        "./",                                       // same dir
-        "./common/",                                // "/common/" subdir
-        "./common/data/",                           // "/common/data/" subdir
-        "./data/",                                  // "/data/" subdir
-        "./src/",                                   // "/src/" subdir
-        "./src/<executable_name>/data/",            // "/src/<executable_name>/data/" subdir
-        "./inc/",                                   // "/inc/" subdir
-        "./0_Simple/",                              // "/0_Simple/" subdir
-        "./1_Utilities/",                           // "/1_Utilities/" subdir
-        "./2_Graphics/",                            // "/2_Graphics/" subdir
-        "./3_Imaging/",                             // "/3_Imaging/" subdir
-        "./4_Financial/",                           // "/4_Financial/" subdir
-        "./5_Simulations/",                         // "/5_Simulations/" subdir
-        "./6_Advanced/",                            // "/6_Advanced/" subdir
-        "./7_CUDALibraries/",                       // "/7_CUDALibraries/" subdir
-        "./8_Android/",                             // "/8_Android/" subdir
-        "./samples/",                               // "/samples/" subdir
-
-        "../",                                      // up 1 in tree
-        "../common/",                               // up 1 in tree, "/common/" subdir
-        "../common/data/",                          // up 1 in tree, "/common/data/" subdir
-        "../data/",                                 // up 1 in tree, "/data/" subdir
-        "../src/",                                  // up 1 in tree, "/src/" subdir
-        "../inc/",                                  // up 1 in tree, "/inc/" subdir
-
-        "../0_Simple/<executable_name>/data/",       // up 1 in tree, "/0_Simple/<executable_name>/" subdir
-        "../1_Utilities/<executable_name>/data/",    // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
-        "../2_Graphics/<executable_name>/data/",     // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
-        "../3_Imaging/<executable_name>/data/",      // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
-        "../4_Financial/<executable_name>/data/",    // up 1 in tree, "/4_Financial/<executable_name>/" subdir
-        "../5_Simulations/<executable_name>/data/",  // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
-        "../6_Advanced/<executable_name>/data/",     // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
-        "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
-        "../8_Android/<executable_name>/data/",      // up 1 in tree, "/8_Android/<executable_name>/" subdir
-        "../samples/<executable_name>/data/",        // up 1 in tree, "/samples/<executable_name>/" subdir
-        "../../",                                        // up 2 in tree
-        "../../common/",                                 // up 2 in tree, "/common/" subdir
-        "../../common/data/",                            // up 2 in tree, "/common/data/" subdir
-        "../../data/",                                   // up 2 in tree, "/data/" subdir
-        "../../src/",                                    // up 2 in tree, "/src/" subdir
-        "../../inc/",                                    // up 2 in tree, "/inc/" subdir
-        "../../sandbox/<executable_name>/data/",         // up 2 in tree, "/sandbox/<executable_name>/" subdir
-        "../../0_Simple/<executable_name>/data/",        // up 2 in tree, "/0_Simple/<executable_name>/" subdir
-        "../../1_Utilities/<executable_name>/data/",     // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
-        "../../2_Graphics/<executable_name>/data/",      // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
-        "../../3_Imaging/<executable_name>/data/",       // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
-        "../../4_Financial/<executable_name>/data/",     // up 2 in tree, "/4_Financial/<executable_name>/" subdir
-        "../../5_Simulations/<executable_name>/data/",   // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
-        "../../6_Advanced/<executable_name>/data/",      // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
+    // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching
+    // the .exe or .bat, etc
+    const char* searchPath[] = {
+        "./", // same dir
+        "./common/", // "/common/" subdir
+        "./common/data/", // "/common/data/" subdir
+        "./data/", // "/data/" subdir
+        "./src/", // "/src/" subdir
+        "./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
+        "./inc/", // "/inc/" subdir
+        "./0_Simple/", // "/0_Simple/" subdir
+        "./1_Utilities/", // "/1_Utilities/" subdir
+        "./2_Graphics/", // "/2_Graphics/" subdir
+        "./3_Imaging/", // "/3_Imaging/" subdir
+        "./4_Financial/", // "/4_Financial/" subdir
+        "./5_Simulations/", // "/5_Simulations/" subdir
+        "./6_Advanced/", // "/6_Advanced/" subdir
+        "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
+        "./8_Android/", // "/8_Android/" subdir
+        "./samples/", // "/samples/" subdir
+
+        "../", // up 1 in tree
+        "../common/", // up 1 in tree, "/common/" subdir
+        "../common/data/", // up 1 in tree, "/common/data/" subdir
+        "../data/", // up 1 in tree, "/data/" subdir
+        "../src/", // up 1 in tree, "/src/" subdir
+        "../inc/", // up 1 in tree, "/inc/" subdir
+
+        "../0_Simple/<executable_name>/data/", // up 1 in tree, "/0_Simple/<executable_name>/" subdir
+        "../1_Utilities/<executable_name>/data/", // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../2_Graphics/<executable_name>/data/", // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../3_Imaging/<executable_name>/data/", // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../4_Financial/<executable_name>/data/", // up 1 in tree, "/4_Financial/<executable_name>/" subdir
+        "../5_Simulations/<executable_name>/data/", // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../6_Advanced/<executable_name>/data/", // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../7_CUDALibraries/<executable_name>/data/", // up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../8_Android/<executable_name>/data/", // up 1 in tree, "/8_Android/<executable_name>/" subdir
+        "../samples/<executable_name>/data/", // up 1 in tree, "/samples/<executable_name>/" subdir
+        "../../", // up 2 in tree
+        "../../common/", // up 2 in tree, "/common/" subdir
+        "../../common/data/", // up 2 in tree, "/common/data/" subdir
+        "../../data/", // up 2 in tree, "/data/" subdir
+        "../../src/", // up 2 in tree, "/src/" subdir
+        "../../inc/", // up 2 in tree, "/inc/" subdir
+        "../../sandbox/<executable_name>/data/", // up 2 in tree, "/sandbox/<executable_name>/" subdir
+        "../../0_Simple/<executable_name>/data/", // up 2 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../1_Utilities/<executable_name>/data/", // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../2_Graphics/<executable_name>/data/", // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../3_Imaging/<executable_name>/data/", // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../4_Financial/<executable_name>/data/", // up 2 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../5_Simulations/<executable_name>/data/", // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../6_Advanced/<executable_name>/data/", // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
         "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
-        "../../8_Android/<executable_name>/data/",       // up 2 in tree, "/8_Android/<executable_name>/" subdir
-        "../../samples/<executable_name>/data/",         // up 2 in tree, "/samples/<executable_name>/" subdir
-        "../../../",                                        // up 3 in tree
-        "../../../src/<executable_name>/",                  // up 3 in tree, "/src/<executable_name>/" subdir
-        "../../../src/<executable_name>/data/",             // up 3 in tree, "/src/<executable_name>/data/" subdir
-        "../../../src/<executable_name>/src/",              // up 3 in tree, "/src/<executable_name>/src/" subdir
-        "../../../src/<executable_name>/inc/",              // up 3 in tree, "/src/<executable_name>/inc/" subdir
-        "../../../sandbox/<executable_name>/",              // up 3 in tree, "/sandbox/<executable_name>/" subdir
-        "../../../sandbox/<executable_name>/data/",         // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
-        "../../../sandbox/<executable_name>/src/",          // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
-        "../../../sandbox/<executable_name>/inc/",          // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
-        "../../../0_Simple/<executable_name>/data/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
-        "../../../1_Utilities/<executable_name>/data/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
-        "../../../2_Graphics/<executable_name>/data/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
-        "../../../3_Imaging/<executable_name>/data/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
-        "../../../4_Financial/<executable_name>/data/",     // up 3 in tree, "/4_Financial/<executable_name>/" subdir
-        "../../../5_Simulations/<executable_name>/data/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
-        "../../../6_Advanced/<executable_name>/data/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
-        "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
-        "../../../8_Android/<executable_name>/data/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
-        "../../../samples/<executable_name>/data/",         // up 3 in tree, "/samples/<executable_name>/" subdir
-        "../../../common/",                                 // up 3 in tree, "../../../common/" subdir
-        "../../../common/data/",                            // up 3 in tree, "../../../common/data/" subdir
-        "../../../data/",                                   // up 3 in tree, "../../../data/" subdir
-        "../../../../",                                // up 4 in tree
-        "../../../../src/<executable_name>/",          // up 4 in tree, "/src/<executable_name>/" subdir
-        "../../../../src/<executable_name>/data/",     // up 4 in tree, "/src/<executable_name>/data/" subdir
-        "../../../../src/<executable_name>/src/",      // up 4 in tree, "/src/<executable_name>/src/" subdir
-        "../../../../src/<executable_name>/inc/",      // up 4 in tree, "/src/<executable_name>/inc/" subdir
-        "../../../../sandbox/<executable_name>/",      // up 4 in tree, "/sandbox/<executable_name>/" subdir
+        "../../8_Android/<executable_name>/data/", // up 2 in tree, "/8_Android/<executable_name>/" subdir
+        "../../samples/<executable_name>/data/", // up 2 in tree, "/samples/<executable_name>/" subdir
+        "../../../", // up 3 in tree
+        "../../../src/<executable_name>/", // up 3 in tree, "/src/<executable_name>/" subdir
+        "../../../src/<executable_name>/data/", // up 3 in tree, "/src/<executable_name>/data/" subdir
+        "../../../src/<executable_name>/src/", // up 3 in tree, "/src/<executable_name>/src/" subdir
+        "../../../src/<executable_name>/inc/", // up 3 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../sandbox/<executable_name>/", // up 3 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../sandbox/<executable_name>/data/", // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../sandbox/<executable_name>/src/", // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../sandbox/<executable_name>/inc/", // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../0_Simple/<executable_name>/data/", // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/data/", // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/data/", // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/data/", // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Financial/<executable_name>/data/", // up 3 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/data/", // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/data/", // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/"
+                                                            // subdir
+        "../../../8_Android/<executable_name>/data/", // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../samples/<executable_name>/data/", // up 3 in tree, "/samples/<executable_name>/" subdir
+        "../../../common/", // up 3 in tree, "../../../common/" subdir
+        "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
+        "../../../data/", // up 3 in tree, "../../../data/" subdir
+        "../../../../", // up 4 in tree
+        "../../../../src/<executable_name>/", // up 4 in tree, "/src/<executable_name>/" subdir
+        "../../../../src/<executable_name>/data/", // up 4 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../src/<executable_name>/src/", // up 4 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../src/<executable_name>/inc/", // up 4 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../sandbox/<executable_name>/", // up 4 in tree, "/sandbox/<executable_name>/" subdir
         "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
-        "../../../../sandbox/<executable_name>/src/",  // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
-        "../../../../sandbox/<executable_name>/inc/",   // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
-        "../../../../0_Simple/<executable_name>/data/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
-        "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
-        "../../../../2_Graphics/<executable_name>/data/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
-        "../../../../3_Imaging/<executable_name>/data/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
-        "../../../../4_Financial/<executable_name>/data/",  // up 4 in tree, "/4_Financial/<executable_name>/" subdir
-        "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
-        "../../../../6_Advanced/<executable_name>/data/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
-        "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
-        "../../../../8_Android/<executable_name>/data/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
-        "../../../../samples/<executable_name>/data/",      // up 4 in tree, "/samples/<executable_name>/" subdir
-        "../../../../common/",                              // up 4 in tree, "../../../common/" subdir
-        "../../../../common/data/",                         // up 4 in tree, "../../../common/data/" subdir
-        "../../../../data/",                                // up 4 in tree, "../../../data/" subdir
-        "../../../../../",                                // up 5 in tree
-        "../../../../../src/<executable_name>/",          // up 5 in tree, "/src/<executable_name>/" subdir
-        "../../../../../src/<executable_name>/data/",     // up 5 in tree, "/src/<executable_name>/data/" subdir
-        "../../../../../src/<executable_name>/src/",      // up 5 in tree, "/src/<executable_name>/src/" subdir
-        "../../../../../src/<executable_name>/inc/",      // up 5 in tree, "/src/<executable_name>/inc/" subdir
-        "../../../../../sandbox/<executable_name>/",      // up 5 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../sandbox/<executable_name>/src/", // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../sandbox/<executable_name>/inc/", // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../0_Simple/<executable_name>/data/", // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/data/", // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/data/", // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/data/", // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Financial/<executable_name>/data/", // up 4 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/data/", // up 4 in tree, "/5_Simulations/<executable_name>/"
+                                                             // subdir
+        "../../../../6_Advanced/<executable_name>/data/", // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/"
+                                                               // subdir
+        "../../../../8_Android/<executable_name>/data/", // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../samples/<executable_name>/data/", // up 4 in tree, "/samples/<executable_name>/" subdir
+        "../../../../common/", // up 4 in tree, "../../../common/" subdir
+        "../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir
+        "../../../../data/", // up 4 in tree, "../../../data/" subdir
+        "../../../../../", // up 5 in tree
+        "../../../../../src/<executable_name>/", // up 5 in tree, "/src/<executable_name>/" subdir
+        "../../../../../src/<executable_name>/data/", // up 5 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../../src/<executable_name>/src/", // up 5 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../../src/<executable_name>/inc/", // up 5 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../../sandbox/<executable_name>/", // up 5 in tree, "/sandbox/<executable_name>/" subdir
         "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
-        "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
-        "../../../../../sandbox/<executable_name>/inc/",   // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
-        "../../../../../0_Simple/<executable_name>/data/",     // up 5 in tree, "/0_Simple/<executable_name>/" subdir
-        "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
-        "../../../../../2_Graphics/<executable_name>/data/",   // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
-        "../../../../../3_Imaging/<executable_name>/data/",    // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
-        "../../../../../4_Financial/<executable_name>/data/",  // up 5 in tree, "/4_Financial/<executable_name>/" subdir
-        "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
-        "../../../../../6_Advanced/<executable_name>/data/",   // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
-        "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
-        "../../../../../8_Android/<executable_name>/data/",    // up 5 in tree, "/8_Android/<executable_name>/" subdir
-        "../../../../../samples/<executable_name>/data/",      // up 5 in tree, "/samples/<executable_name>/" subdir
-        "../../../../../common/",                         // up 5 in tree, "../../../common/" subdir
-        "../../../../../common/data/",                    // up 5 in tree, "../../../common/data/" subdir
+        "../../../../../sandbox/<executable_name>/src/", // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../../sandbox/<executable_name>/inc/", // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../../0_Simple/<executable_name>/data/", // up 5 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../../4_Financial/<executable_name>/data/", // up 5 in tree, "/4_Financial/<executable_name>/" subdir
+        "../../../../../5_Simulations/<executable_name>/data/", // up 5 in tree, "/5_Simulations/<executable_name>/"
+                                                                // subdir
+        "../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree,
+                                                                  // "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../../8_Android/<executable_name>/data/", // up 5 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../../samples/<executable_name>/data/", // up 5 in tree, "/samples/<executable_name>/" subdir
+        "../../../../../common/", // up 5 in tree, "../../../common/" subdir
+        "../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir
     };
 
     // Extract the executable name
     std::string executable_name;
 
-    if (executable_path != 0)
+    if(executable_path != 0)
     {
         executable_name = std::string(executable_path);
 
@@ -432,7 +438,7 @@ inline char *sdkFindFilePath(const char *filename, const char *executable_path)
         size_t delimiter_pos = executable_name.find_last_of('\\');
         executable_name.erase(0, delimiter_pos + 1);
 
-        if (executable_name.rfind(".exe") != std::string::npos)
+        if(executable_name.rfind(".exe") != std::string::npos)
         {
             // we strip .exe, only if the .exe is found
             executable_name.resize(executable_name.size() - 4);
@@ -441,21 +447,21 @@ inline char *sdkFindFilePath(const char *filename, const char *executable_path)
 #else
         // Linux & OSX path delimiter
         size_t delimiter_pos = executable_name.find_last_of('/');
-        executable_name.erase(0,delimiter_pos+1);
+        executable_name.erase(0, delimiter_pos + 1);
 #endif
     }
 
     // Loop over all search paths and return the first hit
-    for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
+    for(unsigned int i = 0; i < sizeof(searchPath) / sizeof(char*); ++i)
     {
         std::string path(searchPath[i]);
         size_t executable_name_pos = path.find("<executable_name>");
 
         // If there is executable_name variable in the searchPath
         // replace it with the value
-        if (executable_name_pos != std::string::npos)
+        if(executable_name_pos != std::string::npos)
         {
-            if (executable_path != 0)
+            if(executable_path != 0)
             {
                 path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
             }
@@ -472,20 +478,20 @@ inline char *sdkFindFilePath(const char *filename, const char *executable_path)
 
         // Test if the file exists
         path.append(filename);
-        FILE *fp;
+        FILE* fp;
         FOPEN(fp, path.c_str(), "rb");
 
-        if (fp != NULL)
+        if(fp != NULL)
         {
             fclose(fp);
             // File found
             // returning an allocated array here for backwards compatibility reasons
-            char *file_path = (char *) malloc(path.length() + 1);
+            char* file_path = (char*) malloc(path.length() + 1);
             STRCPY(file_path, path.length() + 1, path.c_str());
             return file_path;
         }
 
-        if (fp)
+        if(fp)
         {
             fclose(fp);
         }
diff --git a/example/CUDASamples/common/helper_timer.h b/example/CUDASamples/common/helper_timer.h
index 39ddc77f..ceb1f3d5 100644
--- a/example/CUDASamples/common/helper_timer.h
+++ b/example/CUDASamples/common/helper_timer.h
@@ -14,7 +14,7 @@
 #define HELPER_TIMER_H
 
 #ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
+#    define EXIT_WAIVED 2
 #endif
 
 // includes, system
@@ -27,28 +27,28 @@
 // But rather in a self contained class interface
 class StopWatchInterface
 {
-    public:
-        StopWatchInterface() {};
-        virtual ~StopWatchInterface() {};
+public:
+    StopWatchInterface(){};
+    virtual ~StopWatchInterface(){};
 
-    public:
-        //! Start time measurement
-        virtual void start() = 0;
+public:
+    //! Start time measurement
+    virtual void start() = 0;
 
-        //! Stop time measurement
-        virtual void stop() = 0;
+    //! Stop time measurement
+    virtual void stop() = 0;
 
-        //! Reset time counters to zero
-        virtual void reset() = 0;
+    //! Reset time counters to zero
+    virtual void reset() = 0;
 
-        //! Time in msec. after start. If the stop watch is still running (i.e. there
-        //! was no call to stop()) then the elapsed time is returned, otherwise the
-        //! time between the last start() and stop call is returned
-        virtual float getTime() = 0;
+    //! Time in msec. after start. If the stop watch is still running (i.e. there
+    //! was no call to stop()) then the elapsed time is returned, otherwise the
+    //! time between the last start() and stop call is returned
+    virtual float getTime() = 0;
 
-        //! Mean time to date based on the number of times the stopwatch has been
-        //! _stopped_ (ie finished sessions) and the current total time
-        virtual float getAverageTime() = 0;
+    //! Mean time to date based on the number of times the stopwatch has been
+    //! _stopped_ (ie finished sessions) and the current total time
+    virtual float getAverageTime() = 0;
 };
 
 
@@ -57,85 +57,90 @@ class StopWatchInterface
 //////////////////////////////////////////////////////////////////
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // includes, system
-#define WINDOWS_LEAN_AND_MEAN
-#include <windows.h>
-#undef min
-#undef max
+#    define WINDOWS_LEAN_AND_MEAN
+#    include <windows.h>
+#    undef min
+#    undef max
 
 //! Windows specific implementation of StopWatch
 class StopWatchWin : public StopWatchInterface
 {
-    public:
-        //! Constructor, default
-        StopWatchWin() :
-            start_time(),     end_time(),
-            diff_time(0.0f),  total_time(0.0f),
-            running(false), clock_sessions(0), freq(0), freq_set(false)
+public:
+    //! Constructor, default
+    StopWatchWin()
+        : start_time()
+        , end_time()
+        , diff_time(0.0f)
+        , total_time(0.0f)
+        , running(false)
+        , clock_sessions(0)
+        , freq(0)
+        , freq_set(false)
+    {
+        if(!freq_set)
         {
-            if (! freq_set)
-            {
-                // helper variable
-                LARGE_INTEGER temp;
+            // helper variable
+            LARGE_INTEGER temp;
 
-                // get the tick frequency from the OS
-                QueryPerformanceFrequency((LARGE_INTEGER *) &temp);
+            // get the tick frequency from the OS
+            QueryPerformanceFrequency((LARGE_INTEGER*) &temp);
 
-                // convert to type in which it is needed
-                freq = ((double) temp.QuadPart) / 1000.0;
+            // convert to type in which it is needed
+            freq = ((double) temp.QuadPart) / 1000.0;
 
-                // rememeber query
-                freq_set = true;
-            }
-        };
+            // rememeber query
+            freq_set = true;
+        }
+    };
 
-        // Destructor
-        ~StopWatchWin() { };
+    // Destructor
+    ~StopWatchWin(){};
 
-    public:
-        //! Start time measurement
-        inline void start();
+public:
+    //! Start time measurement
+    inline void start();
 
-        //! Stop time measurement
-        inline void stop();
+    //! Stop time measurement
+    inline void stop();
 
-        //! Reset time counters to zero
-        inline void reset();
+    //! Reset time counters to zero
+    inline void reset();
 
-        //! Time in msec. after start. If the stop watch is still running (i.e. there
-        //! was no call to stop()) then the elapsed time is returned, otherwise the
-        //! time between the last start() and stop call is returned
-        inline float getTime();
+    //! Time in msec. after start. If the stop watch is still running (i.e. there
+    //! was no call to stop()) then the elapsed time is returned, otherwise the
+    //! time between the last start() and stop call is returned
+    inline float getTime();
 
-        //! Mean time to date based on the number of times the stopwatch has been
-        //! _stopped_ (ie finished sessions) and the current total time
-        inline float getAverageTime();
+    //! Mean time to date based on the number of times the stopwatch has been
+    //! _stopped_ (ie finished sessions) and the current total time
+    inline float getAverageTime();
 
-    private:
-        // member variables
+private:
+    // member variables
 
-        //! Start of measurement
-        LARGE_INTEGER  start_time;
-        //! End of measurement
-        LARGE_INTEGER  end_time;
+    //! Start of measurement
+    LARGE_INTEGER start_time;
+    //! End of measurement
+    LARGE_INTEGER end_time;
 
-        //! Time difference between the last start and stop
-        float  diff_time;
+    //! Time difference between the last start and stop
+    float diff_time;
 
-        //! TOTAL time difference between starts and stops
-        float  total_time;
+    //! TOTAL time difference between starts and stops
+    float total_time;
 
-        //! flag if the stop watch is running
-        bool running;
+    //! flag if the stop watch is running
+    bool running;
 
-        //! Number of times clock has been started
-        //! and stopped to allow averaging
-        int clock_sessions;
+    //! Number of times clock has been started
+    //! and stopped to allow averaging
+    int clock_sessions;
 
-        //! tick frequency
-        double  freq;
+    //! tick frequency
+    double freq;
 
-        //! flag if the frequency has been set
-        bool  freq_set;
+    //! flag if the frequency has been set
+    bool freq_set;
 };
 
 // functions, inlined
@@ -143,10 +148,9 @@ class StopWatchWin : public StopWatchInterface
 ////////////////////////////////////////////////////////////////////////////////
 //! Start time measurement
 ////////////////////////////////////////////////////////////////////////////////
-inline void
-StopWatchWin::start()
+inline void StopWatchWin::start()
 {
-    QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+    QueryPerformanceCounter((LARGE_INTEGER*) &start_time);
     running = true;
 }
 
@@ -154,12 +158,10 @@ StopWatchWin::start()
 //! Stop time measurement and increment add to the current diff_time summation
 //! variable. Also increment the number of times this clock has been run.
 ////////////////////////////////////////////////////////////////////////////////
-inline void
-StopWatchWin::stop()
+inline void StopWatchWin::stop()
 {
-    QueryPerformanceCounter((LARGE_INTEGER *) &end_time);
-    diff_time = (float)
-                (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
+    QueryPerformanceCounter((LARGE_INTEGER*) &end_time);
+    diff_time = (float) (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
 
     total_time += diff_time;
     clock_sessions++;
@@ -170,16 +172,15 @@ StopWatchWin::stop()
 //! Reset the timer to 0. Does not change the timer running state but does
 //! recapture this point in time as the current start time if it is running.
 ////////////////////////////////////////////////////////////////////////////////
-inline void
-StopWatchWin::reset()
+inline void StopWatchWin::reset()
 {
     diff_time = 0;
     total_time = 0;
     clock_sessions = 0;
 
-    if (running)
+    if(running)
     {
-        QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+        QueryPerformanceCounter((LARGE_INTEGER*) &start_time);
     }
 }
 
@@ -190,18 +191,16 @@ StopWatchWin::reset()
 //! current diff_time sum, otherwise the current summed time difference alone
 //! is returned.
 ////////////////////////////////////////////////////////////////////////////////
-inline float
-StopWatchWin::getTime()
+inline float StopWatchWin::getTime()
 {
     // Return the TOTAL time to date
     float retval = total_time;
 
-    if (running)
+    if(running)
     {
         LARGE_INTEGER temp;
-        QueryPerformanceCounter((LARGE_INTEGER *) &temp);
-        retval += (float)
-                  (((double)(temp.QuadPart - start_time.QuadPart)) / freq);
+        QueryPerformanceCounter((LARGE_INTEGER*) &temp);
+        retval += (float) (((double) (temp.QuadPart - start_time.QuadPart)) / freq);
     }
 
     return retval;
@@ -211,76 +210,70 @@ StopWatchWin::getTime()
 //! Time in msec. for a single run based on the total number of COMPLETED runs
 //! and the total time.
 ////////////////////////////////////////////////////////////////////////////////
-inline float
-StopWatchWin::getAverageTime()
+inline float StopWatchWin::getAverageTime()
 {
-    return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+    return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
 }
 #else
 // Declarations for Stopwatch on Linux and Mac OSX
 // includes, system
-#include <ctime>
-#include <sys/time.h>
+#    include <ctime>
+
+#    include <sys/time.h>
 
 //! Windows specific implementation of StopWatch
 class StopWatchLinux : public StopWatchInterface
 {
-    public:
-        //! Constructor, default
-        StopWatchLinux() :
-            start_time(), diff_time(0.0), total_time(0.0),
-            running(false), clock_sessions(0)
-        { };
-
-        // Destructor
-        virtual ~StopWatchLinux()
-        { };
-
-    public:
-        //! Start time measurement
-        inline void start();
+public:
+    //! Constructor, default
+    StopWatchLinux() : start_time(), diff_time(0.0), total_time(0.0), running(false), clock_sessions(0){};
 
-        //! Stop time measurement
-        inline void stop();
+    // Destructor
+    virtual ~StopWatchLinux(){};
 
-        //! Reset time counters to zero
-        inline void reset();
+public:
+    //! Start time measurement
+    inline void start();
 
-        //! Time in msec. after start. If the stop watch is still running (i.e. there
-        //! was no call to stop()) then the elapsed time is returned, otherwise the
-        //! time between the last start() and stop call is returned
-        inline float getTime();
+    //! Stop time measurement
+    inline void stop();
 
-        //! Mean time to date based on the number of times the stopwatch has been
-        //! _stopped_ (ie finished sessions) and the current total time
-        inline float getAverageTime();
+    //! Reset time counters to zero
+    inline void reset();
 
-    private:
+    //! Time in msec. after start. If the stop watch is still running (i.e. there
+    //! was no call to stop()) then the elapsed time is returned, otherwise the
+    //! time between the last start() and stop call is returned
+    inline float getTime();
 
-        // helper functions
+    //! Mean time to date based on the number of times the stopwatch has been
+    //! _stopped_ (ie finished sessions) and the current total time
+    inline float getAverageTime();
 
-        //! Get difference between start time and current time
-        inline float getDiffTime();
+private:
+    // helper functions
 
-    private:
+    //! Get difference between start time and current time
+    inline float getDiffTime();
 
-        // member variables
+private:
+    // member variables
 
-        //! Start of measurement
-        struct timeval  start_time;
+    //! Start of measurement
+    struct timeval start_time;
 
-        //! Time difference between the last start and stop
-        float  diff_time;
+    //! Time difference between the last start and stop
+    float diff_time;
 
-        //! TOTAL time difference between starts and stops
-        float  total_time;
+    //! TOTAL time difference between starts and stops
+    float total_time;
 
-        //! flag if the stop watch is running
-        bool running;
+    //! flag if the stop watch is running
+    bool running;
 
-        //! Number of times clock has been started
-        //! and stopped to allow averaging
-        int clock_sessions;
+    //! Number of times clock has been started
+    //! and stopped to allow averaging
+    int clock_sessions;
 };
 
 // functions, inlined
@@ -288,8 +281,7 @@ class StopWatchLinux : public StopWatchInterface
 ////////////////////////////////////////////////////////////////////////////////
 //! Start time measurement
 ////////////////////////////////////////////////////////////////////////////////
-inline void
-StopWatchLinux::start()
+inline void StopWatchLinux::start()
 {
     gettimeofday(&start_time, 0);
     running = true;
@@ -299,8 +291,7 @@ StopWatchLinux::start()
 //! Stop time measurement and increment add to the current diff_time summation
 //! variable. Also increment the number of times this clock has been run.
 ////////////////////////////////////////////////////////////////////////////////
-inline void
-StopWatchLinux::stop()
+inline void StopWatchLinux::stop()
 {
     diff_time = getDiffTime();
     total_time += diff_time;
@@ -312,14 +303,13 @@ StopWatchLinux::stop()
 //! Reset the timer to 0. Does not change the timer running state but does
 //! recapture this point in time as the current start time if it is running.
 ////////////////////////////////////////////////////////////////////////////////
-inline void
-StopWatchLinux::reset()
+inline void StopWatchLinux::reset()
 {
     diff_time = 0;
     total_time = 0;
     clock_sessions = 0;
 
-    if (running)
+    if(running)
     {
         gettimeofday(&start_time, 0);
     }
@@ -331,13 +321,12 @@ StopWatchLinux::reset()
 //! current diff_time sum, otherwise the current summed time difference alone
 //! is returned.
 ////////////////////////////////////////////////////////////////////////////////
-inline float
-StopWatchLinux::getTime()
+inline float StopWatchLinux::getTime()
 {
     // Return the TOTAL time to date
     float retval = total_time;
 
-    if (running)
+    if(running)
     {
         retval += getDiffTime();
     }
@@ -349,23 +338,20 @@ StopWatchLinux::getTime()
 //! Time in msec. for a single run based on the total number of COMPLETED runs
 //! and the total time.
 ////////////////////////////////////////////////////////////////////////////////
-inline float
-StopWatchLinux::getAverageTime()
+inline float StopWatchLinux::getAverageTime()
 {
-    return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+    return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
 }
 ////////////////////////////////////////////////////////////////////////////////
 
 ////////////////////////////////////////////////////////////////////////////////
-inline float
-StopWatchLinux::getDiffTime()
+inline float StopWatchLinux::getDiffTime()
 {
     struct timeval t_time;
     gettimeofday(&t_time, 0);
 
     // time difference in milli-seconds
-    return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
-                   + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+    return (float) (1000.0 * (t_time.tv_sec - start_time.tv_sec) + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
 }
 #endif // WIN32
 
@@ -377,14 +363,13 @@ StopWatchLinux::getDiffTime()
 //! @return true if a time has been created, otherwise false
 //! @param  name of the new timer, 0 if the creation failed
 ////////////////////////////////////////////////////////////////////////////////
-inline bool
-sdkCreateTimer(StopWatchInterface **timer_interface)
+inline bool sdkCreateTimer(StopWatchInterface** timer_interface)
 {
-    //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+    // printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    *timer_interface = (StopWatchInterface *)new StopWatchWin();
+    *timer_interface = (StopWatchInterface*) new StopWatchWin();
 #else
-    *timer_interface = (StopWatchInterface *)new StopWatchLinux();
+    *timer_interface = (StopWatchInterface*) new StopWatchLinux();
 #endif
     return (*timer_interface != NULL) ? true : false;
 }
@@ -395,11 +380,10 @@ sdkCreateTimer(StopWatchInterface **timer_interface)
 //! @return true if a time has been deleted, otherwise false
 //! @param  name of the timer to delete
 ////////////////////////////////////////////////////////////////////////////////
-inline bool
-sdkDeleteTimer(StopWatchInterface **timer_interface)
+inline bool sdkDeleteTimer(StopWatchInterface** timer_interface)
 {
-    //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
-    if (*timer_interface)
+    // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+    if(*timer_interface)
     {
         delete *timer_interface;
         *timer_interface = NULL;
@@ -412,11 +396,10 @@ sdkDeleteTimer(StopWatchInterface **timer_interface)
 //! Start the time with name \a name
 //! @param name  name of the timer to start
 ////////////////////////////////////////////////////////////////////////////////
-inline bool
-sdkStartTimer(StopWatchInterface **timer_interface)
+inline bool sdkStartTimer(StopWatchInterface** timer_interface)
 {
-    //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
-    if (*timer_interface)
+    // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+    if(*timer_interface)
     {
         (*timer_interface)->start();
     }
@@ -428,11 +411,10 @@ sdkStartTimer(StopWatchInterface **timer_interface)
 //! Stop the time with name \a name. Does not reset.
 //! @param name  name of the timer to stop
 ////////////////////////////////////////////////////////////////////////////////
-inline bool
-sdkStopTimer(StopWatchInterface **timer_interface)
+inline bool sdkStopTimer(StopWatchInterface** timer_interface)
 {
     // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
-    if (*timer_interface)
+    if(*timer_interface)
     {
         (*timer_interface)->stop();
     }
@@ -444,11 +426,10 @@ sdkStopTimer(StopWatchInterface **timer_interface)
 //! Resets the timer's counter.
 //! @param name  name of the timer to reset.
 ////////////////////////////////////////////////////////////////////////////////
-inline bool
-sdkResetTimer(StopWatchInterface **timer_interface)
+inline bool sdkResetTimer(StopWatchInterface** timer_interface)
 {
     // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
-    if (*timer_interface)
+    if(*timer_interface)
     {
         (*timer_interface)->reset();
     }
@@ -463,11 +444,10 @@ sdkResetTimer(StopWatchInterface **timer_interface)
 //! Excludes the current running time if the timer is currently running.
 //! @param name  name of the timer to return the time of
 ////////////////////////////////////////////////////////////////////////////////
-inline float
-sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
+inline float sdkGetAverageTimerValue(StopWatchInterface** timer_interface)
 {
     //  printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
-    if (*timer_interface)
+    if(*timer_interface)
     {
         return (*timer_interface)->getAverageTime();
     }
@@ -482,11 +462,10 @@ sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
 //! or timer creation.
 //! @param name  name of the timer to obtain the value of.
 ////////////////////////////////////////////////////////////////////////////////
-inline float
-sdkGetTimerValue(StopWatchInterface **timer_interface)
+inline float sdkGetTimerValue(StopWatchInterface** timer_interface)
 {
     // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
-    if (*timer_interface)
+    if(*timer_interface)
     {
         return (*timer_interface)->getTime();
     }
diff --git a/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
index 4c91d53f..15644865 100644
--- a/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
+++ b/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
@@ -15,40 +15,50 @@
  * of the programming guide with some additions like error checking.
  */
 
-#include <stdio.h>
 #include <iostream> //std:cout
+
+#include <stdio.h>
 // For the CUDA runtime routines (prefixed with "cupla_")
 #include <cupla.hpp>
-//Timer for test purpose
+// Timer for test purpose
 #include <chrono>
-#include <boost/lexical_cast.hpp>
 #include <vector>
+
+#include <boost/lexical_cast.hpp>
 /**
  * CUDA Kernel Device code
  *
  * Computes the vector addition of A and B into C. The 3 vectors have the same
  * number of elements numElements.
  */
-struct vectorAdd {
+struct vectorAdd
+{
     template<typename T_Acc>
-    ALPAKA_FN_HOST_ACC
-    void operator()(T_Acc const &acc, const float *A, const float *B, float *C, const int numElements) const {
-        int begin = cupla::blockDim(acc).x * cupla::blockIdx(acc).x * cupla::threadDim(acc).x + cupla::threadIdx(acc).x * cupla::threadDim(acc).x;
-        if (begin < numElements) {
-            int end = (begin + cupla::threadDim(acc).x < numElements) ? begin+cupla::threadDim(acc).x : numElements;
-            for (int i=begin; i <end; ++i) {
+    ALPAKA_FN_HOST_ACC void operator()(
+        T_Acc const& acc,
+        const float* A,
+        const float* B,
+        float* C,
+        const int numElements) const
+    {
+        int begin = cupla::blockDim(acc).x * cupla::blockIdx(acc).x * cupla::threadDim(acc).x
+            + cupla::threadIdx(acc).x * cupla::threadDim(acc).x;
+        if(begin < numElements)
+        {
+            int end = (begin + cupla::threadDim(acc).x < numElements) ? begin + cupla::threadDim(acc).x : numElements;
+            for(int i = begin; i < end; ++i)
+            {
                 C[i] = A[i] + B[i], cupla::hierarchy::Blocks{};
             }
         }
     }
 };
 
-void benchmarkTest(int first, int last , int stepSize);
+void benchmarkTest(int first, int last, int stepSize);
 /**
  * Host main routine
  */
-int
-main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     // Error code to check return values for CUDA calls
     cuplaError_t err = cuplaSuccess;
@@ -59,53 +69,53 @@ main(int argc, char *argv[])
     printf("[Vector addition of %d elements]\n", numElements);
 
     // Allocate the host input vector A
-    float *h_A = (float *)malloc(size);
+    float* h_A = (float*) malloc(size);
 
     // Allocate the host input vector B
-    float *h_B = (float *)malloc(size);
+    float* h_B = (float*) malloc(size);
 
     // Allocate the host output vector C
-    float *h_C = (float *)malloc(size);
+    float* h_C = (float*) malloc(size);
 
     // Verify that allocations succeeded
-    if (h_A == NULL || h_B == NULL || h_C == NULL)
+    if(h_A == NULL || h_B == NULL || h_C == NULL)
     {
         fprintf(stderr, "Failed to allocate host vectors!\n");
         exit(EXIT_FAILURE);
     }
 
     // Initialize the host input vectors
-    for (int i = 0; i < numElements; ++i)
+    for(int i = 0; i < numElements; ++i)
     {
-        h_A[i] = rand()/(float)RAND_MAX;
-        h_B[i] = rand()/(float)RAND_MAX;
+        h_A[i] = rand() / (float) RAND_MAX;
+        h_B[i] = rand() / (float) RAND_MAX;
     }
 
     // Allocate the device input vector A
-    float *d_A = NULL;
-    err = cuplaMalloc((void **)&d_A, size);
+    float* d_A = NULL;
+    err = cuplaMalloc((void**) &d_A, size);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
 
     // Allocate the device input vector B
-    float *d_B = NULL;
-    err = cuplaMalloc((void **)&d_B, size);
+    float* d_B = NULL;
+    err = cuplaMalloc((void**) &d_B, size);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
 
     // Allocate the device output vector C
-    float *d_C = NULL;
-    err = cuplaMalloc((void **)&d_C, size);
+    float* d_C = NULL;
+    err = cuplaMalloc((void**) &d_C, size);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -116,7 +126,7 @@ main(int argc, char *argv[])
     printf("Copy input data from the host memory to the CUDA device\n");
     err = cuplaMemcpy(d_A, h_A, size, cuplaMemcpyHostToDevice);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -124,7 +134,7 @@ main(int argc, char *argv[])
 
     err = cuplaMemcpy(d_B, h_B, size, cuplaMemcpyHostToDevice);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -132,12 +142,12 @@ main(int argc, char *argv[])
 
     // Launch the Vector Add CUDA Kernel
     int threadsPerBlock = 256;
-    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
     printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
-    CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock,0,0)(d_A, d_B, d_C, numElements);
+    CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock, 0, 0)(d_A, d_B, d_C, numElements);
     err = cuplaGetLastError();
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -148,16 +158,16 @@ main(int argc, char *argv[])
     printf("Copy output data from the CUDA device to the host memory\n");
     err = cuplaMemcpy(h_C, d_C, size, cuplaMemcpyDeviceToHost);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
 
     // Verify that the result vector is correct
-    for (int i = 0; i < numElements; ++i)
+    for(int i = 0; i < numElements; ++i)
     {
-        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
+        if(fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
         {
             fprintf(stderr, "Result verification failed at element %d!\n", i);
             exit(EXIT_FAILURE);
@@ -169,7 +179,7 @@ main(int argc, char *argv[])
     // Free device global memory
     err = cuplaFree(d_A);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -177,14 +187,14 @@ main(int argc, char *argv[])
 
     err = cuplaFree(d_B);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
     err = cuplaFree(d_C);
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -203,86 +213,88 @@ main(int argc, char *argv[])
     // flushed before the application exits
     err = cuplaDeviceReset();
 
-    if (err != cuplaSuccess)
+    if(err != cuplaSuccess)
     {
         fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cuplaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
     printf("Done\n");
 
-    using boost::lexical_cast;
     using boost::bad_lexical_cast;
+    using boost::lexical_cast;
     std::vector<int> args;
-    while (*++argv){
-        try{
+    while(*++argv)
+    {
+        try
+        {
             args.push_back(lexical_cast<int>(*argv));
         }
-        catch( const bad_lexical_cast &){
+        catch(const bad_lexical_cast&)
+        {
             args.push_back(0);
         }
     }
-    //run benchmartest
+    // run benchmartest
     int first = 50000;
     int last = 100000;
-    int stepSize= 50000;
-    if (args.size() >1){
-        first=args[0];
-        last=args[1];
+    int stepSize = 50000;
+    if(args.size() > 1)
+    {
+        first = args[0];
+        last = args[1];
     }
-    if (args.size()>2){
-        stepSize=args[2];
+    if(args.size() > 2)
+    {
+        stepSize = args[2];
     }
     benchmarkTest(first, last, stepSize);
     cuplaDeviceReset();
     return 0;
 }
 
-void
-benchmarkTest(int first, int last, int stepSize)
+void benchmarkTest(int first, int last, int stepSize)
 {
-
-    for (int numElements = first; numElements <=last ; numElements+= stepSize) {
-        std::cout <<"N= " <<numElements << "; ";
+    for(int numElements = first; numElements <= last; numElements += stepSize)
+    {
+        std::cout << "N= " << numElements << "; ";
         size_t size = numElements * sizeof(float);
-        //alloc host memory
-        float *h_A = (float *)malloc(size);
-        float *h_B = (float *)malloc(size);
-        //init
-        for (int i = 0; i < numElements; ++i) {
-            h_A[i] = rand()/(float)RAND_MAX;
-            h_B[i] = rand()/(float)RAND_MAX;
+        // alloc host memory
+        float* h_A = (float*) malloc(size);
+        float* h_B = (float*) malloc(size);
+        // init
+        for(int i = 0; i < numElements; ++i)
+        {
+            h_A[i] = rand() / (float) RAND_MAX;
+            h_B[i] = rand() / (float) RAND_MAX;
         }
-        //alloc device memory
-        float *d_A = NULL;
-        cuplaMalloc((void **) &d_A, size);
-        float *d_B = NULL;
-        cuplaMalloc((void **) &d_B, size);
-        float *d_C = NULL;
-        cuplaMalloc((void **) &d_C, size);
+        // alloc device memory
+        float* d_A = NULL;
+        cuplaMalloc((void**) &d_A, size);
+        float* d_B = NULL;
+        cuplaMalloc((void**) &d_B, size);
+        float* d_C = NULL;
+        cuplaMalloc((void**) &d_C, size);
 
         // copy host device
         cuplaMemcpy(d_A, h_A, size, cuplaMemcpyHostToDevice);
         cuplaMemcpy(d_B, h_B, size, cuplaMemcpyHostToDevice);
 
-        int threadsPerBlock=1024;
-        int blocksPerGrid= (numElements+threadsPerBlock-1)/threadsPerBlock;
+        int threadsPerBlock = 1024;
+        int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 
-        //Run Kernel
-        std::chrono::high_resolution_clock::time_point start =
-            std::chrono::high_resolution_clock::now();
+        // Run Kernel
+        std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
 
         CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock, 0, 0)(d_A, d_B, d_C, numElements);
         cuplaDeviceSynchronize();
 
-        std::chrono::high_resolution_clock::time_point end =
-                std::chrono::high_resolution_clock::now();
+        std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
 
-        std::cout << "Time: "<< std::chrono::duration_cast<std::chrono::milliseconds>
-                                        (end-start).count() <<"ms"<<std::endl;
-        //Free Device memory
+        std::cout << "Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms"
+                  << std::endl;
+        // Free Device memory
         cuplaFree(d_A);
         cuplaFree(d_B);
         cuplaFree(d_C);
     }
 }
-
diff --git a/example/CUDASamples/matrixMul/src/matrixMul.cpp b/example/CUDASamples/matrixMul/src/matrixMul.cpp
index e27a2db1..85f6662d 100644
--- a/example/CUDASamples/matrixMul/src/matrixMul.cpp
+++ b/example/CUDASamples/matrixMul/src/matrixMul.cpp
@@ -24,8 +24,8 @@
  */
 
 // System includes
-#include <stdio.h>
 #include <assert.h>
+#include <stdio.h>
 // CUDA runtime
 #include <cuda_to_cupla.hpp>
 
@@ -36,94 +36,89 @@
  * Matrix multiplication (CUDA Kernel) on the device: C = A * B
  * wA is A's width and wB is B's width
  */
-template <int BLOCK_SIZE>
+template<int BLOCK_SIZE>
 struct matrixMulCUDA
 {
+    template<typename T_Acc>
+    ALPAKA_FN_HOST_ACC void operator()(T_Acc const& acc, float* C, float* A, float* B, int wA, int wB) const
+    {
+        // Block index
+        int bx = blockIdx.x;
+        int by = blockIdx.y;
 
-template<typename T_Acc>
-ALPAKA_FN_HOST_ACC
-void operator()(T_Acc const& acc,float *C, float *A, float *B, int wA, int wB) const
-{
-    // Block index
-    int bx = blockIdx.x;
-    int by = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    // Index of the first sub-matrix of A processed by the block
-    int aBegin = wA * BLOCK_SIZE * by;
-
-    // Index of the last sub-matrix of A processed by the block
-    int aEnd   = aBegin + wA - 1;
-
-    // Step size used to iterate through the sub-matrices of A
-    int aStep  = BLOCK_SIZE;
-
-    // Index of the first sub-matrix of B processed by the block
-    int bBegin = BLOCK_SIZE * bx;
-
-    // Step size used to iterate through the sub-matrices of B
-    int bStep  = BLOCK_SIZE * wB;
+        // Thread index
+        int tx = threadIdx.x;
+        int ty = threadIdx.y;
 
-    // Csub is used to store the element of the block sub-matrix
-    // that is computed by the thread
-    float Csub = 0;
+        // Index of the first sub-matrix of A processed by the block
+        int aBegin = wA * BLOCK_SIZE * by;
 
-   sharedMem(As, cupla::Array<cupla::Array<float,BLOCK_SIZE>,BLOCK_SIZE>);
-   sharedMem(Bs, cupla::Array<cupla::Array<float,BLOCK_SIZE>,BLOCK_SIZE>);
+        // Index of the last sub-matrix of A processed by the block
+        int aEnd = aBegin + wA - 1;
 
-    // Loop over all the sub-matrices of A and B
-    // required to compute the block sub-matrix
-    for (int a = aBegin, b = bBegin;
-         a <= aEnd;
-         a += aStep, b += bStep)
-    {
+        // Step size used to iterate through the sub-matrices of A
+        int aStep = BLOCK_SIZE;
 
-        // Declaration of the shared memory array As used to
-        // store the sub-matrix of A
-        //__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+        // Index of the first sub-matrix of B processed by the block
+        int bBegin = BLOCK_SIZE * bx;
 
-        // Declaration of the shared memory array Bs used to
-        // store the sub-matrix of B
-        //__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+        // Step size used to iterate through the sub-matrices of B
+        int bStep = BLOCK_SIZE * wB;
 
-        // Load the matrices from device memory
-        // to shared memory; each thread loads
-        // one element of each matrix
-        As[ty][tx] = A[a + wA * ty + tx];
-        Bs[ty][tx] = B[b + wB * ty + tx];
+        // Csub is used to store the element of the block sub-matrix
+        // that is computed by the thread
+        float Csub = 0;
 
-        // Synchronize to make sure the matrices are loaded
-        __syncthreads();
+        sharedMem(As, cupla::Array<cupla::Array<float, BLOCK_SIZE>, BLOCK_SIZE>);
+        sharedMem(Bs, cupla::Array<cupla::Array<float, BLOCK_SIZE>, BLOCK_SIZE>);
 
-        // Multiply the two matrices together;
-        // each thread computes one element
-        // of the block sub-matrix
+        // Loop over all the sub-matrices of A and B
+        // required to compute the block sub-matrix
+        for(int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
+        {
+            // Declaration of the shared memory array As used to
+            // store the sub-matrix of A
+            //__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+            // Declaration of the shared memory array Bs used to
+            // store the sub-matrix of B
+            //__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+            // Load the matrices from device memory
+            // to shared memory; each thread loads
+            // one element of each matrix
+            As[ty][tx] = A[a + wA * ty + tx];
+            Bs[ty][tx] = B[b + wB * ty + tx];
+
+            // Synchronize to make sure the matrices are loaded
+            __syncthreads();
+
+            // Multiply the two matrices together;
+            // each thread computes one element
+            // of the block sub-matrix
 #pragma unroll
 
-        for (int k = 0; k < BLOCK_SIZE; ++k)
-        {
-            Csub += As[ty][k] * Bs[k][tx];
+            for(int k = 0; k < BLOCK_SIZE; ++k)
+            {
+                Csub += As[ty][k] * Bs[k][tx];
+            }
+
+            // Synchronize to make sure that the preceding
+            // computation is done before loading two new
+            // sub-matrices of A and B in the next iteration
+            __syncthreads();
         }
 
-        // Synchronize to make sure that the preceding
-        // computation is done before loading two new
-        // sub-matrices of A and B in the next iteration
-        __syncthreads();
+        // Write the block sub-matrix to device memory;
+        // each thread writes one element
+        int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+        C[c + wB * ty + tx] = Csub;
     }
-
-    // Write the block sub-matrix to device memory;
-    // each thread writes one element
-    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
-    C[c + wB * ty + tx] = Csub;
-}
 };
 
-void constantInit(float *data, int size, float val)
+void constantInit(float* data, int size, float val)
 {
-    for (int i = 0; i < size; ++i)
+    for(int i = 0; i < size; ++i)
     {
         data[i] = val;
     }
@@ -132,15 +127,15 @@ void constantInit(float *data, int size, float val)
 /**
  * Run a simple test of matrix multiplication using CUDA
  */
-int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
+int matrixMultiply(int argc, char** argv, int block_size, dim3& dimsA, dim3& dimsB)
 {
     // Allocate host memory for matrices A and B
     unsigned int size_A = dimsA.x * dimsA.y;
     unsigned int mem_size_A = sizeof(float) * size_A;
-    float *h_A = (float *)malloc(mem_size_A);
+    float* h_A = (float*) malloc(mem_size_A);
     unsigned int size_B = dimsB.x * dimsB.y;
     unsigned int mem_size_B = sizeof(float) * size_B;
-    float *h_B = (float *)malloc(mem_size_B);
+    float* h_B = (float*) malloc(mem_size_B);
 
     // Initialize host memory
     const float valB = 0.01f;
@@ -153,9 +148,9 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     // Allocate host matrix C
     dim3 dimsC(dimsB.x, dimsA.y, 1);
     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
-    float *h_C = (float *) malloc(mem_size_C);
+    float* h_C = (float*) malloc(mem_size_C);
 
-    if (h_C == NULL)
+    if(h_C == NULL)
     {
         fprintf(stderr, "Failed to allocate host matrix C!\n");
         exit(EXIT_FAILURE);
@@ -163,25 +158,25 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
 
     cudaError_t error;
 
-    error = cudaMalloc((void **) &d_A, mem_size_A);
+    error = cudaMalloc((void**) &d_A, mem_size_A);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
         exit(EXIT_FAILURE);
     }
 
-    error = cudaMalloc((void **) &d_B, mem_size_B);
+    error = cudaMalloc((void**) &d_B, mem_size_B);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
         exit(EXIT_FAILURE);
     }
 
-    error = cudaMalloc((void **) &d_C, mem_size_C);
+    error = cudaMalloc((void**) &d_C, mem_size_C);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
         exit(EXIT_FAILURE);
@@ -190,7 +185,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     // copy host memory to device
     error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__);
         exit(EXIT_FAILURE);
@@ -198,7 +193,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
 
     error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__);
         exit(EXIT_FAILURE);
@@ -212,13 +207,13 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     printf("Computing result using CUDA Kernel...\n");
 
     // Performs warmup operation using matrixMul CUDA kernel
-    if (block_size == 16)
+    if(block_size == 16)
     {
-        CUPLA_KERNEL(matrixMulCUDA<16>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x);
+        CUPLA_KERNEL(matrixMulCUDA<16>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x);
     }
     else
     {
-        CUPLA_KERNEL(matrixMulCUDA<32>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x);
+        CUPLA_KERNEL(matrixMulCUDA<32>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x);
     }
 
     printf("done\n");
@@ -229,7 +224,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     cudaEvent_t start;
     error = cudaEventCreate(&start);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
         exit(EXIT_FAILURE);
@@ -238,7 +233,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     cudaEvent_t stop;
     error = cudaEventCreate(&stop);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
         exit(EXIT_FAILURE);
@@ -247,31 +242,31 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     // Record the start event
     error = cudaEventRecord(start, NULL);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
         exit(EXIT_FAILURE);
     }
 
     // Execute the kernel
-    int nIter = 1; //300;
+    int nIter = 1; // 300;
 
-    for (int j = 0; j < nIter; j++)
+    for(int j = 0; j < nIter; j++)
     {
-        if (block_size == 16)
+        if(block_size == 16)
         {
-            CUPLA_KERNEL(matrixMulCUDA<16>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            CUPLA_KERNEL(matrixMulCUDA<16>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x);
         }
         else
         {
-            CUPLA_KERNEL(matrixMulCUDA<32>)( grid, threads )(d_C, d_A, d_B, dimsA.x, dimsB.x);
+            CUPLA_KERNEL(matrixMulCUDA<32>)(grid, threads)(d_C, d_A, d_B, dimsA.x, dimsB.x);
         }
     }
 
     // Record the stop event
     error = cudaEventRecord(stop, NULL);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
         exit(EXIT_FAILURE);
@@ -280,7 +275,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     // Wait for the stop event to complete
     error = cudaEventSynchronize(stop);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
         exit(EXIT_FAILURE);
@@ -289,7 +284,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     float msecTotal = 0.0f;
     error = cudaEventElapsedTime(&msecTotal, start, stop);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
         exit(EXIT_FAILURE);
@@ -297,7 +292,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
 
     // Compute and print the performance
     float msecPerMatrixMul = msecTotal / nIter;
-    double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
+    double flopsPerMatrixMul = 2.0 * (double) dimsA.x * (double) dimsA.y * (double) dimsB.x;
     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
     printf(
         "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
@@ -309,7 +304,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     // Copy result from device to host
     error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
         exit(EXIT_FAILURE);
@@ -320,18 +315,18 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
 
     // test relative error by the formula
     //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
-    double eps = 1.e-6 ; // machine zero
+    double eps = 1.e-6; // machine zero
 
-    for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
+    for(int i = 0; i < (int) (dimsC.x * dimsC.y); i++)
     {
         double abs_err = fabs(h_C[i] - (dimsA.x * valB));
         double dot_length = dimsA.x;
         double abs_val = fabs(h_C[i]);
-        double rel_err = abs_err/abs_val/dot_length ;
+        double rel_err = abs_err / abs_val / dot_length;
 
-        if (rel_err > eps)
+        if(rel_err > eps)
         {
-            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
+            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
             correct = false;
         }
     }
@@ -355,7 +350,7 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
     // flushed before the application exits
     cudaDeviceReset();
 
-    if (correct)
+    if(correct)
     {
         return EXIT_SUCCESS;
     }
@@ -369,12 +364,11 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dim
 /**
  * Program main
  */
-int main(int argc, char **argv)
+int main(int argc, char** argv)
 {
     printf("[Matrix Multiply Using CUDA] - Starting...\n");
 
-    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
-        checkCmdLineFlag(argc, (const char **)argv, "?"))
+    if(checkCmdLineFlag(argc, (const char**) argv, "help") || checkCmdLineFlag(argc, (const char**) argv, "?"))
     {
         printf("Usage -device=n (n >= 0 for deviceID)\n");
         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
@@ -387,73 +381,73 @@ int main(int argc, char **argv)
     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
     int devID = 0;
 
-    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
+    if(checkCmdLineFlag(argc, (const char**) argv, "device"))
     {
-        devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
+        devID = getCmdLineArgumentInt(argc, (const char**) argv, "device");
         cudaSetDevice(devID);
     }
 
     cudaError_t error;
-//  cudaDeviceProp deviceProp;
+    //  cudaDeviceProp deviceProp;
     error = cudaGetDevice(&devID);
 
-    if (error != cudaSuccess)
+    if(error != cudaSuccess)
     {
         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
     }
-/*
-    error = cudaGetDeviceProperties(&deviceProp, devID);
+    /*
+        error = cudaGetDeviceProperties(&deviceProp, devID);
 
-    if (deviceProp.computeMode == cudaComputeModeProhibited)
-    {
-        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
-        exit(EXIT_SUCCESS);
-    }
+        if (deviceProp.computeMode == cudaComputeModeProhibited)
+        {
+            fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use
+       ::cudaSetDevice().\n"); exit(EXIT_SUCCESS);
+        }
 
-    if (error != cudaSuccess)
-    {
-        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
-    }
-    else
-    {
-        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
-    }
- */
+        if (error != cudaSuccess)
+        {
+            printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
+        }
+        else
+        {
+            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major,
+       deviceProp.minor);
+        }
+     */
 
     // Use a larger block size for Fermi and above
     int block_size = 16; // (deviceProp.major < 2) ? 16 : 32;
 
-    dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
-    dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
+    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
 
     // width of Matrix A
-    if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
+    if(checkCmdLineFlag(argc, (const char**) argv, "wA"))
     {
-        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
+        dimsA.x = getCmdLineArgumentInt(argc, (const char**) argv, "wA");
     }
 
     // height of Matrix A
-    if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
+    if(checkCmdLineFlag(argc, (const char**) argv, "hA"))
     {
-        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
+        dimsA.y = getCmdLineArgumentInt(argc, (const char**) argv, "hA");
     }
 
     // width of Matrix B
-    if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
+    if(checkCmdLineFlag(argc, (const char**) argv, "wB"))
     {
-        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
+        dimsB.x = getCmdLineArgumentInt(argc, (const char**) argv, "wB");
     }
 
     // height of Matrix B
-    if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
+    if(checkCmdLineFlag(argc, (const char**) argv, "hB"))
     {
-        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
+        dimsB.y = getCmdLineArgumentInt(argc, (const char**) argv, "hB");
     }
 
-    if (dimsA.x != dimsB.y)
+    if(dimsA.x != dimsB.y)
     {
-        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
-               dimsA.x, dimsB.y);
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
         exit(EXIT_FAILURE);
     }
 
diff --git a/example/CUDASamples/vectorAdd/src/vectorAdd.cpp b/example/CUDASamples/vectorAdd/src/vectorAdd.cpp
index 1db118f9..1e37224c 100644
--- a/example/CUDASamples/vectorAdd/src/vectorAdd.cpp
+++ b/example/CUDASamples/vectorAdd/src/vectorAdd.cpp
@@ -15,40 +15,49 @@
  * of the programming guide with some additions like error checking.
  */
 
-#include <stdio.h>
 #include <iostream> //std:cout
+
+#include <stdio.h>
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_to_cupla.hpp>
-//Timer for test purpose
+// Timer for test purpose
 #include <chrono>
-#include <boost/lexical_cast.hpp>
 #include <vector>
+
+#include <boost/lexical_cast.hpp>
 /**
  * CUDA Kernel Device code
  *
  * Computes the vector addition of A and B into C. The 3 vectors have the same
  * number of elements numElements.
  */
-struct vectorAdd {
+struct vectorAdd
+{
     template<typename T_Acc>
-    ALPAKA_FN_HOST_ACC
-    void operator()(T_Acc const &acc, const float *A, const float *B, float *C, const int numElements) const {
+    ALPAKA_FN_HOST_ACC void operator()(
+        T_Acc const& acc,
+        const float* A,
+        const float* B,
+        float* C,
+        const int numElements) const
+    {
         int begin = blockDim.x * blockIdx.x * elemDim.x + threadIdx.x * elemDim.x;
-        if (begin < numElements) {
-            int end = (begin + elemDim.x < numElements) ? begin+elemDim.x : numElements;
-            for (int i=begin; i <end; ++i) {
+        if(begin < numElements)
+        {
+            int end = (begin + elemDim.x < numElements) ? begin + elemDim.x : numElements;
+            for(int i = begin; i < end; ++i)
+            {
                 C[i] = A[i] + B[i];
             }
         }
     }
 };
 
-void benchmarkTest(int first, int last , int stepSize);
+void benchmarkTest(int first, int last, int stepSize);
 /**
  * Host main routine
  */
-int
-main(int argc, char *argv[])
+int main(int argc, char* argv[])
 {
     // Error code to check return values for CUDA calls
     cudaError_t err = cudaSuccess;
@@ -59,53 +68,53 @@ main(int argc, char *argv[])
     printf("[Vector addition of %d elements]\n", numElements);
 
     // Allocate the host input vector A
-    float *h_A = (float *)malloc(size);
+    float* h_A = (float*) malloc(size);
 
     // Allocate the host input vector B
-    float *h_B = (float *)malloc(size);
+    float* h_B = (float*) malloc(size);
 
     // Allocate the host output vector C
-    float *h_C = (float *)malloc(size);
+    float* h_C = (float*) malloc(size);
 
     // Verify that allocations succeeded
-    if (h_A == NULL || h_B == NULL || h_C == NULL)
+    if(h_A == NULL || h_B == NULL || h_C == NULL)
     {
         fprintf(stderr, "Failed to allocate host vectors!\n");
         exit(EXIT_FAILURE);
     }
 
     // Initialize the host input vectors
-    for (int i = 0; i < numElements; ++i)
+    for(int i = 0; i < numElements; ++i)
     {
-        h_A[i] = rand()/(float)RAND_MAX;
-        h_B[i] = rand()/(float)RAND_MAX;
+        h_A[i] = rand() / (float) RAND_MAX;
+        h_B[i] = rand() / (float) RAND_MAX;
     }
 
     // Allocate the device input vector A
-    float *d_A = NULL;
-    err = cudaMalloc((void **)&d_A, size);
+    float* d_A = NULL;
+    err = cudaMalloc((void**) &d_A, size);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
 
     // Allocate the device input vector B
-    float *d_B = NULL;
-    err = cudaMalloc((void **)&d_B, size);
+    float* d_B = NULL;
+    err = cudaMalloc((void**) &d_B, size);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
 
     // Allocate the device output vector C
-    float *d_C = NULL;
-    err = cudaMalloc((void **)&d_C, size);
+    float* d_C = NULL;
+    err = cudaMalloc((void**) &d_C, size);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -116,7 +125,7 @@ main(int argc, char *argv[])
     printf("Copy input data from the host memory to the CUDA device\n");
     err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -124,7 +133,7 @@ main(int argc, char *argv[])
 
     err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -132,12 +141,12 @@ main(int argc, char *argv[])
 
     // Launch the Vector Add CUDA Kernel
     int threadsPerBlock = 256;
-    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
     printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
-    CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock,0,0)(d_A, d_B, d_C, numElements);
+    CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock, 0, 0)(d_A, d_B, d_C, numElements);
     err = cudaGetLastError();
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -148,16 +157,16 @@ main(int argc, char *argv[])
     printf("Copy output data from the CUDA device to the host memory\n");
     err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
 
     // Verify that the result vector is correct
-    for (int i = 0; i < numElements; ++i)
+    for(int i = 0; i < numElements; ++i)
     {
-        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
+        if(fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
         {
             fprintf(stderr, "Result verification failed at element %d!\n", i);
             exit(EXIT_FAILURE);
@@ -169,7 +178,7 @@ main(int argc, char *argv[])
     // Free device global memory
     err = cudaFree(d_A);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -177,14 +186,14 @@ main(int argc, char *argv[])
 
     err = cudaFree(d_B);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
     err = cudaFree(d_C);
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
@@ -203,86 +212,88 @@ main(int argc, char *argv[])
     // flushed before the application exits
     err = cudaDeviceReset();
 
-    if (err != cudaSuccess)
+    if(err != cudaSuccess)
     {
         fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
         exit(EXIT_FAILURE);
     }
     printf("Done\n");
 
-    using boost::lexical_cast;
     using boost::bad_lexical_cast;
+    using boost::lexical_cast;
     std::vector<int> args;
-    while (*++argv){
-        try{
+    while(*++argv)
+    {
+        try
+        {
             args.push_back(lexical_cast<int>(*argv));
         }
-        catch( const bad_lexical_cast &){
+        catch(const bad_lexical_cast&)
+        {
             args.push_back(0);
         }
     }
-    //run benchmartest
+    // run benchmartest
     int first = 50000;
     int last = 100000;
-    int stepSize= 50000;
-    if (args.size() >1){
-        first=args[0];
-        last=args[1];
+    int stepSize = 50000;
+    if(args.size() > 1)
+    {
+        first = args[0];
+        last = args[1];
     }
-    if (args.size()>2){
-        stepSize=args[2];
+    if(args.size() > 2)
+    {
+        stepSize = args[2];
     }
     benchmarkTest(first, last, stepSize);
     cudaDeviceReset();
     return 0;
 }
 
-void
-benchmarkTest(int first, int last, int stepSize)
+void benchmarkTest(int first, int last, int stepSize)
 {
-
-    for (int numElements = first; numElements <=last ; numElements+= stepSize) {
-        std::cout <<"N= " <<numElements << "; ";
+    for(int numElements = first; numElements <= last; numElements += stepSize)
+    {
+        std::cout << "N= " << numElements << "; ";
         size_t size = numElements * sizeof(float);
-        //alloc host memory
-        float *h_A = (float *)malloc(size);
-        float *h_B = (float *)malloc(size);
-        //init
-        for (int i = 0; i < numElements; ++i) {
-            h_A[i] = rand()/(float)RAND_MAX;
-            h_B[i] = rand()/(float)RAND_MAX;
+        // alloc host memory
+        float* h_A = (float*) malloc(size);
+        float* h_B = (float*) malloc(size);
+        // init
+        for(int i = 0; i < numElements; ++i)
+        {
+            h_A[i] = rand() / (float) RAND_MAX;
+            h_B[i] = rand() / (float) RAND_MAX;
         }
-        //alloc device memory
-        float *d_A = NULL;
-        cudaMalloc((void **) &d_A, size);
-        float *d_B = NULL;
-        cudaMalloc((void **) &d_B, size);
-        float *d_C = NULL;
-        cudaMalloc((void **) &d_C, size);
+        // alloc device memory
+        float* d_A = NULL;
+        cudaMalloc((void**) &d_A, size);
+        float* d_B = NULL;
+        cudaMalloc((void**) &d_B, size);
+        float* d_C = NULL;
+        cudaMalloc((void**) &d_C, size);
 
         // copy host device
         cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
         cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
 
-        int threadsPerBlock=1024;
-        int blocksPerGrid= (numElements+threadsPerBlock-1)/threadsPerBlock;
+        int threadsPerBlock = 1024;
+        int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 
-        //Run Kernel
-        std::chrono::high_resolution_clock::time_point start =
-            std::chrono::high_resolution_clock::now();
+        // Run Kernel
+        std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
 
         CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock, 0, 0)(d_A, d_B, d_C, numElements);
         cudaDeviceSynchronize();
 
-        std::chrono::high_resolution_clock::time_point end =
-                std::chrono::high_resolution_clock::now();
+        std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
 
-        std::cout << "Time: "<< std::chrono::duration_cast<std::chrono::milliseconds>
-                                        (end-start).count() <<"ms"<<std::endl;
-        //Free Device memory
+        std::cout << "Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms"
+                  << std::endl;
+        // Free Device memory
         cudaFree(d_A);
         cudaFree(d_B);
         cudaFree(d_C);
     }
 }
-
diff --git a/include/cuda_to_cupla.hpp b/include/cuda_to_cupla.hpp
index 2070e92c..99bd48ef 100644
--- a/include/cuda_to_cupla.hpp
+++ b/include/cuda_to_cupla.hpp
@@ -22,8 +22,6 @@
 #pragma once
 
 #include "cupla.hpp"
-
-#include "cupla/device_functions.hpp"
-
 #include "cupla/cudaToCupla/driverTypes.hpp"
 #include "cupla/cudaToCupla/runtime.hpp"
+#include "cupla/device_functions.hpp"
diff --git a/include/cupla.hpp b/include/cupla.hpp
index 85284e6d..74701944 100644
--- a/include/cupla.hpp
+++ b/include/cupla.hpp
@@ -21,5 +21,5 @@
 
 #pragma once
 
-#include "cupla_runtime.hpp"
 #include "cupla/device_functions.hpp"
+#include "cupla_runtime.hpp"
diff --git a/include/cupla/api/common.hpp b/include/cupla/api/common.hpp
index fdcaac9f..12c9e7c2 100644
--- a/include/cupla/api/common.hpp
+++ b/include/cupla/api/common.hpp
@@ -21,41 +21,36 @@
 
 #pragma once
 
-#include <alpaka/alpaka.hpp>
-
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
 #include "cupla_driver_types.hpp"
 
+#include <alpaka/alpaka.hpp>
+
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    const char* cuplaGetErrorName(cuplaError_t);
 
-const char *
-cuplaGetErrorName(cuplaError_t);
+    const char* cuplaGetErrorString(cuplaError_t);
 
-const char *
-cuplaGetErrorString(cuplaError_t);
 
-
-/** returns the last error from a runtime call.
- *
- * This call reset the error code to cuplaSuccess
- * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess
- *
- * @return cuplaSuccess if there was no error else the corresponding error type
- */
-cuplaError_t
-cuplaGetLastError();
+    /** returns the last error from a runtime call.
+     *
+     * This call reset the error code to cuplaSuccess
+     * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess
+     *
+     * @return cuplaSuccess if there was no error else the corresponding error type
+     */
+    cuplaError_t cuplaGetLastError();
 
 
-/** returns the last error from a runtime call.
- *
- * This call does not reset the error code.
- * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess
- *
- * @return cuplaSuccess if there was no error else the corresponding error type
- */
-cuplaError_t
-cuplaPeekAtLastError();
+    /** returns the last error from a runtime call.
+     *
+     * This call does not reset the error code.
+     * @warning If a non CUDA Alpaka backend is used this function will return always cuplaSuccess
+     *
+     * @return cuplaSuccess if there was no error else the corresponding error type
+     */
+    cuplaError_t cuplaPeekAtLastError();
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/api/device.hpp b/include/cupla/api/device.hpp
index c2ae1be6..8a58d2aa 100644
--- a/include/cupla/api/device.hpp
+++ b/include/cupla/api/device.hpp
@@ -21,34 +21,24 @@
 
 #pragma once
 
-#include <alpaka/alpaka.hpp>
-
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
 #include "cupla_driver_types.hpp"
 
+#include <alpaka/alpaka.hpp>
+
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    cuplaError_t cuplaGetDeviceCount(int* count);
 
-cuplaError_t
-cuplaGetDeviceCount( int * count);
-
-cuplaError_t
-cuplaSetDevice( int idx);
+    cuplaError_t cuplaSetDevice(int idx);
 
-cuplaError_t
-cuplaGetDevice( int * deviceId );
+    cuplaError_t cuplaGetDevice(int* deviceId);
 
-cuplaError_t
-cuplaDeviceReset( );
+    cuplaError_t cuplaDeviceReset();
 
-cuplaError_t
-cuplaDeviceSynchronize( );
+    cuplaError_t cuplaDeviceSynchronize();
 
-cuplaError_t
-cuplaMemGetInfo(
-    size_t * free,
-    size_t * total
-);
+    cuplaError_t cuplaMemGetInfo(size_t* free, size_t* total);
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/api/event.hpp b/include/cupla/api/event.hpp
index 6b93421f..b95c1ada 100644
--- a/include/cupla/api/event.hpp
+++ b/include/cupla/api/event.hpp
@@ -21,48 +21,26 @@
 
 #pragma once
 
-#include <alpaka/alpaka.hpp>
-
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
 #include "cupla_driver_types.hpp"
 
+#include <alpaka/alpaka.hpp>
+
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    cuplaError_t cuplaEventCreateWithFlags(cuplaEvent_t* event, unsigned int flags);
 
-cuplaError_t
-cuplaEventCreateWithFlags(
-    cuplaEvent_t * event,
-    unsigned int flags
-);
-
-cuplaError_t
-cuplaEventCreate(
-    cuplaEvent_t * event
-);
+    cuplaError_t cuplaEventCreate(cuplaEvent_t* event);
 
-cuplaError_t
-cuplaEventDestroy( cuplaEvent_t event );
+    cuplaError_t cuplaEventDestroy(cuplaEvent_t event);
 
-cuplaError_t
-cuplaEventRecord(
-    cuplaEvent_t event,
-    cuplaStream_t stream = 0
-);
+    cuplaError_t cuplaEventRecord(cuplaEvent_t event, cuplaStream_t stream = 0);
 
-cuplaError_t
-cuplaEventElapsedTime(
-    float * ms,
-    cuplaEvent_t start,
-    cuplaEvent_t end
-);
+    cuplaError_t cuplaEventElapsedTime(float* ms, cuplaEvent_t start, cuplaEvent_t end);
 
-cuplaError_t
-cuplaEventSynchronize(
-    cuplaEvent_t event
-);
+    cuplaError_t cuplaEventSynchronize(cuplaEvent_t event);
 
-cuplaError_t
-cuplaEventQuery( cuplaEvent_t event );
+    cuplaError_t cuplaEventQuery(cuplaEvent_t event);
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/api/memory.hpp b/include/cupla/api/memory.hpp
index 27f841fc..79ba0e70 100644
--- a/include/cupla/api/memory.hpp
+++ b/include/cupla/api/memory.hpp
@@ -21,140 +21,74 @@
 
 #pragma once
 
-#include <alpaka/alpaka.hpp>
-
-#include "cupla/datatypes/dim3.hpp"
-#include "cupla/datatypes/uint.hpp"
 #include "cupla/c/datatypes/cuplaExtent.hpp"
+#include "cupla/c/datatypes/cuplaMemcpy3DParms.hpp"
 #include "cupla/c/datatypes/cuplaPitchedPtr.hpp"
-
+#include "cupla/c/datatypes/cuplaPos.hpp"
+#include "cupla/datatypes/dim3.hpp"
+#include "cupla/datatypes/uint.hpp"
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
 #include "cupla_driver_types.hpp"
 
+#include <alpaka/alpaka.hpp>
+
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    cuplaError_t cuplaMalloc(void** ptrptr, size_t size);
+
+    cuplaError_t cuplaMallocHost(void** ptrptr, size_t size);
+
+
+    cuplaError_t cuplaMallocPitch(void** devPtr, size_t* pitch, size_t const width, size_t const height);
+
+    cuplaError_t cuplaMalloc3D(cuplaPitchedPtr* pitchedDevPtr, cuplaExtent const extent);
+
+
+    cuplaExtent make_cuplaExtent(size_t const w, size_t const h, size_t const d);
+
+    cuplaPos make_cuplaPos(size_t const x, size_t const y, size_t const z);
+
+    cuplaPitchedPtr make_cuplaPitchedPtr(void* const d, size_t const p, size_t const xsz, size_t const ysz);
+
+    cuplaError_t cuplaFree(void* ptr);
+
+    cuplaError_t cuplaFreeHost(void* ptr);
+
+    cuplaError_t cuplaMemcpy(void* dst, const void* src, size_t count, enum cuplaMemcpyKind kind);
+
+    cuplaError_t cuplaMemcpyAsync(
+        void* dst,
+        const void* src,
+        size_t count,
+        enum cuplaMemcpyKind kind,
+        cuplaStream_t stream = 0);
+
+    cuplaError_t cuplaMemsetAsync(void* devPtr, int value, size_t count, cuplaStream_t stream = 0);
+
+    cuplaError_t cuplaMemset(void* devPtr, int value, size_t count);
+
+    cuplaError_t cuplaMemcpy2D(
+        void* dst,
+        size_t const dPitch,
+        void const* const src,
+        size_t const spitch,
+        size_t const width,
+        size_t const height,
+        enum cuplaMemcpyKind kind);
+
+    cuplaError_t cuplaMemcpy2DAsync(
+        void* dst,
+        size_t const dPitch,
+        void const* const src,
+        size_t const spitch,
+        size_t const width,
+        size_t const height,
+        enum cuplaMemcpyKind kind,
+        cuplaStream_t const stream = 0);
+
+    cuplaError_t cuplaMemcpy3DAsync(const cuplaMemcpy3DParms* const p, cuplaStream_t stream = 0);
+
+    cuplaError_t cuplaMemcpy3D(const cuplaMemcpy3DParms* const p);
 
-cuplaError_t
-cuplaMalloc(
-    void **ptrptr,
-    size_t size
-);
-
-cuplaError_t
-cuplaMallocHost(
-    void **ptrptr,
-    size_t size
-);
-
-
-cuplaError_t
-cuplaMallocPitch(
-    void ** devPtr,
-    size_t * pitch,
-    size_t const width,
-    size_t const height
-);
-
-cuplaError_t
-cuplaMalloc3D(
-    cuplaPitchedPtr * pitchedDevPtr,
-    cuplaExtent const extent
-);
-
-
-cuplaExtent
-make_cuplaExtent(
-    size_t const w,
-    size_t const h,
-    size_t const d
-);
-
-cuplaPos
-make_cuplaPos(
-    size_t const x,
-    size_t const y,
-    size_t const z
-);
-
-cuplaPitchedPtr
-make_cuplaPitchedPtr(
-    void * const d,
-    size_t const p,
-    size_t const xsz,
-    size_t const ysz
-);
-
-cuplaError_t
-cuplaFree(void *ptr);
-
-cuplaError_t
-cuplaFreeHost(void *ptr);
-
-cuplaError_t
-cuplaMemcpy(
-    void *dst,
-    const void *src,
-    size_t count,
-    enum cuplaMemcpyKind kind
-);
-
-cuplaError_t
-cuplaMemcpyAsync(
-    void *dst,
-    const void *src,
-    size_t count,
-    enum cuplaMemcpyKind kind,
-    cuplaStream_t stream = 0
-);
-
-cuplaError_t
-cuplaMemsetAsync(
-    void * devPtr,
-    int value,
-    size_t count,
-    cuplaStream_t stream = 0
-);
-
-cuplaError_t
-cuplaMemset(
-    void * devPtr,
-    int value,
-    size_t count
-);
-
-cuplaError_t
-cuplaMemcpy2D(
-    void * dst,
-    size_t const dPitch,
-    void const * const src,
-    size_t const spitch,
-    size_t const width,
-    size_t const height,
-    enum cuplaMemcpyKind kind
-);
-
-cuplaError_t
-cuplaMemcpy2DAsync(
-    void * dst,
-    size_t const dPitch,
-    void const * const src,
-    size_t const spitch,
-    size_t const width,
-    size_t const height,
-    enum cuplaMemcpyKind kind,
-    cuplaStream_t const stream = 0
-);
-
-cuplaError_t
-cuplaMemcpy3DAsync(
-    const cuplaMemcpy3DParms * const p,
-    cuplaStream_t stream = 0
-);
-
-cuplaError_t
-cuplaMemcpy3D(
-    const cuplaMemcpy3DParms * const p
-);
-
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/api/stream.hpp b/include/cupla/api/stream.hpp
index d68508de..0c99b8f3 100644
--- a/include/cupla/api/stream.hpp
+++ b/include/cupla/api/stream.hpp
@@ -21,36 +21,22 @@
 
 #pragma once
 
-#include <alpaka/alpaka.hpp>
-
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
 #include "cupla_driver_types.hpp"
 
+#include <alpaka/alpaka.hpp>
+
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    cuplaError_t cuplaStreamCreate(cuplaStream_t* stream);
 
-cuplaError_t
-cuplaStreamCreate(
-    cuplaStream_t * stream
-);
-
-cuplaError_t
-cuplaStreamDestroy( cuplaStream_t stream );
+    cuplaError_t cuplaStreamDestroy(cuplaStream_t stream);
 
-cuplaError_t
-cuplaStreamSynchronize(
-    cuplaStream_t stream
-);
+    cuplaError_t cuplaStreamSynchronize(cuplaStream_t stream);
 
-cuplaError_t
-cuplaStreamWaitEvent(
-    cuplaStream_t stream,
-    cuplaEvent_t event,
-    unsigned int flags
-);
+    cuplaError_t cuplaStreamWaitEvent(cuplaStream_t stream, cuplaEvent_t event, unsigned int flags);
 
-cuplaError_t
-cuplaStreamQuery( cuplaStream_t stream );
+    cuplaError_t cuplaStreamQuery(cuplaStream_t stream);
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/c/datatypes/cuplaArray.hpp b/include/cupla/c/datatypes/cuplaArray.hpp
index 34905370..588a4eb8 100644
--- a/include/cupla/c/datatypes/cuplaArray.hpp
+++ b/include/cupla/c/datatypes/cuplaArray.hpp
@@ -21,17 +21,16 @@
 
 #pragma once
 
-#include "cupla/namespace.hpp"
-#include "cupla/types.hpp"
+#include "cupla/c/datatypes/cuplaExtent.hpp"
 #include "cupla/c/datatypes/cuplaPitchedPtr.hpp"
 #include "cupla/c/datatypes/cuplaPos.hpp"
-#include "cupla/c/datatypes/cuplaExtent.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla/types.hpp"
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    struct cuplaArray
+    {
+    };
 
-struct cuplaArray
-{
-};
-
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/c/datatypes/cuplaExtent.hpp b/include/cupla/c/datatypes/cuplaExtent.hpp
index 59721eb1..b5559606 100644
--- a/include/cupla/c/datatypes/cuplaExtent.hpp
+++ b/include/cupla/c/datatypes/cuplaExtent.hpp
@@ -26,199 +26,127 @@
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
-
-struct cuplaExtent{
-    cupla::MemSizeType width, height, depth;
-
-    cuplaExtent() = default;
-
-    ALPAKA_FN_HOST_ACC
-    cuplaExtent(
-        cupla::MemSizeType const w,
-        cupla::MemSizeType const h,
-        cupla::MemSizeType const d
-    ) :
-        width( w ),
-        height( h ),
-        depth( d )
-    {}
-
-    template<
-      typename TDim,
-      typename TSize,
-      typename = typename std::enable_if<
-          (TDim::value == 3u)
-      >::type
-    >
-    ALPAKA_FN_HOST_ACC
-    cuplaExtent(
-        ::alpaka::Vec<
-            TDim,
-            TSize
-        > const &vec
-    )
-    {
-        for( uint32_t i(0); i < 3u; ++i ) {
-            // alpaka vectors are z,y,x.
-            ( &this->width )[ i ] = vec[ ( 3u - 1u ) - i ];
-        }
-    }
-
-    ALPAKA_FN_HOST_ACC
-    operator ::alpaka::Vec<
-        cupla::AlpakaDim< 3u >,
-        cupla::MemSizeType
-    >(void) const
+    struct cuplaExtent
     {
-        ::alpaka::Vec<
-            cupla::AlpakaDim< 3u >,
-            cupla::MemSizeType
-        > vec( depth, height, width );
-        return vec;
-    }
-};
+        cupla::MemSizeType width, height, depth;
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+        cuplaExtent() = default;
 
+        ALPAKA_FN_HOST_ACC
+        cuplaExtent(cupla::MemSizeType const w, cupla::MemSizeType const h, cupla::MemSizeType const d)
+            : width(w)
+            , height(h)
+            , depth(d)
+        {
+        }
 
-namespace alpaka
-{
-namespace traits
-{
+        template<typename TDim, typename TSize, typename = typename std::enable_if<(TDim::value == 3u)>::type>
+        ALPAKA_FN_HOST_ACC cuplaExtent(::alpaka::Vec<TDim, TSize> const& vec)
+        {
+            for(uint32_t i(0); i < 3u; ++i)
+            {
+                // alpaka vectors are z,y,x.
+                (&this->width)[i] = vec[(3u - 1u) - i];
+            }
+        }
 
-    //! dimension get trait specialization
-    template<>
-    struct DimType<
-        cuplaExtent
-    >{
-      using type = ::alpaka::DimInt<3u>;
+        ALPAKA_FN_HOST_ACC
+        operator ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType>(void) const
+        {
+            ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType> vec(depth, height, width);
+            return vec;
+        }
     };
 
-} // namespace traits
-
-namespace traits
-{
-
-    //! element type trait specialization
-    template<>
-    struct ElemType<
-        cuplaExtent
-    >{
-        using type = cupla::MemSizeType;
-    };
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
 
-} // namespace traits
 
-namespace extent
-{
-namespace traits
+namespace alpaka
 {
-
-    //! extent get trait specialization
-    template<
-        typename T_Idx
-    >
-    struct GetExtent<
-        T_Idx,
-        cuplaExtent,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-
-        ALPAKA_FN_HOST_ACC
-        static auto
-        getExtent( cuplaExtent const & extents )
-        -> cupla::MemSizeType {
-        return (&extents.width)[(3u - 1u) - T_Idx::value];
-      }
-    };
-
-    //! extent set trait specialization
-    template<
-        typename T_Idx,
-        typename T_Extent
-    >
-    struct SetExtent<
-        T_Idx,
-        cuplaExtent,
-        T_Extent,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        setExtent(
-            cuplaExtent &extents,
-            T_Extent const &extent
-        )
-        -> void
+    namespace traits
+    {
+        //! dimension get trait specialization
+        template<>
+        struct DimType<cuplaExtent>
         {
-            (&extents.width)[(3u - 1u) - T_Idx::value] = extent;
-        }
-    };
-} // namespace traits
-} // namespace extent
+            using type = ::alpaka::DimInt<3u>;
+        };
 
-namespace traits
-{
+    } // namespace traits
 
-    //! offset get trait specialization
-    template<
-        typename T_Idx
-    >
-    struct GetOffset<
-        T_Idx,
-        cuplaExtent,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        getOffset( cuplaExtent const & offsets )
-        -> cupla::MemSizeType{
-            return (&offsets.width)[(3u - 1u) - T_Idx::value];
-        }
-    };
+    namespace traits
+    {
+        //! element type trait specialization
+        template<>
+        struct ElemType<cuplaExtent>
+        {
+            using type = cupla::MemSizeType;
+        };
 
+    } // namespace traits
 
-    //! offset set trait specialization.
-    template<
-        typename T_Idx,
-        typename T_Offset
-    >
-    struct SetOffset<
-        T_Idx,
-        cuplaExtent,
-        T_Offset,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        setOffset(
-            cuplaExtent &offsets,
-            T_Offset const &offset
-        )
-        -> void {
-            offsets[(3u - 1u) - T_Idx::value] = offset;
-        }
-    };
-} // namespace traits
+    namespace extent
+    {
+        namespace traits
+        {
+            //! extent get trait specialization
+            template<typename T_Idx>
+            struct GetExtent<T_Idx, cuplaExtent, typename std::enable_if<(3u > T_Idx::value)>::type>
+            {
+                ALPAKA_FN_HOST_ACC
+                static auto getExtent(cuplaExtent const& extents) -> cupla::MemSizeType
+                {
+                    return (&extents.width)[(3u - 1u) - T_Idx::value];
+                }
+            };
+
+            //! extent set trait specialization
+            template<typename T_Idx, typename T_Extent>
+            struct SetExtent<T_Idx, cuplaExtent, T_Extent, typename std::enable_if<(3u > T_Idx::value)>::type>
+            {
+                ALPAKA_FN_HOST_ACC
+                static auto setExtent(cuplaExtent& extents, T_Extent const& extent) -> void
+                {
+                    (&extents.width)[(3u - 1u) - T_Idx::value] = extent;
+                }
+            };
+        } // namespace traits
+    } // namespace extent
+
+    namespace traits
+    {
+        //! offset get trait specialization
+        template<typename T_Idx>
+        struct GetOffset<T_Idx, cuplaExtent, typename std::enable_if<(3u > T_Idx::value)>::type>
+        {
+            ALPAKA_FN_HOST_ACC
+            static auto getOffset(cuplaExtent const& offsets) -> cupla::MemSizeType
+            {
+                return (&offsets.width)[(3u - 1u) - T_Idx::value];
+            }
+        };
 
-namespace traits
-{
 
-    //! size type trait specialization.
-    template<>
-    struct IdxType<
-        cuplaExtent
-    >{
-        using type = cupla::MemSizeType;
-    };
+        //! offset set trait specialization.
+        template<typename T_Idx, typename T_Offset>
+        struct SetOffset<T_Idx, cuplaExtent, T_Offset, typename std::enable_if<(3u > T_Idx::value)>::type>
+        {
+            ALPAKA_FN_HOST_ACC
+            static auto setOffset(cuplaExtent& offsets, T_Offset const& offset) -> void
+            {
+                offsets[(3u - 1u) - T_Idx::value] = offset;
+            }
+        };
+    } // namespace traits
+
+    namespace traits
+    {
+        //! size type trait specialization.
+        template<>
+        struct IdxType<cuplaExtent>
+        {
+            using type = cupla::MemSizeType;
+        };
 
-} // namespace traits
-} // namespave alpaka
+    } // namespace traits
+} // namespace alpaka
diff --git a/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp b/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp
index f23da699..de5caa2e 100644
--- a/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp
+++ b/include/cupla/c/datatypes/cuplaMemcpy3DParms.hpp
@@ -21,28 +21,27 @@
 
 #pragma once
 
-#include "cupla/namespace.hpp"
-#include "cupla/types.hpp"
 #include "cupla/c/datatypes/cuplaArray.hpp"
+#include "cupla/c/datatypes/cuplaExtent.hpp"
 #include "cupla/c/datatypes/cuplaPitchedPtr.hpp"
 #include "cupla/c/datatypes/cuplaPos.hpp"
-#include "cupla/c/datatypes/cuplaExtent.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla/types.hpp"
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    struct cuplaMemcpy3DParms
+    {
+        cuplaArray* dstArray;
+        cuplaPos dstPos;
+        cuplaPitchedPtr dstPtr;
+        cuplaExtent extent;
+        cuplaMemcpyKind kind;
+        cuplaArray* srcArray;
+        cuplaPos srcPos;
+        cuplaPitchedPtr srcPtr;
 
-struct cuplaMemcpy3DParms
-{
-    cuplaArray* dstArray;
-    cuplaPos dstPos;
-    cuplaPitchedPtr dstPtr;
-    cuplaExtent extent;
-    cuplaMemcpyKind kind;
-    cuplaArray * srcArray;
-    cuplaPos srcPos;
-    cuplaPitchedPtr srcPtr;
-
-    cuplaMemcpy3DParms() = default;
-};
+        cuplaMemcpy3DParms() = default;
+    };
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/c/datatypes/cuplaPitchedPtr.hpp b/include/cupla/c/datatypes/cuplaPitchedPtr.hpp
index 881349c0..ee139af6 100644
--- a/include/cupla/c/datatypes/cuplaPitchedPtr.hpp
+++ b/include/cupla/c/datatypes/cuplaPitchedPtr.hpp
@@ -21,33 +21,31 @@
 
 #pragma once
 
+#include "cupla/datatypes/uint.hpp"
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
-#include "cupla/datatypes/uint.hpp"
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
-
-struct cuplaPitchedPtr
-{
-    void * ptr;
-    cupla::MemSizeType pitch, xsize, ysize;
-
-    cuplaPitchedPtr() = default;
-
-    ALPAKA_FN_HOST_ACC
-    cuplaPitchedPtr(
-        void * const d,
-        cupla::MemSizeType const p,
-        cupla::MemSizeType const xsz,
-        cupla::MemSizeType const ysz
-    ) :
-        ptr( d ),
-        pitch( p ),
-        xsize( xsz ),
-        ysize( ysz )
-    {}
-
-};
-
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+    struct cuplaPitchedPtr
+    {
+        void* ptr;
+        cupla::MemSizeType pitch, xsize, ysize;
+
+        cuplaPitchedPtr() = default;
+
+        ALPAKA_FN_HOST_ACC
+        cuplaPitchedPtr(
+            void* const d,
+            cupla::MemSizeType const p,
+            cupla::MemSizeType const xsz,
+            cupla::MemSizeType const ysz)
+            : ptr(d)
+            , pitch(p)
+            , xsize(xsz)
+            , ysize(ysz)
+        {
+        }
+    };
+
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/include/cupla/c/datatypes/cuplaPos.hpp b/include/cupla/c/datatypes/cuplaPos.hpp
index b99a26de..1a56f46b 100644
--- a/include/cupla/c/datatypes/cuplaPos.hpp
+++ b/include/cupla/c/datatypes/cuplaPos.hpp
@@ -26,198 +26,123 @@
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
-
-struct cuplaPos{
-    size_t x, y, z;
-
-    cuplaPos() = default;
-
-    ALPAKA_FN_HOST_ACC
-    cuplaPos(
-        size_t const x_in,
-        size_t const y_in,
-        size_t const z_in
-    ) :
-        x( x_in ),
-        y( y_in ),
-        z( z_in )
-    {}
-
-    template<
-      typename TDim,
-      typename TSize,
-      typename = typename std::enable_if<
-          (TDim::value == 3u)
-      >::type
-    >
-    ALPAKA_FN_HOST_ACC
-    cuplaPos(
-        ::alpaka::Vec<
-            TDim,
-            TSize
-        > const &vec
-    )
-    {
-        for( uint32_t i(0); i < 3u; ++i ) {
-            // alpaka vectors are z,y,x.
-            ( &this->x )[ i ] = vec[ ( 3u - 1u ) - i ];
-        }
-    }
-
-    ALPAKA_FN_HOST_ACC
-    operator ::alpaka::Vec<
-        cupla::AlpakaDim< 3u >,
-        cupla::MemSizeType
-    >(void) const
+    struct cuplaPos
     {
-        ::alpaka::Vec<
-            cupla::AlpakaDim< 3u >,
-            cupla::MemSizeType
-        > vec( x, y, z );
-        return vec;
-    }
-};
-
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-
-namespace alpaka
-{
-namespace traits
-{
-
-    //! dimension get trait specialization
-    template<>
-    struct DimType<
-        cuplaPos
-    >{
-      using type = ::alpaka::DimInt<3u>;
-    };
-
-} // namespace traits
+        size_t x, y, z;
 
-namespace traits
-{
-
-    //! element type trait specialization
-    template<>
-    struct ElemType<
-        cuplaPos
-    >{
-        using type = cupla::MemSizeType;
-    };
-
-} // namespace traits
-
-namespace extent
-{
-namespace traits
-{
-
-    //! extent get trait specialization
-    template<
-        typename T_Idx
-    >
-    struct GetExtent<
-        T_Idx,
-        cuplaPos,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
+        cuplaPos() = default;
 
         ALPAKA_FN_HOST_ACC
-        static auto
-        getExtent( cuplaPos const & extents )
-        -> cupla::MemSizeType {
-        return (&extents.x)[(3u - 1u) - T_Idx::value];
-      }
-    };
+        cuplaPos(size_t const x_in, size_t const y_in, size_t const z_in) : x(x_in), y(y_in), z(z_in)
+        {
+        }
+
+        template<typename TDim, typename TSize, typename = typename std::enable_if<(TDim::value == 3u)>::type>
+        ALPAKA_FN_HOST_ACC cuplaPos(::alpaka::Vec<TDim, TSize> const& vec)
+        {
+            for(uint32_t i(0); i < 3u; ++i)
+            {
+                // alpaka vectors are z,y,x.
+                (&this->x)[i] = vec[(3u - 1u) - i];
+            }
+        }
 
-    //! extent set trait specialization
-    template<
-        typename T_Idx,
-        typename T_Pos
-    >
-    struct SetExtent<
-        T_Idx,
-        cuplaPos,
-        T_Pos,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
         ALPAKA_FN_HOST_ACC
-        static auto
-        setExtent(
-            cuplaPos &extents,
-            T_Pos const &extent
-        )
-        -> void
+        operator ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType>(void) const
         {
-            (&extents.x)[(3u - 1u) - T_Idx::value] = extent;
+            ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType> vec(x, y, z);
+            return vec;
         }
     };
-} // namespace traits
-} // namespace extent
 
-namespace traits
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+
+namespace alpaka
 {
+    namespace traits
+    {
+        //! dimension get trait specialization
+        template<>
+        struct DimType<cuplaPos>
+        {
+            using type = ::alpaka::DimInt<3u>;
+        };
 
-    //! offset get trait specialization
-    template<
-        typename T_Idx
-    >
-    struct GetOffset<
-        T_Idx,
-        cuplaPos,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        getOffset( cuplaPos const & offsets )
-        -> cupla::MemSizeType{
-            return (&offsets.x)[(3u - 1u) - T_Idx::value];
-        }
-    };
+    } // namespace traits
 
+    namespace traits
+    {
+        //! element type trait specialization
+        template<>
+        struct ElemType<cuplaPos>
+        {
+            using type = cupla::MemSizeType;
+        };
 
-    //! offset set trait specialization.
-    template<
-        typename T_Idx,
-        typename T_Offset
-    >
-    struct SetOffset<
-        T_Idx,
-        cuplaPos,
-        T_Offset,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        setOffset(
-            cuplaPos &offsets,
-            T_Offset const &offset
-        )
-        -> void {
-            offsets[(3u - 1u) - T_Idx::value] = offset;
-        }
-    };
-} // namespace traits
+    } // namespace traits
 
-namespace traits
-{
+    namespace extent
+    {
+        namespace traits
+        {
+            //! extent get trait specialization
+            template<typename T_Idx>
+            struct GetExtent<T_Idx, cuplaPos, typename std::enable_if<(3u > T_Idx::value)>::type>
+            {
+                ALPAKA_FN_HOST_ACC
+                static auto getExtent(cuplaPos const& extents) -> cupla::MemSizeType
+                {
+                    return (&extents.x)[(3u - 1u) - T_Idx::value];
+                }
+            };
+
+            //! extent set trait specialization
+            template<typename T_Idx, typename T_Pos>
+            struct SetExtent<T_Idx, cuplaPos, T_Pos, typename std::enable_if<(3u > T_Idx::value)>::type>
+            {
+                ALPAKA_FN_HOST_ACC
+                static auto setExtent(cuplaPos& extents, T_Pos const& extent) -> void
+                {
+                    (&extents.x)[(3u - 1u) - T_Idx::value] = extent;
+                }
+            };
+        } // namespace traits
+    } // namespace extent
+
+    namespace traits
+    {
+        //! offset get trait specialization
+        template<typename T_Idx>
+        struct GetOffset<T_Idx, cuplaPos, typename std::enable_if<(3u > T_Idx::value)>::type>
+        {
+            ALPAKA_FN_HOST_ACC
+            static auto getOffset(cuplaPos const& offsets) -> cupla::MemSizeType
+            {
+                return (&offsets.x)[(3u - 1u) - T_Idx::value];
+            }
+        };
 
-    //! size type trait specialization.
-    template<>
-    struct IdxType<
-        cuplaPos
-    >{
-        using type = cupla::MemSizeType;
-    };
 
-} // namespace traits
-} // namespave alpaka
+        //! offset set trait specialization.
+        template<typename T_Idx, typename T_Offset>
+        struct SetOffset<T_Idx, cuplaPos, T_Offset, typename std::enable_if<(3u > T_Idx::value)>::type>
+        {
+            ALPAKA_FN_HOST_ACC
+            static auto setOffset(cuplaPos& offsets, T_Offset const& offset) -> void
+            {
+                offsets[(3u - 1u) - T_Idx::value] = offset;
+            }
+        };
+    } // namespace traits
+
+    namespace traits
+    {
+        //! size type trait specialization.
+        template<>
+        struct IdxType<cuplaPos>
+        {
+            using type = cupla::MemSizeType;
+        };
+
+    } // namespace traits
+} // namespace alpaka
diff --git a/include/cupla/config/AnyOacc.hpp b/include/cupla/config/AnyOacc.hpp
index e1602ad6..7000fbd1 100644
--- a/include/cupla/config/AnyOacc.hpp
+++ b/include/cupla/config/AnyOacc.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/AnyOacc.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/AnyOmp5.hpp b/include/cupla/config/AnyOmp5.hpp
index 64f547d8..45089a01 100644
--- a/include/cupla/config/AnyOmp5.hpp
+++ b/include/cupla/config/AnyOmp5.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/AnyOmp5.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/CpuOmp2Blocks.hpp b/include/cupla/config/CpuOmp2Blocks.hpp
index 34881724..75aef43d 100644
--- a/include/cupla/config/CpuOmp2Blocks.hpp
+++ b/include/cupla/config/CpuOmp2Blocks.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/CpuOmp2Blocks.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/CpuOmp2Threads.hpp b/include/cupla/config/CpuOmp2Threads.hpp
index 287bef45..8cd50938 100644
--- a/include/cupla/config/CpuOmp2Threads.hpp
+++ b/include/cupla/config/CpuOmp2Threads.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/CpuOmp2Threads.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/CpuSerial.hpp b/include/cupla/config/CpuSerial.hpp
index a88f3541..09e42f63 100644
--- a/include/cupla/config/CpuSerial.hpp
+++ b/include/cupla/config/CpuSerial.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/CpuSerial.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/CpuTbbBlocks.hpp b/include/cupla/config/CpuTbbBlocks.hpp
index 643c23d5..81af6924 100644
--- a/include/cupla/config/CpuTbbBlocks.hpp
+++ b/include/cupla/config/CpuTbbBlocks.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/CpuTbbBlocks.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/CpuThreads.hpp b/include/cupla/config/CpuThreads.hpp
index 036c963f..9514b949 100644
--- a/include/cupla/config/CpuThreads.hpp
+++ b/include/cupla/config/CpuThreads.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/CpuThreads.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/GpuCudaRt.hpp b/include/cupla/config/GpuCudaRt.hpp
index e6d52ad4..4b71d411 100644
--- a/include/cupla/config/GpuCudaRt.hpp
+++ b/include/cupla/config/GpuCudaRt.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/GpuCudaRt.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/config/GpuHipRt.hpp b/include/cupla/config/GpuHipRt.hpp
index 6195cdf1..1328442e 100644
--- a/include/cupla/config/GpuHipRt.hpp
+++ b/include/cupla/config/GpuHipRt.hpp
@@ -24,20 +24,20 @@
 #include <alpaka/standalone/GpuHipRt.hpp>
 
 #ifndef CUPLA_HEADER_ONLY
-#   define CUPLA_HEADER_ONLY 1
+#    define CUPLA_HEADER_ONLY 1
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC inline
+#if(CUPLA_HEADER_ONLY == 1)
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC inline
 #endif
 
-#if( CUPLA_HEADER_ONLY == 1 )
-#   include "cupla/../../src/manager/Driver.cpp"
-#   include "cupla/../../src/common.cpp"
-#   include "cupla/../../src/device.cpp"
-#   include "cupla/../../src/event.cpp"
-#   include "cupla/../../src/memory.cpp"
-#   include "cupla/../../src/stream.cpp"
+#if(CUPLA_HEADER_ONLY == 1)
+#    include "cupla/../../src/common.cpp"
+#    include "cupla/../../src/device.cpp"
+#    include "cupla/../../src/event.cpp"
+#    include "cupla/../../src/manager/Driver.cpp"
+#    include "cupla/../../src/memory.cpp"
+#    include "cupla/../../src/stream.cpp"
 #endif
 
 #include "cupla.hpp"
diff --git a/include/cupla/cudaToCupla/driverTypes.hpp b/include/cupla/cudaToCupla/driverTypes.hpp
index 8a798567..76090773 100644
--- a/include/cupla/cudaToCupla/driverTypes.hpp
+++ b/include/cupla/cudaToCupla/driverTypes.hpp
@@ -51,7 +51,7 @@
 #define cudaMemcpy3DParms cuplaMemcpy3DParms
 
 #ifdef cudaEventBlockingSync
-#undef cudaEventBlockingSync
+#    undef cudaEventBlockingSync
 #endif
 /* cudaEventBlockingSync is a define in CUDA, hence we must remove
  * the old definition with the cupla enum
@@ -59,7 +59,7 @@
 #define cudaEventBlockingSync cuplaEventBlockingSync
 
 #ifdef cudaEventDisableTiming
-#undef cudaEventDisableTiming
+#    undef cudaEventDisableTiming
 #endif
 /* cudaEventDisableTiming is a define in CUDA therefore we must remove
  * the old definition with the cupla enum
@@ -135,23 +135,21 @@ ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int3 make_int3(int x, int y, int z)
 // recast functions
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-
-    template< typename A, typename B >
-    ALPAKA_FN_HOST_ACC
-    B A_as_B( A const & x )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        static_assert( sizeof(A) == sizeof(B), "reinterpretation assumes data types of same size!" );
-        return reinterpret_cast< B const & >( x );
-    }
-
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+        template<typename A, typename B>
+        ALPAKA_FN_HOST_ACC B A_as_B(A const& x)
+        {
+            static_assert(sizeof(A) == sizeof(B), "reinterpretation assumes data types of same size!");
+            return reinterpret_cast<B const&>(x);
+        }
+
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
 
 #ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-#   define __int_as_float(...) cupla::A_as_B< int, float >( __VA_ARGS__ )
-#   define __float_as_int(...) cupla::A_as_B< float, int >( __VA_ARGS__ )
-#   define __longlong_as_double(...) cupla::A_as_B< long long, double >( __VA_ARGS__ )
-#   define __double_as_longlong(...) cupla::A_as_B< double, long long >( __VA_ARGS__ )
+#    define __int_as_float(...) cupla::A_as_B<int, float>(__VA_ARGS__)
+#    define __float_as_int(...) cupla::A_as_B<float, int>(__VA_ARGS__)
+#    define __longlong_as_double(...) cupla::A_as_B<long long, double>(__VA_ARGS__)
+#    define __double_as_longlong(...) cupla::A_as_B<double, long long>(__VA_ARGS__)
 #endif
diff --git a/include/cupla/cudaToCupla/runtime.hpp b/include/cupla/cudaToCupla/runtime.hpp
index cea4d4a9..1955f3f0 100644
--- a/include/cupla/cudaToCupla/runtime.hpp
+++ b/include/cupla/cudaToCupla/runtime.hpp
@@ -85,7 +85,7 @@
  * are disabled in CUDA
  */
 #if CUPLA_DEVICE_COMPILE == 0
-#   define __fdividef(a,b) ((a)/(b))
-#   define __expf(a) cupla::math::exp(a)
-#   define __logf(a) cupla::math::log(a)
+#    define __fdividef(a, b) ((a) / (b))
+#    define __expf(a) cupla::math::exp(a)
+#    define __logf(a) cupla::math::log(a)
 #endif
diff --git a/include/cupla/datatypes/Array.hpp b/include/cupla/datatypes/Array.hpp
index dd0d4d6d..05fcc3f1 100644
--- a/include/cupla/datatypes/Array.hpp
+++ b/include/cupla/datatypes/Array.hpp
@@ -26,38 +26,25 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-
-    template<
-        typename T_Type,
-        size_t T_size
-    >
-    struct Array{
-        T_Type m_data[T_size];
-
-        template<
-            typename T_Idx
-        >
-        ALPAKA_FN_HOST_ACC
-        const T_Type &
-        operator[](
-            const T_Idx idx
-        ) const {
-            return m_data[idx];
-        }
-
-        template<
-            typename T_Idx
-        >
-        ALPAKA_FN_HOST_ACC
-        T_Type &
-        operator[](
-            const T_Idx idx
-        ){
-            return m_data[idx];
-        }
-    };
-
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        template<typename T_Type, size_t T_size>
+        struct Array
+        {
+            T_Type m_data[T_size];
+
+            template<typename T_Idx>
+            ALPAKA_FN_HOST_ACC const T_Type& operator[](const T_Idx idx) const
+            {
+                return m_data[idx];
+            }
+
+            template<typename T_Idx>
+            ALPAKA_FN_HOST_ACC T_Type& operator[](const T_Idx idx)
+            {
+                return m_data[idx];
+            }
+        };
+
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/datatypes/dim3.hpp b/include/cupla/datatypes/dim3.hpp
index 5f42db23..5a24d53d 100644
--- a/include/cupla/datatypes/dim3.hpp
+++ b/include/cupla/datatypes/dim3.hpp
@@ -21,49 +21,38 @@
 
 #pragma once
 
+#include "cupla/datatypes/uint.hpp"
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
-#include "cupla/datatypes/uint.hpp"
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-
-    struct dim3
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        IdxType x, y, z;
-
-        ALPAKA_FN_HOST_ACC
-        dim3(
-            IdxType vx = 1,
-            IdxType vy = 1,
-            IdxType vz = 1
-        ) :
-            x(vx),
-            y(vy),
-            z(vz)
-        {}
-
-        ALPAKA_FN_HOST_ACC
-        dim3(
-            const uint3& v
-        ) :
-            x(v.x),
-            y(v.y),
-            z(v.z)
-        {}
-
-        ALPAKA_FN_HOST_ACC
-        operator uint3(void)
+        struct dim3
         {
-          uint3 t;
-          t.x = x;
-          t.y = y;
-          t.z = z;
-          return t;
-        }
-    };
-
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+            IdxType x, y, z;
+
+            ALPAKA_FN_HOST_ACC
+            dim3(IdxType vx = 1, IdxType vy = 1, IdxType vz = 1) : x(vx), y(vy), z(vz)
+            {
+            }
+
+            ALPAKA_FN_HOST_ACC
+            dim3(const uint3& v) : x(v.x), y(v.y), z(v.z)
+            {
+            }
+
+            ALPAKA_FN_HOST_ACC
+            operator uint3(void)
+            {
+                uint3 t;
+                t.x = x;
+                t.y = y;
+                t.z = z;
+                return t;
+            }
+        };
+
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/datatypes/uint.hpp b/include/cupla/datatypes/uint.hpp
index 11c1d509..be77cabd 100644
--- a/include/cupla/datatypes/uint.hpp
+++ b/include/cupla/datatypes/uint.hpp
@@ -26,200 +26,133 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-
-    struct uint3{
-        IdxType x, y, z;
-
-        uint3() = default;
-
-        template<
-          typename TDim,
-          typename TSize,
-          typename = typename std::enable_if<
-              (TDim::value == 3u)
-          >::type
-        >
-        ALPAKA_FN_HOST_ACC
-        uint3(
-          ::alpaka::Vec<
-              TDim,
-              TSize
-          > const &vec
-        ){
-            for (uint32_t i(0); i < 3u; ++i) {
-                // alpaka vectors are z,y,x.
-                (&(this->x))[i] = vec[(3u - 1u) - i];
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        struct uint3
+        {
+            IdxType x, y, z;
+
+            uint3() = default;
+
+            template<typename TDim, typename TSize, typename = typename std::enable_if<(TDim::value == 3u)>::type>
+            ALPAKA_FN_HOST_ACC uint3(::alpaka::Vec<TDim, TSize> const& vec)
+            {
+                for(uint32_t i(0); i < 3u; ++i)
+                {
+                    // alpaka vectors are z,y,x.
+                    (&(this->x))[i] = vec[(3u - 1u) - i];
+                }
             }
-        }
-
-#if( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1 )
-        ALPAKA_FN_HOST_ACC
-        uint3(
-          ::uint3 const & vec
-        ){
-            for (uint32_t i(0); i < 3u; ++i) {
-                (&(this->x))[i] = (&(vec.x))[i];
+
+#if(ALPAKA_ACC_GPU_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+            ALPAKA_FN_HOST_ACC
+            uint3(::uint3 const& vec)
+            {
+                for(uint32_t i(0); i < 3u; ++i)
+                {
+                    (&(this->x))[i] = (&(vec.x))[i];
+                }
             }
-        }
 #endif
 
-        ALPAKA_FN_HOST_ACC
-        operator ::alpaka::Vec<
-            cupla::AlpakaDim< 3u >,
-            IdxType
-        >(void) const
-        {
-            ::alpaka::Vec<
-                cupla::AlpakaDim< 3u >,
-                IdxType
-            > vec(z, y, x);
-            return vec;
-        }
-    };
-
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            ALPAKA_FN_HOST_ACC
+            operator ::alpaka::Vec<cupla::AlpakaDim<3u>, IdxType>(void) const
+            {
+                ::alpaka::Vec<cupla::AlpakaDim<3u>, IdxType> vec(z, y, x);
+                return vec;
+            }
+        };
+
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
 
 
 namespace alpaka
 {
-namespace traits
-{
-
-    //! dimension get trait specialization
-    template<>
-    struct DimType<
-        cupla::uint3
-    >{
-      using type = ::alpaka::DimInt<3u>;
-    };
-
-} // namespace traits
-
-namespace traits
-{
+    namespace traits
+    {
+        //! dimension get trait specialization
+        template<>
+        struct DimType<cupla::uint3>
+        {
+            using type = ::alpaka::DimInt<3u>;
+        };
 
-    //! element type trait specialization
-    template<>
-    struct ElemType<
-        cupla::uint3
-    >{
-        using type = cupla::IdxType;
-    };
+    } // namespace traits
 
-} // namespace traits
+    namespace traits
+    {
+        //! element type trait specialization
+        template<>
+        struct ElemType<cupla::uint3>
+        {
+            using type = cupla::IdxType;
+        };
 
-namespace extent
-{
-namespace traits
-{
+    } // namespace traits
 
-    //! extent get trait specialization
-    template<
-        typename T_Idx
-    >
-    struct GetExtent<
-        T_Idx,
-        cupla::uint3,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-
-        ALPAKA_FN_HOST_ACC
-        static auto
-        getExtent( cupla::uint3 const &extents )
-        -> cupla::IdxType {
-        return (&extents.x)[(3u - 1u) - T_Idx::value];
-      }
-    };
-
-    //! extent set trait specialization
-    template<
-        typename T_Idx,
-        typename T_Extent
-    >
-    struct SetExtent<
-        T_Idx, cupla::uint3,
-        T_Extent,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        setExtent(
-            cupla::uint3 &extents,
-            T_Extent const &extent
-        )
-        -> void
+    namespace extent
+    {
+        namespace traits
         {
-            (&extents.x)[(3u - 1u) - T_Idx::value] = extent;
-        }
-    };
-} // namespace traits
-} // namespace extent
-
-namespace traits
-{
+            //! extent get trait specialization
+            template<typename T_Idx>
+            struct GetExtent<T_Idx, cupla::uint3, typename std::enable_if<(3u > T_Idx::value)>::type>
+            {
+                ALPAKA_FN_HOST_ACC
+                static auto getExtent(cupla::uint3 const& extents) -> cupla::IdxType
+                {
+                    return (&extents.x)[(3u - 1u) - T_Idx::value];
+                }
+            };
+
+            //! extent set trait specialization
+            template<typename T_Idx, typename T_Extent>
+            struct SetExtent<T_Idx, cupla::uint3, T_Extent, typename std::enable_if<(3u > T_Idx::value)>::type>
+            {
+                ALPAKA_FN_HOST_ACC
+                static auto setExtent(cupla::uint3& extents, T_Extent const& extent) -> void
+                {
+                    (&extents.x)[(3u - 1u) - T_Idx::value] = extent;
+                }
+            };
+        } // namespace traits
+    } // namespace extent
+
+    namespace traits
+    {
+        //! offset get trait specialization
+        template<typename T_Idx>
+        struct GetOffset<T_Idx, cupla::uint3, typename std::enable_if<(3u > T_Idx::value)>::type>
+        {
+            ALPAKA_FN_HOST_ACC
+            static auto getOffset(cupla::uint3 const& offsets) -> cupla::IdxType
+            {
+                return (&offsets.x)[(3u - 1u) - T_Idx::value];
+            }
+        };
 
-    //! offset get trait specialization
-    template<
-        typename T_Idx
-    >
-    struct GetOffset<
-        T_Idx,
-        cupla::uint3,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        getOffset( cupla::uint3 const & offsets )
-        -> cupla::IdxType{
-            return (&offsets.x)[(3u - 1u) - T_Idx::value];
-        }
-    };
-
-
-    //! offset set trait specialization.
-    template<
-        typename T_Idx,
-        typename T_Offset
-    >
-    struct SetOffset<
-        T_Idx,
-        cupla::uint3,
-        T_Offset,
-        typename std::enable_if<
-            (3u > T_Idx::value)
-        >::type
-    >{
-        ALPAKA_FN_HOST_ACC
-        static auto
-        setOffset(
-            cupla::uint3 &offsets,
-            T_Offset const &offset
-        )
-        -> void {
-            offsets[(3u - 1u) - T_Idx::value] = offset;
-        }
-    };
-} // namespace traits
-
-namespace traits
-{
 
-    //! size type trait specialization.
-    template<>
-    struct IdxType<
-        cupla::uint3
-    >{
-        using type = cupla::IdxType;
-    };
+        //! offset set trait specialization.
+        template<typename T_Idx, typename T_Offset>
+        struct SetOffset<T_Idx, cupla::uint3, T_Offset, typename std::enable_if<(3u > T_Idx::value)>::type>
+        {
+            ALPAKA_FN_HOST_ACC
+            static auto setOffset(cupla::uint3& offsets, T_Offset const& offset) -> void
+            {
+                offsets[(3u - 1u) - T_Idx::value] = offset;
+            }
+        };
+    } // namespace traits
+
+    namespace traits
+    {
+        //! size type trait specialization.
+        template<>
+        struct IdxType<cupla::uint3>
+        {
+            using type = cupla::IdxType;
+        };
 
-} // namespace traits
-} // namespave alpaka
+    } // namespace traits
+} // namespace alpaka
diff --git a/include/cupla/defines.hpp b/include/cupla/defines.hpp
index 708d1760..49e64681 100644
--- a/include/cupla/defines.hpp
+++ b/include/cupla/defines.hpp
@@ -20,103 +20,90 @@
 
 #pragma once
 
+#include "cupla/namespace.hpp"
+
 #include <alpaka/alpaka.hpp>
-#include <cstdint>
 
-#include "cupla/namespace.hpp"
+#include <cstdint>
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-#   undef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-#   define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED 1
+#    undef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-#   undef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-#   define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED 1
+#    undef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-#   undef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-#   define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED 1
+#    undef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#   undef ALPAKA_ACC_GPU_CUDA_ENABLED
-#   define ALPAKA_ACC_GPU_CUDA_ENABLED 1
+#    undef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    define ALPAKA_ACC_GPU_CUDA_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-#   undef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-#   define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED 1
+#    undef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-#   undef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-#   define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED 1
+#    undef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-#   undef ALPAKA_ACC_GPU_HIP_ENABLED
-#   define ALPAKA_ACC_GPU_HIP_ENABLED 1
+#    undef ALPAKA_ACC_GPU_HIP_ENABLED
+#    define ALPAKA_ACC_GPU_HIP_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
-#   undef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
-#   define ALPAKA_ACC_ANY_BT_OMP5_ENABLED 1
+#    undef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#    define ALPAKA_ACC_ANY_BT_OMP5_ENABLED 1
 #endif
 
 #ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
-#   undef ALPAKA_ACC_ANY_BT_OACC_ENABLED
-#   define ALPAKA_ACC_ANY_BT_OACC_ENABLED 1
+#    undef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+#    define ALPAKA_ACC_ANY_BT_OACC_ENABLED 1
 #endif
 
-#define CUPLA_NUM_SELECTED_DEVICES (                                           \
-        ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED +                                  \
-        ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED +                               \
-        ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED  +                                 \
-        ALPAKA_ACC_GPU_CUDA_ENABLED +                                          \
-        ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +                                   \
-        ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +                                   \
-        ALPAKA_ACC_GPU_HIP_ENABLED +                                           \
-        ALPAKA_ACC_ANY_BT_OMP5_ENABLED +                                       \
-        ALPAKA_ACC_ANY_BT_OACC_ENABLED                                         \
-)
-
-
-#if( CUPLA_NUM_SELECTED_DEVICES == 0 )
-    #error "there is no accelerator selected, please run `ccmake .` and select one"
+#define CUPLA_NUM_SELECTED_DEVICES                                                                                    \
+    (ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED                                     \
+     + ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED + ALPAKA_ACC_GPU_CUDA_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED         \
+     + ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + ALPAKA_ACC_GPU_HIP_ENABLED + ALPAKA_ACC_ANY_BT_OMP5_ENABLED               \
+     + ALPAKA_ACC_ANY_BT_OACC_ENABLED)
+
+
+#if(CUPLA_NUM_SELECTED_DEVICES == 0)
+#    error "there is no accelerator selected, please run `ccmake .` and select one"
 #endif
 
-#if( CUPLA_NUM_SELECTED_DEVICES > 2  )
-    #error "please select at most two accelerators"
+#if(CUPLA_NUM_SELECTED_DEVICES > 2)
+#    error "please select at most two accelerators"
 #endif
 
 // count accelerators where the thread count must be one
-#define CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES (                                \
-        ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +                                  \
-        ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +                                   \
-        ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED                                     \
-)
-
-#define CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES (                           \
-        ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED +                                  \
-        ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED +                               \
-        ALPAKA_ACC_GPU_CUDA_ENABLED +                                          \
-        ALPAKA_ACC_GPU_HIP_ENABLED +                                           \
-        ALPAKA_ACC_ANY_BT_OMP5_ENABLED +                                       \
-        ALPAKA_ACC_ANY_BT_OACC_ENABLED                                         \
-)
-
-#if( CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES > 1 )
-    #error "it is only alowed to select one thread sequential Alpaka accelerator"
+#define CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES                                                                         \
+    (ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+
+#define CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES                                                                    \
+    (ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED + ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED + ALPAKA_ACC_GPU_CUDA_ENABLED       \
+     + ALPAKA_ACC_GPU_HIP_ENABLED + ALPAKA_ACC_ANY_BT_OMP5_ENABLED + ALPAKA_ACC_ANY_BT_OACC_ENABLED)
+
+#if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES > 1)
+#    error "it is only alowed to select one thread sequential Alpaka accelerator"
 #endif
 
-#if( CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES > 1 )
-    #error "it is only alowed to select one thread parallelized Alpaka accelerator"
+#if(CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES > 1)
+#    error "it is only alowed to select one thread parallelized Alpaka accelerator"
 #endif
 
 #ifndef CUPLA_HEADER_ONLY_FUNC_SPEC
-#   define CUPLA_HEADER_ONLY_FUNC_SPEC
+#    define CUPLA_HEADER_ONLY_FUNC_SPEC
 #endif
 
 /*! device compile flag
@@ -127,8 +114,8 @@
  *
  * Value is 1 if device path is compiled else 0
  */
-#if defined(__CUDA_ARCH__) || ( defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__== 1 && defined(__HIP__) )
-    #define CUPLA_DEVICE_COMPILE 1
+#if defined(__CUDA_ARCH__) || (defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__))
+#    define CUPLA_DEVICE_COMPILE 1
 #else
-    #define CUPLA_DEVICE_COMPILE 0
+#    define CUPLA_DEVICE_COMPILE 0
 #endif
diff --git a/include/cupla/device/Atomic.hpp b/include/cupla/device/Atomic.hpp
index 7e9bc60d..66977a9a 100644
--- a/include/cupla/device/Atomic.hpp
+++ b/include/cupla/device/Atomic.hpp
@@ -29,151 +29,98 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-
-#define CUPLA_UNARY_ATOMIC_OP(functionName, alpakaOp)                          \
-        /*!                                                                    \
-         * Compared to their CUDA/HIP counterparts, these functions take an additional last \
-         * parameter to denote atomicity (synchronization) level. This parameter is \
-         * of type cupla::hierarchy::{Grids|Blocks|Threads}. Grids corresponds \
-         * to atomicity between different kernels, Blocks - to different blocks \
-         * in the same grid/kernel, Threads - to threads of the same block.    \
-         * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] \
-         * @tparam T_Acc alpaka accelerator [alpaka::*]                   \
-         * @tparam T_Type type of the value                                    \
-         * @param acc alpaka accelerator                                       \
-         * @param ptr destination pointer                                      \
-         * @param value source value                                           \
-         * @{                                                                  \
-         */                                                                    \
-        template<                                                              \
-            typename T_Hierarchy,                                              \
-            typename T_Acc,                                                    \
-            typename T_Type                                                    \
-        >                                                                      \
-        ALPAKA_FN_ACC ALPAKA_FN_INLINE                                         \
-        T_Type functionName(                                                   \
-            T_Acc const & acc,                                                 \
-            T_Type *ptr,                                                       \
-            T_Type const & value                                               \
-        )                                                                      \
-        {                                                                      \
-            return ::alpaka::atomicOp< alpakaOp >(                     \
-                acc,                                                           \
-                ptr,                                                           \
-                value,                                                         \
-                T_Hierarchy{}                                                  \
-            );                                                                 \
-        }                                                                      \
-                                                                               \
-        /*! @param hierarchy hierarchy level within the operation is atomic    \
-         */                                                                    \
-        template<                                                              \
-            typename T_Acc,                                                    \
-            typename T_Type,                                                   \
-            typename T_Hierarchy = alpaka::hierarchy::Grids                    \
-        >                                                                      \
-        ALPAKA_FN_ACC ALPAKA_FN_INLINE                                         \
-        T_Type functionName(                                                   \
-            T_Acc const & acc,                                                 \
-            T_Type *ptr,                                                       \
-            T_Type const & value,                                              \
-            T_Hierarchy const & hierarchy = T_Hierarchy()                      \
-        )                                                                      \
-        {                                                                      \
-            return functionName< T_Hierarchy >(                                \
-                acc,                                                           \
-                ptr,                                                           \
-                value                                                          \
-            );                                                                 \
-        }                                                                      \
-        /*!@}                                                                  \
-         */
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+#define CUPLA_UNARY_ATOMIC_OP(functionName, alpakaOp)                                                                 \
+    /*!                                                                                                               \
+     * Compared to their CUDA/HIP counterparts, these functions take an additional last                               \
+     * parameter to denote atomicity (synchronization) level. This parameter is                                       \
+     * of type cupla::hierarchy::{Grids|Blocks|Threads}. Grids corresponds                                            \
+     * to atomicity between different kernels, Blocks - to different blocks                                           \
+     * in the same grid/kernel, Threads - to threads of the same block.                                               \
+     * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*]      \
+     * @tparam T_Acc alpaka accelerator [alpaka::*]                                                                   \
+     * @tparam T_Type type of the value                                                                               \
+     * @param acc alpaka accelerator                                                                                  \
+     * @param ptr destination pointer                                                                                 \
+     * @param value source value                                                                                      \
+     * @{                                                                                                             \
+     */                                                                                                               \
+    template<typename T_Hierarchy, typename T_Acc, typename T_Type>                                                   \
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type functionName(T_Acc const& acc, T_Type* ptr, T_Type const& value)            \
+    {                                                                                                                 \
+        return ::alpaka::atomicOp<alpakaOp>(acc, ptr, value, T_Hierarchy{});                                          \
+    }                                                                                                                 \
+                                                                                                                      \
+    /*! @param hierarchy hierarchy level within the operation is atomic                                               \
+     */                                                                                                               \
+    template<typename T_Acc, typename T_Type, typename T_Hierarchy = alpaka::hierarchy::Grids>                        \
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type                                                                             \
+    functionName(T_Acc const& acc, T_Type* ptr, T_Type const& value, T_Hierarchy const& hierarchy = T_Hierarchy())    \
+    {                                                                                                                 \
+        return functionName<T_Hierarchy>(acc, ptr, value);                                                            \
+    }                                                                                                                 \
+            /*!@}                                                                                                     \
+             */
 
-        /// atomic addition
-        CUPLA_UNARY_ATOMIC_OP( atomicAdd, ::alpaka::AtomicAdd )
-        /// atomic subtraction
-        CUPLA_UNARY_ATOMIC_OP( atomicSub, ::alpaka::AtomicSub )
-        /// atomic minimum
-        CUPLA_UNARY_ATOMIC_OP( atomicMin, ::alpaka::AtomicMin )
-        /// atomic maximum
-        CUPLA_UNARY_ATOMIC_OP( atomicMax, ::alpaka::AtomicMax )
-        /// atomic increment
-        CUPLA_UNARY_ATOMIC_OP( atomicInc, ::alpaka::AtomicInc )
-        /// atomic decrement
-        CUPLA_UNARY_ATOMIC_OP( atomicDec, ::alpaka::AtomicDec )
-        /// atomic bit-wise and
-        CUPLA_UNARY_ATOMIC_OP( atomicAnd, ::alpaka::AtomicAnd )
-        /// atomic bit-wise or
-        CUPLA_UNARY_ATOMIC_OP( atomicOr, ::alpaka::AtomicOr )
-        /// atomic exchange
-        CUPLA_UNARY_ATOMIC_OP( atomicExch, ::alpaka::AtomicExch )
-        /// atomic bit-wise xor
-        CUPLA_UNARY_ATOMIC_OP( atomicXor, ::alpaka::AtomicXor )
+            /// atomic addition
+            CUPLA_UNARY_ATOMIC_OP(atomicAdd, ::alpaka::AtomicAdd)
+            /// atomic subtraction
+            CUPLA_UNARY_ATOMIC_OP(atomicSub, ::alpaka::AtomicSub)
+            /// atomic minimum
+            CUPLA_UNARY_ATOMIC_OP(atomicMin, ::alpaka::AtomicMin)
+            /// atomic maximum
+            CUPLA_UNARY_ATOMIC_OP(atomicMax, ::alpaka::AtomicMax)
+            /// atomic increment
+            CUPLA_UNARY_ATOMIC_OP(atomicInc, ::alpaka::AtomicInc)
+            /// atomic decrement
+            CUPLA_UNARY_ATOMIC_OP(atomicDec, ::alpaka::AtomicDec)
+            /// atomic bit-wise and
+            CUPLA_UNARY_ATOMIC_OP(atomicAnd, ::alpaka::AtomicAnd)
+            /// atomic bit-wise or
+            CUPLA_UNARY_ATOMIC_OP(atomicOr, ::alpaka::AtomicOr)
+            /// atomic exchange
+            CUPLA_UNARY_ATOMIC_OP(atomicExch, ::alpaka::AtomicExch)
+            /// atomic bit-wise xor
+            CUPLA_UNARY_ATOMIC_OP(atomicXor, ::alpaka::AtomicXor)
 
 #undef CUPLA_UNARY_ATOMIC_OP
 
-        /** atomic compare and swap
-         *
-         * @{
-         * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*]
-         * @tparam T_Acc alpaka accelerator [alpaka::*]
-         * @tparam T_Type type of the value
-         * @param acc alpaka accelerator
-         * @param ptr destination pointer
-         * @param value source value
-         */
-        template<
-            typename T_Hierarchy,
-            typename T_Acc,
-            typename T_Type
-        >
-        ALPAKA_FN_ACC ALPAKA_FN_INLINE
-        T_Type atomicCas(
-            T_Acc const & acc,
-            T_Type *ptr,
-            T_Type const & compare,
-            T_Type const & value
-        )
-        {
-            return ::alpaka::atomicOp< ::alpaka::AtomicCas >(
-                acc,
-                ptr,
-                compare,
-                value,
-                T_Hierarchy{}
-            );
-        }
+            /** atomic compare and swap
+             *
+             * @{
+             * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type
+             * cupla::hierarchy::*]
+             * @tparam T_Acc alpaka accelerator [alpaka::*]
+             * @tparam T_Type type of the value
+             * @param acc alpaka accelerator
+             * @param ptr destination pointer
+             * @param value source value
+             */
+            template<typename T_Hierarchy, typename T_Acc, typename T_Type>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type
+            atomicCas(T_Acc const& acc, T_Type* ptr, T_Type const& compare, T_Type const& value)
+            {
+                return ::alpaka::atomicOp<::alpaka::AtomicCas>(acc, ptr, compare, value, T_Hierarchy{});
+            }
 
-        /*! @param hierarchy hierarchy level within the operation is atomic
-         */
-        template<
-            typename T_Acc,
-            typename T_Type,
-            typename T_Hierarchy = hierarchy::Grids
-        >
-        ALPAKA_FN_ACC ALPAKA_FN_INLINE
-        T_Type atomicCas(
-            T_Acc const & acc,
-            T_Type *ptr,
-            T_Type const & compare,
-            T_Type const & value,
-            T_Hierarchy const & hierarchy = T_Hierarchy()
-        )
-        {
-            return atomicCas< T_Hierarchy >(
-                acc,
-                ptr,
-                compare,
-                value
-            );
-        }
-        /*!@}
-         */
+            /*! @param hierarchy hierarchy level within the operation is atomic
+             */
+            template<typename T_Acc, typename T_Type, typename T_Hierarchy = hierarchy::Grids>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE T_Type atomicCas(
+                T_Acc const& acc,
+                T_Type* ptr,
+                T_Type const& compare,
+                T_Type const& value,
+                T_Hierarchy const& hierarchy = T_Hierarchy())
+            {
+                return atomicCas<T_Hierarchy>(acc, ptr, compare, value);
+            }
+            /*!@}
+             */
 
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/Hierarchy.hpp b/include/cupla/device/Hierarchy.hpp
index 130faa31..032f89cb 100644
--- a/include/cupla/device/Hierarchy.hpp
+++ b/include/cupla/device/Hierarchy.hpp
@@ -27,17 +27,16 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-namespace hierarchy
-{
-
-    //! hierarchy definitions for atomic operation
-    using namespace ::alpaka::hierarchy;
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            namespace hierarchy
+            {
+                //! hierarchy definitions for atomic operation
+                using namespace ::alpaka::hierarchy;
 
-} // namespace layer
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace hierarchy
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/Index.hpp b/include/cupla/device/Index.hpp
index c7bd2d8a..32b98228 100644
--- a/include/cupla/device/Index.hpp
+++ b/include/cupla/device/Index.hpp
@@ -28,96 +28,65 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-
-    /** number of blocks within the grid layer
-     *
-     * @tparam T_Acc alpaka accelerator [alpaka::*]
-     * @param acc alpaka accelerator
-     */
-    template< typename T_Acc >
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE
-    cupla::uint3 gridDim( T_Acc const & acc )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        return static_cast< uint3 >(
-            ::alpaka::getWorkDiv<
-                ::alpaka::Grid,
-                ::alpaka::Blocks
-            >( acc )
-        );
-    }
+        inline namespace device
+        {
+            /** number of blocks within the grid layer
+             *
+             * @tparam T_Acc alpaka accelerator [alpaka::*]
+             * @param acc alpaka accelerator
+             */
+            template<typename T_Acc>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 gridDim(T_Acc const& acc)
+            {
+                return static_cast<uint3>(::alpaka::getWorkDiv<::alpaka::Grid, ::alpaka::Blocks>(acc));
+            }
 
-    /** number of threads within the block layer
-     *
-     * @tparam T_Acc alpaka accelerator [alpaka::*]
-     * @param acc alpaka accelerator
-     */
-    template< typename T_Acc >
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE
-    cupla::uint3 blockDim( T_Acc const & acc )
-    {
-        return static_cast< uint3 >(
-            ::alpaka::getWorkDiv<
-                ::alpaka::Block,
-                ::alpaka::Threads
-            >( acc )
-        );
-    }
+            /** number of threads within the block layer
+             *
+             * @tparam T_Acc alpaka accelerator [alpaka::*]
+             * @param acc alpaka accelerator
+             */
+            template<typename T_Acc>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 blockDim(T_Acc const& acc)
+            {
+                return static_cast<uint3>(::alpaka::getWorkDiv<::alpaka::Block, ::alpaka::Threads>(acc));
+            }
 
-    /** number of elements within the thread layer
-     *
-     * @tparam T_Acc alpaka accelerator [alpaka::*]
-     * @param acc alpaka accelerator
-     */
-    template< typename T_Acc >
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE
-    cupla::uint3 threadDim( T_Acc const & acc )
-    {
-        return static_cast< uint3 >(
-            ::alpaka::getWorkDiv<
-                ::alpaka::Thread,
-                ::alpaka::Elems
-            >( acc )
-        );
-    }
+            /** number of elements within the thread layer
+             *
+             * @tparam T_Acc alpaka accelerator [alpaka::*]
+             * @param acc alpaka accelerator
+             */
+            template<typename T_Acc>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 threadDim(T_Acc const& acc)
+            {
+                return static_cast<uint3>(::alpaka::getWorkDiv<::alpaka::Thread, ::alpaka::Elems>(acc));
+            }
 
-    /** index of the thread within the block layer
-     *
-     * @tparam T_Acc alpaka accelerator [alpaka::*]
-     * @param acc alpaka accelerator
-     */
-    template< typename T_Acc >
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE
-    cupla::uint3 threadIdx( T_Acc const & acc )
-    {
-        return static_cast< uint3 >(
-            ::alpaka::getIdx<
-                ::alpaka::Block,
-                ::alpaka::Threads
-            >( acc )
-        );
-    }
+            /** index of the thread within the block layer
+             *
+             * @tparam T_Acc alpaka accelerator [alpaka::*]
+             * @param acc alpaka accelerator
+             */
+            template<typename T_Acc>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 threadIdx(T_Acc const& acc)
+            {
+                return static_cast<uint3>(::alpaka::getIdx<::alpaka::Block, ::alpaka::Threads>(acc));
+            }
 
-    /** index of the block within the grid layer
-     *
-     * @tparam T_Acc alpaka accelerator [alpaka::*]
-     * @param acc alpaka accelerator
-     */
-    template< typename T_Acc >
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE
-    cupla::uint3 blockIdx( T_Acc const & acc )
-    {
-        return static_cast< uint3 >(
-            ::alpaka::getIdx<
-                ::alpaka::Grid,
-                ::alpaka::Blocks
-            >( acc )
-        );
-    }
+            /** index of the block within the grid layer
+             *
+             * @tparam T_Acc alpaka accelerator [alpaka::*]
+             * @param acc alpaka accelerator
+             */
+            template<typename T_Acc>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE cupla::uint3 blockIdx(T_Acc const& acc)
+            {
+                return static_cast<uint3>(::alpaka::getIdx<::alpaka::Grid, ::alpaka::Blocks>(acc));
+            }
 
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/SharedMemory.hpp b/include/cupla/device/SharedMemory.hpp
index 03d99398..6202f3f8 100644
--- a/include/cupla/device/SharedMemory.hpp
+++ b/include/cupla/device/SharedMemory.hpp
@@ -23,10 +23,6 @@
 
 #include <alpaka/alpaka.hpp>
 
-#define sharedMem(ppName, ...)                                                 \
-     __VA_ARGS__& ppName =                                                     \
-        ::alpaka::declareSharedVar< __VA_ARGS__, __COUNTER__ >( acc )
+#define sharedMem(ppName, ...) __VA_ARGS__& ppName = ::alpaka::declareSharedVar<__VA_ARGS__, __COUNTER__>(acc)
 
-#define sharedMemExtern(ppName, ...)                                           \
-    __VA_ARGS__* ppName =                                                      \
-        ::alpaka::getDynSharedMem< __VA_ARGS__ >( acc )
+#define sharedMemExtern(ppName, ...) __VA_ARGS__* ppName = ::alpaka::getDynSharedMem<__VA_ARGS__>(acc)
diff --git a/include/cupla/device/Synchronization.hpp b/include/cupla/device/Synchronization.hpp
index a095dacb..91918b84 100644
--- a/include/cupla/device/Synchronization.hpp
+++ b/include/cupla/device/Synchronization.hpp
@@ -27,34 +27,31 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-
-    /** synchronize threads within the block
-     *
-     * @tparam T_Acc alpaka accelerator [alpaka::*]
-     * @param acc alpaka accelerator
-     *
-     * @{
-     */
-    template< typename T_Acc >
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE
-    void syncThreads( T_Acc const & acc )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        ::alpaka::syncBlockThreads( acc );
-    }
-
-    template< typename T_Acc >
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE
-    void __syncthreads( T_Acc const & acc )
-    {
-        syncThreads( acc );
-    }
-
-    //!@}
-
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+        inline namespace device
+        {
+            /** synchronize threads within the block
+             *
+             * @tparam T_Acc alpaka accelerator [alpaka::*]
+             * @param acc alpaka accelerator
+             *
+             * @{
+             */
+            template<typename T_Acc>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE void syncThreads(T_Acc const& acc)
+            {
+                ::alpaka::syncBlockThreads(acc);
+            }
+
+            template<typename T_Acc>
+            ALPAKA_FN_ACC ALPAKA_FN_INLINE void __syncthreads(T_Acc const& acc)
+            {
+                syncThreads(acc);
+            }
+
+            //!@}
+
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Abs.hpp b/include/cupla/device/math/Abs.hpp
index b90f8a48..38fbf1f1 100644
--- a/include/cupla/device/math/Abs.hpp
+++ b/include/cupla/device/math/Abs.hpp
@@ -26,18 +26,17 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the absolute value.
-    CUPLA_UNARY_MATH_FN( abs, alpaka::math::ConceptMathAbs, Abs )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the absolute value.
+                CUPLA_UNARY_MATH_FN(abs, alpaka::math::ConceptMathAbs, Abs)
 
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Common.hpp b/include/cupla/device/math/Common.hpp
index c6a856e0..88f59a45 100644
--- a/include/cupla/device/math/Common.hpp
+++ b/include/cupla/device/math/Common.hpp
@@ -21,8 +21,8 @@
 
 #pragma once
 
-#include "cupla/types.hpp"
 #include "cupla/defines.hpp"
+#include "cupla/types.hpp"
 
 #include <alpaka/core/Concepts.hpp>
 
@@ -30,149 +30,100 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-namespace detail
-{
-    /** Get the concept implementation of the current accelerator
-     *
-     * @tparam T_AccOrMathImpl accelerator or math implementation [type alpaka::* or alpaka::math::MathStdLib]
-     * @tparam T_Concept alpaka concept
-     * @return implementation of the concept
-     */
-    ALPAKA_NO_HOST_ACC_WARNING
-    template< typename T_AccOrMathImpl, typename T_Concept >
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto getConcept()
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        using ResultMathConcept = alpaka::concepts::ImplementationBase<
-            T_Concept,
-            T_AccOrMathImpl
-        >;
+        inline namespace device
+        {
+            inline namespace math
+            {
+                namespace detail
+                {
+                    /** Get the concept implementation of the current accelerator
+                     *
+                     * @tparam T_AccOrMathImpl accelerator or math implementation [type alpaka::* or
+                     * alpaka::math::MathStdLib]
+                     * @tparam T_Concept alpaka concept
+                     * @return implementation of the concept
+                     */
+                    ALPAKA_NO_HOST_ACC_WARNING
+                    template<typename T_AccOrMathImpl, typename T_Concept>
+                    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto getConcept()
+                    {
+                        using ResultMathConcept = alpaka::concepts::ImplementationBase<T_Concept, T_AccOrMathImpl>;
 
-        using AccMathConcept = alpaka::concepts::ImplementationBase<
-            T_Concept,
-            Acc
-        >;
+                        using AccMathConcept = alpaka::concepts::ImplementationBase<T_Concept, Acc>;
 
-        using AccThreadSeqMathConcept = alpaka::concepts::ImplementationBase<
-            T_Concept,
-            AccThreadSeq
-        >;
+                        using AccThreadSeqMathConcept = alpaka::concepts::ImplementationBase<T_Concept, AccThreadSeq>;
 
-        // cupla Acc and AccThreadSeq should use the same math concept implementation
-        static_assert(
-            std::is_same<
-                AccMathConcept,
-                AccThreadSeqMathConcept
-            >::value,
-            "The math concept implementation for the type 'Acc' and 'AccThreadSeq' must be equal"
-        );
+                        // cupla Acc and AccThreadSeq should use the same math concept implementation
+                        static_assert(
+                            std::is_same<AccMathConcept, AccThreadSeqMathConcept>::value,
+                            "The math concept implementation for the type 'Acc' and 'AccThreadSeq' must be equal");
 
-        return ResultMathConcept{};
-    }
-} // namespace detail
+                        return ResultMathConcept{};
+                    }
+                } // namespace detail
 
-#define CUPLA_UNARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait)  \
-    /**                                                                        \
-     * @tparam T_Type argument type                                            \
-     * @param arg input argument                                               \
-     */                                                                        \
-    ALPAKA_NO_HOST_ACC_WARNING                                                 \
-    template< typename T_Type >                                                \
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName(                     \
-        T_Type const & arg                                                     \
-    )                                                                          \
-    /* return type is required for the compiler to detect host, device         \
-     * function qualifier correctly                                            \
-     */                                                                        \
-    ->  decltype(                                                              \
-        alpaka::core::declval<alpaka::math::traits::alpakaMathTrait<                                 \
-            alpaka::concepts::ImplementationBase<                              \
-                alpakaMathConcept,                                             \
-                accOrMathImpl                                                  \
-            >,                                                                 \
-            T_Type                                                             \
-        >>()(                                                       \
-            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
-            arg                                                                \
-        )                                                                      \
-    )                                                                          \
-    {                                                                          \
-        return alpaka::math::traits::alpakaMathTrait<                          \
-            alpaka::concepts::ImplementationBase<                              \
-                alpakaMathConcept,                                             \
-                accOrMathImpl                                                  \
-            >,                                                                 \
-            T_Type                                                             \
-        >{}(                                                       \
-            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
-            arg                                                                \
-        );                                                                     \
+#define CUPLA_UNARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait)                   \
+    /**                                                                                                               \
+     * @tparam T_Type argument type                                                                                   \
+     * @param arg input argument                                                                                      \
+     */                                                                                                               \
+    ALPAKA_NO_HOST_ACC_WARNING                                                                                        \
+    template<typename T_Type>                                                                                         \
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName(                                                            \
+        T_Type const& arg) /* return type is required for the compiler to detect host, device                         \
+                            * function qualifier correctly                                                            \
+                            */                                                                                        \
+        ->decltype(alpaka::core::declval<alpaka::math::traits::alpakaMathTrait<                                       \
+                       alpaka::concepts::ImplementationBase<alpakaMathConcept, accOrMathImpl>,                        \
+                       T_Type>>()(detail::getConcept<accOrMathImpl, alpakaMathConcept>(), arg))                       \
+    {                                                                                                                 \
+        return alpaka::math::traits::alpakaMathTrait<                                                                 \
+            alpaka::concepts::ImplementationBase<alpakaMathConcept, accOrMathImpl>,                                   \
+            T_Type>{}(detail::getConcept<accOrMathImpl, alpakaMathConcept>(), arg);                                   \
     }
 
 /* Using the free alpaka functions `alpaka::math::*` will result into `__host__ __device__`
  * errors, therefore the alpaka math trait must be used.
  */
-#define CUPLA_BINARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait) \
-    /**                                                                        \
-     * @tparam T_Type argument type                                            \
-     * @param arg1 first input argument                                        \
-     * @param arg2 second input argument                                       \
-     */                                                                        \
-    ALPAKA_NO_HOST_ACC_WARNING                                                 \
-    template<                                                                  \
-        typename T_Type1,                                                      \
-        typename T_Type2                                                       \
-    >                                                                          \
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName(                     \
-        T_Type1 const & arg1,                                                  \
-        T_Type2 const & arg2                                                   \
-    )                                                                          \
-    /* return type is required for the compiler to detect host, device         \
-     * function qualifier correctly                                            \
-     */                                                                        \
-    ->  decltype(                                                              \
-        alpaka::core::declval<alpaka::math::traits::alpakaMathTrait<                                 \
-            alpaka::concepts::ImplementationBase<                              \
-                alpakaMathConcept,                                             \
-                accOrMathImpl                                                  \
-            >,                                                                 \
-            T_Type1,                                                           \
-            T_Type2                                                            \
-        >>()(                                                       \
-            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
-            arg1,                                                              \
-            arg2                                                               \
-        )                                                                      \
-    )                                                                          \
-    {                                                                          \
-        return alpaka::math::traits::alpakaMathTrait<                          \
-            alpaka::concepts::ImplementationBase<                              \
-                alpakaMathConcept,                                             \
-                accOrMathImpl                                                  \
-            >,                                                                 \
-            T_Type1,                                                           \
-            T_Type2                                                            \
-        >{}(                                                       \
-            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
-            arg1,                                                              \
-            arg2                                                               \
-        );                                                                     \
+#define CUPLA_BINARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait)                  \
+    /**                                                                                                               \
+     * @tparam T_Type argument type                                                                                   \
+     * @param arg1 first input argument                                                                               \
+     * @param arg2 second input argument                                                                              \
+     */                                                                                                               \
+    ALPAKA_NO_HOST_ACC_WARNING                                                                                        \
+    template<typename T_Type1, typename T_Type2>                                                                      \
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName(                                                            \
+        T_Type1 const& arg1,                                                                                          \
+        T_Type2 const& arg2) /* return type is required for the compiler to detect host, device                       \
+                              * function qualifier correctly                                                          \
+                              */                                                                                      \
+        ->decltype(alpaka::core::declval<alpaka::math::traits::alpakaMathTrait<                                       \
+                       alpaka::concepts::ImplementationBase<alpakaMathConcept, accOrMathImpl>,                        \
+                       T_Type1,                                                                                       \
+                       T_Type2>>()(detail::getConcept<accOrMathImpl, alpakaMathConcept>(), arg1, arg2))               \
+    {                                                                                                                 \
+        return alpaka::math::traits::alpakaMathTrait<                                                                 \
+            alpaka::concepts::ImplementationBase<alpakaMathConcept, accOrMathImpl>,                                   \
+            T_Type1,                                                                                                  \
+            T_Type2>{}(detail::getConcept<accOrMathImpl, alpakaMathConcept>(), arg1, arg2);                           \
     }
 
 #if CUPLA_DEVICE_COMPILE == 0
-    #define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_UNARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait)
-    #define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_BINARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait)
+#    define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait)                                     \
+        CUPLA_UNARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait)
+#    define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait)                                    \
+        CUPLA_BINARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait)
 #else
-    #define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_UNARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait)
-    #define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_BINARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait)
+#    define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait)                                     \
+        CUPLA_UNARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait)
+#    define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait)                                    \
+        CUPLA_BINARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait)
 #endif
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Comparison.hpp b/include/cupla/device/math/Comparison.hpp
index 0cca2fd9..247c7ca9 100644
--- a/include/cupla/device/math/Comparison.hpp
+++ b/include/cupla/device/math/Comparison.hpp
@@ -26,20 +26,19 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Calculates the smaller value of two arguments.
-    CUPLA_BINARY_MATH_FN( min, alpaka::math::ConceptMathMin, Min )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Calculates the smaller value of two arguments.
+                CUPLA_BINARY_MATH_FN(min, alpaka::math::ConceptMathMin, Min)
 
-    //! Calculates the larger value of two arguments.
-    CUPLA_BINARY_MATH_FN( max, alpaka::math::ConceptMathMax, Max )
+                //! Calculates the larger value of two arguments.
+                CUPLA_BINARY_MATH_FN(max, alpaka::math::ConceptMathMax, Max)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Erf.hpp b/include/cupla/device/math/Erf.hpp
index dd7be4c6..f8347a28 100644
--- a/include/cupla/device/math/Erf.hpp
+++ b/include/cupla/device/math/Erf.hpp
@@ -26,17 +26,16 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the error function.
-    CUPLA_UNARY_MATH_FN( erf, alpaka::math::ConceptMathErf, Erf )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the error function.
+                CUPLA_UNARY_MATH_FN(erf, alpaka::math::ConceptMathErf, Erf)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Exp.hpp b/include/cupla/device/math/Exp.hpp
index 6f343fb5..47e9f70b 100644
--- a/include/cupla/device/math/Exp.hpp
+++ b/include/cupla/device/math/Exp.hpp
@@ -26,17 +26,16 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes e (Euler's number, 2.7182818...) raised to the given power.
-    CUPLA_UNARY_MATH_FN( exp, alpaka::math::ConceptMathExp, Exp )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes e (Euler's number, 2.7182818...) raised to the given power.
+                CUPLA_UNARY_MATH_FN(exp, alpaka::math::ConceptMathExp, Exp)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Log.hpp b/include/cupla/device/math/Log.hpp
index 7b8c621b..d49ed531 100644
--- a/include/cupla/device/math/Log.hpp
+++ b/include/cupla/device/math/Log.hpp
@@ -28,17 +28,16 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the natural (base e) logarithm.
-    CUPLA_UNARY_MATH_FN( log, alpaka::math::ConceptMathLog, Log )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the natural (base e) logarithm.
+                CUPLA_UNARY_MATH_FN(log, alpaka::math::ConceptMathLog, Log)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Mod.hpp b/include/cupla/device/math/Mod.hpp
index 2ea026a2..d01c5927 100644
--- a/include/cupla/device/math/Mod.hpp
+++ b/include/cupla/device/math/Mod.hpp
@@ -26,20 +26,19 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the floating-point remainder of the division operation x/y.
-    CUPLA_BINARY_MATH_FN( fmod, alpaka::math::ConceptMathFmod, Fmod )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the floating-point remainder of the division operation x/y.
+                CUPLA_BINARY_MATH_FN(fmod, alpaka::math::ConceptMathFmod, Fmod)
 
-    //! Computes the IEEE remainder of the floating point division operation x/y.
-    CUPLA_BINARY_MATH_FN( remainder, alpaka::math::ConceptMathRemainder, Remainder )
+                //! Computes the IEEE remainder of the floating point division operation x/y.
+                CUPLA_BINARY_MATH_FN(remainder, alpaka::math::ConceptMathRemainder, Remainder)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Pow.hpp b/include/cupla/device/math/Pow.hpp
index 56a4e464..6d166e75 100644
--- a/include/cupla/device/math/Pow.hpp
+++ b/include/cupla/device/math/Pow.hpp
@@ -26,17 +26,16 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the value of base raised to the power exp.
-    CUPLA_BINARY_MATH_FN( pow, alpaka::math::ConceptMathPow, Pow )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the value of base raised to the power exp.
+                CUPLA_BINARY_MATH_FN(pow, alpaka::math::ConceptMathPow, Pow)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Root.hpp b/include/cupla/device/math/Root.hpp
index eafbb769..d5d66e98 100644
--- a/include/cupla/device/math/Root.hpp
+++ b/include/cupla/device/math/Root.hpp
@@ -26,23 +26,22 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the square root.
-    CUPLA_UNARY_MATH_FN( sqrt, alpaka::math::ConceptMathSqrt, Sqrt )
-
-    //! Computes the inverse square root.
-    CUPLA_UNARY_MATH_FN( rsqrt, alpaka::math::ConceptMathRsqrt, Rsqrt )
-
-    //! Computes the cubic root.
-    CUPLA_UNARY_MATH_FN( cbrt, alpaka::math::ConceptMathCbrt, Cbrt )
-
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the square root.
+                CUPLA_UNARY_MATH_FN(sqrt, alpaka::math::ConceptMathSqrt, Sqrt)
+
+                //! Computes the inverse square root.
+                CUPLA_UNARY_MATH_FN(rsqrt, alpaka::math::ConceptMathRsqrt, Rsqrt)
+
+                //! Computes the cubic root.
+                CUPLA_UNARY_MATH_FN(cbrt, alpaka::math::ConceptMathCbrt, Cbrt)
+
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Round.hpp b/include/cupla/device/math/Round.hpp
index 35bf0834..75359638 100644
--- a/include/cupla/device/math/Round.hpp
+++ b/include/cupla/device/math/Round.hpp
@@ -26,41 +26,40 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the smallest integer value not less than arg.
-    CUPLA_UNARY_MATH_FN( ceil, alpaka::math::ConceptMathCeil, Ceil )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the smallest integer value not less than arg.
+                CUPLA_UNARY_MATH_FN(ceil, alpaka::math::ConceptMathCeil, Ceil)
 
-    //! Computes the largest integer value not greater than arg.
-    CUPLA_UNARY_MATH_FN( floor, alpaka::math::ConceptMathFloor, Floor )
+                //! Computes the largest integer value not greater than arg.
+                CUPLA_UNARY_MATH_FN(floor, alpaka::math::ConceptMathFloor, Floor)
 
-    //! Computes the nearest integer not greater in magnitude than arg.
-    CUPLA_UNARY_MATH_FN( trunc, alpaka::math::ConceptMathTrunc, Trunc )
+                //! Computes the nearest integer not greater in magnitude than arg.
+                CUPLA_UNARY_MATH_FN(trunc, alpaka::math::ConceptMathTrunc, Trunc)
 
-    /** Computes the nearest integer value to arg (in floating-point format).
-     *
-     * Rounding halfway cases away from zero, regardless of the current rounding mode.
-     */
-    CUPLA_UNARY_MATH_FN( round, alpaka::math::ConceptMathRound, Round )
+                /** Computes the nearest integer value to arg (in floating-point format).
+                 *
+                 * Rounding halfway cases away from zero, regardless of the current rounding mode.
+                 */
+                CUPLA_UNARY_MATH_FN(round, alpaka::math::ConceptMathRound, Round)
 
-    /** Computes the nearest integer value to arg (in integer format).
-     *
-     * Rounding halfway cases away from zero, regardless of the current rounding mode.
-     */
-    CUPLA_UNARY_MATH_FN( lround, alpaka::math::ConceptMathRound, Lround )
+                /** Computes the nearest integer value to arg (in integer format).
+                 *
+                 * Rounding halfway cases away from zero, regardless of the current rounding mode.
+                 */
+                CUPLA_UNARY_MATH_FN(lround, alpaka::math::ConceptMathRound, Lround)
 
-    /** Computes the nearest integer value to arg (in integer format).
-     *
-     * Rounding halfway cases away from zero, regardless of the current rounding mode.
-     */
-    CUPLA_UNARY_MATH_FN( llround, alpaka::math::ConceptMathRound, Llround )
+                /** Computes the nearest integer value to arg (in integer format).
+                 *
+                 * Rounding halfway cases away from zero, regardless of the current rounding mode.
+                 */
+                CUPLA_UNARY_MATH_FN(llround, alpaka::math::ConceptMathRound, Llround)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device/math/Trigo.hpp b/include/cupla/device/math/Trigo.hpp
index 15ce6a36..8197334e 100644
--- a/include/cupla/device/math/Trigo.hpp
+++ b/include/cupla/device/math/Trigo.hpp
@@ -26,35 +26,34 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-inline namespace device
-{
-inline namespace math
-{
-
-    //! Computes the sine (measured in radians).
-    CUPLA_UNARY_MATH_FN( sin, alpaka::math::ConceptMathSin, Sin )
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        inline namespace device
+        {
+            inline namespace math
+            {
+                //! Computes the sine (measured in radians).
+                CUPLA_UNARY_MATH_FN(sin, alpaka::math::ConceptMathSin, Sin)
 
-    //! Computes the cosine (measured in radians).
-    CUPLA_UNARY_MATH_FN( cos, alpaka::math::ConceptMathCos, Cos )
+                //! Computes the cosine (measured in radians).
+                CUPLA_UNARY_MATH_FN(cos, alpaka::math::ConceptMathCos, Cos)
 
-    //! Computes the tangent (measured in radians).
-    CUPLA_UNARY_MATH_FN( tan, alpaka::math::ConceptMathTan, Tan )
+                //! Computes the tangent (measured in radians).
+                CUPLA_UNARY_MATH_FN(tan, alpaka::math::ConceptMathTan, Tan)
 
-    //! Computes the principal value of the arc sine.
-    CUPLA_UNARY_MATH_FN( asin, alpaka::math::ConceptMathAsin, Asin )
+                //! Computes the principal value of the arc sine.
+                CUPLA_UNARY_MATH_FN(asin, alpaka::math::ConceptMathAsin, Asin)
 
-    //! Computes the principal value of the arc cosine.
-    CUPLA_UNARY_MATH_FN( acos, alpaka::math::ConceptMathAcos, Acos )
+                //! Computes the principal value of the arc cosine.
+                CUPLA_UNARY_MATH_FN(acos, alpaka::math::ConceptMathAcos, Acos)
 
-    //! Computes the principal value of the arc tangent.
-    CUPLA_UNARY_MATH_FN( atan, alpaka::math::ConceptMathAtan, Atan )
+                //! Computes the principal value of the arc tangent.
+                CUPLA_UNARY_MATH_FN(atan, alpaka::math::ConceptMathAtan, Atan)
 
-    //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
-    CUPLA_BINARY_MATH_FN( atan2, alpaka::math::ConceptMathAtan2, Atan2 )
+                //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
+                CUPLA_BINARY_MATH_FN(atan2, alpaka::math::ConceptMathAtan2, Atan2)
 
-} // namespace math
-} // namespace device
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            } // namespace math
+        } // namespace device
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/device_functions.hpp b/include/cupla/device_functions.hpp
index fe164900..11fb5d88 100644
--- a/include/cupla/device_functions.hpp
+++ b/include/cupla/device_functions.hpp
@@ -21,8 +21,8 @@
 
 #pragma once
 
-#include "cupla/device/Synchronization.hpp"
-#include "cupla/device/Index.hpp"
 #include "cupla/device/Atomic.hpp"
+#include "cupla/device/Index.hpp"
 #include "cupla/device/SharedMemory.hpp"
+#include "cupla/device/Synchronization.hpp"
 #include "cupla/device/math.hpp"
diff --git a/include/cupla/kernel.hpp b/include/cupla/kernel.hpp
index 597f8eff..7f745290 100644
--- a/include/cupla/kernel.hpp
+++ b/include/cupla/kernel.hpp
@@ -21,248 +21,193 @@
 
 #pragma once
 
-#include "cupla/namespace.hpp"
-#include "cupla/types.hpp"
-
 #include "cupla/datatypes/dim3.hpp"
 #include "cupla/datatypes/uint.hpp"
-#include "cupla/manager/Stream.hpp"
 #include "cupla/manager/Device.hpp"
+#include "cupla/manager/Stream.hpp"
+#include "cupla/namespace.hpp"
 #include "cupla/traits/IsThreadSeqAcc.hpp"
+#include "cupla/types.hpp"
 
 #include <utility>
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-
-    /** get block and elements extents
-     *
-     * can swap the block and element extents depend on the selected Alpaka
-     * accelerator
-     */
-    template<
-        typename T_Acc,
-        bool T_isThreadSeqAcc = traits::IsThreadSeqAcc< T_Acc >::value
-    >
-    struct GetBlockAndElemExtents
-    {
-        static void get( dim3 const & , dim3 const &  )
-        { }
-    };
-
-    template< typename T_Acc >
-    struct GetBlockAndElemExtents<
-        T_Acc,
-        true
-    >
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        static void get( dim3 & blockSize, dim3 & elemSize )
+        /** get block and elements extents
+         *
+         * can swap the block and element extents depend on the selected Alpaka
+         * accelerator
+         */
+        template<typename T_Acc, bool T_isThreadSeqAcc = traits::IsThreadSeqAcc<T_Acc>::value>
+        struct GetBlockAndElemExtents
         {
-            std::swap( blockSize, elemSize );
-        }
-    };
-
-    /** wrapper for kernel types
-     *
-     * This implements the possibility to define dynamic shared memory without
-     * specializing the needed alpaka trait BlockSharedMemDynSizeBytes
-     */
-    template<
-        typename T_Kernel
-    >
-    struct CuplaKernel :
-        public T_Kernel
-    {
-        size_t const  m_dynSharedMemBytes;
-
-        CuplaKernel( size_t const & dynSharedMemBytes ) :
-            m_dynSharedMemBytes( dynSharedMemBytes )
-        { }
-    };
-
-    /** execute a kernel
-     *
-     * @tparam T_KernelType type of the kernel
-     * @tparam T_Acc accelerator used to execute the kernel
-     *
-     */
-    template<
-        typename T_KernelType,
-        typename T_Acc
-    >
-    class KernelExecutor
-    {
-        IdxVec3 const m_gridSize;
-        IdxVec3 const m_blockSize;
-        IdxVec3 const m_elemSize;
-        uint32_t const m_dynSharedMemSize;
-        cuplaStream_t const m_stream;
-
-    public:
-        KernelExecutor(
-            dim3 const & gridSize,
-            dim3 const & blockSize,
-            dim3 const & elemSize,
-            uint32_t const & dynSharedMemSize,
-            cuplaStream_t const & stream
-        ) :
-            m_gridSize( gridSize.z, gridSize.y, gridSize.x ),
-            m_blockSize( blockSize.z, blockSize.y, blockSize.x ),
-            m_elemSize( elemSize.z, elemSize.y, elemSize.x ),
-            m_dynSharedMemSize( dynSharedMemSize ),
-            m_stream( stream )
-        {}
+            static void get(dim3 const&, dim3 const&)
+            {
+            }
+        };
 
-        template< typename... T_Args >
-        void operator()( T_Args && ... args ) const
+        template<typename T_Acc>
+        struct GetBlockAndElemExtents<T_Acc, true>
         {
-            ::alpaka::WorkDivMembers<
-              KernelDim,
-              IdxType
-            > workDiv(
-                m_gridSize,
-                m_blockSize,
-                m_elemSize
-            );
-            auto const exec = ::alpaka::createTaskKernel< T_Acc >(
-                workDiv,
-                CuplaKernel< T_KernelType >{ m_dynSharedMemSize },
-                std::forward< T_Args >( args )...
-            );
-
-            auto & stream = cupla::manager::Stream<
-                cupla::AccDev,
-                cupla::AccStream
-            >::get().stream( m_stream );
-
-            ::alpaka::enqueue(stream, exec);
-        }
-    };
-
-    /** Cuda like configuration interface for a kernel
-     *
-     * Interface is compatible to the argument order of a cuda kernel `T_KernelType<<<...>>>`
-     */
-    template<
-        typename T_KernelType
-    >
-    struct KernelCudaLike
-    {
-        auto operator()(
-            dim3 const & gridSize,
-            dim3 const & blockSize,
-            uint32_t const & dynSharedMemSize = 0,
-            cuplaStream_t const & stream = 0
-        ) const
-        -> KernelExecutor<
-            T_KernelType,
-            cupla::Acc
-        >
+            static void get(dim3& blockSize, dim3& elemSize)
+            {
+                std::swap(blockSize, elemSize);
+            }
+        };
+
+        /** wrapper for kernel types
+         *
+         * This implements the possibility to define dynamic shared memory without
+         * specializing the needed alpaka trait BlockSharedMemDynSizeBytes
+         */
+        template<typename T_Kernel>
+        struct CuplaKernel : public T_Kernel
         {
-            return KernelExecutor<
-                T_KernelType,
-                cupla::Acc
-            >(gridSize, blockSize, dim3(), dynSharedMemSize, stream);
-        }
-    };
-
-    /* Kernel configuration interface with element support
-     *
-     * The kernel must support the alpaka element layer.
-     *
-     * Swap the blockSize and the elemSize depending on the activated accelerator.
-     * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) )
-     * and the elemSize is set to the user defined blockSize
-     */
-    template<
-        typename T_KernelType
-    >
-    struct SwitchableElementLevel
-    {
-        auto operator()(
-            dim3 const & gridSize,
-            dim3 const & blockSize,
-            uint32_t const & dynSharedMemSize = 0,
-            cuplaStream_t const & stream = 0
-        ) const
-        -> KernelExecutor<
-            T_KernelType,
-            cupla::AccThreadSeq
-        >
+            size_t const m_dynSharedMemBytes;
+
+            CuplaKernel(size_t const& dynSharedMemBytes) : m_dynSharedMemBytes(dynSharedMemBytes)
+            {
+            }
+        };
+
+        /** execute a kernel
+         *
+         * @tparam T_KernelType type of the kernel
+         * @tparam T_Acc accelerator used to execute the kernel
+         *
+         */
+        template<typename T_KernelType, typename T_Acc>
+        class KernelExecutor
         {
-            dim3 tmpBlockSize = blockSize;
-            dim3 tmpElemSize;
-            GetBlockAndElemExtents<cupla::AccThreadSeq>::get( tmpBlockSize, tmpElemSize );
-
-            return KernelExecutor<
-                T_KernelType,
-                cupla::AccThreadSeq
-            >(gridSize, tmpBlockSize, tmpElemSize, dynSharedMemSize, stream);
-        }
-    };
-
-    /** Kernel configuration interface with element support
-     *
-     * The kernel must support the alpaka element level
-     */
-    template<
-        typename T_KernelType
-    >
-    struct KernelWithElementLevel
-    {
-        auto operator()(
-            dim3 const & gridSize,
-            dim3 const & blockSize,
-            dim3 const & elemSize,
-            uint32_t const & dynSharedMemSize = 0,
-            cuplaStream_t const & stream = 0
-        )  const
-        -> KernelExecutor<
-            T_KernelType,
-            cupla::Acc
-        >
+            IdxVec3 const m_gridSize;
+            IdxVec3 const m_blockSize;
+            IdxVec3 const m_elemSize;
+            uint32_t const m_dynSharedMemSize;
+            cuplaStream_t const m_stream;
+
+        public:
+            KernelExecutor(
+                dim3 const& gridSize,
+                dim3 const& blockSize,
+                dim3 const& elemSize,
+                uint32_t const& dynSharedMemSize,
+                cuplaStream_t const& stream)
+                : m_gridSize(gridSize.z, gridSize.y, gridSize.x)
+                , m_blockSize(blockSize.z, blockSize.y, blockSize.x)
+                , m_elemSize(elemSize.z, elemSize.y, elemSize.x)
+                , m_dynSharedMemSize(dynSharedMemSize)
+                , m_stream(stream)
+            {
+            }
+
+            template<typename... T_Args>
+            void operator()(T_Args&&... args) const
+            {
+                ::alpaka::WorkDivMembers<KernelDim, IdxType> workDiv(m_gridSize, m_blockSize, m_elemSize);
+                auto const exec = ::alpaka::createTaskKernel<T_Acc>(
+                    workDiv,
+                    CuplaKernel<T_KernelType>{m_dynSharedMemSize},
+                    std::forward<T_Args>(args)...);
+
+                auto& stream = cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(m_stream);
+
+                ::alpaka::enqueue(stream, exec);
+            }
+        };
+
+        /** Cuda like configuration interface for a kernel
+         *
+         * Interface is compatible to the argument order of a cuda kernel `T_KernelType<<<...>>>`
+         */
+        template<typename T_KernelType>
+        struct KernelCudaLike
         {
-            return KernelExecutor<
-                T_KernelType,
-                cupla::Acc
-            >(gridSize, blockSize, elemSize, dynSharedMemSize, stream);
-        }
-    };
-
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+            auto operator()(
+                dim3 const& gridSize,
+                dim3 const& blockSize,
+                uint32_t const& dynSharedMemSize = 0,
+                cuplaStream_t const& stream = 0) const -> KernelExecutor<T_KernelType, cupla::Acc>
+            {
+                return KernelExecutor<T_KernelType, cupla::Acc>(gridSize, blockSize, dim3(), dynSharedMemSize, stream);
+            }
+        };
+
+        /* Kernel configuration interface with element support
+         *
+         * The kernel must support the alpaka element layer.
+         *
+         * Swap the blockSize and the elemSize depending on the activated accelerator.
+         * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) )
+         * and the elemSize is set to the user defined blockSize
+         */
+        template<typename T_KernelType>
+        struct SwitchableElementLevel
+        {
+            auto operator()(
+                dim3 const& gridSize,
+                dim3 const& blockSize,
+                uint32_t const& dynSharedMemSize = 0,
+                cuplaStream_t const& stream = 0) const -> KernelExecutor<T_KernelType, cupla::AccThreadSeq>
+            {
+                dim3 tmpBlockSize = blockSize;
+                dim3 tmpElemSize;
+                GetBlockAndElemExtents<cupla::AccThreadSeq>::get(tmpBlockSize, tmpElemSize);
+
+                return KernelExecutor<T_KernelType, cupla::AccThreadSeq>(
+                    gridSize,
+                    tmpBlockSize,
+                    tmpElemSize,
+                    dynSharedMemSize,
+                    stream);
+            }
+        };
+
+        /** Kernel configuration interface with element support
+         *
+         * The kernel must support the alpaka element level
+         */
+        template<typename T_KernelType>
+        struct KernelWithElementLevel
+        {
+            auto operator()(
+                dim3 const& gridSize,
+                dim3 const& blockSize,
+                dim3 const& elemSize,
+                uint32_t const& dynSharedMemSize = 0,
+                cuplaStream_t const& stream = 0) const -> KernelExecutor<T_KernelType, cupla::Acc>
+            {
+                return KernelExecutor<T_KernelType, cupla::Acc>(
+                    gridSize,
+                    blockSize,
+                    elemSize,
+                    dynSharedMemSize,
+                    stream);
+            }
+        };
+
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
 
 
 namespace alpaka
 {
-namespace traits
-{
-    //! CuplaKernel has defined the extern shared memory as member
-    template<
-        typename T_UserKernel,
-        typename T_Acc
-    >
-    struct BlockSharedMemDynSizeBytes<
-        ::cupla::CuplaKernel< T_UserKernel >,
-        T_Acc
-    >
+    namespace traits
     {
-        template<
-            typename... TArgs
-        >
-        ALPAKA_FN_HOST_ACC
-        static auto
-        getBlockSharedMemDynSizeBytes(
-            ::cupla::CuplaKernel< T_UserKernel > const & userKernel,
-            TArgs const & ...)
-        -> ::alpaka::Idx<T_Acc>
+        //! CuplaKernel has defined the extern shared memory as member
+        template<typename T_UserKernel, typename T_Acc>
+        struct BlockSharedMemDynSizeBytes<::cupla::CuplaKernel<T_UserKernel>, T_Acc>
         {
-            return userKernel.m_dynSharedMemBytes;
-        }
-    };
-} // namespace traits
+            template<typename... TArgs>
+            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+                ::cupla::CuplaKernel<T_UserKernel> const& userKernel,
+                TArgs const&...) -> ::alpaka::Idx<T_Acc>
+            {
+                return userKernel.m_dynSharedMemBytes;
+            }
+        };
+    } // namespace traits
 } // namespace alpaka
 
 
@@ -270,7 +215,10 @@ namespace traits
  *
  * The alpaka element level is ignored and always set to dim3(1,1,1)
  */
-#define CUPLA_KERNEL(...) ::cupla::KernelCudaLike<__VA_ARGS__>{}
+#define CUPLA_KERNEL(...)                                                                                             \
+    ::cupla::KernelCudaLike<__VA_ARGS__>                                                                              \
+    {                                                                                                                 \
+    }
 
 /** call the kernel with an hidden element layer
  *
@@ -281,10 +229,16 @@ namespace traits
  * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) )
  * and the elemSize is set to the user defined blockSize
  */
-#define CUPLA_KERNEL_OPTI(...) ::cupla::SwitchableElementLevel<__VA_ARGS__>{}
+#define CUPLA_KERNEL_OPTI(...)                                                                                        \
+    ::cupla::SwitchableElementLevel<__VA_ARGS__>                                                                      \
+    {                                                                                                                 \
+    }
 
 /** cupla kernel call with elements
  *
  * The kernel must support the alpaka element level
  */
-#define CUPLA_KERNEL_ELEM(...) ::cupla::KernelWithElementLevel<__VA_ARGS__>{}
+#define CUPLA_KERNEL_ELEM(...)                                                                                        \
+    ::cupla::KernelWithElementLevel<__VA_ARGS__>                                                                      \
+    {                                                                                                                 \
+    }
diff --git a/include/cupla/manager/Device.hpp b/include/cupla/manager/Device.hpp
index 3476e663..9d1ad5f9 100644
--- a/include/cupla/manager/Device.hpp
+++ b/include/cupla/manager/Device.hpp
@@ -32,148 +32,112 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-namespace manager
-{
-
-    template<
-        typename T_DeviceType
-    >
-    struct Device
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        using DeviceType = T_DeviceType;
-
-        using DeviceMap = std::map<
-            int,
-            std::unique_ptr<
-                DeviceType
-            >
-        >;
-
-        DeviceMap m_map;
-        int m_currentDevice;
-
-        static Device &
-        get()
+        namespace manager
         {
-            static Device device;
-            return device;
-        }
-
-        auto
-        device(
-            int idx = 0
-        )
-        -> DeviceType &
-        {
-            m_currentDevice = idx;
-            auto iter = m_map.find( idx );
-            if( iter != m_map.end() )
-            {
-                return *iter->second;
-            }
-            else
+            template<typename T_DeviceType>
+            struct Device
             {
-                using Pltf = ::alpaka::Pltf< DeviceType >;
+                using DeviceType = T_DeviceType;
 
-                const int numDevices = count();
-                if( idx >= numDevices )
-                {
-                    std::stringstream err;
-                    err << "Unable to return device " << idx << ". There are only " << numDevices << " devices!";
-                    throw std::system_error(
-                        cuplaErrorInvalidDevice,
-                        err.str()
-                    );
-                }
+                using DeviceMap = std::map<int, std::unique_ptr<DeviceType>>;
 
-                std::unique_ptr< DeviceType > dev;
+                DeviceMap m_map;
+                int m_currentDevice;
 
-                try
+                static Device& get()
                 {
-                    /* device id is not in the list
-                     *
-                     * select device with idx
-                     */
-                    dev.reset(
-                        new DeviceType(
-                            alpaka::getDevByIdx<
-                                Pltf
-                            >( idx )
-                        )
-                    );
+                    static Device device;
+                    return device;
                 }
-                catch( const std::runtime_error& e )
+
+                auto device(int idx = 0) -> DeviceType&
                 {
-                    throw std::system_error(
-                        cuplaErrorDeviceAlreadyInUse,
-                        e.what()
-                    );
+                    m_currentDevice = idx;
+                    auto iter = m_map.find(idx);
+                    if(iter != m_map.end())
+                    {
+                        return *iter->second;
+                    }
+                    else
+                    {
+                        using Pltf = ::alpaka::Pltf<DeviceType>;
+
+                        const int numDevices = count();
+                        if(idx >= numDevices)
+                        {
+                            std::stringstream err;
+                            err << "Unable to return device " << idx << ". There are only " << numDevices
+                                << " devices!";
+                            throw std::system_error(cuplaErrorInvalidDevice, err.str());
+                        }
+
+                        std::unique_ptr<DeviceType> dev;
+
+                        try
+                        {
+                            /* device id is not in the list
+                             *
+                             * select device with idx
+                             */
+                            dev.reset(new DeviceType(alpaka::getDevByIdx<Pltf>(idx)));
+                        }
+                        catch(const std::runtime_error& e)
+                        {
+                            throw std::system_error(cuplaErrorDeviceAlreadyInUse, e.what());
+                        }
+                        m_map.insert(std::make_pair(idx, std::move(dev)));
+                        return *m_map[idx];
+                    }
                 }
-                m_map.insert(
-                    std::make_pair( idx, std::move( dev ) )
-                );
-                return *m_map[ idx ];
-            }
-        }
-
-        /**! reset the current device
-         *
-         * streams, memory and events on the current device must be
-         * deleted at first by the user
-         *
-         * @return true in success case else false
-         */
-        bool reset()
-        {
-            ::alpaka::reset( this->current( ) );
-            auto iter = m_map.find( this->id( ) );
 
-            if( iter == m_map.end() )
-            {
-                std::cerr << "device " << this->id( ) <<
-                    " can not destroyed (was never created) " <<
-                    std::endl;
-                return false;
-            }
-            else
-            {
-                m_map.erase( iter );
-                return true;
-            }
-        }
-
-        auto
-        id()
-        -> int
-        {
-            return m_currentDevice;
-        }
-
-        auto
-        current()
-        -> DeviceType &
-        {
-            return this->device( this->id( ) );
-        }
+                /**! reset the current device
+                 *
+                 * streams, memory and events on the current device must be
+                 * deleted at first by the user
+                 *
+                 * @return true in success case else false
+                 */
+                bool reset()
+                {
+                    ::alpaka::reset(this->current());
+                    auto iter = m_map.find(this->id());
+
+                    if(iter == m_map.end())
+                    {
+                        std::cerr << "device " << this->id() << " can not destroyed (was never created) " << std::endl;
+                        return false;
+                    }
+                    else
+                    {
+                        m_map.erase(iter);
+                        return true;
+                    }
+                }
 
-        auto
-        count()
-        -> int
-        {
-            using Pltf = ::alpaka::Pltf< DeviceType >;
-            return static_cast< int >( ::alpaka::getDevCount< Pltf >( ) );
-        }
+                auto id() -> int
+                {
+                    return m_currentDevice;
+                }
 
-    protected:
-        Device() : m_currentDevice( 0 )
-        {
+                auto current() -> DeviceType&
+                {
+                    return this->device(this->id());
+                }
 
-        }
+                auto count() -> int
+                {
+                    using Pltf = ::alpaka::Pltf<DeviceType>;
+                    return static_cast<int>(::alpaka::getDevCount<Pltf>());
+                }
 
-    };
+            protected:
+                Device() : m_currentDevice(0)
+                {
+                }
+            };
 
-} //namespace manager
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+        } // namespace manager
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/manager/Driver.hpp b/include/cupla/manager/Driver.hpp
index 0e3f0cf1..ff6e28e4 100644
--- a/include/cupla/manager/Driver.hpp
+++ b/include/cupla/manager/Driver.hpp
@@ -25,29 +25,27 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-namespace manager
-{
-
-/** initialize the cupla environment
- *
- * avoid side effects with singletons in the user code
- */
-class Driver
-{
-
-public:
-    static Driver& get()
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        static Driver driver;
-        return driver;
-    }
-private:
-
-    Driver();
-};
-
-} //namespace manager
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+        namespace manager
+        {
+            /** initialize the cupla environment
+             *
+             * avoid side effects with singletons in the user code
+             */
+            class Driver
+            {
+            public:
+                static Driver& get()
+                {
+                    static Driver driver;
+                    return driver;
+                }
+
+            private:
+                Driver();
+            };
+
+        } // namespace manager
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/manager/Event.hpp b/include/cupla/manager/Event.hpp
index 8d3ee193..bac0b360 100644
--- a/include/cupla/manager/Event.hpp
+++ b/include/cupla/manager/Event.hpp
@@ -21,223 +21,174 @@
 
 #pragma once
 
+#include "cupla/manager/Device.hpp"
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
-#include "cupla/manager/Device.hpp"
 #include "cupla_driver_types.hpp"
 
-#include <vector>
+#include <chrono>
 #include <map>
 #include <memory>
 #include <utility>
-#include <chrono>
+#include <vector>
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-namespace manager
-{
-
-namespace detail
-{
-    template<
-        typename T_DeviceType,
-        typename T_QueueType
-    >
-    class EmulatedEvent
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-    private:
-        bool hasTimer;
-
-        using TimePoint = std::chrono::time_point<
-            std::chrono::high_resolution_clock
-        >;
-
-        TimePoint time;
-
-    public:
-        using AlpakaEvent = ::alpaka::Event< T_QueueType >;
-        std::unique_ptr< AlpakaEvent > event;
-
-        EmulatedEvent( uint32_t flags ) :
-            hasTimer( !( flags & cuplaEventDisableTiming ) ),
-            event(
-                new AlpakaEvent(
-                    Device< T_DeviceType >::get().current()
-                    // The alpaka interfaces for this constructor are different depending on the backend.
-#if( ALPAKA_ACC_GPU_HIP_ENABLED == 1 || ALPAKA_ACC_GPU_CUDA_ENABLED == 1 )
-                    ,!(flags & cuplaEventBlockingSync)
-#endif
-                )
-            )
-        {
-
-        }
-
-        AlpakaEvent &
-        operator *()
+        namespace manager
         {
-            return *event;
-        }
-
-        void record( T_QueueType & stream )
-        {
-            ::alpaka::enqueue( stream, *event );
-            if( hasTimer )
+            namespace detail
             {
-                ::alpaka::wait( *event );
-                time = std::chrono::high_resolution_clock::now();
-            }
-        }
-
-        TimePoint getTimePoint() const
-        {
-            return time;
-        }
+                template<typename T_DeviceType, typename T_QueueType>
+                class EmulatedEvent
+                {
+                private:
+                    bool hasTimer;
 
-        double elapsedSince( EmulatedEvent const & startEvent )
-        {
-            if( !hasTimer )
-                std::cerr<<"event has no timing enabled"<<std::endl;
-
-            std::chrono::duration<double, std::milli> timeElapsed_ms = time - startEvent.getTimePoint();
-            return timeElapsed_ms.count();
-        }
-
-    };
-}
-    template<
-        typename T_DeviceType,
-        typename T_QueueType
-    >
-    struct Event
-    {
-        using DeviceType = T_DeviceType;
-        using QueueType = T_QueueType;
+                    using TimePoint = std::chrono::time_point<std::chrono::high_resolution_clock>;
 
-        using EventType = detail::EmulatedEvent<
-            DeviceType,
-            QueueType
-        >;
+                    TimePoint time;
 
-        using EventMap = std::map<
-            cuplaEvent_t,
-            std::unique_ptr<
-                EventType
-            >
-        >;
+                public:
+                    using AlpakaEvent = ::alpaka::Event<T_QueueType>;
+                    std::unique_ptr<AlpakaEvent> event;
 
-        using MapVector = std::vector< EventMap >;
-
-        MapVector m_mapVector;
-
-        static auto
-        get()
-        -> Event &
-        {
-            static Event event;
-            return event;
-        }
-
-        auto
-        create( uint32_t flags )
-        -> cuplaEvent_t
-        {
-
-            auto& device = Device< DeviceType >::get();
-
-            std::unique_ptr<
-                EventType
-            > eventPtr(
-                new EventType(
-                    flags
-                )
-            );
-
-            cuplaEvent_t eventId = reinterpret_cast< cuplaEvent_t >(
-                m_id++
-            );
-            m_mapVector[ device.id() ].insert(
-                std::make_pair( eventId, std::move( eventPtr ) )
-            );
-            return eventId;
-        }
-
-        auto
-        event( cuplaEvent_t eventId = 0 )
-        -> EventType &
-        {
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
-            auto iter = m_mapVector[ deviceId ].find(
-                eventId
-            );
-
-            if( iter == m_mapVector[ device.id( ) ].end() )
-            {
-                std::cerr << "event " << eventId <<
-                    " not exists on device "<< deviceId << std::endl;
-            }
-            // @todo: check if stream was created
-            return *(iter->second);
-        }
-
-        auto
-        destroy( cuplaEvent_t eventId )
-        -> bool
-        {
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
-
-            auto iter = m_mapVector[ deviceId ].find(
-                eventId
-            );
-
-            if( iter == m_mapVector[ deviceId ].end() )
-            {
-                std::cerr << "stream " << eventId <<
-                    " can not destroyed (was never created) on device " <<
-                    deviceId <<
-                    std::endl;
-                return false;
-            }
-            else
+                    EmulatedEvent(uint32_t flags)
+                        : hasTimer(!(flags & cuplaEventDisableTiming))
+                        , event(new AlpakaEvent(
+                              Device<T_DeviceType>::get().current()
+                    // The alpaka interfaces for this constructor are different depending on the backend.
+#if(ALPAKA_ACC_GPU_HIP_ENABLED == 1 || ALPAKA_ACC_GPU_CUDA_ENABLED == 1)
+                                  ,
+                              !(flags & cuplaEventBlockingSync)
+#endif
+                                  ))
+                    {
+                    }
+
+                    AlpakaEvent& operator*()
+                    {
+                        return *event;
+                    }
+
+                    void record(T_QueueType& stream)
+                    {
+                        ::alpaka::enqueue(stream, *event);
+                        if(hasTimer)
+                        {
+                            ::alpaka::wait(*event);
+                            time = std::chrono::high_resolution_clock::now();
+                        }
+                    }
+
+                    TimePoint getTimePoint() const
+                    {
+                        return time;
+                    }
+
+                    double elapsedSince(EmulatedEvent const& startEvent)
+                    {
+                        if(!hasTimer)
+                            std::cerr << "event has no timing enabled" << std::endl;
+
+                        std::chrono::duration<double, std::milli> timeElapsed_ms = time - startEvent.getTimePoint();
+                        return timeElapsed_ms.count();
+                    }
+                };
+            } // namespace detail
+            template<typename T_DeviceType, typename T_QueueType>
+            struct Event
             {
-                m_mapVector[ deviceId ].erase( iter );
-                return true;
-            }
-        }
-
-        /** delete all events on the current device
-         *
-         * @return true in success case else false
-         */
-        bool
-        reset( )
-        {
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
-
-            m_mapVector[ deviceId ].clear( );
-            // reset id to allow that this instance can be reused
-            m_id = 0u;
-
-            // @todo: check if clear creates errors
-            return true;
-        }
-
-
-    protected:
-        Event() :  m_mapVector( Device< DeviceType >::get().count() )
-        {
-        }
-
-        //! unique if for the next stream
-        size_t m_id = 0u;
-
-    };
-
-} //namespace manager
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+                using DeviceType = T_DeviceType;
+                using QueueType = T_QueueType;
+
+                using EventType = detail::EmulatedEvent<DeviceType, QueueType>;
+
+                using EventMap = std::map<cuplaEvent_t, std::unique_ptr<EventType>>;
+
+                using MapVector = std::vector<EventMap>;
+
+                MapVector m_mapVector;
+
+                static auto get() -> Event&
+                {
+                    static Event event;
+                    return event;
+                }
+
+                auto create(uint32_t flags) -> cuplaEvent_t
+                {
+                    auto& device = Device<DeviceType>::get();
+
+                    std::unique_ptr<EventType> eventPtr(new EventType(flags));
+
+                    cuplaEvent_t eventId = reinterpret_cast<cuplaEvent_t>(m_id++);
+                    m_mapVector[device.id()].insert(std::make_pair(eventId, std::move(eventPtr)));
+                    return eventId;
+                }
+
+                auto event(cuplaEvent_t eventId = 0) -> EventType&
+                {
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
+                    auto iter = m_mapVector[deviceId].find(eventId);
+
+                    if(iter == m_mapVector[device.id()].end())
+                    {
+                        std::cerr << "event " << eventId << " not exists on device " << deviceId << std::endl;
+                    }
+                    // @todo: check if stream was created
+                    return *(iter->second);
+                }
+
+                auto destroy(cuplaEvent_t eventId) -> bool
+                {
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
+
+                    auto iter = m_mapVector[deviceId].find(eventId);
+
+                    if(iter == m_mapVector[deviceId].end())
+                    {
+                        std::cerr << "stream " << eventId << " can not destroyed (was never created) on device "
+                                  << deviceId << std::endl;
+                        return false;
+                    }
+                    else
+                    {
+                        m_mapVector[deviceId].erase(iter);
+                        return true;
+                    }
+                }
+
+                /** delete all events on the current device
+                 *
+                 * @return true in success case else false
+                 */
+                bool reset()
+                {
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
+
+                    m_mapVector[deviceId].clear();
+                    // reset id to allow that this instance can be reused
+                    m_id = 0u;
+
+                    // @todo: check if clear creates errors
+                    return true;
+                }
+
+
+            protected:
+                Event() : m_mapVector(Device<DeviceType>::get().count())
+                {
+                }
+
+                //! unique if for the next stream
+                size_t m_id = 0u;
+            };
+
+        } // namespace manager
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/manager/Memory.hpp b/include/cupla/manager/Memory.hpp
index 1d3edc64..a1de369c 100644
--- a/include/cupla/manager/Memory.hpp
+++ b/include/cupla/manager/Memory.hpp
@@ -21,135 +21,97 @@
 
 #pragma once
 
+#include "cupla/manager/Device.hpp"
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
-#include "cupla/manager/Device.hpp"
 
-#include <vector>
 #include <map>
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-namespace manager
-{
-
-    template<
-        typename T_DeviceType,
-        typename T_Dim
-    >
-    struct Memory
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        using DeviceType = T_DeviceType;
-        static constexpr uint32_t dim = T_Dim::value;
-
-        using BufType = ::alpaka::Buf<
-            DeviceType,
-            uint8_t,
-            T_Dim,
-            MemSizeType
-        >;
-
-        using MemoryMap = std::map<
-            uint8_t*,
-            std::unique_ptr<
-                BufType
-            >
-        >;
-
-        using MapVector = std::vector< MemoryMap >;
-
-        MapVector m_mapVector;
-
-        static auto
-        get()
-        -> Memory &
+        namespace manager
         {
-            static Memory mem;
-            return mem;
-        }
+            template<typename T_DeviceType, typename T_Dim>
+            struct Memory
+            {
+                using DeviceType = T_DeviceType;
+                static constexpr uint32_t dim = T_Dim::value;
 
+                using BufType = ::alpaka::Buf<DeviceType, uint8_t, T_Dim, MemSizeType>;
 
-        auto
-        alloc(
-            MemVec< dim > const & extent
-        )
-        -> BufType &
-        {
+                using MemoryMap = std::map<uint8_t*, std::unique_ptr<BufType>>;
 
-            auto& device = Device< DeviceType >::get();
-
-            std::unique_ptr<
-                BufType
-            > bufPtr(
-                new BufType(
-                    ::alpaka::allocBuf<uint8_t, MemSizeType>(
-                         device.current(),
-                         extent
-                    )
-                )
-            );
-
-
-            uint8_t *nativePtr = ::alpaka::getPtrNative(*bufPtr);
-            m_mapVector[ device.id() ].insert(
-                std::make_pair( nativePtr, std::move( bufPtr ) )
-            );
-            return *m_mapVector[ device.id() ][ nativePtr ];
-        }
-
-        auto
-        free( void * ptr)
-        -> bool
-        {
-            if( ptr == nullptr)
-                return true;
+                using MapVector = std::vector<MemoryMap>;
 
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
+                MapVector m_mapVector;
 
-            auto iter = m_mapVector[ deviceId ].find(
-                static_cast< uint8_t * >( ptr )
-            );
+                static auto get() -> Memory&
+                {
+                    static Memory mem;
+                    return mem;
+                }
 
-            if( iter == m_mapVector[ deviceId ].end() )
-            {
-                return false;
-            }
-            else
-            {
-                m_mapVector[ deviceId ].erase( iter );
-                return true;
-            }
-        }
-
-        /** delete all memory on the current device
-         *
-         * @return true in success case else false
-         */
-        bool
-        reset( )
-        {
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
 
-            m_mapVector[ deviceId ].clear( );
+                auto alloc(MemVec<dim> const& extent) -> BufType&
+                {
+                    auto& device = Device<DeviceType>::get();
 
-            // @todo: check if clear creates errors
-            return true;
-        }
+                    std::unique_ptr<BufType> bufPtr(
+                        new BufType(::alpaka::allocBuf<uint8_t, MemSizeType>(device.current(), extent)));
 
-    protected:
-        Memory() : m_mapVector( Device< DeviceType >::get().count() )
-        {
 
-        }
+                    uint8_t* nativePtr = ::alpaka::getPtrNative(*bufPtr);
+                    m_mapVector[device.id()].insert(std::make_pair(nativePtr, std::move(bufPtr)));
+                    return *m_mapVector[device.id()][nativePtr];
+                }
+
+                auto free(void* ptr) -> bool
+                {
+                    if(ptr == nullptr)
+                        return true;
+
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
+
+                    auto iter = m_mapVector[deviceId].find(static_cast<uint8_t*>(ptr));
+
+                    if(iter == m_mapVector[deviceId].end())
+                    {
+                        return false;
+                    }
+                    else
+                    {
+                        m_mapVector[deviceId].erase(iter);
+                        return true;
+                    }
+                }
+
+                /** delete all memory on the current device
+                 *
+                 * @return true in success case else false
+                 */
+                bool reset()
+                {
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
+
+                    m_mapVector[deviceId].clear();
+
+                    // @todo: check if clear creates errors
+                    return true;
+                }
 
-    };
+            protected:
+                Memory() : m_mapVector(Device<DeviceType>::get().count())
+                {
+                }
+            };
 
-} //namespace manager
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+        } // namespace manager
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/manager/Stream.hpp b/include/cupla/manager/Stream.hpp
index 43fc4d33..aea9362d 100644
--- a/include/cupla/manager/Stream.hpp
+++ b/include/cupla/manager/Stream.hpp
@@ -21,155 +21,124 @@
 
 #pragma once
 
+#include "cupla/manager/Device.hpp"
 #include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
-#include "cupla/manager/Device.hpp"
 #include "cupla_driver_types.hpp"
 
 #include <map>
-#include <vector>
 #include <memory>
+#include <vector>
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-namespace manager
-{
-
-    template<
-        typename T_DeviceType,
-        typename T_QueueType
-    >
-    struct Stream
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        using DeviceType = T_DeviceType;
-        using QueueType = T_QueueType;
-
-
-        using StreamMap = std::map<
-            cuplaStream_t,
-            std::unique_ptr<
-                QueueType
-            >
-        >;
-        using MapVector = std::vector< StreamMap >;
-
-        MapVector m_mapVector;
-
-        static auto
-        get()
-        -> Stream &
+        namespace manager
         {
-            static Stream stream;
-            return stream;
-        }
+            template<typename T_DeviceType, typename T_QueueType>
+            struct Stream
+            {
+                using DeviceType = T_DeviceType;
+                using QueueType = T_QueueType;
 
-        auto
-        create( )
-        -> cuplaStream_t
-        {
-            return createNewStream(reinterpret_cast< cuplaStream_t >(m_id++));
-        }
 
-        auto
-        stream( cuplaStream_t streamId = 0 )
-        -> QueueType &
-        {
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
-            auto iter = m_mapVector[ deviceId ].find(
-                streamId
-            );
+                using StreamMap = std::map<cuplaStream_t, std::unique_ptr<QueueType>>;
+                using MapVector = std::vector<StreamMap>;
 
-            if( iter == m_mapVector[ device.id( ) ].end() )
-            {
-                if( streamId == 0 )
+                MapVector m_mapVector;
+
+                static auto get() -> Stream&
                 {
-                    createNewStream( streamId );
-                    return this->stream( streamId );
+                    static Stream stream;
+                    return stream;
                 }
-                else
+
+                auto create() -> cuplaStream_t
                 {
-                    std::cerr << "stream " << streamId <<
-                        " not exists on device "<< deviceId << std::endl;
+                    return createNewStream(reinterpret_cast<cuplaStream_t>(m_id++));
                 }
-            }
-            // @todo: check if stream was created
-            return *(iter->second);
-        }
-
-        auto
-        destroy( cuplaStream_t streamId)
-        -> bool
-        {
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
-
-            auto iter = m_mapVector[ deviceId ].find(
-                streamId
-            );
 
-            if( iter == m_mapVector[ deviceId ].end() )
-            {
-                std::cerr << "stream " << streamId <<
-                    " can not destroyed (was never created) on device " <<
-                    deviceId <<
-                    std::endl;
-                return false;
-            }
-            else
-            {
-                m_mapVector[ deviceId ].erase( iter );
-                return true;
-            }
-        }
-
-
-        /** delete all streams on the current device
-         *
-         * @return true in success case else false
-         */
-        bool
-        reset( )
-        {
-            auto& device = Device< DeviceType >::get();
-            const auto deviceId = device.id();
+                auto stream(cuplaStream_t streamId = 0) -> QueueType&
+                {
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
+                    auto iter = m_mapVector[deviceId].find(streamId);
+
+                    if(iter == m_mapVector[device.id()].end())
+                    {
+                        if(streamId == 0)
+                        {
+                            createNewStream(streamId);
+                            return this->stream(streamId);
+                        }
+                        else
+                        {
+                            std::cerr << "stream " << streamId << " not exists on device " << deviceId << std::endl;
+                        }
+                    }
+                    // @todo: check if stream was created
+                    return *(iter->second);
+                }
 
-            m_mapVector[ deviceId ].clear( );
+                auto destroy(cuplaStream_t streamId) -> bool
+                {
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
+
+                    auto iter = m_mapVector[deviceId].find(streamId);
+
+                    if(iter == m_mapVector[deviceId].end())
+                    {
+                        std::cerr << "stream " << streamId << " can not destroyed (was never created) on device "
+                                  << deviceId << std::endl;
+                        return false;
+                    }
+                    else
+                    {
+                        m_mapVector[deviceId].erase(iter);
+                        return true;
+                    }
+                }
 
-            // @todo: check if clear creates errors
-            return true;
-        }
 
-    protected:
-        Stream() :  m_mapVector( Device< DeviceType >::get().count() )
-        {
-        }
+                /** delete all streams on the current device
+                 *
+                 * @return true in success case else false
+                 */
+                bool reset()
+                {
+                    auto& device = Device<DeviceType>::get();
+                    const auto deviceId = device.id();
 
-        auto
-        createNewStream( cuplaStream_t streamId  )
-        -> cuplaStream_t
-        {
+                    m_mapVector[deviceId].clear();
 
-            auto& device = Device< DeviceType >::get();
+                    // @todo: check if clear creates errors
+                    return true;
+                }
 
-            auto streamPtr = std::make_unique< QueueType >( device.current() );
-            m_mapVector[ device.id() ].insert(
-                std::make_pair( streamId, std::move( streamPtr ) )
-            );
-            return streamId;
-        }
+            protected:
+                Stream() : m_mapVector(Device<DeviceType>::get().count())
+                {
+                }
 
-        /** unique id for the next stream
-         *
-         * The enumeration starts with id one. Id zero is reserved
-         * for the default stream.
-         */
-        size_t m_id = 1u;
+                auto createNewStream(cuplaStream_t streamId) -> cuplaStream_t
+                {
+                    auto& device = Device<DeviceType>::get();
 
-    };
+                    auto streamPtr = std::make_unique<QueueType>(device.current());
+                    m_mapVector[device.id()].insert(std::make_pair(streamId, std::move(streamPtr)));
+                    return streamId;
+                }
 
-} //namespace manager
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+                /** unique id for the next stream
+                 *
+                 * The enumeration starts with id one. Id zero is reserved
+                 * for the default stream.
+                 */
+                size_t m_id = 1u;
+            };
+
+        } // namespace manager
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla/namespace.hpp b/include/cupla/namespace.hpp
index d9030f00..5037fdfc 100644
--- a/include/cupla/namespace.hpp
+++ b/include/cupla/namespace.hpp
@@ -26,76 +26,76 @@
 #if CUPLA_STREAM_ASYNC_ENABLED
 
 // thread parallel and thread sequential accelerator is used together
-#   if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1)
-#       define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_async
-#   else
+#    if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1)
+#        define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_async
+#    else
 
-#       ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_async
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_async
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_async
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_async
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_async
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_async
+#        endif
 
-#       ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_async
-#       endif
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_async
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_async
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_async
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_async
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_async
+#        endif
 
-#       ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_async
-#       endif
+#        ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_async
+#        endif
 
-#   endif // mixed accelerator usage
+#    endif // mixed accelerator usage
 
 #else // CUPLA_STREAM_ASYNC_ENABLED
 
 // thread parallel and thread sequential accelerator is used together
-#   if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1)
-#       define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_sync
-#   else
+#    if(CUPLA_NUM_SELECTED_THREAD_SEQ_DEVICES == 1 && CUPLA_NUM_SELECTED_THREAD_PARALLEL_DEVICES == 1)
+#        define CUPLA_ACCELERATOR_NAMESPACE cupla_mixed_sync
+#    else
 
-#       ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_sync
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_omp2_sync
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_sync
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_threads_sync
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_sync
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_omp2_seq_sync
+#        endif
 
-#       ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_sync
-#       endif
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_cuda_sync
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_sync
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_seq_seq_sync
+#        endif
 
-#       ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_sync
-#       endif
+#        ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_tbb_seq_sync
+#        endif
 
-#       ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
-#           define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_sync
-#       endif
+#        ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#            define CUPLA_ACCELERATOR_NAMESPACE cupla_omp5_omp5_sync
+#        endif
 
-#   endif // mixed accelerator usage
+#    endif // mixed accelerator usage
 
 #endif // CUPLA_STREAM_ASYNC_ENABLED
 
diff --git a/include/cupla/traits/IsThreadSeqAcc.hpp b/include/cupla/traits/IsThreadSeqAcc.hpp
index 5bced676..50da8d72 100644
--- a/include/cupla/traits/IsThreadSeqAcc.hpp
+++ b/include/cupla/traits/IsThreadSeqAcc.hpp
@@ -26,71 +26,46 @@
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-namespace traits
-{
-
-    /** check if thread level is full sequential
-     *
-     * \return ::value true if no threads where used in the thread level
-     *                  else false
-     */
-    template< typename T_Acc >
-    struct IsThreadSeqAcc
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
     {
-        static constexpr bool value = false;
-    };
+        namespace traits
+        {
+            /** check if thread level is full sequential
+             *
+             * \return ::value true if no threads where used in the thread level
+             *                  else false
+             */
+            template<typename T_Acc>
+            struct IsThreadSeqAcc
+            {
+                static constexpr bool value = false;
+            };
 
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-    template<
-        typename T_KernelDim,
-        typename T_IndexType
-    >
-    struct IsThreadSeqAcc<
-        ::alpaka::AccCpuOmp2Blocks<
-            T_KernelDim,
-            T_IndexType
-        >
-    >
-    {
-        static constexpr bool value = true;
-    };
+            template<typename T_KernelDim, typename T_IndexType>
+            struct IsThreadSeqAcc<::alpaka::AccCpuOmp2Blocks<T_KernelDim, T_IndexType>>
+            {
+                static constexpr bool value = true;
+            };
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-    template<
-        typename T_KernelDim,
-        typename T_IndexType
-    >
-    struct IsThreadSeqAcc<
-        ::alpaka::AccCpuSerial<
-            T_KernelDim,
-            T_IndexType
-        >
-    >
-    {
-        static constexpr bool value = true;
-    };
+            template<typename T_KernelDim, typename T_IndexType>
+            struct IsThreadSeqAcc<::alpaka::AccCpuSerial<T_KernelDim, T_IndexType>>
+            {
+                static constexpr bool value = true;
+            };
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-    template<
-        typename T_KernelDim,
-        typename T_IndexType
-    >
-    struct IsThreadSeqAcc<
-        ::alpaka::AccCpuTbbBlocks<
-            T_KernelDim,
-            T_IndexType
-        >
-    >
-    {
-        static constexpr bool value = true;
-    };
+            template<typename T_KernelDim, typename T_IndexType>
+            struct IsThreadSeqAcc<::alpaka::AccCpuTbbBlocks<T_KernelDim, T_IndexType>>
+            {
+                static constexpr bool value = true;
+            };
 #endif
 
-} // namespace traits
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
+        } // namespace traits
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
 } // namespace cupla
diff --git a/include/cupla/types.hpp b/include/cupla/types.hpp
index 755db275..92d3f71b 100644
--- a/include/cupla/types.hpp
+++ b/include/cupla/types.hpp
@@ -20,240 +20,146 @@
 
 #pragma once
 
-#include <alpaka/alpaka.hpp>
-#include <cstdint>
-
 #include "cupla/defines.hpp"
 #include "cupla/namespace.hpp"
 
+#include <alpaka/alpaka.hpp>
+
+#include <cstdint>
+
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        using MemSizeType = size_t;
+        using IdxType = unsigned int;
 
-    using MemSizeType = size_t;
-    using IdxType = unsigned int;
+        static constexpr uint32_t Dimensions = 3u;
 
-    static constexpr uint32_t Dimensions = 3u;
+        template<uint32_t T_dim>
+        using AlpakaDim = ::alpaka::DimInt<T_dim>;
 
-    template<
-        uint32_t T_dim
-    >
-    using AlpakaDim = ::alpaka::DimInt< T_dim >;
+        using KernelDim = AlpakaDim<Dimensions>;
 
-    using KernelDim = AlpakaDim< Dimensions >;
+        using IdxVec3 = ::alpaka::Vec<KernelDim, IdxType>;
 
-    using IdxVec3 = ::alpaka::Vec<
-        KernelDim,
-        IdxType
-    >;
+        template<uint32_t T_dim>
+        using MemVec = ::alpaka::Vec<AlpakaDim<T_dim>, MemSizeType>;
 
-    template<
-        uint32_t T_dim
-    >
-    using MemVec = ::alpaka::Vec<
-        AlpakaDim< T_dim >,
-        MemSizeType
-    >;
+        using AccHost = ::alpaka::DevCpu;
+        using AccHostStream = ::alpaka::QueueCpuBlocking;
 
-    using AccHost = ::alpaka::DevCpu;
-    using AccHostStream = ::alpaka::QueueCpuBlocking;
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)                   \
+    || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)                    \
+    || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) ||                            \
-    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) ||                         \
-    defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) ||                            \
-    defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) ||                             \
-    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
-
-    using AccDev = ::alpaka::DevCpu;
-#   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
+        using AccDev = ::alpaka::DevCpu;
+#    if(CUPLA_STREAM_ASYNC_ENABLED == 1)
         using AccStream = ::alpaka::QueueCpuNonBlocking;
-#   else
+#    else
         using AccStream = ::alpaka::QueueCpuBlocking;
-#   endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-    using Acc = ::alpaka::AccCpuOmp2Threads<
-        KernelDim,
-        IdxType
-    >;
-#endif
-
-#if (ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED == 1)
-    #if (CUPLA_NUM_SELECTED_DEVICES == 1)
-        using Acc = ::alpaka::AccCpuOmp2Blocks<
-            KernelDim,
-            IdxType
-        >;
-    #else
-        using AccThreadSeq = ::alpaka::AccCpuOmp2Blocks<
-            KernelDim,
-            IdxType
-        >;
-    #endif
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-    using Acc = ::alpaka::AccCpuThreads<
-        KernelDim,
-        IdxType
-    >;
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-    #if (CUPLA_NUM_SELECTED_DEVICES == 1)
-        using Acc = ::alpaka::AccCpuSerial<
-            KernelDim,
-            IdxType
-        >;
-    #else
-        using AccThreadSeq = ::alpaka::AccCpuSerial<
-            KernelDim,
-            IdxType
-        >;
-    #endif
-#endif
-
-#if (ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED == 1)
-    #if (CUPLA_NUM_SELECTED_DEVICES == 1)
-        using Acc = ::alpaka::AccCpuTbbBlocks<
-            KernelDim,
-            IdxType
-        >;
-    #else
-        using AccThreadSeq = ::alpaka::AccCpuTbbBlocks<
-            KernelDim,
-            IdxType
-        >;
-    #endif
-#endif
+#    endif
+
+#    ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+        using Acc = ::alpaka::AccCpuOmp2Threads<KernelDim, IdxType>;
+#    endif
+
+#    if(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED == 1)
+#        if(CUPLA_NUM_SELECTED_DEVICES == 1)
+        using Acc = ::alpaka::AccCpuOmp2Blocks<KernelDim, IdxType>;
+#        else
+        using AccThreadSeq = ::alpaka::AccCpuOmp2Blocks<KernelDim, IdxType>;
+#        endif
+#    endif
+
+#    ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+        using Acc = ::alpaka::AccCpuThreads<KernelDim, IdxType>;
+#    endif
+
+#    ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#        if(CUPLA_NUM_SELECTED_DEVICES == 1)
+        using Acc = ::alpaka::AccCpuSerial<KernelDim, IdxType>;
+#        else
+        using AccThreadSeq = ::alpaka::AccCpuSerial<KernelDim, IdxType>;
+#        endif
+#    endif
+
+#    if(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED == 1)
+#        if(CUPLA_NUM_SELECTED_DEVICES == 1)
+        using Acc = ::alpaka::AccCpuTbbBlocks<KernelDim, IdxType>;
+#        else
+        using AccThreadSeq = ::alpaka::AccCpuTbbBlocks<KernelDim, IdxType>;
+#        endif
+#    endif
 
 #endif
 
 #ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
-    using AccDev = ::alpaka::DevOmp5;
-#   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
+        using AccDev = ::alpaka::DevOmp5;
+#    if(CUPLA_STREAM_ASYNC_ENABLED == 1)
         using AccStream = ::alpaka::QueueOmp5NonBlocking;
-#   else
+#    else
         using AccStream = ::alpaka::QueueOmp5Blocking;
-#   endif
-    using Acc = ::alpaka::AccOmp5<
-        KernelDim,
-        IdxType
-    >;
+#    endif
+        using Acc = ::alpaka::AccOmp5<KernelDim, IdxType>;
 #endif
 
 #ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
-    using AccDev = ::alpaka::DevOacc;
-#   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
+        using AccDev = ::alpaka::DevOacc;
+#    if(CUPLA_STREAM_ASYNC_ENABLED == 1)
         using AccStream = ::alpaka::QueueOaccNonBlocking;
-#   else
+#    else
         using AccStream = ::alpaka::QueueOaccBlocking;
-#   endif
-    using Acc = ::alpaka::AccOacc<
-        KernelDim,
-        IdxType
-    >;
+#    endif
+        using Acc = ::alpaka::AccOacc<KernelDim, IdxType>;
 #endif
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-    using AccDev = ::alpaka::DevCudaRt;
-#   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
+        using AccDev = ::alpaka::DevCudaRt;
+#    if(CUPLA_STREAM_ASYNC_ENABLED == 1)
         using AccStream = ::alpaka::QueueCudaRtNonBlocking;
-#   else
+#    else
         using AccStream = ::alpaka::QueueCudaRtBlocking;
-#   endif
-    using Acc = ::alpaka::AccGpuCudaRt<
-        KernelDim,
-        IdxType
-    >;
+#    endif
+        using Acc = ::alpaka::AccGpuCudaRt<KernelDim, IdxType>;
 #endif
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-    using AccDev = ::alpaka::DevHipRt;
-#   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
+        using AccDev = ::alpaka::DevHipRt;
+#    if(CUPLA_STREAM_ASYNC_ENABLED == 1)
         using AccStream = ::alpaka::QueueHipRtNonBlocking;
-#   else
+#    else
         using AccStream = ::alpaka::QueueHipRtBlocking;
-#   endif
-    using Acc = ::alpaka::AccGpuHipRt<
-        KernelDim,
-        IdxType
-    >;
+#    endif
+        using Acc = ::alpaka::AccGpuHipRt<KernelDim, IdxType>;
 #endif
 
-#if (CUPLA_NUM_SELECTED_DEVICES == 1)
-    /** is an Alpaka accelerator which limits the thread count per block to one
-     *
-     * if only one accelerator is selected than it can be a accelerator without
-     * thread restrictions
-     */
-    using AccThreadSeq = Acc;
+#if(CUPLA_NUM_SELECTED_DEVICES == 1)
+        /** is an Alpaka accelerator which limits the thread count per block to one
+         *
+         * if only one accelerator is selected than it can be a accelerator without
+         * thread restrictions
+         */
+        using AccThreadSeq = Acc;
 #endif
 
-    template<
-        uint32_t T_dim
-    >
-    using AccBuf = ::alpaka::Buf<
-        AccDev,
-        uint8_t,
-        AlpakaDim< T_dim >,
-        MemSizeType
-    >;
+        template<uint32_t T_dim>
+        using AccBuf = ::alpaka::Buf<AccDev, uint8_t, AlpakaDim<T_dim>, MemSizeType>;
 
-    template<
-        uint32_t T_dim
-    >
-    using HostBuf = ::alpaka::Buf<
-        AccHost,
-        uint8_t,
-        AlpakaDim< T_dim >,
-        MemSizeType
-    >;
+        template<uint32_t T_dim>
+        using HostBuf = ::alpaka::Buf<AccHost, uint8_t, AlpakaDim<T_dim>, MemSizeType>;
 
-    template<
-        unsigned T_dim
-    >
-    using HostBufWrapper =
-        ::alpaka::ViewPlainPtr<
-            AccHost,
-            uint8_t,
-            AlpakaDim< T_dim >,
-            MemSizeType
-        >;
+        template<unsigned T_dim>
+        using HostBufWrapper = ::alpaka::ViewPlainPtr<AccHost, uint8_t, AlpakaDim<T_dim>, MemSizeType>;
 
-    template<
-        unsigned T_dim
-    >
-    using HostViewWrapper =
-        ::alpaka::ViewSubView<
-            AccHost,
-            uint8_t,
-            AlpakaDim< T_dim >,
-            MemSizeType
-        >;
+        template<unsigned T_dim>
+        using HostViewWrapper = ::alpaka::ViewSubView<AccHost, uint8_t, AlpakaDim<T_dim>, MemSizeType>;
 
-    template<
-        unsigned T_dim
-    >
-    using DeviceBufWrapper =
-        ::alpaka::ViewPlainPtr<
-            AccDev,
-            uint8_t,
-            AlpakaDim< T_dim >,
-            MemSizeType
-        >;
+        template<unsigned T_dim>
+        using DeviceBufWrapper = ::alpaka::ViewPlainPtr<AccDev, uint8_t, AlpakaDim<T_dim>, MemSizeType>;
 
-    template<
-        unsigned T_dim
-    >
-    using DeviceViewWrapper =
-        ::alpaka::ViewSubView<
-            AccDev,
-            uint8_t,
-            AlpakaDim< T_dim >,
-            MemSizeType
-        >;
+        template<unsigned T_dim>
+        using DeviceViewWrapper = ::alpaka::ViewSubView<AccDev, uint8_t, AlpakaDim<T_dim>, MemSizeType>;
 
-} // namespace CUPLA_ACCELERATOR_NAMESPACE
-} // namepsace cupla
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/include/cupla_driver_types.hpp b/include/cupla_driver_types.hpp
index d0373b05..548ba62a 100644
--- a/include/cupla_driver_types.hpp
+++ b/include/cupla_driver_types.hpp
@@ -25,15 +25,15 @@
 
 // emulated that cuda runtime is loaded
 #ifndef __DRIVER_TYPES_H__
-# define __DRIVER_TYPES_H__
+#    define __DRIVER_TYPES_H__
 #endif
 
 enum cuplaMemcpyKind
 {
-  cuplaMemcpyHostToHost,
-  cuplaMemcpyHostToDevice,
-  cuplaMemcpyDeviceToHost,
-  cuplaMemcpyDeviceToDevice
+    cuplaMemcpyHostToHost,
+    cuplaMemcpyHostToDevice,
+    cuplaMemcpyDeviceToHost,
+    cuplaMemcpyDeviceToDevice
 };
 
 enum cuplaError
@@ -63,42 +63,44 @@ using cuplaEvent_t = void*;
 /** error category for `cuplaError` */
 struct CuplaErrorCode : public std::error_category
 {
-    char const * name() const noexcept override { return "cuplaError"; }
+    char const* name() const noexcept override
+    {
+        return "cuplaError";
+    }
     std::string message(int ev) const override
-	{
-        return message_cstr( ev );
+    {
+        return message_cstr(ev);
     }
-	static char const * message_cstr(int ev)
-	{
+    static char const* message_cstr(int ev)
+    {
         switch(ev)
         {
-            case cuplaSuccess:
-                return "cuplaSuccess";
-            case cuplaErrorMemoryAllocation:
-                return "cuplaErrorMemoryAllocation";
-            case cuplaErrorInitializationError:
-                return "cuplaErrorInitializationError";
-            case cuplaErrorNotReady:
-                return "cuplaErrorNotReady";
-            case cuplaErrorDeviceAlreadyInUse:
-                return "cuplaErrorDeviceAlreadyInUse";
-            default:
-                return "not defined cuplaError";
+        case cuplaSuccess:
+            return "cuplaSuccess";
+        case cuplaErrorMemoryAllocation:
+            return "cuplaErrorMemoryAllocation";
+        case cuplaErrorInitializationError:
+            return "cuplaErrorInitializationError";
+        case cuplaErrorNotReady:
+            return "cuplaErrorNotReady";
+        case cuplaErrorDeviceAlreadyInUse:
+            return "cuplaErrorDeviceAlreadyInUse";
+        default:
+            return "not defined cuplaError";
         };
     }
 };
 
 namespace std
 {
-
-    template< >
-    struct is_error_code_enum< cuplaError > : public true_type{};
+    template<>
+    struct is_error_code_enum<cuplaError> : public true_type
+    {
+    };
 
 } // namespace std
 
-inline std::error_code make_error_code( const cuplaError result )
+inline std::error_code make_error_code(const cuplaError result)
 {
-    return std::error_code( static_cast<int>(result), CuplaErrorCode() );
+    return std::error_code(static_cast<int>(result), CuplaErrorCode());
 }
-
-
diff --git a/include/cupla_runtime.hpp b/include/cupla_runtime.hpp
index 07e9be20..8e869ba7 100644
--- a/include/cupla_runtime.hpp
+++ b/include/cupla_runtime.hpp
@@ -21,34 +21,31 @@
 
 #pragma once
 
-#include <alpaka/alpaka.hpp>
-
-#include "cupla/namespace.hpp"
-#include "cupla/kernel.hpp"
-
+#include "cupla/api/common.hpp"
+#include "cupla/api/device.hpp"
+#include "cupla/api/event.hpp"
+#include "cupla/api/memory.hpp"
+#include "cupla/api/stream.hpp"
 #include "cupla/c/datatypes/cuplaArray.hpp"
-#include "cupla/datatypes/dim3.hpp"
-#include "cupla/datatypes/uint.hpp"
 #include "cupla/c/datatypes/cuplaExtent.hpp"
-#include "cupla/c/datatypes/cuplaPos.hpp"
 #include "cupla/c/datatypes/cuplaMemcpy3DParms.hpp"
 #include "cupla/c/datatypes/cuplaPitchedPtr.hpp"
-
+#include "cupla/c/datatypes/cuplaPos.hpp"
+#include "cupla/datatypes/dim3.hpp"
+#include "cupla/datatypes/uint.hpp"
+#include "cupla/kernel.hpp"
+#include "cupla/manager/Driver.hpp"
+#include "cupla/namespace.hpp"
 #include "cupla/types.hpp"
 #include "cupla_driver_types.hpp"
 
-#include "cupla/api/common.hpp"
-#include "cupla/api/device.hpp"
-#include "cupla/api/stream.hpp"
-#include "cupla/api/event.hpp"
-#include "cupla/api/memory.hpp"
-#include "cupla/manager/Driver.hpp"
+#include <alpaka/alpaka.hpp>
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-    const auto driver = manager::Driver::get();
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        const auto driver = manager::Driver::get();
 
-} //namespace cupla
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/src/common.cpp b/src/common.cpp
index 48ef4399..ed08ea01 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -19,56 +19,52 @@
  */
 
 
-#include "cupla/namespace.hpp"
-#include "cupla_runtime.hpp"
-#include "cupla/manager/Memory.hpp"
+#include "cupla/api/common.hpp"
+
 #include "cupla/manager/Device.hpp"
-#include "cupla/manager/Stream.hpp"
 #include "cupla/manager/Event.hpp"
-#include "cupla/api/common.hpp"
+#include "cupla/manager/Memory.hpp"
+#include "cupla/manager/Stream.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla_runtime.hpp"
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    const char* cuplaGetErrorName(cuplaError_t e)
+    {
+        return CuplaErrorCode::message_cstr(e);
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-const char *
-cuplaGetErrorName(cuplaError_t e)
-{
-    return CuplaErrorCode::message_cstr(e);
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-const char *
-cuplaGetErrorString(cuplaError_t e)
-{
-    return CuplaErrorCode::message_cstr(e);
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    const char* cuplaGetErrorString(cuplaError_t e)
+    {
+        return CuplaErrorCode::message_cstr(e);
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaGetLastError()
-{
-#if( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 )
-    // reset the last cuda error
-    return (cuplaError_t)cudaGetLastError();
-#elif( ALPAKA_ACC_GPU_HIP_ENABLED == 1 )
-    return (cuplaError_t)hipGetLastError();
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaGetLastError()
+    {
+#if(ALPAKA_ACC_GPU_CUDA_ENABLED == 1)
+        // reset the last cuda error
+        return (cuplaError_t) cudaGetLastError();
+#elif(ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+        return (cuplaError_t) hipGetLastError();
 #else
-    return cuplaSuccess;
+        return cuplaSuccess;
 #endif
-}
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaPeekAtLastError()
-{
-#if( ALPAKA_ACC_GPU_CUDA_ENABLED == 1 )
-    return (cuplaError_t)cudaPeekAtLastError();
-#elif( ALPAKA_ACC_GPU_HIP_ENABLED == 1 )
-    return (cuplaError_t)hipPeekAtLastError();
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaPeekAtLastError()
+    {
+#if(ALPAKA_ACC_GPU_CUDA_ENABLED == 1)
+        return (cuplaError_t) cudaPeekAtLastError();
+#elif(ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+        return (cuplaError_t) hipPeekAtLastError();
 #else
-    return cuplaSuccess;
+        return cuplaSuccess;
 #endif
-}
+    }
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/src/device.cpp b/src/device.cpp
index e2127784..81826821 100644
--- a/src/device.cpp
+++ b/src/device.cpp
@@ -19,113 +19,84 @@
  */
 
 
-#include "cupla/namespace.hpp"
-#include "cupla_runtime.hpp"
-#include "cupla/manager/Memory.hpp"
 #include "cupla/manager/Device.hpp"
-#include "cupla/manager/Stream.hpp"
-#include "cupla/manager/Event.hpp"
+
 #include "cupla/api/device.hpp"
+#include "cupla/manager/Event.hpp"
+#include "cupla/manager/Memory.hpp"
+#include "cupla/manager/Stream.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla_runtime.hpp"
+
 #include <stdexcept>
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaGetDeviceCount(int* count)
+    {
+        *count = cupla::manager::Device<cupla::AccDev>::get().count();
+        return cuplaSuccess;
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaGetDeviceCount( int * count)
-{
-    *count = cupla::manager::Device< cupla::AccDev >::get().count();
-    return cuplaSuccess;
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaSetDevice(int idx)
+    {
+        try
+        {
+            cupla::manager::Device<cupla::AccDev>::get().device(idx);
+        }
+        catch(const std::system_error& e)
+        {
+            return static_cast<cuplaError_t>(e.code().value());
+        }
+        return cuplaSuccess;
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaSetDevice( int idx)
-{
-    try
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaGetDevice(int* deviceId)
     {
-      cupla::manager::Device< cupla::AccDev >::get().device( idx );
+        *deviceId = cupla::manager::Device<cupla::AccDev>::get().id();
+        return cuplaSuccess;
     }
-    catch(const std::system_error& e)
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaDeviceReset()
     {
-      return static_cast<cuplaError_t>( e.code().value() );
+        // wait that all work on the device is finished
+        cuplaDeviceSynchronize();
+
+        // delete all events on the current device
+        cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().reset();
+
+        // delete all memory on the current device
+        cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<1u>>::get().reset();
+
+        cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<2u>>::get().reset();
+
+        cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<3u>>::get().reset();
+
+        // delete all streams on the current device
+        cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().reset();
+
+        cupla::manager::Device<cupla::AccDev>::get().reset();
+        return cuplaSuccess;
     }
-    return cuplaSuccess;
-}
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaGetDevice( int * deviceId )
-{
-    *deviceId = cupla::manager::Device< cupla::AccDev >::get().id();
-    return cuplaSuccess;
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaDeviceSynchronize()
+    {
+        ::alpaka::wait(cupla::manager::Device<cupla::AccDev>::get().current());
+        return cuplaSuccess;
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaDeviceReset( )
-{
-    // wait that all work on the device is finished
-    cuplaDeviceSynchronize( );
-
-    // delete all events on the current device
-    cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().reset( );
-
-    // delete all memory on the current device
-    cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim<1u>
-    >::get().reset( );
-
-    cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim<2u>
-    >::get().reset( );
-
-    cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim<3u>
-    >::get().reset( );
-
-    // delete all streams on the current device
-    cupla::manager::Stream<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().reset( );
-
-    cupla::manager::Device< cupla::AccDev >::get( ).reset( );
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaDeviceSynchronize( )
-{
-    ::alpaka::wait(
-        cupla::manager::Device< cupla::AccDev >::get( ).current( )
-    );
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemGetInfo(
-    size_t * free,
-    size_t * total
-)
-{
-    auto& device(
-        cupla::manager::Device<
-            cupla::AccDev
-        >::get().current()
-    );
-    *total = ::alpaka::getMemBytes( device );
-    *free = ::alpaka::getFreeMemBytes( device );
-    return cuplaSuccess;
-}
-
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemGetInfo(size_t* free, size_t* total)
+    {
+        auto& device(cupla::manager::Device<cupla::AccDev>::get().current());
+        *total = ::alpaka::getMemBytes(device);
+        *free = ::alpaka::getFreeMemBytes(device);
+        return cuplaSuccess;
+    }
+
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/src/event.cpp b/src/event.cpp
index 9cbf779c..037506c8 100644
--- a/src/event.cpp
+++ b/src/event.cpp
@@ -19,133 +19,83 @@
  */
 
 
-#include "cupla/namespace.hpp"
-#include "cupla_runtime.hpp"
-#include "cupla/manager/Memory.hpp"
-#include "cupla/manager/Device.hpp"
-#include "cupla/manager/Stream.hpp"
 #include "cupla/manager/Event.hpp"
+
 #include "cupla/api/event.hpp"
+#include "cupla/manager/Device.hpp"
+#include "cupla/manager/Memory.hpp"
+#include "cupla/manager/Stream.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla_runtime.hpp"
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaEventCreateWithFlags(cuplaEvent_t* event, unsigned int flags)
+    {
+        *event = cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().create(flags);
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaEventCreateWithFlags(
-    cuplaEvent_t * event,
-    unsigned int flags
-)
-{
-    *event = cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().create( flags );
+        return cuplaSuccess;
+    }
 
-    return cuplaSuccess;
-}
 
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaEventCreate(cuplaEvent_t* event)
+    {
+        *event = cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().create(0);
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaEventCreate(
-    cuplaEvent_t * event
-)
-{
-    *event = cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().create( 0 );
+        return cuplaSuccess;
+    }
 
-    return cuplaSuccess;
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaEventDestroy(cuplaEvent_t event)
+    {
+        if(cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().destroy(event))
+            return cuplaSuccess;
+        else
+            return cuplaErrorInitializationError;
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaEventDestroy( cuplaEvent_t event )
-{
-    if(
-        cupla::manager::Event<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().destroy( event )
-    )
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaEventRecord(cuplaEvent_t event, cuplaStream_t stream)
+    {
+        auto& streamObject = cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream);
+        auto& eventObject = cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().event(event);
+
+        eventObject.record(streamObject);
         return cuplaSuccess;
-    else
-        return cuplaErrorInitializationError;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaEventRecord(
-    cuplaEvent_t event,
-    cuplaStream_t stream
-)
-{
-    auto& streamObject = cupla::manager::Stream<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().stream( stream );
-    auto& eventObject = cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().event( event );
-
-    eventObject.record( streamObject );
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaEventElapsedTime(
-    float * ms,
-    cuplaEvent_t start,
-    cuplaEvent_t end
-)
-{
-    auto& eventStart = cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().event( start );
-    auto& eventEnd = cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().event( end );
-    *ms = static_cast< float >( eventEnd.elapsedSince( eventStart ) );
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaEventSynchronize(
-    cuplaEvent_t event
-)
-{
-    auto& eventObject = cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().event( event );
-    ::alpaka::wait( *eventObject );
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaEventQuery( cuplaEvent_t event )
-{
-    auto& eventObject = cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().event( event );
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaEventElapsedTime(float* ms, cuplaEvent_t start, cuplaEvent_t end)
+    {
+        auto& eventStart = cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().event(start);
+        auto& eventEnd = cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().event(end);
+        *ms = static_cast<float>(eventEnd.elapsedSince(eventStart));
+        return cuplaSuccess;
+    }
 
-    if( ::alpaka::isComplete( *eventObject ) )
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaEventSynchronize(cuplaEvent_t event)
     {
+        auto& eventObject = cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().event(event);
+        ::alpaka::wait(*eventObject);
         return cuplaSuccess;
     }
-    else
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaEventQuery(cuplaEvent_t event)
     {
-        return cuplaErrorNotReady;
+        auto& eventObject = cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().event(event);
+
+        if(::alpaka::isComplete(*eventObject))
+        {
+            return cuplaSuccess;
+        }
+        else
+        {
+            return cuplaErrorNotReady;
+        }
     }
-}
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/src/manager/Driver.cpp b/src/manager/Driver.cpp
index 0486588e..a2f4a0dc 100644
--- a/src/manager/Driver.cpp
+++ b/src/manager/Driver.cpp
@@ -18,53 +18,38 @@
  *
  */
 
-#include "cupla/namespace.hpp"
-#include "cupla/types.hpp"
-#include "cupla_runtime.hpp"
 #include "cupla/manager/Driver.hpp"
-#include "cupla/manager/Memory.hpp"
+
 #include "cupla/manager/Device.hpp"
-#include "cupla/manager/Stream.hpp"
 #include "cupla/manager/Event.hpp"
+#include "cupla/manager/Memory.hpp"
+#include "cupla/manager/Stream.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla/types.hpp"
+#include "cupla_runtime.hpp"
 
 namespace cupla
 {
-inline namespace CUPLA_ACCELERATOR_NAMESPACE
-{
-namespace manager
-{
-
-CUPLA_HEADER_ONLY_FUNC_SPEC Driver::Driver()
-{
-    cupla::manager::Device< cupla::AccDev >::get( );
+    inline namespace CUPLA_ACCELERATOR_NAMESPACE
+    {
+        namespace manager
+        {
+            CUPLA_HEADER_ONLY_FUNC_SPEC Driver::Driver()
+            {
+                cupla::manager::Device<cupla::AccDev>::get();
 
-    cupla::manager::Stream<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get();
+                cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get();
 
-    cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim<3u>
-    >::get();
+                cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<3u>>::get();
 
-    cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim<2u>
-    >::get();
+                cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<2u>>::get();
 
-    cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim<1u>
-    >::get();
+                cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<1u>>::get();
 
-    cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get();
-}
+                cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get();
+            }
 
 
-} //namespace manager
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
-} //namespace cupla
+        } // namespace manager
+    } // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/src/memory.cpp b/src/memory.cpp
index 77a8fc56..72bb3567 100644
--- a/src/memory.cpp
+++ b/src/memory.cpp
@@ -19,951 +19,458 @@
  */
 
 
-#include "cupla/namespace.hpp"
-#include "cupla_runtime.hpp"
 #include "cupla/manager/Memory.hpp"
+
+#include "cupla/api/memory.hpp"
 #include "cupla/manager/Device.hpp"
-#include "cupla/manager/Stream.hpp"
 #include "cupla/manager/Event.hpp"
-#include "cupla/api/memory.hpp"
+#include "cupla/manager/Stream.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla_runtime.hpp"
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMalloc(void** ptrptr, size_t size)
+    {
+        const ::alpaka::Vec<cupla::AlpakaDim<1u>, cupla::MemSizeType> extent(size);
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMalloc(
-    void **ptrptr,
-    size_t size
-)
-{
+        auto& buf = cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<1u>>::get().alloc(extent);
 
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<1u>,
-        cupla::MemSizeType
-    > extent( size );
-
-    auto& buf = cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim<1u>
-    >::get().alloc( extent );
-
-    // @toto catch errors
-    *ptrptr = ::alpaka::getPtrNative(buf);
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMallocPitch(
-    void ** devPtr,
-    size_t * pitch,
-    size_t const width,
-    size_t const height
-)
-{
-    const ::alpaka::Vec<
-        cupla::AlpakaDim< 2u >,
-        cupla::MemSizeType
-    > extent( height, width );
-
-    auto& buf = cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim< 2u >
-    >::get().alloc( extent );
-
-    // @toto catch errors
-    *devPtr = ::alpaka::getPtrNative(buf);
-    *pitch = ::alpaka::getPitchBytes< 1u >( buf );
-
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMalloc3D(
-    cuplaPitchedPtr * const pitchedDevPtr,
-    cuplaExtent const extent
-)
-{
+        // @toto catch errors
+        *ptrptr = ::alpaka::getPtrNative(buf);
+        return cuplaSuccess;
+    }
 
-    auto& buf = cupla::manager::Memory<
-        cupla::AccDev,
-        cupla::AlpakaDim< 3u >
-    >::get().alloc( extent );
-
-    // @toto catch errors
-    *pitchedDevPtr = make_cuplaPitchedPtr(
-        ::alpaka::getPtrNative(buf),
-        ::alpaka::getPitchBytes< 2u >( buf ),
-        extent.width,
-        extent.height
-    );
-
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaExtent
-make_cuplaExtent(
-    size_t const w,
-    size_t const h,
-    size_t const d
-)
-{
-    return cuplaExtent( w, h, d );
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaPos
-make_cuplaPos(
-    size_t const x,
-    size_t const y,
-    size_t const z
-)
-{
-    return cuplaPos( x, y, z );
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaPitchedPtr
-make_cuplaPitchedPtr(
-    void * const d,
-    size_t const p,
-    size_t const xsz,
-    size_t const ysz
-)
-{
-    return cuplaPitchedPtr( d, p, xsz, ysz );
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMallocHost(
-    void **ptrptr,
-    size_t size
-)
-{
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<1u>,
-        cupla::MemSizeType
-    > extent( size );
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMallocPitch(void** devPtr, size_t* pitch, size_t const width, size_t const height)
+    {
+        const ::alpaka::Vec<cupla::AlpakaDim<2u>, cupla::MemSizeType> extent(height, width);
 
-    auto& buf = cupla::manager::Memory<
-        cupla::AccHost,
-        cupla::AlpakaDim<1u>
-    >::get().alloc( extent );
+        auto& buf = cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<2u>>::get().alloc(extent);
 
-    prepareForAsyncCopy( buf );
+        // @toto catch errors
+        *devPtr = ::alpaka::getPtrNative(buf);
+        *pitch = ::alpaka::getPitchBytes<1u>(buf);
+
+        return cuplaSuccess;
+    }
 
-    // @toto catch errors
-    *ptrptr = ::alpaka::getPtrNative(buf);
-    return cuplaSuccess;
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMalloc3D(cuplaPitchedPtr* const pitchedDevPtr, cuplaExtent const extent)
+    {
+        auto& buf = cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<3u>>::get().alloc(extent);
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t cuplaFree(void *ptr)
-{
+        // @toto catch errors
+        *pitchedDevPtr = make_cuplaPitchedPtr(
+            ::alpaka::getPtrNative(buf),
+            ::alpaka::getPitchBytes<2u>(buf),
+            extent.width,
+            extent.height);
 
-    if(
-        cupla::manager::Memory<
-            cupla::AccDev,
-            cupla::AlpakaDim<1u>
-        >::get().free( ptr )
-    )
         return cuplaSuccess;
-    else if(
-        cupla::manager::Memory<
-            cupla::AccDev,
-            cupla::AlpakaDim<2u>
-        >::get().free( ptr )
-    )
-        return cuplaSuccess;
-    else if(
-        cupla::manager::Memory<
-            cupla::AccDev,
-            cupla::AlpakaDim<3u>
-        >::get().free( ptr )
-    )
-        return cuplaSuccess;
-    else
-        return cuplaErrorMemoryAllocation;
+    }
 
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaExtent make_cuplaExtent(size_t const w, size_t const h, size_t const d)
+    {
+        return cuplaExtent(w, h, d);
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t cuplaFreeHost(void *ptr)
-{
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaPos make_cuplaPos(size_t const x, size_t const y, size_t const z)
+    {
+        return cuplaPos(x, y, z);
+    }
 
-    if(
-        cupla::manager::Memory<
-            cupla::AccHost,
-            cupla::AlpakaDim<1u>
-        >::get().free( ptr )
-    )
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaPitchedPtr make_cuplaPitchedPtr(void* const d, size_t const p, size_t const xsz, size_t const ysz)
+    {
+        return cuplaPitchedPtr(d, p, xsz, ysz);
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMallocHost(void** ptrptr, size_t size)
+    {
+        const ::alpaka::Vec<cupla::AlpakaDim<1u>, cupla::MemSizeType> extent(size);
+
+        auto& buf = cupla::manager::Memory<cupla::AccHost, cupla::AlpakaDim<1u>>::get().alloc(extent);
+
+        prepareForAsyncCopy(buf);
+
+        // @toto catch errors
+        *ptrptr = ::alpaka::getPtrNative(buf);
         return cuplaSuccess;
-    else
-        return cuplaErrorMemoryAllocation;
-
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t cuplaMemcpyAsync(
-    void *dst,
-    const void *src,
-    size_t count,
-    enum cuplaMemcpyKind kind,
-    cuplaStream_t stream
-)
-{
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<1u>,
-        cupla::MemSizeType
-    > numBytes(count);
-
-    auto& device(
-        cupla::manager::Device<
-            cupla::AccDev
-        >::get().current()
-    );
-
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( stream )
-    );
-
-    switch(kind)
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaFree(void* ptr)
     {
+        if(cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<1u>>::get().free(ptr))
+            return cuplaSuccess;
+        else if(cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<2u>>::get().free(ptr))
+            return cuplaSuccess;
+        else if(cupla::manager::Memory<cupla::AccDev, cupla::AlpakaDim<3u>>::get().free(ptr))
+            return cuplaSuccess;
+        else
+            return cuplaErrorMemoryAllocation;
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaFreeHost(void* ptr)
+    {
+        if(cupla::manager::Memory<cupla::AccHost, cupla::AlpakaDim<1u>>::get().free(ptr))
+            return cuplaSuccess;
+        else
+            return cuplaErrorMemoryAllocation;
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemcpyAsync(
+        void* dst,
+        const void* src,
+        size_t count,
+        enum cuplaMemcpyKind kind,
+        cuplaStream_t stream)
+    {
+        const ::alpaka::Vec<cupla::AlpakaDim<1u>, cupla::MemSizeType> numBytes(count);
+
+        auto& device(cupla::manager::Device<cupla::AccDev>::get().current());
+
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream));
+
+        switch(kind)
+        {
         case cuplaMemcpyHostToDevice:
         {
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-
-            const cupla::HostBufWrapper< 1u > hBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
-                host,
-                numBytes
-            );
-            cupla::DeviceBufWrapper< 1u > dBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
-                device,
-                numBytes
-            );
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
 
-            ::alpaka::memcpy(
-                streamObject,
-                dBuf,
-                hBuf,
-                numBytes
-            );
+            const cupla::HostBufWrapper<1u> hBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
+                host,
+                numBytes);
+            cupla::DeviceBufWrapper<1u> dBuf(static_cast<uint8_t*>(dst), device, numBytes);
 
+            ::alpaka::memcpy(streamObject, dBuf, hBuf, numBytes);
         }
-            break;
+        break;
         case cuplaMemcpyDeviceToHost:
         {
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-            const cupla::DeviceBufWrapper< 1u > dBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+            const cupla::DeviceBufWrapper<1u> dBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
                 device,
-                numBytes
-            );
-            cupla::HostBufWrapper< 1u > hBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
-                host,
-                numBytes
-            );
-
-            ::alpaka::memcpy(
-                streamObject,
-                hBuf,
-                dBuf,
-                numBytes
-            );
+                numBytes);
+            cupla::HostBufWrapper<1u> hBuf(static_cast<uint8_t*>(dst), host, numBytes);
 
+            ::alpaka::memcpy(streamObject, hBuf, dBuf, numBytes);
         }
-            break;
+        break;
         case cuplaMemcpyDeviceToDevice:
         {
-            const cupla::DeviceBufWrapper< 1u > dSrcBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
+            const cupla::DeviceBufWrapper<1u> dSrcBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
                 device,
-                numBytes
-            );
-            cupla::DeviceBufWrapper< 1u > dDestBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
-                device,
-                numBytes
-            );
-
-            ::alpaka::memcpy(
-                streamObject,
-                dDestBuf,
-                dSrcBuf,
-                numBytes
-            );
+                numBytes);
+            cupla::DeviceBufWrapper<1u> dDestBuf(static_cast<uint8_t*>(dst), device, numBytes);
 
+            ::alpaka::memcpy(streamObject, dDestBuf, dSrcBuf, numBytes);
         }
-            break;
+        break;
         case cuplaMemcpyHostToHost:
         {
-            auto& hostStreamObject(
-                cupla::manager::Stream<
-                    cupla::AccHost,
-                    cupla::AccHostStream
-                >::get().stream( stream )
-            );
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-            const cupla::HostBufWrapper< 1u > hSrcBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
+            auto& hostStreamObject(cupla::manager::Stream<cupla::AccHost, cupla::AccHostStream>::get().stream(stream));
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+            const cupla::HostBufWrapper<1u> hSrcBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
                 host,
-                numBytes
-            );
-            cupla::HostBufWrapper< 1u > hDestBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
-                host,
-                numBytes
-            );
-
-            ::alpaka::memcpy(
-                hostStreamObject,
-                hDestBuf,
-                hSrcBuf,
-                numBytes
-            );
+                numBytes);
+            cupla::HostBufWrapper<1u> hDestBuf(static_cast<uint8_t*>(dst), host, numBytes);
 
+            ::alpaka::memcpy(hostStreamObject, hDestBuf, hSrcBuf, numBytes);
         }
         break;
+        }
+        return cuplaSuccess;
     }
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemcpy(
-    void *dst,
-    const void *src,
-    size_t count,
-    enum cuplaMemcpyKind kind
-)
-{
-    cuplaDeviceSynchronize();
-
-    cuplaMemcpyAsync(
-        dst,
-        src,
-        count,
-        kind,
-        0
-    );
-
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( 0 )
-    );
-    ::alpaka::wait( streamObject );
-
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemsetAsync(
-    void * devPtr,
-    int value,
-    size_t count,
-    cuplaStream_t stream
-)
-{
-    auto& device(
-        cupla::manager::Device<
-            cupla::AccDev
-        >::get().current()
-    );
-
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( stream )
-    );
-
-    ::alpaka::Vec<
-        cupla::AlpakaDim<1u>,
-        cupla::MemSizeType
-    > const
-    numBytes(count);
-
-    cupla::DeviceBufWrapper< 1u >
-    dBuf(
-        static_cast< uint8_t * >( devPtr ),
-        device,
-        numBytes
-    );
-
-    ::alpaka::memset(
-        streamObject,
-        dBuf,
-        value,
-        numBytes
-    );
-
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemset(
-    void * devPtr,
-    int value,
-    size_t count
-)
-{
-    cuplaDeviceSynchronize();
-
-    cuplaMemsetAsync(
-        devPtr,
-        value,
-        count,
-        0
-    );
-
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( 0 )
-    );
-    ::alpaka::wait( streamObject );
-
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemcpy2DAsync(
-    void * dst,
-    size_t const dPitch,
-    void const * const src,
-    size_t const sPitch,
-    size_t const width,
-    size_t const height,
-    enum cuplaMemcpyKind kind,
-    cuplaStream_t const stream
-)
-{
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<2u>,
-        cupla::MemSizeType
-    > numBytes( height, width );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<2u>,
-        cupla::MemSizeType
-    > dstPitch( dPitch * height , dPitch );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<2u>,
-        cupla::MemSizeType
-    > srcPitch( sPitch * height , sPitch );
-
-    auto& device(
-        cupla::manager::Device<
-            cupla::AccDev
-        >::get().current()
-    );
-
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( stream )
-    );
-
-    switch(kind)
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemcpy(void* dst, const void* src, size_t count, enum cuplaMemcpyKind kind)
+    {
+        cuplaDeviceSynchronize();
+
+        cuplaMemcpyAsync(dst, src, count, kind, 0);
+
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(0));
+        ::alpaka::wait(streamObject);
+
+        return cuplaSuccess;
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemsetAsync(void* devPtr, int value, size_t count, cuplaStream_t stream)
     {
+        auto& device(cupla::manager::Device<cupla::AccDev>::get().current());
+
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream));
+
+        ::alpaka::Vec<cupla::AlpakaDim<1u>, cupla::MemSizeType> const numBytes(count);
+
+        cupla::DeviceBufWrapper<1u> dBuf(static_cast<uint8_t*>(devPtr), device, numBytes);
+
+        ::alpaka::memset(streamObject, dBuf, value, numBytes);
+
+        return cuplaSuccess;
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemset(void* devPtr, int value, size_t count)
+    {
+        cuplaDeviceSynchronize();
+
+        cuplaMemsetAsync(devPtr, value, count, 0);
+
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(0));
+        ::alpaka::wait(streamObject);
+
+        return cuplaSuccess;
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemcpy2DAsync(
+        void* dst,
+        size_t const dPitch,
+        void const* const src,
+        size_t const sPitch,
+        size_t const width,
+        size_t const height,
+        enum cuplaMemcpyKind kind,
+        cuplaStream_t const stream)
+    {
+        const ::alpaka::Vec<cupla::AlpakaDim<2u>, cupla::MemSizeType> numBytes(height, width);
+
+        const ::alpaka::Vec<cupla::AlpakaDim<2u>, cupla::MemSizeType> dstPitch(dPitch * height, dPitch);
+
+        const ::alpaka::Vec<cupla::AlpakaDim<2u>, cupla::MemSizeType> srcPitch(sPitch * height, sPitch);
+
+        auto& device(cupla::manager::Device<cupla::AccDev>::get().current());
+
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream));
+
+        switch(kind)
+        {
         case cuplaMemcpyHostToDevice:
         {
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-
-            const cupla::HostBufWrapper< 2u > hBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+
+            const cupla::HostBufWrapper<2u> hBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
                 host,
                 numBytes,
-                srcPitch
-            );
-            cupla::DeviceBufWrapper< 2u > dBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
-                device,
-                numBytes,
-                dstPitch
-            );
-
-            ::alpaka::memcpy(
-                streamObject,
-                dBuf,
-                hBuf,
-                numBytes
-            );
+                srcPitch);
+            cupla::DeviceBufWrapper<2u> dBuf(static_cast<uint8_t*>(dst), device, numBytes, dstPitch);
 
+            ::alpaka::memcpy(streamObject, dBuf, hBuf, numBytes);
         }
-            break;
+        break;
         case cuplaMemcpyDeviceToHost:
         {
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-            const cupla::DeviceBufWrapper< 2u > dBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+            const cupla::DeviceBufWrapper<2u> dBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
                 device,
                 numBytes,
-                srcPitch
-            );
-            cupla::HostBufWrapper< 2u > hBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
-                host,
-                numBytes,
-                dstPitch
-            );
-
-            ::alpaka::memcpy(
-                streamObject,
-                hBuf,
-                dBuf,
-                numBytes
-            );
+                srcPitch);
+            cupla::HostBufWrapper<2u> hBuf(static_cast<uint8_t*>(dst), host, numBytes, dstPitch);
 
+            ::alpaka::memcpy(streamObject, hBuf, dBuf, numBytes);
         }
-            break;
+        break;
         case cuplaMemcpyDeviceToDevice:
         {
-            const cupla::DeviceBufWrapper< 2u > dSrcBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
-                device,
-                numBytes,
-                srcPitch
-            );
-            cupla::DeviceBufWrapper< 2u > dDestBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
+            const cupla::DeviceBufWrapper<2u> dSrcBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
                 device,
                 numBytes,
-                dstPitch
-            );
-
-            ::alpaka::memcpy(
-                streamObject,
-                dDestBuf,
-                dSrcBuf,
-                numBytes
-            );
+                srcPitch);
+            cupla::DeviceBufWrapper<2u> dDestBuf(static_cast<uint8_t*>(dst), device, numBytes, dstPitch);
 
+            ::alpaka::memcpy(streamObject, dDestBuf, dSrcBuf, numBytes);
         }
         break;
         case cuplaMemcpyHostToHost:
         {
-             auto& hostStreamObject(
-                cupla::manager::Stream<
-                    cupla::AccHost,
-                    cupla::AccHostStream
-                >::get().stream( stream )
-            );
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-            const cupla::HostBufWrapper< 2u > hSrcBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(src)
-                ),
-                host,
-                numBytes,
-                srcPitch
-            );
-            cupla::HostBufWrapper< 2u > hDestBuf(
-                static_cast<uint8_t *>(
-                    dst
-                ),
+            auto& hostStreamObject(cupla::manager::Stream<cupla::AccHost, cupla::AccHostStream>::get().stream(stream));
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+            const cupla::HostBufWrapper<2u> hSrcBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(src)),
                 host,
                 numBytes,
-                dstPitch
-            );
-
-            ::alpaka::memcpy(
-                hostStreamObject,
-                hDestBuf,
-                hSrcBuf,
-                numBytes
-            );
+                srcPitch);
+            cupla::HostBufWrapper<2u> hDestBuf(static_cast<uint8_t*>(dst), host, numBytes, dstPitch);
 
+            ::alpaka::memcpy(hostStreamObject, hDestBuf, hSrcBuf, numBytes);
         }
         break;
+        }
+        return cuplaSuccess;
     }
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemcpy2D(
-    void * dst,
-    size_t const dPitch,
-    void const * const src,
-    size_t const sPitch,
-    size_t const width,
-    size_t const height,
-    enum cuplaMemcpyKind kind
-)
-{
-    cuplaDeviceSynchronize();
-
-    cuplaMemcpy2DAsync(
-        dst,
-        dPitch,
-        src,
-        sPitch,
-        width,
-        height,
-        kind,
-        0
-    );
-
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( 0 )
-    );
-    ::alpaka::wait( streamObject );
-
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemcpy3DAsync(
-    const cuplaMemcpy3DParms * const p,
-    cuplaStream_t stream
-)
-{
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<3u>,
-        cupla::MemSizeType
-    > numBytes( p->extent );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<3u>,
-        cupla::MemSizeType
-    > extentSrc(
-        p->srcPtr.xsize * p->srcPtr.ysize * ( p->extent.depth + p->srcPos.z ),
-        p->srcPtr.xsize * p->srcPtr.ysize,
-        p->srcPtr.xsize
-    );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<3u>,
-        cupla::MemSizeType
-    > extentDst(
-        p->dstPtr.xsize * p->dstPtr.ysize * ( p->extent.depth + p->dstPos.z ),
-        p->dstPtr.xsize * p->dstPtr.ysize,
-        p->dstPtr.xsize
-    );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<3u>,
-        cupla::MemSizeType
-    > offsetSrc(
-        p->srcPos.z,
-        p->srcPos.y,
-        p->srcPos.x
-    );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<3u>,
-        cupla::MemSizeType
-    > offsetDst(
-        p->dstPos.z,
-        p->dstPos.y,
-        p->dstPos.x
-    );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<3u>,
-        cupla::MemSizeType
-    > dstPitch(
-        p->dstPtr.pitch * p->dstPtr.ysize * ( p->extent.depth + p->dstPos.z ), // @todo: can't create z pitch,  but is not needed by alpaka
-        p->dstPtr.pitch * p->dstPtr.ysize,
-        p->dstPtr.pitch
-    );
-
-    const ::alpaka::Vec<
-        cupla::AlpakaDim<3u>,
-        cupla::MemSizeType
-    > srcPitch(
-        p->srcPtr.pitch * p->srcPtr.ysize * ( p->extent.depth + p->srcPos.z ), // @todo: can't create z pitch, but is not needed by alpaka
-        p->srcPtr.pitch * p->srcPtr.ysize,
-        p->srcPtr.pitch
-    );
-
-    auto& device(
-        cupla::manager::Device<
-            cupla::AccDev
-        >::get().current()
-    );
-
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( stream )
-    );
-
-    switch(p->kind)
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemcpy2D(
+        void* dst,
+        size_t const dPitch,
+        void const* const src,
+        size_t const sPitch,
+        size_t const width,
+        size_t const height,
+        enum cuplaMemcpyKind kind)
+    {
+        cuplaDeviceSynchronize();
+
+        cuplaMemcpy2DAsync(dst, dPitch, src, sPitch, width, height, kind, 0);
+
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(0));
+        ::alpaka::wait(streamObject);
+
+        return cuplaSuccess;
+    }
+
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemcpy3DAsync(const cuplaMemcpy3DParms* const p, cuplaStream_t stream)
     {
+        const ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType> numBytes(p->extent);
+
+        const ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType> extentSrc(
+            p->srcPtr.xsize * p->srcPtr.ysize * (p->extent.depth + p->srcPos.z),
+            p->srcPtr.xsize * p->srcPtr.ysize,
+            p->srcPtr.xsize);
+
+        const ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType> extentDst(
+            p->dstPtr.xsize * p->dstPtr.ysize * (p->extent.depth + p->dstPos.z),
+            p->dstPtr.xsize * p->dstPtr.ysize,
+            p->dstPtr.xsize);
+
+        const ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType> offsetSrc(p->srcPos.z, p->srcPos.y, p->srcPos.x);
+
+        const ::alpaka::Vec<cupla::AlpakaDim<3u>, cupla::MemSizeType> offsetDst(p->dstPos.z, p->dstPos.y, p->dstPos.x);
+
+        const ::alpaka::Vec<
+            cupla::AlpakaDim<3u>,
+            cupla::MemSizeType>
+            dstPitch(
+                p->dstPtr.pitch * p->dstPtr.ysize
+                    * (p->extent.depth + p->dstPos.z), // @todo: can't create z pitch,  but is not needed by alpaka
+                p->dstPtr.pitch * p->dstPtr.ysize,
+                p->dstPtr.pitch);
+
+        const ::alpaka::Vec<
+            cupla::AlpakaDim<3u>,
+            cupla::MemSizeType>
+            srcPitch(
+                p->srcPtr.pitch * p->srcPtr.ysize
+                    * (p->extent.depth + p->srcPos.z), // @todo: can't create z pitch, but is not needed by alpaka
+                p->srcPtr.pitch * p->srcPtr.ysize,
+                p->srcPtr.pitch);
+
+        auto& device(cupla::manager::Device<cupla::AccDev>::get().current());
+
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream));
+
+        switch(p->kind)
+        {
         case cuplaMemcpyHostToDevice:
         {
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-
-            cupla::HostBufWrapper< 3u > hBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(p->srcPtr.ptr)
-                ),
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+
+            cupla::HostBufWrapper<3u> hBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(p->srcPtr.ptr)),
                 host,
                 extentSrc,
-                srcPitch
-            );
-            cupla::DeviceBufWrapper< 3u > dBuf(
-                static_cast<uint8_t *>(
-                    p->dstPtr.ptr
-                ),
-                device,
-                extentDst,
-                dstPitch
-            );
+                srcPitch);
+            cupla::DeviceBufWrapper<3u> dBuf(static_cast<uint8_t*>(p->dstPtr.ptr), device, extentDst, dstPitch);
 
-            cupla::DeviceViewWrapper< 3u > dView(
-                dBuf,
-                extentDst - offsetDst,
-                offsetDst
-            );
+            cupla::DeviceViewWrapper<3u> dView(dBuf, extentDst - offsetDst, offsetDst);
 
             ::alpaka::memcpy(
                 streamObject,
                 dView,
-                cupla::HostViewWrapper< 3u >(
-                    hBuf,
-                    extentSrc - offsetSrc,
-                    offsetSrc
-                ),
-                numBytes
-            );
-
+                cupla::HostViewWrapper<3u>(hBuf, extentSrc - offsetSrc, offsetSrc),
+                numBytes);
         }
-            break;
+        break;
         case cuplaMemcpyDeviceToHost:
         {
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-            cupla::DeviceBufWrapper< 3u > dBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(p->srcPtr.ptr)
-                ),
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+            cupla::DeviceBufWrapper<3u> dBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(p->srcPtr.ptr)),
                 device,
                 extentSrc,
-                srcPitch
-            );
-            cupla::HostBufWrapper< 3u > hBuf(
-                static_cast<uint8_t *>(
-                    p->dstPtr.ptr
-                ),
-                host,
-                extentDst,
-                dstPitch
-            );
+                srcPitch);
+            cupla::HostBufWrapper<3u> hBuf(static_cast<uint8_t*>(p->dstPtr.ptr), host, extentDst, dstPitch);
 
-            cupla::HostViewWrapper< 3u > hView(
-                hBuf,
-                extentDst - offsetDst,
-                offsetDst
-            );
+            cupla::HostViewWrapper<3u> hView(hBuf, extentDst - offsetDst, offsetDst);
 
             ::alpaka::memcpy(
                 streamObject,
                 hView,
-                cupla::DeviceViewWrapper< 3u >(
-                    dBuf,
-                    extentSrc - offsetSrc,
-                    offsetSrc
-                ),
-                numBytes
-            );
-
+                cupla::DeviceViewWrapper<3u>(dBuf, extentSrc - offsetSrc, offsetSrc),
+                numBytes);
         }
-            break;
+        break;
         case cuplaMemcpyDeviceToDevice:
         {
-            cupla::DeviceBufWrapper< 3u > dSrcBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(p->srcPtr.ptr)
-                ),
+            cupla::DeviceBufWrapper<3u> dSrcBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(p->srcPtr.ptr)),
                 device,
                 extentSrc,
-                srcPitch
-            );
-            cupla::DeviceBufWrapper< 3u > dDestBuf(
-                static_cast<uint8_t *>(
-                    p->dstPtr.ptr
-                ),
-                device,
-                extentDst,
-                dstPitch
-            );
+                srcPitch);
+            cupla::DeviceBufWrapper<3u> dDestBuf(static_cast<uint8_t*>(p->dstPtr.ptr), device, extentDst, dstPitch);
 
-            cupla::DeviceViewWrapper< 3u > dView(
-                dDestBuf,
-                extentDst - offsetDst,
-                offsetDst
-            );
+            cupla::DeviceViewWrapper<3u> dView(dDestBuf, extentDst - offsetDst, offsetDst);
 
             ::alpaka::memcpy(
                 streamObject,
                 dView,
-                cupla::DeviceViewWrapper< 3u >(
-                    dSrcBuf,
-                    extentSrc - offsetSrc,
-                    offsetSrc
-                ),
-                numBytes
-            );
-
+                cupla::DeviceViewWrapper<3u>(dSrcBuf, extentSrc - offsetSrc, offsetSrc),
+                numBytes);
         }
         break;
         case cuplaMemcpyHostToHost:
         {
-            auto& hostStreamObject(
-                cupla::manager::Stream<
-                    cupla::AccHost,
-                    cupla::AccHostStream
-                >::get().stream( stream )
-            );
-
-            auto& host(
-                cupla::manager::Device<
-                    cupla::AccHost
-                >::get().current()
-            );
-            cupla::HostBufWrapper< 3u > hSrcBuf(
-                const_cast<uint8_t *>(
-                    static_cast<const uint8_t *>(p->srcPtr.ptr)
-                ),
+            auto& hostStreamObject(cupla::manager::Stream<cupla::AccHost, cupla::AccHostStream>::get().stream(stream));
+
+            auto& host(cupla::manager::Device<cupla::AccHost>::get().current());
+            cupla::HostBufWrapper<3u> hSrcBuf(
+                const_cast<uint8_t*>(static_cast<const uint8_t*>(p->srcPtr.ptr)),
                 host,
                 extentSrc,
-                srcPitch
-            );
-            cupla::HostBufWrapper< 3u > hDestBuf(
-                static_cast<uint8_t *>(
-                    p->dstPtr.ptr
-                ),
-                host,
-                extentDst,
-                dstPitch
-            );
-
-            cupla::HostViewWrapper< 3u > hView(
-                hDestBuf,
-                extentDst - offsetDst,
-                offsetDst
-            );
+                srcPitch);
+            cupla::HostBufWrapper<3u> hDestBuf(static_cast<uint8_t*>(p->dstPtr.ptr), host, extentDst, dstPitch);
+
+            cupla::HostViewWrapper<3u> hView(hDestBuf, extentDst - offsetDst, offsetDst);
             ::alpaka::memcpy(
                 hostStreamObject,
                 hView,
-                cupla::HostViewWrapper< 3u >(
-                    hSrcBuf,
-                    extentSrc - offsetSrc,
-                    offsetSrc
-                ),
-                numBytes
-            );
-
+                cupla::HostViewWrapper<3u>(hSrcBuf, extentSrc - offsetSrc, offsetSrc),
+                numBytes);
         }
         break;
+        }
+        return cuplaSuccess;
     }
-    return cuplaSuccess;
-}
-
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaMemcpy3D(
-    const cuplaMemcpy3DParms * const p
-)
-{
-    cuplaDeviceSynchronize();
 
-    cuplaMemcpy3DAsync( p, 0 );
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaMemcpy3D(const cuplaMemcpy3DParms* const p)
+    {
+        cuplaDeviceSynchronize();
+
+        cuplaMemcpy3DAsync(p, 0);
 
-    auto& streamObject(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().stream( 0 )
-    );
-    ::alpaka::wait( streamObject );
+        auto& streamObject(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(0));
+        ::alpaka::wait(streamObject);
 
-    return cuplaSuccess;
-}
+        return cuplaSuccess;
+    }
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/src/stream.cpp b/src/stream.cpp
index e936286c..9e59c5d7 100644
--- a/src/stream.cpp
+++ b/src/stream.cpp
@@ -19,96 +19,62 @@
  */
 
 
-#include "cupla/namespace.hpp"
-#include "cupla_runtime.hpp"
-#include "cupla/manager/Memory.hpp"
-#include "cupla/manager/Device.hpp"
 #include "cupla/manager/Stream.hpp"
-#include "cupla/manager/Event.hpp"
 
 #include "cupla/api/stream.hpp"
+#include "cupla/manager/Device.hpp"
+#include "cupla/manager/Event.hpp"
+#include "cupla/manager/Memory.hpp"
+#include "cupla/namespace.hpp"
+#include "cupla_runtime.hpp"
 
 inline namespace CUPLA_ACCELERATOR_NAMESPACE
 {
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaStreamCreate(cuplaStream_t* stream)
+    {
+        *stream = cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().create();
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaStreamCreate(
-    cuplaStream_t * stream
-)
-{
-    *stream = cupla::manager::Stream<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().create();
+        return cuplaSuccess;
+    }
 
-    return cuplaSuccess;
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaStreamDestroy(cuplaStream_t stream)
+    {
+        if(cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().destroy(stream))
+            return cuplaSuccess;
+        else
+            return cuplaErrorInitializationError;
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaStreamDestroy( cuplaStream_t stream )
-{
-    if(
-        cupla::manager::Stream<
-            cupla::AccDev,
-            cupla::AccStream
-        >::get().destroy( stream )
-    )
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaStreamSynchronize(cuplaStream_t stream)
+    {
+        auto& streamObject = cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream);
+        ::alpaka::wait(streamObject);
         return cuplaSuccess;
-    else
-        return cuplaErrorInitializationError;
-}
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaStreamSynchronize(
-    cuplaStream_t stream
-)
-{
-    auto& streamObject = cupla::manager::Stream<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().stream( stream );
-    ::alpaka::wait( streamObject );
-    return cuplaSuccess;
-}
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaStreamWaitEvent(cuplaStream_t stream, cuplaEvent_t event, unsigned int)
+    {
+        auto& streamObject = cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream);
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaStreamWaitEvent(
-    cuplaStream_t stream,
-    cuplaEvent_t event,
-    unsigned int
-)
-{
-    auto& streamObject = cupla::manager::Stream<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().stream( stream );
+        auto& eventObject = *cupla::manager::Event<cupla::AccDev, cupla::AccStream>::get().event(event);
 
-    auto& eventObject = *cupla::manager::Event<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().event( event );
-
-    ::alpaka::wait(streamObject,eventObject);
-    return cuplaSuccess;
-}
+        ::alpaka::wait(streamObject, eventObject);
+        return cuplaSuccess;
+    }
 
-CUPLA_HEADER_ONLY_FUNC_SPEC
-cuplaError_t
-cuplaStreamQuery( cuplaStream_t stream )
-{
-    auto& streamObject = cupla::manager::Stream<
-        cupla::AccDev,
-        cupla::AccStream
-    >::get().stream( stream );
+    CUPLA_HEADER_ONLY_FUNC_SPEC
+    cuplaError_t cuplaStreamQuery(cuplaStream_t stream)
+    {
+        auto& streamObject = cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(stream);
 
-    if( alpaka::empty( streamObject ) )
-        return cuplaSuccess;
-    else
-        return cuplaErrorNotReady;
-}
+        if(alpaka::empty(streamObject))
+            return cuplaSuccess;
+        else
+            return cuplaErrorNotReady;
+    }
 
-} //namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/test/system/config/kernel.cpp b/test/system/config/kernel.cpp
index 2768e0aa..e2bd8ca3 100644
--- a/test/system/config/kernel.cpp
+++ b/test/system/config/kernel.cpp
@@ -19,20 +19,20 @@
  */
 
 
-#if defined( CUPLA_ACC_CpuOmp2Blocks  )
-#   include <cupla/config/CpuOmp2Blocks.hpp>
-#elif defined( CUPLA_ACC_CpuOmp2Threads  )
-#   include <cupla/config/CpuOmp2Threads.hpp>
-#elif defined( CUPLA_ACC_CpuSerial )
-#   include <cupla/config/CpuSerial.hpp>
-#elif defined( CUPLA_ACC_CpuTbbBlocks  )
-#   include <cupla/config/CpuTbbBlocks.hpp>
-#elif defined( CUPLA_ACC_CpuThreads  )
-#   include <cupla/config/CpuThreads.hpp>
-#elif defined( CUPLA_ACC_GpuCudaRt  )
-#   include <cupla/config/GpuCudaRt.hpp>
-#elif defined( CUPLA_ACC_GpuHipRt  )
-#   include <cupla/config/GpuHipRt.hpp>
+#if defined(CUPLA_ACC_CpuOmp2Blocks)
+#    include <cupla/config/CpuOmp2Blocks.hpp>
+#elif defined(CUPLA_ACC_CpuOmp2Threads)
+#    include <cupla/config/CpuOmp2Threads.hpp>
+#elif defined(CUPLA_ACC_CpuSerial)
+#    include <cupla/config/CpuSerial.hpp>
+#elif defined(CUPLA_ACC_CpuTbbBlocks)
+#    include <cupla/config/CpuTbbBlocks.hpp>
+#elif defined(CUPLA_ACC_CpuThreads)
+#    include <cupla/config/CpuThreads.hpp>
+#elif defined(CUPLA_ACC_GpuCudaRt)
+#    include <cupla/config/GpuCudaRt.hpp>
+#elif defined(CUPLA_ACC_GpuHipRt)
+#    include <cupla/config/GpuHipRt.hpp>
 #endif
 
 #include "cuda_to_cupla.hpp"
@@ -40,9 +40,9 @@
 struct IncrementKernel
 {
     template<typename T_Acc>
-    ALPAKA_FN_ACC void operator()( T_Acc const & acc, int * ptr) const
+    ALPAKA_FN_ACC void operator()(T_Acc const& acc, int* ptr) const
     {
-        for( int i = 0; i < elemDim.x; ++i )
+        for(int i = 0; i < elemDim.x; ++i)
             atomicAdd(ptr, 1);
     }
 };
@@ -51,7 +51,5 @@ struct IncrementKernel
 void callIncrementKernel(int* pr_d)
 {
     // increment 42 times
-    CUPLA_KERNEL_OPTI(
-        IncrementKernel
-    )(7, 6)(pr_d);
+    CUPLA_KERNEL_OPTI(IncrementKernel)(7, 6)(pr_d);
 }
diff --git a/test/system/config/main.cpp b/test/system/config/main.cpp
index e7d09039..91fe30a5 100644
--- a/test/system/config/main.cpp
+++ b/test/system/config/main.cpp
@@ -19,20 +19,20 @@
  */
 
 
-#if defined( CUPLA_ACC_CpuOmp2Blocks  )
-#   include <cupla/config/CpuOmp2Blocks.hpp>
-#elif defined( CUPLA_ACC_CpuOmp2Threads  )
-#   include <cupla/config/CpuOmp2Threads.hpp>
-#elif defined( CUPLA_ACC_CpuSerial )
-#   include <cupla/config/CpuSerial.hpp>
-#elif defined( CUPLA_ACC_CpuTbbBlocks  )
-#   include <cupla/config/CpuTbbBlocks.hpp>
-#elif defined( CUPLA_ACC_CpuThreads  )
-#   include <cupla/config/CpuThreads.hpp>
-#elif defined( CUPLA_ACC_GpuCudaRt  )
-#   include <cupla/config/GpuCudaRt.hpp>
-#elif defined( CUPLA_ACC_GpuHipRt  )
-#   include <cupla/config/GpuHipRt.hpp>
+#if defined(CUPLA_ACC_CpuOmp2Blocks)
+#    include <cupla/config/CpuOmp2Blocks.hpp>
+#elif defined(CUPLA_ACC_CpuOmp2Threads)
+#    include <cupla/config/CpuOmp2Threads.hpp>
+#elif defined(CUPLA_ACC_CpuSerial)
+#    include <cupla/config/CpuSerial.hpp>
+#elif defined(CUPLA_ACC_CpuTbbBlocks)
+#    include <cupla/config/CpuTbbBlocks.hpp>
+#elif defined(CUPLA_ACC_CpuThreads)
+#    include <cupla/config/CpuThreads.hpp>
+#elif defined(CUPLA_ACC_GpuCudaRt)
+#    include <cupla/config/GpuCudaRt.hpp>
+#elif defined(CUPLA_ACC_GpuHipRt)
+#    include <cupla/config/GpuHipRt.hpp>
 #endif
 
 #include "cuda_to_cupla.hpp"
@@ -42,16 +42,16 @@ extern void callIncrementKernel(int* pr_d);
 int main()
 {
     int res_h = 0;
-    int *res_ptr_d = nullptr;
-    cudaMalloc( (void**)&res_ptr_d, sizeof( int ) );
+    int* res_ptr_d = nullptr;
+    cudaMalloc((void**) &res_ptr_d, sizeof(int));
 
     // reset result to zero
-    cuplaMemset( res_ptr_d, 0, sizeof( int ) );
+    cuplaMemset(res_ptr_d, 0, sizeof(int));
 
     // increment 42 times
     callIncrementKernel(res_ptr_d);
 
-    cudaMemcpy(&res_h, res_ptr_d, sizeof( int ), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&res_h, res_ptr_d, sizeof(int), cudaMemcpyDeviceToHost);
 
     return res_h != 42;
 }

From 3ac491dac631be133e639a2a8aee549ef4914d92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20Widera?= <r.widera@hzdr.de>
Date: Wed, 1 Sep 2021 11:11:01 +0200
Subject: [PATCH 3/3] CI: test code formation

Test code formation with clang-format-11.
---
 .gitlab-ci.yml                 | 40 ++++++++++++++++++++++++++++++++++
 script/check_cpp_code_style.sh | 11 ++++++++++
 2 files changed, 51 insertions(+)
 create mode 100755 script/check_cpp_code_style.sh

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9d246505..bc61e886 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,7 +8,32 @@
 include:
   - local: '/script/compiler_base.yml'
 
+stages:
+  - validate
+  - compile-and-run
+
+################################################################################
+# Check code formation with clang-format
+# pull request validation:
+#   - check C++ code style
+pull-request-validation:
+  stage: validate
+  image: ubuntu:focal
+  script:
+    - apt update
+    # install clang-format-11
+    - apt install -y -q gnupg2 wget
+    - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+    - echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main" | tee -a /etc/apt/sources.list
+    - apt update
+    - DEBIAN_FRONTEND=noninteractive apt install -y clang-format-11
+    # Check C++ code style
+    - source $CI_PROJECT_DIR/script/check_cpp_code_style.sh
+  tags:
+    - x86_64
+
 cuda92:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-gcc:1.4
   variables:
     CUPLA_CXX: "g++-6"
@@ -21,48 +46,56 @@ cuda92:
   extends: .base_cuda
 
 cuda100:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-gcc:1.4
   variables:
     CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
   extends: .base_cuda
 
 cuda101:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-gcc:1.4
   variables:
     CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
   extends: .base_cuda
 
 cuda102:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda102-gcc:1.4
   variables:
     CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
   extends: .base_cuda
 
 gcc1:
+  stage: compile-and-run
   variables:
     CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9"
     CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0"
   extends: .base_gcc
 
 gcc2:
+  stage: compile-and-run
   variables:
     CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9"
     CUPLA_BOOST_VERSIONS: "1.68.0 1.69.0 1.70.0"
   extends: .base_gcc
 
 gcc3:
+  stage: compile-and-run
   variables:
     CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9"
     CUPLA_BOOST_VERSIONS: "1.71.0 1.72.0 1.73.0"
   extends: .base_gcc
 
 clang:
+  stage: compile-and-run
   variables:
     CUPLA_CXX: "clang++-5.0 clang++-6.0 clang++-7 clang++-8 clang++-9 clang++-10 clang++-11"
     CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
   extends: .base_clang
 
 cudaClang92:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-clang:1.4
   variables:
     CUPLA_CXX: "clang++-8 clang++-10 clang++-11"
@@ -70,6 +103,7 @@ cudaClang92:
   extends: .base_cuda_clang
 
 cudaClang100:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-clang:1.4
   variables:
     CUPLA_CXX: "clang++-8 clang++-9 clang++-10 clang++-11"
@@ -77,6 +111,7 @@ cudaClang100:
   extends: .base_cuda_clang
 
 cudaClang101:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-clang:1.4
   variables:
     CUPLA_CXX: "clang++-9 clang++-10 clang++-11"
@@ -84,6 +119,7 @@ cudaClang101:
   extends: .base_cuda_clang
 
 hip42:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-rocm4.2:1.4
   variables:
     CMAKE_MODULE_PATH: "/opt/rocm-4.2.0/hip/cmake"
@@ -97,6 +133,7 @@ hip42:
 # build external project and use cupla via cmake add_subdirectory()
 # use internal alpaka
 addSubdirectoryInternal:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4
   variables:
     GIT_SUBMODULE_STRATEGY: normal
@@ -110,6 +147,7 @@ addSubdirectoryInternal:
 # build external project and use cupla via cmake add_subdirectory()
 # use installed alpaka
 addSubdirectoryExternal:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4
   variables:
     GIT_SUBMODULE_STRATEGY: normal
@@ -124,6 +162,7 @@ addSubdirectoryExternal:
 # build external project and use cupla via cmake find_package()
 # cupla was installed with disabled examples
 findPackageWithoutExample:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4
   variables:
     GIT_SUBMODULE_STRATEGY: normal
@@ -139,6 +178,7 @@ findPackageWithoutExample:
 # build external project and use cupla via cmake find_package()
 # cupla was installed with enabled examples
 findPackageWithExample:
+  stage: compile-and-run
   image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.4
   variables:
     GIT_SUBMODULE_STRATEGY: normal
diff --git a/script/check_cpp_code_style.sh b/script/check_cpp_code_style.sh
new file mode 100755
index 00000000..37c32c29
--- /dev/null
+++ b/script/check_cpp_code_style.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+cd $CI_PROJECT_DIR
+
+# check code style with clang format
+find src example include test  -iname "*.def" \
+  -o -iname "*.h" -o -iname "*.cpp" -o -iname "*.hpp" \
+  | xargs clang-format-11 --dry-run --Werror