From 22941ce426b992434b0e45859abb074a28c8c2b2 Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Sat, 18 Dec 2021 09:29:15 +0100
Subject: [PATCH] Add support for work-group merge factor and some smaller
 fixes

- Adds the host-side support for a WIP compiler optimization
- Modifies behavior of clSetKernelArg according to updated specification
---
 src/Kernel.cpp              | 51 +++++++++++++++++++++++--------------
 src/PerformanceCounter.cpp  |  6 ++---
 src/PerformanceCounter.h    |  4 +--
 src/executor.cpp            | 47 +++++++++++++++++++---------------
 src/shared/BinaryHeader.cpp | 35 +++++++++++++++++++++++++
 src/shared/BinaryHeader.h   | 47 +++++++++++++++++++++++++++++++++-
 test/TestExecutions.cpp     |  3 ++-
 7 files changed, 147 insertions(+), 46 deletions(-)
diff --git a/src/Kernel.cpp b/src/Kernel.cpp
index 5c0cec7..be42eb6 100644
--- a/src/Kernel.cpp
+++ b/src/Kernel.cpp
@@ -126,9 +126,6 @@ cl_int Kernel::setArg(cl_uint arg_index, size_t arg_size, const void* arg_value)
             buildString("Invalid arg index: %d of %d", arg_index, info.parameters.size()));
     }
 
-    // clear previous set parameter value
-    args[arg_index].reset();
-
     const auto& paramInfo = info.parameters[arg_index];
     if(!paramInfo.getPointer() || paramInfo.getByValue())
     {
@@ -171,8 +168,7 @@ cl_int Kernel::setArg(cl_uint arg_index, size_t arg_size, const void* arg_value)
                 // elements) is passed in
                 elementSize = arg_size / 4;
             }
-            ScalarArgument* scalarArg = new ScalarArgument(paramInfo.getVectorElements());
-            args[arg_index].reset(scalarArg);
+            auto scalarArg = std::make_unique<ScalarArgument>(paramInfo.getVectorElements());
             for(cl_uchar i = 0; i < paramInfo.getVectorElements(); ++i)
             {
                 // arguments are all 32-bit, since UNIFORMS are always 32-bit
@@ -219,6 +215,7 @@ cl_int Kernel::setArg(cl_uint arg_index, size_t arg_size, const void* arg_value)
                     scalarArg->addScalar(static_cast<const cl_uint*>(arg_value)[i]);
                 }
             }
+            args[arg_index] = std::move(scalarArg);
             DEBUG_LOG(DebugLevel::KERNEL_EXECUTION,
                 std::cout << "Setting kernel-argument " << arg_index << " to scalar " << args[arg_index]->to_string()
                           << std::endl)
@@ -346,8 +343,12 @@ cl_int Kernel::getWorkGroupInfo(
         // not a built-in kernel."
         return CL_INVALID_VALUE;
     case CL_KERNEL_WORK_GROUP_SIZE:
+    {
         //"[...] query the maximum work-group size that can be used to execute a kernel on a specific device [...]"
-        return returnValue<size_t>(system()->getNumQPUs(), param_value_size, param_value, param_value_size_ret);
+        auto mergeFactor = std::max(info.workItemMergeFactor, uint8_t{1});
+        return returnValue<size_t>(
+            system()->getNumQPUs() * mergeFactor, param_value_size, param_value, param_value_size_ret);
+    }
     case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
     {
         std::array<size_t, kernel_config::NUM_DIMENSIONS> tmp{
@@ -355,13 +356,20 @@ cl_int Kernel::getWorkGroupInfo(
         return returnValue(tmp.data(), sizeof(size_t), 3, param_value_size, param_value, param_value_size_ret);
     }
     case CL_KERNEL_LOCAL_MEM_SIZE:
-        // XXX can we get this somehow? Need to distinguish in global data block what is __global/__local/__private
-        // section
+        if(auto entry = findMetaData<MetaData::KERNEL_LOCAL_MEMORY_SIZE>(info.metaData))
+            // TODO should also include the size of local parameters, as far as already set!
+            return returnValue<cl_ulong>(entry->getValue<MetaData::KERNEL_LOCAL_MEMORY_SIZE>(), param_value_size,
+                param_value, param_value_size_ret);
         return returnValue<cl_ulong>(0, param_value_size, param_value, param_value_size_ret);
     case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-        return returnValue<size_t>(1, param_value_size, param_value, param_value_size_ret);
+        // TODO this has little effect (and is in fact wrong according to the OpenCL standard), if clients check the
+        // device's max work-group size (which is fixed to 12)...
+        return returnValue<size_t>(info.workItemMergeFactor ? info.workItemMergeFactor : 1u, param_value_size,
+            param_value, param_value_size_ret);
     case CL_KERNEL_PRIVATE_MEM_SIZE:
-        // XXX same for local memory, could determine if type of global data section is known
+        if(auto entry = findMetaData<MetaData::KERNEL_PRIVATE_MEMORY_SIZE>(info.metaData))
+            return returnValue<cl_ulong>(entry->getValue<MetaData::KERNEL_PRIVATE_MEMORY_SIZE>(), param_value_size,
+                param_value, param_value_size_ret);
         return returnValue<cl_ulong>(0, param_value_size, param_value, param_value_size_ret);
     }
 
@@ -418,12 +426,12 @@ cl_int Kernel::getArgInfo(cl_uint arg_index, cl_kernel_arg_info param_name, size
  */
 static bool split_compile_work_size(const std::array<uint16_t, kernel_config::NUM_DIMENSIONS>& compile_group_sizes,
     const std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& global_sizes,
-    std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& local_sizes)
+    std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& local_sizes, uint8_t mergeFactor)
 {
     if(compile_group_sizes[0] == 0 && compile_group_sizes[1] == 0 && compile_group_sizes[2] == 0)
         // no compile-time sizes set
         return false;
-    const cl_uint max_group_size = system()->getNumQPUs();
+    const cl_uint max_group_size = system()->getNumQPUs() * mergeFactor;
 
     if((global_sizes[0] % compile_group_sizes[0]) != 0 || (global_sizes[1] % compile_group_sizes[1]) != 0 ||
         (global_sizes[2] % compile_group_sizes[2]) != 0)
@@ -447,10 +455,10 @@ static bool split_compile_work_size(const std::array<uint16_t, kernel_config::NU
  * - the number of work-groups is as small as possible
  */
 static cl_int split_global_work_size(const std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& global_sizes,
-    std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& local_sizes, cl_uint num_dimensions)
+    std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& local_sizes, cl_uint num_dimensions, uint8_t mergeFactor)
 {
     const size_t total_sizes = global_sizes[0] * global_sizes[1] * global_sizes[2];
-    const cl_uint max_group_size = system()->getNumQPUs();
+    const cl_uint max_group_size = system()->getNumQPUs() * mergeFactor;
     if(total_sizes <= max_group_size)
     {
         // can be executed in a single work-group
@@ -464,7 +472,7 @@ static cl_int split_global_work_size(const std::array<std::size_t, kernel_config
      * global[0] = x * local[0]
      * global[1] = x * local[1]
      * global[2] = x * local[2]
-     * -> produces x� work-groups
+     * -> produces x^3 work-groups
      * - only works, if global[0,1,2] are all divisible by the same number
      */
     /*
@@ -544,6 +552,7 @@ cl_int Kernel::setWorkGroupSizes(CommandQueue* commandQueue, cl_uint work_dim, c
     else
         memcpy(work_offsets.data(), global_work_offset, work_dim * sizeof(size_t));
     memcpy(work_sizes.data(), global_work_size, work_dim * sizeof(size_t));
+    auto mergeFactor = std::max(info.workItemMergeFactor, uint8_t{1});
     // fill to 3 dimensions
     for(size_t i = work_dim; i < kernel_config::NUM_DIMENSIONS; ++i)
     {
@@ -556,9 +565,9 @@ cl_int Kernel::setWorkGroupSizes(CommandQueue* commandQueue, cl_uint work_dim, c
         //"local_work_size can also be a NULL value in which case the OpenCL implementation
         // will determine how to be break the global work-items into appropriate work-group instances."
         cl_int state = CL_SUCCESS;
-        if(!split_compile_work_size(info.workGroupSize, work_sizes, local_sizes))
+        if(!split_compile_work_size(info.workGroupSize, work_sizes, local_sizes, mergeFactor))
         {
-            state = split_global_work_size(work_sizes, local_sizes, work_dim);
+            state = split_global_work_size(work_sizes, local_sizes, work_dim, mergeFactor);
         }
 
         if(state != CL_SUCCESS)
@@ -599,10 +608,10 @@ cl_int Kernel::setWorkGroupSizes(CommandQueue* commandQueue, cl_uint work_dim, c
                 work_sizes[1] + work_offsets[1], kernel_config::MAX_WORK_ITEM_DIMENSIONS[1],
                 work_sizes[2] + work_offsets[2], kernel_config::MAX_WORK_ITEM_DIMENSIONS[2]));
     }
-    if(exceedsLimits<size_t>(local_sizes[0] * local_sizes[1] * local_sizes[2], 0, system()->getNumQPUs()))
+    if(exceedsLimits<size_t>(local_sizes[0] * local_sizes[1] * local_sizes[2], 0, system()->getNumQPUs() * mergeFactor))
         return returnError(CL_INVALID_WORK_GROUP_SIZE, __FILE__, __LINE__,
             buildString("Local work-sizes exceed maximum: %u * %u * %u > %u", local_sizes[0], local_sizes[1],
-                local_sizes[2], system()->getNumQPUs()));
+                local_sizes[2], system()->getNumQPUs() * mergeFactor));
 
     // check divisibility of local_sizes[i] by work_sizes[i]
     for(cl_uint i = 0; i < kernel_config::NUM_DIMENSIONS; ++i)
@@ -1541,6 +1550,7 @@ cl_kernel VC4CL_FUNC(clCloneKernel)(cl_kernel source_kernel, cl_int* errcode_ret
  * - CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the corresponding value in
  * global_work_offset for dimension exceeds the maximum value representable by size_t on the device associated with
  * command_queue.
+ * - CL_INVALID_VALUE if suggested_local_work_size is NULL.
  * - CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the
  * device.
  * - CL_OUT_OF_HOST_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the
@@ -1558,6 +1568,9 @@ cl_int VC4CL_FUNC(clGetKernelSuggestedLocalWorkSizeKHR)(cl_command_queue command
     CHECK_COMMAND_QUEUE(toType<CommandQueue>(command_queue))
     CHECK_KERNEL(toType<Kernel>(kernel))
 
+    if(suggested_local_work_size == nullptr)
+        return returnError(CL_INVALID_VALUE, __FILE__, __LINE__, "Local work size output parameter is not set!");
+
     std::array<std::size_t, kernel_config::NUM_DIMENSIONS> work_offsets{};
     std::array<std::size_t, kernel_config::NUM_DIMENSIONS> work_sizes{};
     std::array<std::size_t, kernel_config::NUM_DIMENSIONS> local_sizes{};
diff --git a/src/PerformanceCounter.cpp b/src/PerformanceCounter.cpp
index d90e9a3..e58d300 100644
--- a/src/PerformanceCounter.cpp
+++ b/src/PerformanceCounter.cpp
@@ -47,7 +47,7 @@ void PerformanceCounters::dumpCounters() const
     DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "Instruction count: " << numInstructions << std::endl)
     DEBUG_LOG(
         DebugLevel::PERFORMANCE_COUNTERS, std::cout << "Explicit uniform count: " << numExplicitUniforms << std::endl)
-    DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "QPUs used: " << workGroupSize << std::endl)
+    DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "QPUs used: " << numQPUs << std::endl)
     DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "Kernel repetition count: " << numWorkGroups << std::endl)
     for(const auto& counter : PERFORMANCE_COUNTERS)
     {
@@ -102,7 +102,7 @@ cl_int PerformanceCounters::getCounterValue(
 }
 
 PerformanceCollector::PerformanceCollector(
-    PerformanceCounters& counters, const KernelHeader& kernel, size_t localWorkSize, size_t numGroups) :
+    PerformanceCounters& counters, const KernelHeader& kernel, size_t numQPUs, size_t numGroups) :
     counters(counters)
 {
     // set-up and clear the performance counters
@@ -121,7 +121,7 @@ PerformanceCollector::PerformanceCollector(
     counters.numInstructions = kernel.getLength();
     counters.numExplicitUniforms = static_cast<uint32_t>(kernel.getExplicitUniformCount());
     counters.numWorkGroups = numGroups;
-    counters.workGroupSize = localWorkSize;
+    counters.numQPUs = numQPUs;
     for(uint8_t i = 0; i < PERFORMANCE_COUNTERS.size(); ++i)
     {
         if(!v3d->setCounter(i, PERFORMANCE_COUNTERS[i].first))
diff --git a/src/PerformanceCounter.h b/src/PerformanceCounter.h
index 5e39303..0ac78b6 100644
--- a/src/PerformanceCounter.h
+++ b/src/PerformanceCounter.h
@@ -32,7 +32,7 @@ namespace vc4cl
         size_t numInstructions;
         size_t numExplicitUniforms;
         size_t numWorkGroups;
-        size_t workGroupSize;
+        size_t numQPUs;
         std::chrono::microseconds elapsedTime;
 
         void dumpCounters() const;
@@ -48,7 +48,7 @@ namespace vc4cl
     {
     public:
         PerformanceCollector(
-            PerformanceCounters& counters, const KernelHeader& kernel, size_t localWorkSize, size_t numGroups);
+            PerformanceCounters& counters, const KernelHeader& kernel, size_t numQPUs, size_t numGroups);
         PerformanceCollector(const PerformanceCollector&) = delete;
         PerformanceCollector(PerformanceCollector&&) noexcept = delete;
         ~PerformanceCollector() noexcept;
diff --git a/src/executor.cpp b/src/executor.cpp
index 6f368eb..53ec76c 100644
--- a/src/executor.cpp
+++ b/src/executor.cpp
@@ -59,7 +59,7 @@ static unsigned* set_work_item_info(unsigned* ptr, cl_uint num_dimensions,
     const std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& local_sizes,
     const std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& group_indices,
     const std::array<std::size_t, kernel_config::NUM_DIMENSIONS>& local_indices, unsigned global_data,
-    unsigned uniformAddress, const KernelUniforms& uniformsUsed)
+    unsigned uniformAddress, const KernelUniforms& uniformsUsed, uint8_t workItemMergeFactor)
 {
     DEBUG_LOG(DebugLevel::KERNEL_EXECUTION, {
         std::cout << "Setting work-item infos:" << std::endl;
@@ -69,8 +69,13 @@ static unsigned* set_work_item_info(unsigned* ptr, cl_uint num_dimensions,
                   << global_sizes[0] << "), " << group_indices[1] * local_sizes[1] + local_indices[1] << "("
                   << global_sizes[1] << "), " << group_indices[2] * local_sizes[2] + local_indices[2] << "("
                   << global_sizes[2] << ")" << std::endl;
-        std::cout << "\tLocal IDs (sizes): " << local_indices[0] << "(" << local_sizes[0] << "), " << local_indices[1]
-                  << "(" << local_sizes[1] << "), " << local_indices[2] << "(" << local_sizes[2] << ")" << std::endl;
+        if (workItemMergeFactor > 1)
+            std::cout << "\tLocal IDs (sizes): " << (local_indices[0] * workItemMergeFactor) << "-"
+                      << std::min((local_indices[0] + 1) * workItemMergeFactor, local_sizes[0]) << "(" << local_sizes[0] << "), "
+                      << local_indices[1] << "(" << local_sizes[1] << "), " << local_indices[2] << "(" << local_sizes[2] << ")" << std::endl;
+        else
+            std::cout << "\tLocal IDs (sizes): " << local_indices[0] << "(" << local_sizes[0] << "), " << local_indices[1]
+                      << "(" << local_sizes[1] << "), " << local_indices[2] << "(" << local_sizes[2] << ")" << std::endl;
         std::cout << "\tGroup IDs (sizes): " << group_indices[0] << "(" << (global_sizes[0] / local_sizes[0]) << "), "
                   << group_indices[1] << "(" << (global_sizes[1] / local_sizes[1]) << "), " << group_indices[2] << "("
                   << (global_sizes[2] / local_sizes[2]) << ")" << std::endl;
@@ -85,7 +90,7 @@ static unsigned* set_work_item_info(unsigned* ptr, cl_uint num_dimensions,
             local_sizes[2] << 16 | local_sizes[1] << 8 | local_sizes[0]); /* get_local_size(dim) */
     if(uniformsUsed.getLocalIDsUsed())
         *ptr++ = static_cast<unsigned>(
-            local_indices[2] << 16 | local_indices[1] << 8 | local_indices[0]); /* get_local_id(dim) */
+            local_indices[2] << 16 | local_indices[1] << 8 | (local_indices[0] * workItemMergeFactor)); /* get_local_id(dim) */
     if(uniformsUsed.getNumGroupsXUsed())
         *ptr++ = static_cast<unsigned>(global_sizes[0] / local_sizes[0]); /* get_num_groups(0) */
     if(uniformsUsed.getNumGroupsYUsed())
@@ -225,11 +230,13 @@ cl_int executeKernel(KernelExecution& args)
     CHECK_KERNEL(kernel)
 
     // the number of QPUs is the product of all local sizes
-    const size_t num_qpus = args.localSizes[0] * args.localSizes[1] * args.localSizes[2];
-    if(num_qpus > args.system->getNumQPUs())
+    auto mergeFactor = std::max(kernel->info.workItemMergeFactor, uint8_t{1});
+    size_t localSize = args.localSizes[0] * args.localSizes[1] * args.localSizes[2];
+    size_t numQPUs = (localSize / mergeFactor) + (localSize % mergeFactor != 0);
+    if(numQPUs > args.system->getNumQPUs())
         return CL_INVALID_GLOBAL_WORK_SIZE;
 
-    if(num_qpus == 0)
+    if(numQPUs == 0)
         // OpenCL 3.0 requires that we allow to enqueue a kernel without any executions for some reason
         return CL_COMPLETE;
 
@@ -250,9 +257,9 @@ cl_int executeKernel(KernelExecution& args)
         std::cout << "Running kernel '" << kernel->info.name << "' with " << kernel->info.getLength()
                   << " instructions..." << std::endl;
         std::cout << "Local sizes: " << args.localSizes[0] << " " << args.localSizes[1] << " " << args.localSizes[2]
-                  << " -> " << num_qpus << " QPUs" << std::endl;
+                  << " and merge-factor " << static_cast<unsigned>(mergeFactor) << " -> " << numQPUs << " QPUs" << std::endl;
         std::cout << "Global sizes: " << args.globalSizes[0] << " " << args.globalSizes[1] << " " << args.globalSizes[2]
-                  << " -> " << (args.globalSizes[0] * args.globalSizes[1] * args.globalSizes[2]) / num_qpus
+                  << " -> " << (args.globalSizes[0] * args.globalSizes[1] * args.globalSizes[2]) / localSize
                   << " work-groups (" << (isWorkGroupLoopEnabled ? "all at once" : "separate") << ")" << std::endl;
     })
 
@@ -260,7 +267,7 @@ cl_int executeKernel(KernelExecution& args)
     // ALLOCATE BUFFER
     //
     size_t buffer_size = get_size(args.system->getNumQPUs(), kernel->info.getLength() * sizeof(uint64_t),
-        num_qpus * (MAX_HIDDEN_PARAMETERS + kernel->info.getExplicitUniformCount()),
+        numQPUs * (MAX_HIDDEN_PARAMETERS + kernel->info.getExplicitUniformCount()),
         kernel->program->globalData.size() * sizeof(uint64_t), kernel->program->moduleInfo.getStackFrameSize());
 
     std::unique_ptr<DeviceBuffer> buffer(
@@ -323,11 +330,11 @@ cl_int executeKernel(KernelExecution& args)
     std::array<std::array<unsigned*, 16>, 2> uniformPointers;
     // Build Uniforms
     const unsigned* qpu_uniform_0 = p;
-    for(unsigned i = 0; i < num_qpus; ++i)
+    for(unsigned i = 0; i < numQPUs; ++i)
     {
         uniformPointers[0][i] = p;
         p = set_work_item_info(p, args.numDimensions, args.globalOffsets, args.globalSizes, args.localSizes,
-            group_indices, local_indices, global_data, AS_GPU_ADDRESS(p, buffer.get()), kernel->info.uniformsUsed);
+            group_indices, local_indices, global_data, AS_GPU_ADDRESS(p, buffer.get()), kernel->info.uniformsUsed, mergeFactor);
         for(unsigned u = 0; u < kernel->info.parameters.size(); ++u)
         {
             auto tmpBufferIt = args.tmpBuffers.find(u);
@@ -410,20 +417,20 @@ cl_int executeKernel(KernelExecution& args)
 
         // the UNIFORMs of the second block are exactly the size of the first block after the corresponding UNIFORMs
         // of the first block
-        for(unsigned i = 0; i < num_qpus; ++i)
+        for(unsigned i = 0; i < numQPUs; ++i)
             uniformPointers[1][i] = uniformPointers[0][i] + uniformSize;
     }
 
     /* Build QPU Launch messages */
     auto uniformsPerQPU = kernel->info.uniformsUsed.countUniforms() + kernel->info.getExplicitUniformCount();
     unsigned* qpu_msg_0 = p;
-    for(unsigned i = 0; i < num_qpus; ++i)
+    for(unsigned i = 0; i < numQPUs; ++i)
     {
         *p++ = AS_GPU_ADDRESS(qpu_uniform_0 + i * uniformsPerQPU, buffer.get());
         *p++ = AS_GPU_ADDRESS(qpu_code, buffer.get());
     }
     unsigned* qpu_msg_1 = p;
-    for(unsigned i = 0; i < num_qpus; ++i)
+    for(unsigned i = 0; i < numQPUs; ++i)
     {
         *p++ = AS_GPU_ADDRESS(qpu_uniform_1 + i * uniformsPerQPU, buffer.get());
         *p++ = AS_GPU_ADDRESS(qpu_code, buffer.get());
@@ -461,11 +468,11 @@ cl_int executeKernel(KernelExecution& args)
     // object lifetime
     std::unique_ptr<PerformanceCollector> perfCollector;
     if(args.performanceCounters)
-        perfCollector.reset(new PerformanceCollector(*args.performanceCounters, args.kernel->info, num_qpus,
+        perfCollector.reset(new PerformanceCollector(*args.performanceCounters, args.kernel->info, numQPUs,
             group_limits[0] * group_limits[1] * group_limits[2]));
     // on first execution, flush code cache
     auto start = std::chrono::high_resolution_clock::now();
-    auto result = args.system->executeQPU(static_cast<unsigned>(num_qpus),
+    auto result = args.system->executeQPU(static_cast<unsigned>(numQPUs),
         std::make_pair(qpu_msg_current, AS_GPU_ADDRESS(qpu_msg_current, buffer.get())), true, timeout);
     DEBUG_LOG(DebugLevel::KERNEL_EXECUTION, {
         // NOTE: This disables background-execution!
@@ -482,11 +489,11 @@ cl_int executeKernel(KernelExecution& args)
         std::swap(uniformPointers_current, uniformPointers_next);
         local_indices[0] = local_indices[1] = local_indices[2] = 0;
         // re-set indices and offsets for all QPUs
-        for(cl_uint i = 0; i < num_qpus; ++i)
+        for(cl_uint i = 0; i < numQPUs; ++i)
         {
             set_work_item_info((*uniformPointers_current)[i], args.numDimensions, args.globalOffsets, args.globalSizes,
                 args.localSizes, group_indices, local_indices, global_data,
-                AS_GPU_ADDRESS((*uniformPointers_current)[i], buffer.get()), kernel->info.uniformsUsed);
+                AS_GPU_ADDRESS((*uniformPointers_current)[i], buffer.get()), kernel->info.uniformsUsed, mergeFactor);
 
             increment_index(local_indices, args.localSizes, 1);
         }
@@ -498,7 +505,7 @@ cl_int executeKernel(KernelExecution& args)
             std::cout << "Running work-group " << group_indices[0] << ", " << group_indices[1] << ", "
                       << group_indices[2] << std::endl)
         // all following executions, don't flush cache
-        result = args.system->executeQPU(static_cast<unsigned>(num_qpus),
+        result = args.system->executeQPU(static_cast<unsigned>(numQPUs),
             std::make_pair(qpu_msg_current, AS_GPU_ADDRESS(qpu_msg_current, buffer.get())), false, timeout);
         // NOTE: This disables background-execution!
         DEBUG_LOG(DebugLevel::KERNEL_EXECUTION,
diff --git a/src/shared/BinaryHeader.cpp b/src/shared/BinaryHeader.cpp
index 48e7fe2..c6d6299 100644
--- a/src/shared/BinaryHeader.cpp
+++ b/src/shared/BinaryHeader.cpp
@@ -10,6 +10,7 @@
 
 #include <algorithm>
 #include <bitset>
+#include <stdexcept>
 
 #ifdef VC4CL_BITFIELD
 #include "../Program.h"
@@ -104,6 +105,12 @@ std::string MetaData::to_string(bool withQuotes) const
             std::to_string(sizes[2]) + ")";
         break;
     }
+    case Type::KERNEL_LOCAL_MEMORY_SIZE:
+        tmp = "local_memory_size(" + std::to_string(getInt()) + ")";
+        break;
+    case Type::KERNEL_PRIVATE_MEMORY_SIZE:
+        tmp = "private_memory_size(" + std::to_string(getInt()) + ")";
+        break;
     }
     return withQuotes ? "\"" + tmp + "\"" : tmp;
 }
@@ -115,6 +122,8 @@ void MetaData::toBinaryData(std::vector<uint64_t>& data) const
 
 MetaData MetaData::fromBinaryData(const std::vector<uint64_t>& data, std::size_t& dataIndex)
 {
+    if(dataIndex >= data.size())
+        throw std::invalid_argument{"Binary data is too short, does not contain (further) metadata information!"};
     MetaData metaData;
     auto numBytes = data[dataIndex] & 0xFFFFU;
     metaData.payload = readByteContainer<std::vector<uint8_t>>(data, dataIndex, static_cast<std::size_t>(numBytes));
@@ -175,6 +184,25 @@ void MetaData::setSizes(Type type, const std::array<uint32_t, 3>& sizes)
     payload[15] = static_cast<uint8_t>((sizes[2] >> 24u) & 0xFF);
 }
 
+uint32_t MetaData::getInt() const
+{
+    return static_cast<uint32_t>(payload[4]) | (static_cast<uint32_t>(payload[5]) << 8u) |
+        (static_cast<uint32_t>(payload[6]) << 16u) | (static_cast<uint32_t>(payload[7]) << 24u);
+}
+
+void MetaData::setInt(Type type, uint32_t val)
+{
+    payload.resize(8);
+    payload[0] = 8; // lower size
+    payload[1] = 0; // upper size
+    payload[2] = static_cast<uint8_t>(type);
+    payload[3] = 0; // padding
+    payload[4] = static_cast<uint8_t>(val & 0xFF);
+    payload[5] = static_cast<uint8_t>((val >> 8u) & 0xFF);
+    payload[6] = static_cast<uint8_t>((val >> 16u) & 0xFF);
+    payload[7] = static_cast<uint8_t>((val >> 24u) & 0xFF);
+}
+
 LCOV_EXCL_START
 std::string ParamHeader::to_string() const
 {
@@ -204,6 +232,9 @@ void ParamHeader::toBinaryData(std::vector<uint64_t>& data) const
 
 ParamHeader ParamHeader::fromBinaryData(const std::vector<uint64_t>& data, std::size_t& dataIndex)
 {
+    if(dataIndex >= data.size() || (data.size() - dataIndex) < 2)
+        throw std::invalid_argument{"Binary data is too short, does not contain (further) parameter information!"};
+
     ParamHeader param{data[dataIndex]};
     ++dataIndex;
     param.name = readString(data, dataIndex, param.getNameLength());
@@ -301,6 +332,8 @@ void KernelHeader::toBinaryData(std::vector<uint64_t>& data) const
 
 KernelHeader KernelHeader::fromBinaryData(const std::vector<uint64_t>& data, std::size_t& dataIndex)
 {
+    if(dataIndex >= data.size() || (data.size() - dataIndex) < 4)
+        throw std::invalid_argument{"Binary data is too short, does not contain (further) kernel information!"};
     KernelHeader kernel{4};
     kernel.value = data[dataIndex];
     ++dataIndex;
@@ -361,6 +394,8 @@ std::vector<uint64_t> ModuleHeader::toBinaryData(const std::vector<uint64_t>& gl
 
 ModuleHeader ModuleHeader::fromBinaryData(const std::vector<uint64_t>& data)
 {
+    if(data.size() < 2)
+        throw std::invalid_argument{"Binary data is too short, does not contain module information!"};
     std::size_t dataIndex = 1; // skip magic number
     ModuleHeader module{data[dataIndex]};
     ++dataIndex;
diff --git a/src/shared/BinaryHeader.h b/src/shared/BinaryHeader.h
index 8173e3b..eb12def 100644
--- a/src/shared/BinaryHeader.h
+++ b/src/shared/BinaryHeader.h
@@ -26,6 +26,8 @@ namespace vc4c
     /**
      * Additional metadata to be stored (e.g. OpenCL C attributes).
      *
+     * NOTE: Metadata must not have any semantic meaning effecting the kernel execution itself!
+     *
      * Binary layout (64-bit rows, item lengths not to scale):
      *
      * | number of bytes | type | data bytes ... |
@@ -34,11 +36,13 @@ namespace vc4c
     class MetaData
     {
     public:
-        enum class Type : uint8_t
+        enum Type : uint8_t
         {
             KERNEL_WORK_GROUP_SIZE,
             KERNEL_WORK_GROUP_SIZE_HINT,
             KERNEL_VECTOR_TYPE_HINT,
+            KERNEL_LOCAL_MEMORY_SIZE,
+            KERNEL_PRIVATE_MEMORY_SIZE
         };
 
         template <Type T>
@@ -48,6 +52,14 @@ namespace vc4c
             setSizes(T, value);
         }
 
+        template <Type T>
+        std::enable_if_t<T == Type::KERNEL_WORK_GROUP_SIZE || T == Type::KERNEL_WORK_GROUP_SIZE_HINT,
+            std::array<uint32_t, 3>>
+        getValue() const
+        {
+            return getSizes();
+        }
+
         template <Type T>
         void setValue(const std::string& value)
         {
@@ -55,6 +67,26 @@ namespace vc4c
             setString(T, value);
         }
 
+        template <Type T>
+        std::enable_if_t<T == Type::KERNEL_VECTOR_TYPE_HINT, std::string> getValue() const
+        {
+            return getString();
+        }
+
+        template <Type T>
+        void setValue(uint32_t value)
+        {
+            static_assert(T == Type::KERNEL_LOCAL_MEMORY_SIZE || T == Type::KERNEL_PRIVATE_MEMORY_SIZE, "");
+            setInt(T, value);
+        }
+
+        template <Type T>
+        std::enable_if_t<T == Type::KERNEL_LOCAL_MEMORY_SIZE || T == Type::KERNEL_PRIVATE_MEMORY_SIZE, uint32_t>
+        getValue() const
+        {
+            return getInt();
+        }
+
         Type getType() const;
 
         std::string to_string(bool withQuotes = true) const;
@@ -69,6 +101,8 @@ namespace vc4c
         void setString(Type type, const std::string& text);
         std::array<uint32_t, 3> getSizes() const;
         void setSizes(Type type, const std::array<uint32_t, 3>& sizes);
+        uint32_t getInt() const;
+        void setInt(Type type, uint32_t val);
     };
 
     /*
@@ -397,6 +431,17 @@ namespace vc4c
         static ModuleHeader fromBinaryData(const std::vector<uint64_t>& data);
     };
 
+    template <MetaData::Type T>
+    const MetaData* findMetaData(const std::vector<MetaData>& list)
+    {
+        for(const auto& entry : list)
+        {
+            if(entry.getType() == T)
+                return &entry;
+        }
+        return nullptr;
+    }
+
 } // namespace vc4c
 
 #endif /* VC4CL_SHARED_BINARY_HEADER_H */
diff --git a/test/TestExecutions.cpp b/test/TestExecutions.cpp
index 07c85f1..2be215d 100644
--- a/test/TestExecutions.cpp
+++ b/test/TestExecutions.cpp
@@ -150,7 +150,8 @@ struct ExecutionRunner final : public test_data::TestRunner
     }
     ~ExecutionRunner() noexcept override;
 
-    test_data::Result compile(const std::string& sourceCode, const std::string& options) override
+    test_data::Result compile(
+        const std::string& sourceCode, const std::string& options, const std::string& name) override
     {
         auto it = compilationCache.find(sourceCode + options);
         if(it != compilationCache.end())