From 22941ce426b992434b0e45859abb074a28c8c2b2 Mon Sep 17 00:00:00 2001 From: doe300 Date: Sat, 18 Dec 2021 09:29:15 +0100 Subject: [PATCH] Add support for work-group merge factor and some smaller fixes - Adds the host-side support for a WIP compiler optimization - Modifies behavior of clSetKernelArg according to updated specification --- src/Kernel.cpp | 51 +++++++++++++++++++++++-------------- src/PerformanceCounter.cpp | 6 ++--- src/PerformanceCounter.h | 4 +-- src/executor.cpp | 47 +++++++++++++++++++--------------- src/shared/BinaryHeader.cpp | 35 +++++++++++++++++++++++++ src/shared/BinaryHeader.h | 47 +++++++++++++++++++++++++++++++++- test/TestExecutions.cpp | 3 ++- 7 files changed, 147 insertions(+), 46 deletions(-) diff --git a/src/Kernel.cpp b/src/Kernel.cpp index 5c0cec7..be42eb6 100644 --- a/src/Kernel.cpp +++ b/src/Kernel.cpp @@ -126,9 +126,6 @@ cl_int Kernel::setArg(cl_uint arg_index, size_t arg_size, const void* arg_value) buildString("Invalid arg index: %d of %d", arg_index, info.parameters.size())); } - // clear previous set parameter value - args[arg_index].reset(); - const auto& paramInfo = info.parameters[arg_index]; if(!paramInfo.getPointer() || paramInfo.getByValue()) { @@ -171,8 +168,7 @@ cl_int Kernel::setArg(cl_uint arg_index, size_t arg_size, const void* arg_value) // elements) is passed in elementSize = arg_size / 4; } - ScalarArgument* scalarArg = new ScalarArgument(paramInfo.getVectorElements()); - args[arg_index].reset(scalarArg); + auto scalarArg = std::make_unique(paramInfo.getVectorElements()); for(cl_uchar i = 0; i < paramInfo.getVectorElements(); ++i) { // arguments are all 32-bit, since UNIFORMS are always 32-bit @@ -219,6 +215,7 @@ cl_int Kernel::setArg(cl_uint arg_index, size_t arg_size, const void* arg_value) scalarArg->addScalar(static_cast(arg_value)[i]); } } + args[arg_index] = std::move(scalarArg); DEBUG_LOG(DebugLevel::KERNEL_EXECUTION, std::cout << "Setting kernel-argument " << arg_index << " to scalar " << args[arg_index]->to_string() << std::endl) @@ -346,8 +343,12 @@ cl_int Kernel::getWorkGroupInfo( // not a built-in kernel." return CL_INVALID_VALUE; case CL_KERNEL_WORK_GROUP_SIZE: + { //"[...] query the maximum work-group size that can be used to execute a kernel on a specific device [...]" - return returnValue(system()->getNumQPUs(), param_value_size, param_value, param_value_size_ret); + auto mergeFactor = std::max(info.workItemMergeFactor, uint8_t{1}); + return returnValue( + system()->getNumQPUs() * mergeFactor, param_value_size, param_value, param_value_size_ret); + } case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: { std::array tmp{ @@ -355,13 +356,20 @@ cl_int Kernel::getWorkGroupInfo( return returnValue(tmp.data(), sizeof(size_t), 3, param_value_size, param_value, param_value_size_ret); } case CL_KERNEL_LOCAL_MEM_SIZE: - // XXX can we get this somehow? Need to distinguish in global data block what is __global/__local/__private - // section + if(auto entry = findMetaData(info.metaData)) + // TODO should also include the size of local parameters, as far as already set! + return returnValue(entry->getValue(), param_value_size, + param_value, param_value_size_ret); return returnValue(0, param_value_size, param_value, param_value_size_ret); case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: - return returnValue(1, param_value_size, param_value, param_value_size_ret); + // TODO this has little effect (and is in fact wrong according to the OpenCL standard), if clients check the + // device's max work-group size (which is fixed to 12)... + return returnValue(info.workItemMergeFactor ? info.workItemMergeFactor : 1u, param_value_size, + param_value, param_value_size_ret); case CL_KERNEL_PRIVATE_MEM_SIZE: - // XXX same for local memory, could determine if type of global data section is known + if(auto entry = findMetaData(info.metaData)) + return returnValue(entry->getValue(), param_value_size, + param_value, param_value_size_ret); return returnValue(0, param_value_size, param_value, param_value_size_ret); } @@ -418,12 +426,12 @@ cl_int Kernel::getArgInfo(cl_uint arg_index, cl_kernel_arg_info param_name, size */ static bool split_compile_work_size(const std::array& compile_group_sizes, const std::array& global_sizes, - std::array& local_sizes) + std::array& local_sizes, uint8_t mergeFactor) { if(compile_group_sizes[0] == 0 && compile_group_sizes[1] == 0 && compile_group_sizes[2] == 0) // no compile-time sizes set return false; - const cl_uint max_group_size = system()->getNumQPUs(); + const cl_uint max_group_size = system()->getNumQPUs() * mergeFactor; if((global_sizes[0] % compile_group_sizes[0]) != 0 || (global_sizes[1] % compile_group_sizes[1]) != 0 || (global_sizes[2] % compile_group_sizes[2]) != 0) @@ -447,10 +455,10 @@ static bool split_compile_work_size(const std::array& global_sizes, - std::array& local_sizes, cl_uint num_dimensions) + std::array& local_sizes, cl_uint num_dimensions, uint8_t mergeFactor) { const size_t total_sizes = global_sizes[0] * global_sizes[1] * global_sizes[2]; - const cl_uint max_group_size = system()->getNumQPUs(); + const cl_uint max_group_size = system()->getNumQPUs() * mergeFactor; if(total_sizes <= max_group_size) { // can be executed in a single work-group @@ -464,7 +472,7 @@ static cl_int split_global_work_size(const std::array produces x� work-groups + * -> produces x^3 work-groups * - only works, if global[0,1,2] are all divisible by the same number */ /* @@ -544,6 +552,7 @@ cl_int Kernel::setWorkGroupSizes(CommandQueue* commandQueue, cl_uint work_dim, c else memcpy(work_offsets.data(), global_work_offset, work_dim * sizeof(size_t)); memcpy(work_sizes.data(), global_work_size, work_dim * sizeof(size_t)); + auto mergeFactor = std::max(info.workItemMergeFactor, uint8_t{1}); // fill to 3 dimensions for(size_t i = work_dim; i < kernel_config::NUM_DIMENSIONS; ++i) { @@ -556,9 +565,9 @@ cl_int Kernel::setWorkGroupSizes(CommandQueue* commandQueue, cl_uint work_dim, c //"local_work_size can also be a NULL value in which case the OpenCL implementation // will determine how to be break the global work-items into appropriate work-group instances." cl_int state = CL_SUCCESS; - if(!split_compile_work_size(info.workGroupSize, work_sizes, local_sizes)) + if(!split_compile_work_size(info.workGroupSize, work_sizes, local_sizes, mergeFactor)) { - state = split_global_work_size(work_sizes, local_sizes, work_dim); + state = split_global_work_size(work_sizes, local_sizes, work_dim, mergeFactor); } if(state != CL_SUCCESS) @@ -599,10 +608,10 @@ cl_int Kernel::setWorkGroupSizes(CommandQueue* commandQueue, cl_uint work_dim, c work_sizes[1] + work_offsets[1], kernel_config::MAX_WORK_ITEM_DIMENSIONS[1], work_sizes[2] + work_offsets[2], kernel_config::MAX_WORK_ITEM_DIMENSIONS[2])); } - if(exceedsLimits(local_sizes[0] * local_sizes[1] * local_sizes[2], 0, system()->getNumQPUs())) + if(exceedsLimits(local_sizes[0] * local_sizes[1] * local_sizes[2], 0, system()->getNumQPUs() * mergeFactor)) return returnError(CL_INVALID_WORK_GROUP_SIZE, __FILE__, __LINE__, buildString("Local work-sizes exceed maximum: %u * %u * %u > %u", local_sizes[0], local_sizes[1], - local_sizes[2], system()->getNumQPUs())); + local_sizes[2], system()->getNumQPUs() * mergeFactor)); // check divisibility of local_sizes[i] by work_sizes[i] for(cl_uint i = 0; i < kernel_config::NUM_DIMENSIONS; ++i) @@ -1541,6 +1550,7 @@ cl_kernel VC4CL_FUNC(clCloneKernel)(cl_kernel source_kernel, cl_int* errcode_ret * - CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the corresponding value in * global_work_offset for dimension exceeds the maximum value representable by size_t on the device associated with * command_queue. + * - CL_INVALID_VALUE if suggested_local_work_size is NULL. * - CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the * device. * - CL_OUT_OF_HOST_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the @@ -1558,6 +1568,9 @@ cl_int VC4CL_FUNC(clGetKernelSuggestedLocalWorkSizeKHR)(cl_command_queue command CHECK_COMMAND_QUEUE(toType(command_queue)) CHECK_KERNEL(toType(kernel)) + if(suggested_local_work_size == nullptr) + return returnError(CL_INVALID_VALUE, __FILE__, __LINE__, "Local work size output parameter is not set!"); + std::array work_offsets{}; std::array work_sizes{}; std::array local_sizes{}; diff --git a/src/PerformanceCounter.cpp b/src/PerformanceCounter.cpp index d90e9a3..e58d300 100644 --- a/src/PerformanceCounter.cpp +++ b/src/PerformanceCounter.cpp @@ -47,7 +47,7 @@ void PerformanceCounters::dumpCounters() const DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "Instruction count: " << numInstructions << std::endl) DEBUG_LOG( DebugLevel::PERFORMANCE_COUNTERS, std::cout << "Explicit uniform count: " << numExplicitUniforms << std::endl) - DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "QPUs used: " << workGroupSize << std::endl) + DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "QPUs used: " << numQPUs << std::endl) DEBUG_LOG(DebugLevel::PERFORMANCE_COUNTERS, std::cout << "Kernel repetition count: " << numWorkGroups << std::endl) for(const auto& counter : PERFORMANCE_COUNTERS) { @@ -102,7 +102,7 @@ cl_int PerformanceCounters::getCounterValue( } PerformanceCollector::PerformanceCollector( - PerformanceCounters& counters, const KernelHeader& kernel, size_t localWorkSize, size_t numGroups) : + PerformanceCounters& counters, const KernelHeader& kernel, size_t numQPUs, size_t numGroups) : counters(counters) { // set-up and clear the performance counters @@ -121,7 +121,7 @@ PerformanceCollector::PerformanceCollector( counters.numInstructions = kernel.getLength(); counters.numExplicitUniforms = static_cast(kernel.getExplicitUniformCount()); counters.numWorkGroups = numGroups; - counters.workGroupSize = localWorkSize; + counters.numQPUs = numQPUs; for(uint8_t i = 0; i < PERFORMANCE_COUNTERS.size(); ++i) { if(!v3d->setCounter(i, PERFORMANCE_COUNTERS[i].first)) diff --git a/src/PerformanceCounter.h b/src/PerformanceCounter.h index 5e39303..0ac78b6 100644 --- a/src/PerformanceCounter.h +++ b/src/PerformanceCounter.h @@ -32,7 +32,7 @@ namespace vc4cl size_t numInstructions; size_t numExplicitUniforms; size_t numWorkGroups; - size_t workGroupSize; + size_t numQPUs; std::chrono::microseconds elapsedTime; void dumpCounters() const; @@ -48,7 +48,7 @@ namespace vc4cl { public: PerformanceCollector( - PerformanceCounters& counters, const KernelHeader& kernel, size_t localWorkSize, size_t numGroups); + PerformanceCounters& counters, const KernelHeader& kernel, size_t numQPUs, size_t numGroups); PerformanceCollector(const PerformanceCollector&) = delete; PerformanceCollector(PerformanceCollector&&) noexcept = delete; ~PerformanceCollector() noexcept; diff --git a/src/executor.cpp b/src/executor.cpp index 6f368eb..53ec76c 100644 --- a/src/executor.cpp +++ b/src/executor.cpp @@ -59,7 +59,7 @@ static unsigned* set_work_item_info(unsigned* ptr, cl_uint num_dimensions, const std::array& local_sizes, const std::array& group_indices, const std::array& local_indices, unsigned global_data, - unsigned uniformAddress, const KernelUniforms& uniformsUsed) + unsigned uniformAddress, const KernelUniforms& uniformsUsed, uint8_t workItemMergeFactor) { DEBUG_LOG(DebugLevel::KERNEL_EXECUTION, { std::cout << "Setting work-item infos:" << std::endl; @@ -69,8 +69,13 @@ static unsigned* set_work_item_info(unsigned* ptr, cl_uint num_dimensions, << global_sizes[0] << "), " << group_indices[1] * local_sizes[1] + local_indices[1] << "(" << global_sizes[1] << "), " << group_indices[2] * local_sizes[2] + local_indices[2] << "(" << global_sizes[2] << ")" << std::endl; - std::cout << "\tLocal IDs (sizes): " << local_indices[0] << "(" << local_sizes[0] << "), " << local_indices[1] - << "(" << local_sizes[1] << "), " << local_indices[2] << "(" << local_sizes[2] << ")" << std::endl; + if (workItemMergeFactor > 1) + std::cout << "\tLocal IDs (sizes): " << (local_indices[0] * workItemMergeFactor) << "-" + << std::min((local_indices[0] + 1) * workItemMergeFactor, local_sizes[0]) << "(" << local_sizes[0] << "), " + << local_indices[1] << "(" << local_sizes[1] << "), " << local_indices[2] << "(" << local_sizes[2] << ")" << std::endl; + else + std::cout << "\tLocal IDs (sizes): " << local_indices[0] << "(" << local_sizes[0] << "), " << local_indices[1] + << "(" << local_sizes[1] << "), " << local_indices[2] << "(" << local_sizes[2] << ")" << std::endl; std::cout << "\tGroup IDs (sizes): " << group_indices[0] << "(" << (global_sizes[0] / local_sizes[0]) << "), " << group_indices[1] << "(" << (global_sizes[1] / local_sizes[1]) << "), " << group_indices[2] << "(" << (global_sizes[2] / local_sizes[2]) << ")" << std::endl; @@ -85,7 +90,7 @@ static unsigned* set_work_item_info(unsigned* ptr, cl_uint num_dimensions, local_sizes[2] << 16 | local_sizes[1] << 8 | local_sizes[0]); /* get_local_size(dim) */ if(uniformsUsed.getLocalIDsUsed()) *ptr++ = static_cast( - local_indices[2] << 16 | local_indices[1] << 8 | local_indices[0]); /* get_local_id(dim) */ + local_indices[2] << 16 | local_indices[1] << 8 | (local_indices[0] * workItemMergeFactor)); /* get_local_id(dim) */ if(uniformsUsed.getNumGroupsXUsed()) *ptr++ = static_cast(global_sizes[0] / local_sizes[0]); /* get_num_groups(0) */ if(uniformsUsed.getNumGroupsYUsed()) @@ -225,11 +230,13 @@ cl_int executeKernel(KernelExecution& args) CHECK_KERNEL(kernel) // the number of QPUs is the product of all local sizes - const size_t num_qpus = args.localSizes[0] * args.localSizes[1] * args.localSizes[2]; - if(num_qpus > args.system->getNumQPUs()) + auto mergeFactor = std::max(kernel->info.workItemMergeFactor, uint8_t{1}); + size_t localSize = args.localSizes[0] * args.localSizes[1] * args.localSizes[2]; + size_t numQPUs = (localSize / mergeFactor) + (localSize % mergeFactor != 0); + if(numQPUs > args.system->getNumQPUs()) return CL_INVALID_GLOBAL_WORK_SIZE; - if(num_qpus == 0) + if(numQPUs == 0) // OpenCL 3.0 requires that we allow to enqueue a kernel without any executions for some reason return CL_COMPLETE; @@ -250,9 +257,9 @@ cl_int executeKernel(KernelExecution& args) std::cout << "Running kernel '" << kernel->info.name << "' with " << kernel->info.getLength() << " instructions..." << std::endl; std::cout << "Local sizes: " << args.localSizes[0] << " " << args.localSizes[1] << " " << args.localSizes[2] - << " -> " << num_qpus << " QPUs" << std::endl; + << " and merge-factor " << static_cast(mergeFactor) << " -> " << numQPUs << " QPUs" << std::endl; std::cout << "Global sizes: " << args.globalSizes[0] << " " << args.globalSizes[1] << " " << args.globalSizes[2] - << " -> " << (args.globalSizes[0] * args.globalSizes[1] * args.globalSizes[2]) / num_qpus + << " -> " << (args.globalSizes[0] * args.globalSizes[1] * args.globalSizes[2]) / localSize << " work-groups (" << (isWorkGroupLoopEnabled ? "all at once" : "separate") << ")" << std::endl; }) @@ -260,7 +267,7 @@ cl_int executeKernel(KernelExecution& args) // ALLOCATE BUFFER // size_t buffer_size = get_size(args.system->getNumQPUs(), kernel->info.getLength() * sizeof(uint64_t), - num_qpus * (MAX_HIDDEN_PARAMETERS + kernel->info.getExplicitUniformCount()), + numQPUs * (MAX_HIDDEN_PARAMETERS + kernel->info.getExplicitUniformCount()), kernel->program->globalData.size() * sizeof(uint64_t), kernel->program->moduleInfo.getStackFrameSize()); std::unique_ptr buffer( @@ -323,11 +330,11 @@ cl_int executeKernel(KernelExecution& args) std::array, 2> uniformPointers; // Build Uniforms const unsigned* qpu_uniform_0 = p; - for(unsigned i = 0; i < num_qpus; ++i) + for(unsigned i = 0; i < numQPUs; ++i) { uniformPointers[0][i] = p; p = set_work_item_info(p, args.numDimensions, args.globalOffsets, args.globalSizes, args.localSizes, - group_indices, local_indices, global_data, AS_GPU_ADDRESS(p, buffer.get()), kernel->info.uniformsUsed); + group_indices, local_indices, global_data, AS_GPU_ADDRESS(p, buffer.get()), kernel->info.uniformsUsed, mergeFactor); for(unsigned u = 0; u < kernel->info.parameters.size(); ++u) { auto tmpBufferIt = args.tmpBuffers.find(u); @@ -410,20 +417,20 @@ cl_int executeKernel(KernelExecution& args) // the UNIFORMs of the second block are exactly the size of the first block after the corresponding UNIFORMs // of the first block - for(unsigned i = 0; i < num_qpus; ++i) + for(unsigned i = 0; i < numQPUs; ++i) uniformPointers[1][i] = uniformPointers[0][i] + uniformSize; } /* Build QPU Launch messages */ auto uniformsPerQPU = kernel->info.uniformsUsed.countUniforms() + kernel->info.getExplicitUniformCount(); unsigned* qpu_msg_0 = p; - for(unsigned i = 0; i < num_qpus; ++i) + for(unsigned i = 0; i < numQPUs; ++i) { *p++ = AS_GPU_ADDRESS(qpu_uniform_0 + i * uniformsPerQPU, buffer.get()); *p++ = AS_GPU_ADDRESS(qpu_code, buffer.get()); } unsigned* qpu_msg_1 = p; - for(unsigned i = 0; i < num_qpus; ++i) + for(unsigned i = 0; i < numQPUs; ++i) { *p++ = AS_GPU_ADDRESS(qpu_uniform_1 + i * uniformsPerQPU, buffer.get()); *p++ = AS_GPU_ADDRESS(qpu_code, buffer.get()); @@ -461,11 +468,11 @@ cl_int executeKernel(KernelExecution& args) // object lifetime std::unique_ptr perfCollector; if(args.performanceCounters) - perfCollector.reset(new PerformanceCollector(*args.performanceCounters, args.kernel->info, num_qpus, + perfCollector.reset(new PerformanceCollector(*args.performanceCounters, args.kernel->info, numQPUs, group_limits[0] * group_limits[1] * group_limits[2])); // on first execution, flush code cache auto start = std::chrono::high_resolution_clock::now(); - auto result = args.system->executeQPU(static_cast(num_qpus), + auto result = args.system->executeQPU(static_cast(numQPUs), std::make_pair(qpu_msg_current, AS_GPU_ADDRESS(qpu_msg_current, buffer.get())), true, timeout); DEBUG_LOG(DebugLevel::KERNEL_EXECUTION, { // NOTE: This disables background-execution! @@ -482,11 +489,11 @@ cl_int executeKernel(KernelExecution& args) std::swap(uniformPointers_current, uniformPointers_next); local_indices[0] = local_indices[1] = local_indices[2] = 0; // re-set indices and offsets for all QPUs - for(cl_uint i = 0; i < num_qpus; ++i) + for(cl_uint i = 0; i < numQPUs; ++i) { set_work_item_info((*uniformPointers_current)[i], args.numDimensions, args.globalOffsets, args.globalSizes, args.localSizes, group_indices, local_indices, global_data, - AS_GPU_ADDRESS((*uniformPointers_current)[i], buffer.get()), kernel->info.uniformsUsed); + AS_GPU_ADDRESS((*uniformPointers_current)[i], buffer.get()), kernel->info.uniformsUsed, mergeFactor); increment_index(local_indices, args.localSizes, 1); } @@ -498,7 +505,7 @@ cl_int executeKernel(KernelExecution& args) std::cout << "Running work-group " << group_indices[0] << ", " << group_indices[1] << ", " << group_indices[2] << std::endl) // all following executions, don't flush cache - result = args.system->executeQPU(static_cast(num_qpus), + result = args.system->executeQPU(static_cast(numQPUs), std::make_pair(qpu_msg_current, AS_GPU_ADDRESS(qpu_msg_current, buffer.get())), false, timeout); // NOTE: This disables background-execution! DEBUG_LOG(DebugLevel::KERNEL_EXECUTION, diff --git a/src/shared/BinaryHeader.cpp b/src/shared/BinaryHeader.cpp index 48e7fe2..c6d6299 100644 --- a/src/shared/BinaryHeader.cpp +++ b/src/shared/BinaryHeader.cpp @@ -10,6 +10,7 @@ #include #include +#include #ifdef VC4CL_BITFIELD #include "../Program.h" @@ -104,6 +105,12 @@ std::string MetaData::to_string(bool withQuotes) const std::to_string(sizes[2]) + ")"; break; } + case Type::KERNEL_LOCAL_MEMORY_SIZE: + tmp = "local_memory_size(" + std::to_string(getInt()) + ")"; + break; + case Type::KERNEL_PRIVATE_MEMORY_SIZE: + tmp = "private_memory_size(" + std::to_string(getInt()) + ")"; + break; } return withQuotes ? "\"" + tmp + "\"" : tmp; } @@ -115,6 +122,8 @@ void MetaData::toBinaryData(std::vector& data) const MetaData MetaData::fromBinaryData(const std::vector& data, std::size_t& dataIndex) { + if(dataIndex >= data.size()) + throw std::invalid_argument{"Binary data is too short, does not contain (further) metadata information!"}; MetaData metaData; auto numBytes = data[dataIndex] & 0xFFFFU; metaData.payload = readByteContainer>(data, dataIndex, static_cast(numBytes)); @@ -175,6 +184,25 @@ void MetaData::setSizes(Type type, const std::array& sizes) payload[15] = static_cast((sizes[2] >> 24u) & 0xFF); } +uint32_t MetaData::getInt() const +{ + return static_cast(payload[4]) | (static_cast(payload[5]) << 8u) | + (static_cast(payload[6]) << 16u) | (static_cast(payload[7]) << 24u); +} + +void MetaData::setInt(Type type, uint32_t val) +{ + payload.resize(8); + payload[0] = 8; // lower size + payload[1] = 0; // upper size + payload[2] = static_cast(type); + payload[3] = 0; // padding + payload[4] = static_cast(val & 0xFF); + payload[5] = static_cast((val >> 8u) & 0xFF); + payload[6] = static_cast((val >> 16u) & 0xFF); + payload[7] = static_cast((val >> 24u) & 0xFF); +} + LCOV_EXCL_START std::string ParamHeader::to_string() const { @@ -204,6 +232,9 @@ void ParamHeader::toBinaryData(std::vector& data) const ParamHeader ParamHeader::fromBinaryData(const std::vector& data, std::size_t& dataIndex) { + if(dataIndex >= data.size() || (data.size() - dataIndex) < 2) + throw std::invalid_argument{"Binary data is too short, does not contain (further) parameter information!"}; + ParamHeader param{data[dataIndex]}; ++dataIndex; param.name = readString(data, dataIndex, param.getNameLength()); @@ -301,6 +332,8 @@ void KernelHeader::toBinaryData(std::vector& data) const KernelHeader KernelHeader::fromBinaryData(const std::vector& data, std::size_t& dataIndex) { + if(dataIndex >= data.size() || (data.size() - dataIndex) < 4) + throw std::invalid_argument{"Binary data is too short, does not contain (further) kernel information!"}; KernelHeader kernel{4}; kernel.value = data[dataIndex]; ++dataIndex; @@ -361,6 +394,8 @@ std::vector ModuleHeader::toBinaryData(const std::vector& gl ModuleHeader ModuleHeader::fromBinaryData(const std::vector& data) { + if(data.size() < 2) + throw std::invalid_argument{"Binary data is too short, does not contain module information!"}; std::size_t dataIndex = 1; // skip magic number ModuleHeader module{data[dataIndex]}; ++dataIndex; diff --git a/src/shared/BinaryHeader.h b/src/shared/BinaryHeader.h index 8173e3b..eb12def 100644 --- a/src/shared/BinaryHeader.h +++ b/src/shared/BinaryHeader.h @@ -26,6 +26,8 @@ namespace vc4c /** * Additional metadata to be stored (e.g. OpenCL C attributes). * + * NOTE: Metadata must not have any semantic meaning effecting the kernel execution itself! + * * Binary layout (64-bit rows, item lengths not to scale): * * | number of bytes | type | data bytes ... | @@ -34,11 +36,13 @@ namespace vc4c class MetaData { public: - enum class Type : uint8_t + enum Type : uint8_t { KERNEL_WORK_GROUP_SIZE, KERNEL_WORK_GROUP_SIZE_HINT, KERNEL_VECTOR_TYPE_HINT, + KERNEL_LOCAL_MEMORY_SIZE, + KERNEL_PRIVATE_MEMORY_SIZE }; template @@ -48,6 +52,14 @@ namespace vc4c setSizes(T, value); } + template + std::enable_if_t> + getValue() const + { + return getSizes(); + } + template void setValue(const std::string& value) { @@ -55,6 +67,26 @@ namespace vc4c setString(T, value); } + template + std::enable_if_t getValue() const + { + return getString(); + } + + template + void setValue(uint32_t value) + { + static_assert(T == Type::KERNEL_LOCAL_MEMORY_SIZE || T == Type::KERNEL_PRIVATE_MEMORY_SIZE, ""); + setInt(T, value); + } + + template + std::enable_if_t + getValue() const + { + return getInt(); + } + Type getType() const; std::string to_string(bool withQuotes = true) const; @@ -69,6 +101,8 @@ namespace vc4c void setString(Type type, const std::string& text); std::array getSizes() const; void setSizes(Type type, const std::array& sizes); + uint32_t getInt() const; + void setInt(Type type, uint32_t val); }; /* @@ -397,6 +431,17 @@ namespace vc4c static ModuleHeader fromBinaryData(const std::vector& data); }; + template + const MetaData* findMetaData(const std::vector& list) + { + for(const auto& entry : list) + { + if(entry.getType() == T) + return &entry; + } + return nullptr; + } + } // namespace vc4c #endif /* VC4CL_SHARED_BINARY_HEADER_H */ diff --git a/test/TestExecutions.cpp b/test/TestExecutions.cpp index 07c85f1..2be215d 100644 --- a/test/TestExecutions.cpp +++ b/test/TestExecutions.cpp @@ -150,7 +150,8 @@ struct ExecutionRunner final : public test_data::TestRunner } ~ExecutionRunner() noexcept override; - test_data::Result compile(const std::string& sourceCode, const std::string& options) override + test_data::Result compile( + const std::string& sourceCode, const std::string& options, const std::string& name) override { auto it = compilationCache.find(sourceCode + options); if(it != compilationCache.end())