diff --git a/test/TestArithmetic.cpp b/test/TestArithmetic.cpp
new file mode 100644
index 00000000..4c2ecc0e
--- /dev/null
+++ b/test/TestArithmetic.cpp
@@ -0,0 +1,890 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#include "TestArithmetic.h"
+
+#include "Compiler.h"
+#include "VC4C.h"
+#include "tools.h"
+
+#include <limits>
+#include <random>
+#include <sstream>
+
+template <typename T, std::size_t N>
+static std::array<T, N> generateInput(bool allowNull)
+{
+    std::array<T, N> arr;
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+
+    for(std::size_t i = 0; i < N; ++i)
+    {
+        T tmp;
+        do
+        {
+            // to prevent division by zero
+            tmp = dis(gen);
+        } while(!allowNull && tmp == 0);
+        arr[i] = tmp;
+    }
+
+    return arr;
+}
+
+template <std::size_t N>
+static std::array<float, N> generateInput(bool allowNull)
+{
+    std::array<float, N> arr;
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(std::numeric_limits<float>::min(), std::numeric_limits<float>::max());
+
+    for(std::size_t i = 0; i < N; ++i)
+    {
+        float tmp;
+        do
+        {
+            // to prevent division by zero
+            tmp = dis(gen);
+        } while(!allowNull && tmp == 0.0f);
+        arr[i] = tmp;
+    }
+    return arr;
+}
+
+static const std::string BINARY_OPERATION = R"(
+__kernel void test(__global TYPE* out, const __global TYPE* in0, const __global TYPE* in1) {
+  size_t gid = get_global_id(0);
+  out[gid] = in0[gid] OP in1[gid];
+}
+)";
+
+static const std::string RELATIONAL_OPERATION = R"(
+__kernel void test(__global int16* out, const __global TYPE* in0, const __global TYPE* in1) {
+  size_t gid = get_global_id(0);
+  out[gid] = convert_int16(in0[gid] OP in1[gid]);
+}
+)";
+
+static const std::string UNARY_OPERATION = R"(
+__kernel void test(__global int16* out, const __global TYPE* in) {
+  size_t gid = get_global_id(0);
+  out[gid] = convert_int16(OP in[gid]);
+}
+)";
+
+static const std::string CONVERSION_OPERATION = R"(
+// trick to allow concatenating macro (content!!) to symbol
+#define CONCAT(A, B) CONCAT_(A, B)
+#define CONCAT_(A, B) A##B
+
+__kernel void test(__global OUT* out, const __global IN* in) {
+  size_t gid = get_global_id(0);
+  out[gid] = CONCAT(convert_,OUT)(in[gid]);
+}
+)";
+
+static const std::string CAST_OPERATION = R"(
+__kernel void test(__global OUT* out, const __global IN* in) {
+  size_t gid = get_global_id(0);
+  out[gid] = as_##OUT(in[gid]);
+}
+)";
+
+template <typename T, typename I, std::size_t N>
+static void checkBinaryResults(const std::array<I, N>& input0, const std::array<I, N>& input1,
+    const std::array<T, N>& output, const std::function<T(I, I)>& op, const std::string& opName,
+    const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    for(std::size_t i = 0; i < N; ++i)
+    {
+        if(output[i] != op(input0[i], input1[i]))
+        {
+            auto result = std::to_string(output[i]);
+            auto expected = std::to_string(input0[i]) + " " + opName + " " + std::to_string(input1[i]) + " = " +
+                std::to_string(op(input0[i], input1[i]));
+            onError(expected, result);
+        }
+    }
+}
+
+template <typename T, typename I, std::size_t N>
+static void checkUnaryResults(const std::array<I, N>& input, const std::array<T, N>& output,
+    const std::function<T(I)>& op, const std::string& opName,
+    const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    for(std::size_t i = 0; i < N; ++i)
+    {
+        if(output[i] != op(input[i]))
+        {
+            auto result = std::to_string(output[i]);
+            auto expected = opName + " " + std::to_string(input[i]) + " = " + std::to_string(op(input[i]));
+            onError(expected, result);
+        }
+    }
+}
+
+TestArithmetic::TestArithmetic(const vc4c::Configuration& config) : config(config)
+{
+    TEST_ADD(TestArithmetic::testSignedIntMultiplication);
+    TEST_ADD(TestArithmetic::testSignedShortMultiplication);
+    TEST_ADD(TestArithmetic::testSignedIntMultiplication);
+    TEST_ADD(TestArithmetic::testSignedShortMultiplication);
+    TEST_ADD(TestArithmetic::testSignedCharMultiplication);
+    TEST_ADD(TestArithmetic::testUnsignedIntMultiplication);
+    TEST_ADD(TestArithmetic::testUnsignedShortMultiplication);
+    TEST_ADD(TestArithmetic::testUnsignedCharMultiplication);
+    TEST_ADD(TestArithmetic::testSignedIntDivision);
+    TEST_ADD(TestArithmetic::testSignedShortDivision);
+    TEST_ADD(TestArithmetic::testSignedCharDivision);
+    TEST_ADD(TestArithmetic::testUnsignedIntDivision);
+    TEST_ADD(TestArithmetic::testUnsignedShortDivision);
+    TEST_ADD(TestArithmetic::testUnsignedCharDivision);
+    TEST_ADD(TestArithmetic::testSignedIntModulo);
+    TEST_ADD(TestArithmetic::testSignedShortModulo);
+    TEST_ADD(TestArithmetic::testSignedCharModulo);
+    TEST_ADD(TestArithmetic::testUnsignedIntModulo);
+    TEST_ADD(TestArithmetic::testUnsignedShortModulo);
+    TEST_ADD(TestArithmetic::testUnsignedCharModulo);
+    TEST_ADD(TestArithmetic::testFloatingPointDivision);
+
+    TEST_ADD(TestArithmetic::testIntegerEquality);
+    TEST_ADD(TestArithmetic::testShortEquality);
+    TEST_ADD(TestArithmetic::testCharEquality);
+    TEST_ADD(TestArithmetic::testFloatEquality);
+    TEST_ADD(TestArithmetic::testIntegerInequality);
+    TEST_ADD(TestArithmetic::testShortInequality);
+    TEST_ADD(TestArithmetic::testCharInequality);
+    TEST_ADD(TestArithmetic::testFloatInequality);
+    TEST_ADD(TestArithmetic::testSignedIntGreater);
+    TEST_ADD(TestArithmetic::testSignedShortGreater);
+    TEST_ADD(TestArithmetic::testSignedCharGreater);
+    TEST_ADD(TestArithmetic::testUnsignedIntGreater);
+    TEST_ADD(TestArithmetic::testUnsignedShortGreater);
+    TEST_ADD(TestArithmetic::testUnsignedCharGreater);
+    TEST_ADD(TestArithmetic::testFloatGreater);
+    TEST_ADD(TestArithmetic::testSignedIntLess);
+    TEST_ADD(TestArithmetic::testSignedShortLess);
+    TEST_ADD(TestArithmetic::testSignedCharLess);
+    TEST_ADD(TestArithmetic::testUnsignedIntLess);
+    TEST_ADD(TestArithmetic::testUnsignedShortLess);
+    TEST_ADD(TestArithmetic::testUnsignedCharLess);
+    TEST_ADD(TestArithmetic::testFloatLess);
+    TEST_ADD(TestArithmetic::testSignedIntGreaterEquals);
+    TEST_ADD(TestArithmetic::testSignedShortGreaterEquals);
+    TEST_ADD(TestArithmetic::testSignedCharGreaterEquals);
+    TEST_ADD(TestArithmetic::testUnsignedIntGreaterEquals);
+    TEST_ADD(TestArithmetic::testUnsignedShortGreaterEquals);
+    TEST_ADD(TestArithmetic::testUnsignedCharGreaterEquals);
+    TEST_ADD(TestArithmetic::testFloatGreaterEquals);
+    TEST_ADD(TestArithmetic::testSignedIntLessEquals);
+    TEST_ADD(TestArithmetic::testSignedShortLessEquals);
+    TEST_ADD(TestArithmetic::testSignedCharLessEquals);
+    TEST_ADD(TestArithmetic::testUnsignedIntLessEquals);
+    TEST_ADD(TestArithmetic::testUnsignedShortLessEquals);
+    TEST_ADD(TestArithmetic::testUnsignedCharLessEquals);
+    TEST_ADD(TestArithmetic::testFloatLessEquals);
+
+    TEST_ADD(TestArithmetic::testSignedIntAnd);
+    TEST_ADD(TestArithmetic::testSignedShortAnd);
+    TEST_ADD(TestArithmetic::testSignedCharAnd);
+    TEST_ADD(TestArithmetic::testUnsignedIntAnd);
+    TEST_ADD(TestArithmetic::testUnsignedShortAnd);
+    TEST_ADD(TestArithmetic::testUnsignedCharAnd);
+    TEST_ADD(TestArithmetic::testFloatAnd);
+    TEST_ADD(TestArithmetic::testSignedIntOr);
+    TEST_ADD(TestArithmetic::testSignedShortOr);
+    TEST_ADD(TestArithmetic::testSignedCharOr);
+    TEST_ADD(TestArithmetic::testUnsignedIntOr);
+    TEST_ADD(TestArithmetic::testUnsignedShortOr);
+    TEST_ADD(TestArithmetic::testUnsignedCharOr);
+    TEST_ADD(TestArithmetic::testFloatOr);
+
+    TEST_ADD(TestArithmetic::testSignedTruncation);
+    TEST_ADD(TestArithmetic::testUnsignedTruncation);
+    TEST_ADD(TestArithmetic::testSignExtension);
+    TEST_ADD(TestArithmetic::testZeroExtension);
+    TEST_ADD(TestArithmetic::testSignedToFloat);
+    TEST_ADD(TestArithmetic::testUnsignedToFloat);
+    TEST_ADD(TestArithmetic::testFloatToSigned);
+    TEST_ADD(TestArithmetic::testFloatToUnsigned);
+}
+
+void TestArithmetic::onMismatch(const std::string& expected, const std::string& result)
+{
+    TEST_ASSERT_EQUALS(expected, result);
+}
+
+static void compileBuffer(
+    vc4c::Configuration& config, std::stringstream& buffer, const std::string& source, const std::string& options)
+{
+    config.outputMode = vc4c::OutputMode::BINARY;
+    config.writeKernelInfo = true;
+    std::istringstream input(source);
+    vc4c::Compiler::compile(input, buffer, config, options);
+}
+
+template <std::size_t N, typename I, typename O>
+static void copyConvert(const I& in, O& out)
+{
+    if(out.size() < N)
+        throw vc4c::CompilationError(vc4c::CompilationStep::GENERAL, "Invalid container size for copy");
+    auto base = reinterpret_cast<const typename O::value_type*>(in.data());
+    std::copy(base, base + N, out.data());
+}
+
+template <typename T, typename R, std::size_t V, std::size_t L, std::size_t G = 1>
+static std::array<R, V * L * G> runEmulation(
+    std::stringstream& codeBuffer, const std::vector<std::array<T, V * L * G>>& inputs)
+{
+    using namespace vc4c::tools;
+
+    std::vector<std::pair<uint32_t, vc4c::Optional<std::vector<uint32_t>>>> parameter;
+    parameter.emplace_back(std::make_pair(0, std::vector<uint32_t>(V * L * G * sizeof(R) / sizeof(uint32_t))));
+    for(const auto& input : inputs)
+    {
+        parameter.emplace_back(std::make_pair(0, std::vector<uint32_t>(V * L * G * sizeof(T) / sizeof(uint32_t))));
+        copyConvert<V * L * G * sizeof(T) / sizeof(uint32_t)>(input, parameter.back().second.value());
+    }
+
+    WorkGroupConfig workGroups;
+    workGroups.dimensions = 1;
+    workGroups.localSizes[0] = L;
+    workGroups.numGroups[0] = G;
+
+    EmulationData data(codeBuffer, "test", parameter, workGroups);
+
+    auto result = emulate(data);
+
+    if(!result.executionSuccessful)
+        throw vc4c::CompilationError(vc4c::CompilationStep::GENERAL, "Kernel execution failed");
+
+    std::array<R, V * L * G> output;
+    copyConvert<V * L * G>(result.results[0].second.value(), output);
+    return output;
+}
+
+template <typename T>
+static void testBinaryOperation(vc4c::Configuration& config, const std::string& options,
+    const std::function<T(T, T)>& op, const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    std::stringstream code;
+    compileBuffer(config, code, BINARY_OPERATION, options);
+
+    auto in0 = generateInput<T, 16 * 12>(false);
+    auto in1 = generateInput<T, 16 * 12>(false);
+
+    auto out = runEmulation<T, T, 16, 12>(code, {in0, in1});
+    auto pos = options.find("-DOP=") + std::string("-DOP=").size();
+    checkBinaryResults<T>(in0, in1, out, op, options.substr(pos, options.find(' ', pos) - pos), onError);
+}
+
+template <>
+void testBinaryOperation<float>(vc4c::Configuration& config, const std::string& options,
+    const std::function<float(float, float)>& op,
+    const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    std::stringstream code;
+    compileBuffer(config, code, BINARY_OPERATION, options);
+
+    auto in0 = generateInput<16 * 12>(false);
+    auto in1 = generateInput<16 * 12>(false);
+
+    auto out = runEmulation<float, float, 16, 12>(code, {in0, in1});
+    auto pos = options.find("-DOP=") + std::string("-DOP=").size();
+    checkBinaryResults<float>(in0, in1, out, op, options.substr(pos, options.find(' ', pos) - pos), onError);
+}
+
+template <typename T>
+static void testRelationalOperation(vc4c::Configuration& config, const std::string& options,
+    const std::function<int(T, T)>& op, const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    std::stringstream code;
+    compileBuffer(config, code, RELATIONAL_OPERATION, options);
+
+    auto in0 = generateInput<T, 16 * 12>(true);
+    auto in1 = generateInput<T, 16 * 12>(true);
+
+    auto out = runEmulation<T, int, 16, 12>(code, {in0, in1});
+    auto pos = options.find("-DOP=") + std::string("-DOP=").size();
+    checkBinaryResults<int, T, 16 * 12>(in0, in1, out, op, options.substr(pos, options.find(' ', pos) - pos), onError);
+}
+
+template <>
+void testRelationalOperation<float>(vc4c::Configuration& config, const std::string& options,
+    const std::function<int(float, float)>& op,
+    const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    std::stringstream code;
+    compileBuffer(config, code, RELATIONAL_OPERATION, options);
+
+    auto in0 = generateInput<16 * 12>(true);
+    auto in1 = generateInput<16 * 12>(true);
+
+    auto out = runEmulation<float, int, 16, 12>(code, {in0, in1});
+    auto pos = options.find("-DOP=") + std::string("-DOP=").size();
+    checkBinaryResults<int, float, 16 * 12>(
+        in0, in1, out, op, options.substr(pos, options.find(' ', pos) - pos), onError);
+}
+
+template <typename I, typename O>
+static void testConversionOperation(vc4c::Configuration& config, const std::string& options,
+    const std::function<O(I)>& op, const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    std::stringstream code;
+    compileBuffer(config, code, CONVERSION_OPERATION, options);
+
+    auto in = generateInput<I, 16 * 12>(true);
+
+    auto out = runEmulation<I, O, 16, 12>(code, {in});
+    checkUnaryResults<O, I, 16 * 12>(in, out, op, "convert", onError);
+}
+
+template <typename O>
+static void testConversionOperation(vc4c::Configuration& config, const std::string& options,
+    const std::function<O(float)>& op, const std::function<void(const std::string&, const std::string&)>& onError)
+{
+    std::stringstream code;
+    compileBuffer(config, code, CONVERSION_OPERATION, options);
+
+    auto in = generateInput<16 * 12>(true);
+
+    auto out = runEmulation<float, O, 16, 12>(code, {in});
+    checkUnaryResults<O, float, 16 * 12>(in, out, op, "convert", onError);
+}
+
+template <typename C, typename T = typename C::first_argument_type>
+static int checkRelation(T arg1, T arg2)
+{
+    C c{};
+    return c(arg1, arg2) ? -1 : 0;
+}
+
+template <typename T>
+static int checkAnd(T arg1, T arg2)
+{
+    return (arg1 != 0 && arg2 != 0) ? -1 : 0;
+}
+
+template <typename T>
+static int checkOr(T arg1, T arg2)
+{
+    return (arg1 != 0 || arg2 != 0) ? -1 : 0;
+}
+
+template <typename T, typename S>
+static T convert(S s)
+{
+    return static_cast<T>(s);
+}
+
+void TestArithmetic::testSignedIntMultiplication()
+{
+    testBinaryOperation<int>(config, "-DTYPE=int16 -DOP=*", std::multiplies<int>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortMultiplication()
+{
+    testBinaryOperation<short>(config, "-DTYPE=short16 -DOP=*", std::multiplies<short>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharMultiplication()
+{
+    testBinaryOperation<char>(config, "-DTYPE=char16 -DOP=*", std::multiplies<char>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntMultiplication()
+{
+    testBinaryOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=*", std::multiplies<unsigned int>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortMultiplication()
+{
+    testBinaryOperation<unsigned short>(config, "-DTYPE=ushort16 -DOP=*", std::multiplies<unsigned short>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharMultiplication()
+{
+    testBinaryOperation<unsigned char>(config, "-DTYPE=uchar16 -DOP=*", std::multiplies<unsigned char>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntDivision()
+{
+    testBinaryOperation<int>(config, "-DTYPE=int16 -DOP=/", std::divides<int>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortDivision()
+{
+    testBinaryOperation<short>(config, "-DTYPE=short16 -DOP=/", std::divides<short>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharDivision()
+{
+    testBinaryOperation<char>(config, "-DTYPE=char16 -DOP=/", std::divides<char>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntDivision()
+{
+    testBinaryOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=/", std::divides<unsigned int>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortDivision()
+{
+    testBinaryOperation<unsigned short>(config, "-DTYPE=ushort16 -DOP=/", std::divides<unsigned short>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharDivision()
+{
+    testBinaryOperation<unsigned char>(config, "-DTYPE=uchar16 -DOP=/", std::divides<unsigned char>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntModulo()
+{
+    testBinaryOperation<int>(config, "-DTYPE=int16 -DOP=%", std::modulus<int>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortModulo()
+{
+    testBinaryOperation<short>(config, "-DTYPE=short16 -DOP=%", std::modulus<short>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharModulo()
+{
+    testBinaryOperation<char>(config, "-DTYPE=char16 -DOP=%", std::modulus<char>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntModulo()
+{
+    testBinaryOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=%", std::modulus<unsigned int>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortModulo()
+{
+    testBinaryOperation<unsigned short>(config, "-DTYPE=ushort16 -DOP=%", std::modulus<unsigned short>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharModulo()
+{
+    testBinaryOperation<unsigned char>(config, "-DTYPE=uchar16 -DOP=%", std::modulus<unsigned char>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatingPointDivision()
+{
+    testBinaryOperation<float>(config, "-DTYPE=float16 -DOP=/", std::divides<float>{},
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testIntegerEquality()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP===", checkRelation<std::equal_to<int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testShortEquality()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP===", checkRelation<std::equal_to<short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testCharEquality()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP===", checkRelation<std::equal_to<char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatEquality()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP===", checkRelation<std::equal_to<float>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testIntegerInequality()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP=!=", checkRelation<std::not_equal_to<int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testShortInequality()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP=!=", checkRelation<std::not_equal_to<short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testCharInequality()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP=!=", checkRelation<std::not_equal_to<char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatInequality()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP=!=", checkRelation<std::not_equal_to<float>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntGreater()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP=>", checkRelation<std::greater<int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortGreater()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP=>", checkRelation<std::greater<short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharGreater()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP=>", checkRelation<std::greater<char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntGreater()
+{
+    testRelationalOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=>", checkRelation<std::greater<unsigned int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortGreater()
+{
+    testRelationalOperation<unsigned short>(config, "-DTYPE=ushort16 -DOP=>",
+        checkRelation<std::greater<unsigned short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharGreater()
+{
+    testRelationalOperation<unsigned char>(config, "-DTYPE=uchar16 -DOP=>", checkRelation<std::greater<unsigned char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatGreater()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP=>", checkRelation<std::greater<float>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntLess()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP=<", checkRelation<std::less<int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortLess()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP=<", checkRelation<std::less<short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharLess()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP=<", checkRelation<std::less<char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntLess()
+{
+    testRelationalOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=<", checkRelation<std::less<unsigned int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortLess()
+{
+    testRelationalOperation<unsigned short>(config, "-DTYPE=ushort16 -DOP=<", checkRelation<std::less<unsigned short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharLess()
+{
+    testRelationalOperation<unsigned char>(config, "-DTYPE=uchar16 -DOP=<", checkRelation<std::less<unsigned char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatLess()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP=<", checkRelation<std::less<float>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntGreaterEquals()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP=>=", checkRelation<std::greater_equal<int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortGreaterEquals()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP=>=", checkRelation<std::greater_equal<short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharGreaterEquals()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP=>=", checkRelation<std::greater_equal<char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntGreaterEquals()
+{
+    testRelationalOperation<unsigned int>(config,
+        "-DTYPE=uint16 -DOP=>=", checkRelation<std::greater_equal<unsigned int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortGreaterEquals()
+{
+    testRelationalOperation<unsigned short>(config,
+        "-DTYPE=ushort16 -DOP=>=", checkRelation<std::greater_equal<unsigned short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharGreaterEquals()
+{
+    testRelationalOperation<unsigned char>(config,
+        "-DTYPE=uchar16 -DOP=>=", checkRelation<std::greater_equal<unsigned char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatGreaterEquals()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP=>=", checkRelation<std::greater_equal<float>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntLessEquals()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP=<=", checkRelation<std::less_equal<int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortLessEquals()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP=<=", checkRelation<std::less_equal<short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharLessEquals()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP=<=", checkRelation<std::less_equal<char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntLessEquals()
+{
+    testRelationalOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=<=", checkRelation<std::less_equal<unsigned int>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortLessEquals()
+{
+    testRelationalOperation<unsigned short>(config,
+        "-DTYPE=ushort16 -DOP=<=", checkRelation<std::less_equal<unsigned short>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharLessEquals()
+{
+    testRelationalOperation<unsigned char>(config,
+        "-DTYPE=uchar16 -DOP=<=", checkRelation<std::less_equal<unsigned char>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatLessEquals()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP=<=", checkRelation<std::less_equal<float>>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntAnd()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP=&&", checkAnd<int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortAnd()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP=&&", checkAnd<short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharAnd()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP=&&", checkAnd<char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntAnd()
+{
+    testRelationalOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=&&", checkAnd<unsigned int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortAnd()
+{
+    testRelationalOperation<unsigned short>(config, "-DTYPE=ushort16 -DOP=&&", checkAnd<unsigned short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharAnd()
+{
+    testRelationalOperation<unsigned char>(config, "-DTYPE=uchar16 -DOP=&&", checkAnd<unsigned char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatAnd()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP=&&", checkAnd<float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedIntOr()
+{
+    testRelationalOperation<int>(config, "-DTYPE=int16 -DOP=||", checkOr<int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedShortOr()
+{
+    testRelationalOperation<short>(config, "-DTYPE=short16 -DOP=||", checkOr<short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedCharOr()
+{
+    testRelationalOperation<char>(config, "-DTYPE=char16 -DOP=||", checkOr<char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedIntOr()
+{
+    testRelationalOperation<unsigned int>(config, "-DTYPE=uint16 -DOP=||", checkOr<unsigned int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedShortOr()
+{
+    testRelationalOperation<unsigned short>(config, "-DTYPE=ushort16 -DOP=||", checkOr<unsigned short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedCharOr()
+{
+    testRelationalOperation<unsigned char>(config, "-DTYPE=uchar16 -DOP=||", checkOr<unsigned char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatOr()
+{
+    testRelationalOperation<float>(config, "-DTYPE=float16 -DOP=||", checkOr<float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedTruncation()
+{
+    testConversionOperation<int, short>(config, "-DIN=int16 -DOUT=short16", convert<short, int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<int, char>(config, "-DIN=int16 -DOUT=char16", convert<char, int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<short, char>(config, "-DIN=short16 -DOUT=char16", convert<char, short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedTruncation()
+{
+    testConversionOperation<unsigned int, unsigned short>(config, "-DIN=uint16 -DOUT=ushort16",
+        convert<unsigned short, unsigned int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned int, unsigned char>(config, "-DIN=uint16 -DOUT=uchar16",
+        convert<unsigned char, unsigned int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned short, unsigned char>(config, "-DIN=ushort16 -DOUT=uchar16",
+        convert<unsigned char, unsigned short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignExtension()
+{
+    testConversionOperation<short, int>(config, "-DIN=short16 -DOUT=int16", convert<int, short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<char, int>(config, "-DIN=char16 -DOUT=int16", convert<int, char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<char, short>(config, "-DIN=char16 -DOUT=short16", convert<short, char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testZeroExtension()
+{
+    testConversionOperation<unsigned short, unsigned int>(config, "-DIN=ushort16 -DOUT=uint16",
+        convert<unsigned int, unsigned short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned char, unsigned int>(config, "-DIN=uchar16 -DOUT=uint16",
+        convert<unsigned int, unsigned char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned char, unsigned short>(config, "-DIN=uchar16 -DOUT=ushort16",
+        convert<unsigned short, unsigned char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testSignedToFloat()
+{
+    testConversionOperation<int, float>(config, "-DIN=int16 -DOUT=float16", convert<int, float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<short, float>(config, "-DIN=short16 -DOUT=float16", convert<short, float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<char, float>(config, "-DIN=char16 -DOUT=float16", convert<char, float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testUnsignedToFloat()
+{
+    testConversionOperation<unsigned int, float>(config, "-DIN=uint16 -DOUT=float16", convert<unsigned int, float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned short, float>(config, "-DIN=ushort16 -DOUT=float16",
+        convert<unsigned short, float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned char, float>(config, "-DIN=uchar16 -DOUT=float16", convert<unsigned char, float>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+
+void TestArithmetic::testFloatToSigned()
+{
+    testConversionOperation<int>(config, "-DIN=float16 -DOUT=int16", convert<float, int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<short>(config, "-DIN=float16 -DOUT=short16", convert<float, short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<char>(config, "-DIN=float16 -DOUT=char16", convert<float, char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
+void TestArithmetic::testFloatToUnsigned()
+{
+    testConversionOperation<unsigned int>(config, "-DIN=float16 -DOUT=uint16", convert<float, unsigned int>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned short>(config, "-DIN=float16 -DOUT=ushort16", convert<float, unsigned short>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+    testConversionOperation<unsigned char>(config, "-DIN=float16 -DOUT=uchar16", convert<float, unsigned char>,
+        std::bind(&TestArithmetic::onMismatch, this, std::placeholders::_1, std::placeholders::_2));
+}
\ No newline at end of file
diff --git a/test/TestArithmetic.h b/test/TestArithmetic.h
new file mode 100644
index 00000000..f6390861
--- /dev/null
+++ b/test/TestArithmetic.h
@@ -0,0 +1,127 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4C_TEST_ARITHMETIC_H
+#define VC4C_TEST_ARITHMETIC_H
+
+#include "cpptest.h"
+
+#include "config.h"
+
+class TestArithmetic : public Test::Suite
+{
+    // TODO write tests for all relevant (e.g. not directly mapped to machine code) instructions from LLVM IR
+    // (https://llvm.org/docs/LangRef.html)
+public:
+    TestArithmetic(const vc4c::Configuration& config = {});
+
+    // arithmetic operators
+    void testSignedIntMultiplication();
+    void testSignedShortMultiplication();
+    void testSignedCharMultiplication();
+    void testUnsignedIntMultiplication();
+    void testUnsignedShortMultiplication();
+    void testUnsignedCharMultiplication();
+    void testSignedIntDivision();
+    void testSignedShortDivision();
+    void testSignedCharDivision();
+    void testUnsignedIntDivision();
+    void testUnsignedShortDivision();
+    void testUnsignedCharDivision();
+    void testSignedIntModulo();
+    void testSignedShortModulo();
+    void testSignedCharModulo();
+    void testUnsignedIntModulo();
+    void testUnsignedShortModulo();
+    void testUnsignedCharModulo();
+    void testFloatingPointDivision();
+
+    // relational operators
+    void testIntegerEquality();
+    void testShortEquality();
+    void testCharEquality();
+    void testFloatEquality();
+    void testIntegerInequality();
+    void testShortInequality();
+    void testCharInequality();
+    void testFloatInequality();
+    void testSignedIntGreater();
+    void testSignedShortGreater();
+    void testSignedCharGreater();
+    void testUnsignedIntGreater();
+    void testUnsignedShortGreater();
+    void testUnsignedCharGreater();
+    void testFloatGreater();
+    void testSignedIntLess();
+    void testSignedShortLess();
+    void testSignedCharLess();
+    void testUnsignedIntLess();
+    void testUnsignedShortLess();
+    void testUnsignedCharLess();
+    void testFloatLess();
+    void testSignedIntGreaterEquals();
+    void testSignedShortGreaterEquals();
+    void testSignedCharGreaterEquals();
+    void testUnsignedIntGreaterEquals();
+    void testUnsignedShortGreaterEquals();
+    void testUnsignedCharGreaterEquals();
+    void testFloatGreaterEquals();
+    void testSignedIntLessEquals();
+    void testSignedShortLessEquals();
+    void testSignedCharLessEquals();
+    void testUnsignedIntLessEquals();
+    void testUnsignedShortLessEquals();
+    void testUnsignedCharLessEquals();
+    void testFloatLessEquals();
+
+    // logical operators
+    void testSignedIntAnd();
+    void testSignedShortAnd();
+    void testSignedCharAnd();
+    void testUnsignedIntAnd();
+    void testUnsignedShortAnd();
+    void testUnsignedCharAnd();
+    void testFloatAnd();
+    void testSignedIntOr();
+    void testSignedShortOr();
+    void testSignedCharOr();
+    void testUnsignedIntOr();
+    void testUnsignedShortOr();
+    void testUnsignedCharOr();
+    void testFloatOr();
+    //XXX
+    void testSignedNot();
+    void testUnsignedNot();
+    void testFloatNot();
+    void testSelection();
+    
+    //XXX vector instructions
+    void testExtractElement();
+    void testInsertElement();
+    void testShuffleVector();
+    
+    //conversion instructions
+    void testSignedTruncation();
+    void testUnsignedTruncation();
+    void testSignExtension();
+    void testZeroExtension();
+    void testSignedToFloat();
+    void testUnsignedToFloat();
+    void testFloatToSigned();
+    void testFloatToUnsigned();
+    //XXX
+    void testVectorBitcastTruncation4To1();
+    void testVectorBitcastTruncation2To1();
+    void testVectorBitcastExtension1To4();
+    void testVectorBitcastExtension1To2();
+
+private:
+    vc4c::Configuration config;
+    
+    void onMismatch(const std::string& expected, const std::string& result);
+};
+
+#endif /* VC4C_TEST_ARITHMETIC_H */
\ No newline at end of file
diff --git a/test/sources.list b/test/sources.list
index 4aa3df3e..40f67d1a 100644
--- a/test/sources.list
+++ b/test/sources.list
@@ -4,6 +4,8 @@ target_sources(TestVC4C
     RegressionTest.h
     test_cases.h
     test.cpp
+    TestArithmetic.cpp
+    TestArithmetic.h
     TestEmulator.cpp
     TestEmulator.h
     TestGraph.cpp
diff --git a/test/test.cpp b/test/test.cpp
index d487f5b9..1266141e 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -9,6 +9,7 @@
 
 #include "cpptest.h"
 #include "cpptest-main.h"
+#include "TestArithmetic.h"
 #include "TestEmulator.h"
 #include "TestGraph.h"
 #include "TestInstructions.h"
@@ -78,6 +79,7 @@ int main(int argc, char** argv)
     Test::registerSuite(newEmulatorTest, "test-emulator", "Runs selected code-samples through the emulator");
     Test::registerSuite(newStdLibTest, "test-stdlib", "Runs most of the VC4CL std-lib functions in emulator");
     Test::registerSuite(Test::newInstance<TestGraph>, "test-graph", "Runs basic test for the graph data structure");
+    Test::registerSuite(Test::newInstance<TestArithmetic>, "test-arithmetic", "Runs emulation tests for various kind of operations");
     
     for(auto i = 1; i < argc; ++i)
     { 
diff --git a/testing/OpenCV/color_lab.cl b/testing/OpenCV/color_lab.cl
new file mode 100644
index 00000000..16a96d25
--- /dev/null
+++ b/testing/OpenCV/color_lab.cl
@@ -0,0 +1,735 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX_NUM 128
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX_NUM 32768
+    #define COEFF_TYPE int
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX_NUM 0.5f
+    #define COEFF_TYPE float
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    xyz_shift  = 12,
+};
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+#define DATA_TYPE_4 CAT(DATA_TYPE, 4)
+#define DATA_TYPE_3 CAT(DATA_TYPE, 3)
+
+///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
+
+__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset,
+                      int rows, int cols, __constant COEFF_TYPE * coeffs)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (dx < cols)
+    {
+        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;
+
+#ifdef DEPTH_5
+                float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));
+                float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));
+                float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));
+#else
+                int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);
+                int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);
+                int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);
+#endif
+                dst[0] = SAT_CAST(x);
+                dst[1] = SAT_CAST(y);
+                dst[2] = SAT_CAST(z);
+
+                ++dy;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset,
+                      int rows, int cols, __constant COEFF_TYPE * coeffs)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (dx < cols)
+    {
+        int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));
+        int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (dy < rows)
+            {
+                __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
+                __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
+
+                DATA_TYPE_4 src_pix = vload4(0, src);
+                DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;
+
+#ifdef DEPTH_5
+                float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));
+                float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));
+                float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));
+#else
+                int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);
+                int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);
+                int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);
+#endif
+
+                DATA_TYPE dst0 = SAT_CAST(b);
+                DATA_TYPE dst1 = SAT_CAST(g);
+                DATA_TYPE dst2 = SAT_CAST(r);
+#if dcn == 3 || defined DEPTH_5
+                dst[0] = dst0;
+                dst[1] = dst1;
+                dst[2] = dst2;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+#else
+                *(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);
+#endif
+
+                ++dy;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
+
+#define lab_shift xyz_shift
+#define gamma_shift 3
+#define lab_shift2 (lab_shift + gamma_shift)
+#define GAMMA_TAB_SIZE 1024
+#define GammaTabScale (float)GAMMA_TAB_SIZE
+
+inline float splineInterpolate(float x, __global const float * tab, int n)
+{
+    int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
+    x -= ix;
+    tab += ix << 2;
+    return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);
+}
+
+#ifdef DEPTH_0
+
+__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+                      __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
+                      __constant int * coeffs, int Lscale, int Lshift)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const uchar* src_ptr = src + src_index;
+                __global uchar* dst_ptr = dst + dst_index;
+                uchar4 src_pix = vload4(0, src_ptr);
+
+                int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                    C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                    C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+                int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];
+                int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];
+                int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];
+                int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];
+
+                int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
+                int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );
+                int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );
+
+                dst_ptr[0] = SAT_CAST(L);
+                dst_ptr[1] = SAT_CAST(a);
+                dst_ptr[2] = SAT_CAST(b);
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _1_3, float _a)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                      C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                      C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+                float R = clamp(src_pix.x, 0.0f, 1.0f);
+                float G = clamp(src_pix.y, 0.0f, 1.0f);
+                float B = clamp(src_pix.z, 0.0f, 1.0f);
+
+#ifdef SRGB
+                R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3
+                float X = fma(R, C0, fma(G, C1, B*C2));
+                float Y = fma(R, C3, fma(G, C4, B*C5));
+                float Z = fma(R, C6, fma(G, C7, B*C8));
+
+                float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);
+                float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);
+                float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);
+
+                float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);
+                float a = 500.f * (FX - FY);
+                float b = 200.f * (FY - FZ);
+
+                dst[0] = L;
+                dst[1] = a;
+                dst[2] = b;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
+
+    float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+          C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+          C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+    float y, fy;
+    // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4)
+    if (li <= lThresh)
+    {
+        y = li / 903.3f;
+        fy = fma(7.787f, y, 16.0f / 116.0f);
+    }
+    else
+    {
+        fy = (li + 16.0f) / 116.0f;
+        y = fy * fy * fy;
+    }
+
+    float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
+
+    #pragma unroll
+    for (int j = 0; j < 2; j++)
+        if (fxz[j] <= fThresh)
+            fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
+        else
+            fxz[j] = fxz[j] * fxz[j] * fxz[j];
+
+    float x = fxz[0], z = fxz[1];
+    float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);
+    float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);
+    float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);
+
+#ifdef SRGB
+    ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+    dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
+}
+
+#ifdef DEPTH_0
+
+__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const uchar* src_ptr = src + src_index;
+                __global uchar * dst_ptr = dst + dst_index;
+                uchar4 src_pix = vload4(0, src_ptr);
+
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x*(100.f/255.f);
+                srcbuf[1] = convert_float(src_pix.y - 128);
+                srcbuf[2] = convert_float(src_pix.z - 128);
+
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+                    gammaTab,
+#endif
+                    coeffs, lThresh, fThresh);
+
+#if dcn == 3
+                dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
+                dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
+                dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
+#else
+                *(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),
+                    SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+        {
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+                float4 src_pix = vload4(0, src);
+
+                float srcbuf[3], dstbuf[3];
+                srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;
+
+                Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+                    gammaTab,
+#endif
+                    coeffs, lThresh, fThresh);
+
+                dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+        }
+    }
+}
+
+#endif
+
+/////////////////////////////////// [l|s]RGB <-> Luv ///////////////////////////
+
+#define LAB_CBRT_TAB_SIZE 1024
+#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
+
+__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
+
+#ifdef DEPTH_5
+
+__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+
+                float R = src[0], G = src[1], B = src[2];
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
+
+                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L = fma(116.f, L, -16.f);
+
+                float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+                float u = L*fma(X, d, -_un);
+                float v = L*fma(2.25f, Y*d, -_vn);
+
+                dst[0] = L;
+                dst[1] = u;
+                dst[2] = v;
+
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+    }
+}
+
+#elif defined DEPTH_0
+
+__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                float scale = 1.0f / 255.0f;
+                float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+                float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));
+                float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));
+                float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));
+
+                float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
+                L = 116.f*L - 16.f;
+
+                float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);
+                float u = L*(X*d - _un);
+                float v = L*fma(2.25f, Y*d, -_vn);
+
+                dst[0] = SAT_CAST(L * 2.55f);
+                //0.72033 = 255/(220+134), 96.525 = 134*255/(220+134)
+                dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));
+                //0.9732 = 255/(140+122), 136.259 = 140*255/(140+122)
+                dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));
+
+                ++y;
+                dst += dst_step;
+                src += src_step;
+            }
+    }
+}
+
+#endif
+
+#ifdef DEPTH_5
+
+__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                __global const float * src = (__global const float *)(srcptr + src_index);
+                __global float * dst = (__global float *)(dstptr + dst_index);
+
+                float L = src[0], u = src[1], v = src[2], X, Y, Z;
+                if(L >= 8)
+                {
+                    Y = fma(L, 1.f/116.f, 16.f/116.f);
+                    Y = Y*Y*Y;
+                }
+                else
+                {
+                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
+                }
+                float up = 3.f*fma(L, _un, u);
+                float vp = 0.25f/fma(L, _vn, v);
+                vp = clamp(vp, -0.25f, 0.25f);
+                X = 3.f*Y*up*vp;
+                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
+
+                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                dst[0] = R;
+                dst[1] = G;
+                dst[2] = B;
+#if dcn == 4
+                dst[3] = MAX_NUM;
+#endif
+                ++y;
+                dst_index += dst_step;
+                src_index += src_step;
+            }
+    }
+}
+
+#elif defined DEPTH_0
+
+__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _un, float _vn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * PIX_PER_WI_Y;
+
+    if (x < cols)
+    {
+        src += mad24(y, src_step, mad24(x, scnbytes, src_offset));
+        dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));
+
+        #pragma unroll
+        for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
+            if (y < rows)
+            {
+                float d, X, Y, Z;
+                float L = src[0]*(100.f/255.f);
+                // 1.388235294117647 = (220+134)/255
+                float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);
+                // 1.027450980392157 = (140+122)/255
+                float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);
+                if(L >= 8)
+                {
+                    Y = fma(L, 1.f/116.f, 16.f/116.f);
+                    Y = Y*Y*Y;
+                }
+                else
+                {
+                    Y = L * (1.0f/903.3f); // L*(3./29.)^3
+                }
+                float up = 3.f*fma(L, _un, u);
+                float vp = 0.25f/fma(L, _vn, v);
+                vp = clamp(vp, -0.25f, 0.25f);
+                X = 3.f*Y*up*vp;
+                Z = Y*fma(fma(12.f*13.f, L, -up), vp, -5.f);
+
+                //limit X, Y, Z to [0, 2] to fit white point
+                X = clamp(X, 0.f, 2.f); Z = clamp(Z, 0.f, 2.f);
+
+                float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));
+                float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));
+                float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));
+
+                R = clamp(R, 0.f, 1.f);
+                G = clamp(G, 0.f, 1.f);
+                B = clamp(B, 0.f, 1.f);
+
+#ifdef SRGB
+                R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+                B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+                uchar dst0 = SAT_CAST(R * 255.0f);
+                uchar dst1 = SAT_CAST(G * 255.0f);
+                uchar dst2 = SAT_CAST(B * 255.0f);
+
+#if dcn == 4
+                *(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
+#else
+                dst[0] = dst0;
+                dst[1] = dst1;
+                dst[2] = dst2;
+#endif
+
+                ++y;
+                dst += dst_step;
+                src += src_step;
+            }
+    }
+}
+
+#endif
diff --git a/testing/bugs/vc4cl_41_hang.cl b/testing/bugs/vc4cl_41_hang.cl
new file mode 100644
index 00000000..9e9790fb
--- /dev/null
+++ b/testing/bugs/vc4cl_41_hang.cl
@@ -0,0 +1,18 @@
+__kernel void TestMagic(const int total, const int is_deeper_magic, const float alpha_s, const float fore_th, __global const float* gradx, __global const float* grady,
+    //in/out
+    __global float* BSx,  __global float* BSy, __global int* mapRes)
+{
+    private const size_t i        = get_global_id(0);
+    private const size_t gpu_used = get_global_size(0);
+
+    private const size_t elements_count = total / (gpu_used * 16);
+    private const size_t offset = i * total / gpu_used;
+ 
+    for (size_t k = 0; k < elements_count; ++k)
+    {
+        int16 mr = vload16( k , mapRes + offset);
+        const int16 twos   = 2;
+        mr += twos;
+        vstore16(mr, k, mapRes + offset);
+   }
+}