From 707e55baf2ad9cc4a502ed98417773758fe8c79e Mon Sep 17 00:00:00 2001
From: doe300 <stadeldani@web.de>
Date: Sat, 31 Aug 2019 08:55:30 +0200
Subject: [PATCH] Cleans up CMake and fixes some warnings/bugs

- Also fixes a few bugs
- Fixes some TSAN findings
---
 CMakeLists.txt                            |  4 +++-
 cmake/sanitizers.cmake                    | 23 ++++++++++++++--------
 src/Bitfield.h                            |  9 ++++++++-
 src/Compiler.cpp                          |  3 ++-
 src/Types.cpp                             |  7 ++++++-
 src/Units.h                               |  2 ++
 src/Values.cpp                            |  4 ++--
 src/asm/OpCodes.cpp                       |  4 ++--
 src/helper.h                              |  2 +-
 src/intermediate/VectorHelper.cpp         |  7 ++++---
 src/intermediate/operators.h              | 14 +++++++++++++
 src/intrinsics/Images.cpp                 |  1 -
 src/intrinsics/Intrinsics.cpp             | 19 +++++++++++-------
 src/llvm/BitcodeReader.cpp                |  4 ++--
 src/main.cpp                              |  2 +-
 src/normalization/AddressCalculation.h    |  2 +-
 src/normalization/MemoryAccess.cpp        | 15 +++++++-------
 src/normalization/MemoryMapChecks.cpp     |  2 +-
 src/normalization/Normalizer.cpp          |  8 +++++---
 src/optimization/ControlFlow.cpp          | 15 +++++++-------
 src/optimization/InstructionScheduler.cpp |  2 +-
 src/optimization/Optimizer.cpp            |  3 ++-
 src/periphery/VPM.h                       |  6 ++++++
 src/tools/Emulator.cpp                    | 24 +++++++++++++----------
 test/TestMemoryAccess.cpp                 |  4 ++--
 test/emulation_helper.h                   |  5 +++--
 26 files changed, 125 insertions(+), 66 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6663ea9e..fa2ffa52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,8 @@ option(LLVMLIB_FRONTEND "Enables the front-end using the LLVM library to read LL
 option(ENABLE_COVERAGE "Enables collection of code coverage via gcov" OFF)
 # Option whether to enable use of the CLang library (EXPERIMENTAL)
 option(CLANG_LIBRARY "Uses the libclang library for compilation, uses the clang executable otherwise" OFF)
+# Option whether to enable more compile-time checks
+option(ADVANCED_CHECKS "Enable advanced compile-time checks" OFF)
 
 # Path to the VC4CL standard library
 # NOTE: Resolving ~ (for home directory) is currently not supported
@@ -78,7 +80,7 @@ endif()
 
 # clang-tidy
 find_program(CLANG_TIDY NAMES clang-tidy clang-tidy-5.0 clang-tidy-6.0 clang-tidy-7 clang-tidy-8)
-if(FALSE AND BUILD_DEBUG AND CLANG_TIDY)
+if(ADVANCED_CHECKS AND CLANG_TIDY)
     message(STATUS "Enabling clang-tidy compile time checks: ${CLANG_TIDY}")
     set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY}")
 endif()
diff --git a/cmake/sanitizers.cmake b/cmake/sanitizers.cmake
index 761e3bec..0539f318 100644
--- a/cmake/sanitizers.cmake
+++ b/cmake/sanitizers.cmake
@@ -14,17 +14,24 @@ elseif("${CMAKE_BUILD_TYPE}" STREQUAL "ubsan")
   message(STATUS "Build will have UBSAN enabled!")
 endif()
 
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(SANITIZERS_ADDITIONAL_FLAGS "-Og")
+else()
+  # -Og is not supported by CLang
+  set(SANITIZERS_ADDITIONAL_FLAGS "")
+endif()
+
 # AddressSanitize
-set(CMAKE_C_FLAGS_ASAN "-fsanitize=address -fno-omit-frame-pointer -g -Og")
-set(CMAKE_CXX_FLAGS_ASAN "-fsanitize=address -fno-omit-frame-pointer -g -Og")
+set(CMAKE_C_FLAGS_ASAN "-fsanitize=address -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
+set(CMAKE_CXX_FLAGS_ASAN "-fsanitize=address -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
 
 # ThreadSanitizer
-set(CMAKE_C_FLAGS_TSAN "-fsanitize=thread -fno-omit-frame-pointer -g -Og")
-set(CMAKE_CXX_FLAGS_TSAN "-fsanitize=thread -fno-omit-frame-pointer -g -Og")
+set(CMAKE_C_FLAGS_TSAN "-fsanitize=thread -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
+set(CMAKE_CXX_FLAGS_TSAN "-fsanitize=thread -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
 
 # LeakSanitizer
-set(CMAKE_C_FLAGS_LSAN "-fsanitize=leak -fno-omit-frame-pointer -g -Og")
-set(CMAKE_CXX_FLAGS_LSAN "-fsanitize=leak -fno-omit-frame-pointer -g -Og")
+set(CMAKE_C_FLAGS_LSAN "-fsanitize=leak -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
+set(CMAKE_CXX_FLAGS_LSAN "-fsanitize=leak -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
 
 # MemorySanitizer (Clang only!)
 set(CMAKE_C_FLAGS_MSAN "-fsanitize=memory -fno-optimize-sibling-calls -fsanitize-memory-track-origins=2 -fno-omit-frame-pointer -g -O2")
@@ -32,5 +39,5 @@ set(CMAKE_CXX_FLAGS_MSAN "-fsanitize=memory -fno-optimize-sibling-calls -fsaniti
 
 # UndefinedBehaviour
 # Run with: UBSAN_OPTIONS=print_stacktrace=1 environment variable
-set(CMAKE_C_FLAGS_UBSAN "-fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-omit-frame-pointer -g -Og")
-set(CMAKE_CXX_FLAGS_UBSAN "-fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-omit-frame-pointer -g -Og")
+set(CMAKE_C_FLAGS_UBSAN "-fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
+set(CMAKE_CXX_FLAGS_UBSAN "-fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-omit-frame-pointer -g ${SANITIZERS_ADDITIONAL_FLAGS}")
diff --git a/src/Bitfield.h b/src/Bitfield.h
index 8b933d2e..f34d3a9c 100644
--- a/src/Bitfield.h
+++ b/src/Bitfield.h
@@ -19,6 +19,11 @@
 
 namespace vc4c
 {
+    template <typename T>
+    using underlying_type = typename std::conditional<std::is_enum<T>::value, std::underlying_type<T>,
+        typename std::conditional<std::is_integral<T>::value || std::is_same<T, bool>::value, std::remove_reference<T>,
+            T>::type>::type;
+
     /*
      * A bit-field is an integral type with a specific binary layout where different parts of the binary code have
      * different meanings.
@@ -102,7 +107,8 @@ namespace vc4c
         template <typename T>
         constexpr inline T getEntry(UnderlyingType pos, UnderlyingType mask) const noexcept
         {
-            return static_cast<T>(mask & getValue(pos));
+            auto tmp = static_cast<typename underlying_type<T>::type>(mask & getValue(pos));
+            return static_cast<T>(tmp);
         }
 
         template <typename T>
@@ -193,6 +199,7 @@ namespace vc4c
      */
     struct InstructionPart
     {
+        using type = unsigned char;
         unsigned char value;
 
         explicit constexpr InstructionPart(unsigned char val) noexcept : value(val) {}
diff --git a/src/Compiler.cpp b/src/Compiler.cpp
index 845932ae..a56a84be 100644
--- a/src/Compiler.cpp
+++ b/src/Compiler.cpp
@@ -108,8 +108,9 @@ std::size_t Compiler::convert()
     norm.adjust(module);
     PROFILE_END(SecondNormalizer);
 
+    auto kernels = module.getKernels();
     const auto f = [&codeGen](Method* kernelFunc) -> void { codeGen.toMachineCode(*kernelFunc); };
-    ThreadPool{"CodeGenerator"}.scheduleAll<Method*>(module.getKernels(), f);
+    ThreadPool{"CodeGenerator"}.scheduleAll<Method*>(kernels, f);
 
     // TODO could discard unused globals
     // since they are exported, they are still in the intermediate code, even if not used (e.g. optimized away)
diff --git a/src/Types.cpp b/src/Types.cpp
index 5e537c71..bb539717 100644
--- a/src/Types.cpp
+++ b/src/Types.cpp
@@ -449,7 +449,12 @@ bool StructType::operator==(const ComplexType& other) const
     const StructType* right = dynamic_cast<const StructType*>(&other);
     if(right == nullptr)
         return false;
-    return isPacked == right->isPacked && elementTypes == right->elementTypes;
+    // NOTE: comparing by name is technically not necessary, since we don't actually care whether two struct types with
+    // the same layout are the exact same types or not. But it is required to work around a quirk in
+    // BitcodeReader#toDataType, since there struct types are inserted empty into the TypeHolder and after that the
+    // element types are resolved. If now the first element type is also a newly created struct type, both types have no
+    // elements and are therefore considered equal. Occurs e.g. in testing/bullet/jointSolver.cl.
+    return name == right->name && isPacked == right->isPacked && elementTypes == right->elementTypes;
 }
 
 unsigned int StructType::getStructSize(const int index) const
diff --git a/src/Units.h b/src/Units.h
index c30b2d2c..74dd4aa4 100644
--- a/src/Units.h
+++ b/src/Units.h
@@ -17,6 +17,7 @@ namespace vc4c
     struct Byte
     {
     public:
+        using type = uint64_t;
         constexpr explicit Byte(uint64_t val) : value(val) {}
 
         constexpr explicit operator uint64_t() const
@@ -65,6 +66,7 @@ namespace vc4c
     struct Word
     {
     public:
+        using type = uint64_t;
         constexpr explicit Word(uint64_t val) : value(val) {}
         constexpr explicit Word(Byte bytes) : value(bytes.value / sizeof(uint64_t)) {}
 
diff --git a/src/Values.cpp b/src/Values.cpp
index 40abcde4..f52662ac 100644
--- a/src/Values.cpp
+++ b/src/Values.cpp
@@ -540,7 +540,7 @@ SIMDVector SIMDVector::rotate(uint8_t offset) const &
     SIMDVector copy(*this);
     //"Rotates the order of the elements in the range [first,last), in such a way that the element pointed by middle
     // becomes the new first element."
-    offset = (NATIVE_VECTOR_SIZE - offset);
+    offset = static_cast<uint8_t>(NATIVE_VECTOR_SIZE - offset);
     std::rotate(copy.begin(), copy.begin() + offset, copy.end());
     return copy;
 }
@@ -549,7 +549,7 @@ SIMDVector SIMDVector::rotate(uint8_t offset) &&
 {
     //"Rotates the order of the elements in the range [first,last), in such a way that the element pointed by middle
     // becomes the new first element."
-    offset = (NATIVE_VECTOR_SIZE - offset);
+    offset = static_cast<uint8_t>(NATIVE_VECTOR_SIZE - offset);
     std::rotate(begin(), begin() + offset, end());
     return std::move(*this);
 }
diff --git a/src/asm/OpCodes.cpp b/src/asm/OpCodes.cpp
index 5ca6298e..0674cca1 100644
--- a/src/asm/OpCodes.cpp
+++ b/src/asm/OpCodes.cpp
@@ -754,7 +754,7 @@ static PrecalculatedLiteral calcLiteral(const OpCode& code, Literal firstLit, Li
         // Tests have shown that on VC4 all shifts (asr, shr, shl) only take the last 5 bits of the offset (modulo 32)
         auto offset = secondLit.unsignedInt() & 0x1F;
         // carry is set if bits set are shifted out of the register: val & (2^shift-offset-1) != 0
-        auto shiftLoss = firstLit.unsignedInt() & ((1 << offset) - 1);
+        auto shiftLoss = firstLit.unsignedInt() & ((1u << offset) - 1u);
         return setFlags(intermediate::asr(firstLit, secondLit), shiftLoss != 0, false);
     }
     if(code == OP_CLZ)
@@ -851,7 +851,7 @@ static PrecalculatedLiteral calcLiteral(const OpCode& code, Literal firstLit, Li
         // Tests have shown that on VC4 all shifts (asr, shr, shl) only take the last 5 bits of the offset (modulo 32)
         auto offset = secondLit.unsignedInt() & 0x1F;
         // carry is set if bits set are shifted out of the register: val & (2^shift-offset-1) != 0
-        auto shiftLoss = firstLit.unsignedInt() & ((1 << offset) - 1);
+        auto shiftLoss = firstLit.unsignedInt() & ((1u << offset) - 1u);
         return setFlags(Literal(firstLit.unsignedInt() >> offset), shiftLoss != 0);
     }
     if(code == OP_SUB)
diff --git a/src/helper.h b/src/helper.h
index dff93bbd..c9a03442 100644
--- a/src/helper.h
+++ b/src/helper.h
@@ -21,7 +21,7 @@
 
 #if __cplusplus > 201402L
 #define FALL_THROUGH [[fallthrough]];
-#elif defined(__clang_major__) && __clang_major__ >= 4
+#elif defined(__clang_major__) && (__clang_major__ >= 4 || (__clang_major__ == 3 && __clang_minor__ == 9))
 #define FALL_THROUGH [[clang::fallthrough]];
 #elif defined(__GNUC__) && __GNUC__ >= 7
 #define FALL_THROUGH __attribute__((fallthrough));
diff --git a/src/intermediate/VectorHelper.cpp b/src/intermediate/VectorHelper.cpp
index de9ff4d4..aa39489e 100644
--- a/src/intermediate/VectorHelper.cpp
+++ b/src/intermediate/VectorHelper.cpp
@@ -172,7 +172,7 @@ InstructionWalker intermediate::insertVectorInsertion(
         // other unchanged
         // we use the mask version of loads to set the elements we want to insert to and then use flags to insert only
         // those
-        unsigned maskLit = (1 << value.type.getVectorWidth()) - 1;
+        unsigned maskLit = (1u << value.type.getVectorWidth()) - 1u;
         auto shiftedMask = method.addNewLocal(TYPE_INT32, "%vector_mask");
         if(auto lit = index.getLiteralValue())
         {
@@ -392,7 +392,8 @@ InstructionWalker intermediate::insertVectorShuffle(InstructionWalker it, Method
         else
         {
             it = insertVectorConcatenation(it, method, source0, source1, destination);
-            numCorrespondingIndices = source0.type.getVectorWidth() + source1.type.getVectorWidth();
+            numCorrespondingIndices =
+                static_cast<uint8_t>(source0.type.getVectorWidth() + source1.type.getVectorWidth());
         }
     }
 
@@ -413,7 +414,7 @@ InstructionWalker intermediate::insertVectorShuffle(InstructionWalker it, Method
             std::make_pair(rotateElementNumberDown(maskContainer[i].unsignedInt() - firstVectorSize, i), 1) :
             // source is from first vector
             std::make_pair(rotateElementNumberDown(maskContainer[i].unsignedInt(), i), 0);
-        relativeSources[source] |= (1 << i);
+        relativeSources[source] |= (1u << i);
     }
 
     for(const auto& sources : relativeSources)
diff --git a/src/intermediate/operators.h b/src/intermediate/operators.h
index 91d3a1f5..27a4940d 100644
--- a/src/intermediate/operators.h
+++ b/src/intermediate/operators.h
@@ -432,6 +432,12 @@ namespace vc4c
                 it.nextInBlock();
             }
 
+            void operator=(Value&& src) &&
+            {
+                it.emplace(new intermediate::MoveOperation(Value(result), std::move(src)));
+                it.nextInBlock();
+            }
+
             NODISCARD ConditionCode operator=(ComparisonWrapper&& op) &&
             {
                 return op(it, result, op.arg0, op.arg1);
@@ -483,6 +489,14 @@ namespace vc4c
                 it.nextInBlock();
                 return result;
             }
+
+            NODISCARD Value operator=(Value&& src) &&
+            {
+                auto result = method.addNewLocal(type, name);
+                it.emplace(new intermediate::MoveOperation(Value(result), std::move(src)));
+                it.nextInBlock();
+                return result;
+            }
         };
 
         /**
diff --git a/src/intrinsics/Images.cpp b/src/intrinsics/Images.cpp
index e7719e55..3f33073b 100644
--- a/src/intrinsics/Images.cpp
+++ b/src/intrinsics/Images.cpp
@@ -320,5 +320,4 @@ InstructionWalker intermediate::insertQueryMeasurements(
     else
         throw CompilationError(CompilationStep::GENERAL, "Unsupported image dimensions",
             std::to_string(static_cast<unsigned>(imageType->dimensions)));
-    throw CompilationError(CompilationStep::GENERAL, "Unimplemented image-query function", it->to_string());
 }
diff --git a/src/intrinsics/Intrinsics.cpp b/src/intrinsics/Intrinsics.cpp
index 0b02e601..451d51f7 100644
--- a/src/intrinsics/Intrinsics.cpp
+++ b/src/intrinsics/Intrinsics.cpp
@@ -644,7 +644,8 @@ static NODISCARD InstructionWalker intrinsifyArithmetic(Method& method, Instruct
                           op->conditional, op->setFlags))
                          ->copyExtrasFrom(it.get()));
         }
-        else if(arg0.getLiteralValue() && isPowerTwo(arg0.getLiteralValue()->signedInt()))
+        else if(arg0.getLiteralValue() && arg0.getLiteralValue()->signedInt() > 0 &&
+            isPowerTwo(arg0.getLiteralValue()->unsignedInt()))
         {
             // a * 2^n = a << n
             CPPLOG_LAZY(logging::Level::DEBUG,
@@ -655,7 +656,8 @@ static NODISCARD InstructionWalker intrinsifyArithmetic(Method& method, Instruct
                      op->conditional, op->setFlags))
                     ->copyExtrasFrom(it.get()));
         }
-        else if(arg1.getLiteralValue() && isPowerTwo(arg1.getLiteralValue()->signedInt()))
+        else if(arg1.getLiteralValue() && arg1.getLiteralValue()->signedInt() > 0 &&
+            isPowerTwo(arg1.getLiteralValue()->unsignedInt()))
         {
             // a * 2^n = a << n
             CPPLOG_LAZY(logging::Level::DEBUG,
@@ -674,7 +676,8 @@ static NODISCARD InstructionWalker intrinsifyArithmetic(Method& method, Instruct
                           op->conditional, op->setFlags))
                          ->copyExtrasFrom(it.get()));
         }
-        else if(arg0.getLiteralValue() && isPowerTwo(arg0.getLiteralValue()->signedInt() + 1))
+        else if(arg0.getLiteralValue() && arg0.getLiteralValue()->signedInt() > 0 &&
+            isPowerTwo(arg0.getLiteralValue()->unsignedInt() + 1))
         {
             // x * (2^k - 1) = x * 2^k - x = x << k - x
             CPPLOG_LAZY(logging::Level::DEBUG,
@@ -686,7 +689,8 @@ static NODISCARD InstructionWalker intrinsifyArithmetic(Method& method, Instruct
             it.reset((new Operation(OP_SUB, op->getOutput().value(), tmp, arg1, op->conditional, op->setFlags))
                          ->copyExtrasFrom(it.get()));
         }
-        else if(arg1.getLiteralValue() && isPowerTwo(arg1.getLiteralValue()->signedInt() + 1))
+        else if(arg1.getLiteralValue() && arg1.getLiteralValue()->signedInt() > 0 &&
+            isPowerTwo(arg1.getLiteralValue()->unsignedInt() + 1))
         {
             // x * (2^k - 1) = x * 2^k - x = x << k - x
             CPPLOG_LAZY(logging::Level::DEBUG,
@@ -734,7 +738,8 @@ static NODISCARD InstructionWalker intrinsifyArithmetic(Method& method, Instruct
             it->addDecorations(InstructionDecorations::UNSIGNED_RESULT);
         }
         // a / 2^n = a >> n
-        else if(arg1.getLiteralValue() && isPowerTwo(arg1.getLiteralValue()->signedInt()))
+        else if(arg1.getLiteralValue() && arg1.getLiteralValue()->signedInt() > 0 &&
+            isPowerTwo(arg1.getLiteralValue()->unsignedInt()))
         {
             CPPLOG_LAZY(logging::Level::DEBUG,
                 log << "Intrinsifying division with right-shift: " << op->to_string() << logging::endl);
@@ -772,7 +777,7 @@ static NODISCARD InstructionWalker intrinsifyArithmetic(Method& method, Instruct
         }
         // a / 2^n = (abs(a) >> n) * sign(a)
         else if(arg1.isLiteralValue() && arg1.getLiteralValue()->signedInt() > 0 &&
-            isPowerTwo(arg1.getLiteralValue()->signedInt()))
+            isPowerTwo(arg1.getLiteralValue()->unsignedInt()))
         {
             CPPLOG_LAZY(logging::Level::DEBUG,
                 log << "Intrinsifying signed division with right-shift and sign-copy: " << op->to_string()
@@ -874,7 +879,7 @@ static NODISCARD InstructionWalker intrinsifyArithmetic(Method& method, Instruct
         }
         // a % 2^n = (abs(a) & 2^n-1) * sign(a)
         else if(arg1.isLiteralValue() && arg1.getLiteralValue()->signedInt() > 0 &&
-            isPowerTwo(arg1.getLiteralValue()->signedInt()))
+            isPowerTwo(arg1.getLiteralValue()->unsignedInt()))
         {
             CPPLOG_LAZY(logging::Level::DEBUG,
                 log << "Intrinsifying signed modulo by power of two: " << op->to_string() << logging::endl);
diff --git a/src/llvm/BitcodeReader.cpp b/src/llvm/BitcodeReader.cpp
index cbf2be25..995cb73d 100644
--- a/src/llvm/BitcodeReader.cpp
+++ b/src/llvm/BitcodeReader.cpp
@@ -690,10 +690,10 @@ Method& BitcodeReader::parseFunction(Module& module, const llvm::Function& func)
             toParameterDecorations(arg, type, func.getCallingConv() == llvm::CallingConv::SPIR_KERNEL)));
         auto& param = method->parameters.back();
         if(arg.getDereferenceableBytes() != 0)
-            param.maxByteOffset = arg.getDereferenceableBytes();
+            param.maxByteOffset = static_cast<std::size_t>(arg.getDereferenceableBytes());
 #if LLVM_LIBRARY_VERSION >= 39
         if(arg.getDereferenceableOrNullBytes() != 0)
-            param.maxByteOffset = arg.getDereferenceableOrNullBytes();
+            param.maxByteOffset = static_cast<std::size_t>(arg.getDereferenceableOrNullBytes());
 #endif
         CPPLOG_LAZY(logging::Level::DEBUG, log << "Reading parameter " << param.to_string(true) << logging::endl);
         if(param.type.getImageType() && func.getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
diff --git a/src/main.cpp b/src/main.cpp
index 84d217c7..6b536a14 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -100,7 +100,7 @@ static void printHelp()
 static std::string toVersionString(unsigned version)
 {
     std::stringstream s;
-    s << (version / 10.0f);
+    s << (static_cast<float>(version) / 10.0f);
     return s.str();
 }
 
diff --git a/src/normalization/AddressCalculation.h b/src/normalization/AddressCalculation.h
index b8560c9b..dd7a69d1 100644
--- a/src/normalization/AddressCalculation.h
+++ b/src/normalization/AddressCalculation.h
@@ -122,4 +122,4 @@ namespace vc4c
         };
     } /* namespace normalization */
 } /* namespace vc4c */
-#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */
\ No newline at end of file
+#endif /* VC4C_NORMALIZATION_ADDRESS_CALCULATION_H */
diff --git a/src/normalization/MemoryAccess.cpp b/src/normalization/MemoryAccess.cpp
index 6386e7ba..30a6fb50 100644
--- a/src/normalization/MemoryAccess.cpp
+++ b/src/normalization/MemoryAccess.cpp
@@ -87,11 +87,12 @@ static BaseAndOffset findBaseAndOffset(const Value& val)
                 ->local()
                 ->getBase(false)
                 ->createReference(),
-            static_cast<int32_t>((*std::find_if(args.begin(), args.end(),
-                                      [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
-                                     .getLiteralValue()
-                                     ->signedInt() /
-                val.type.getElementType().getInMemoryWidth()));
+            (*std::find_if(
+                 args.begin(), args.end(), [](const Value& arg) -> bool { return arg.getLiteralValue().has_value(); }))
+                    .getLiteralValue()
+                    ->signedInt() /
+                /* in-memory width can't be > 2^30 anyway */
+                static_cast<int32_t>(val.type.getElementType().getInMemoryWidth()));
     }
 
     // 3. an addition with two locals -> one is the base, the other the calculation of the literal
@@ -103,10 +104,10 @@ static BaseAndOffset findBaseAndOffset(const Value& val)
         const auto offset1 = findOffset(args[1]);
         if(offset0.offset && args[1].checkLocal())
             return BaseAndOffset(args[1].local()->getBase(false)->createReference(),
-                static_cast<int32_t>(offset0.offset.value() / val.type.getElementType().getInMemoryWidth()));
+                offset0.offset.value() / static_cast<int32_t>(val.type.getElementType().getInMemoryWidth()));
         if(offset1.offset && args[0].checkLocal())
             return BaseAndOffset(args[0].local()->getBase(false)->createReference(),
-                static_cast<int32_t>(offset1.offset.value() / val.type.getElementType().getInMemoryWidth()));
+                offset1.offset.value() / static_cast<int32_t>(val.type.getElementType().getInMemoryWidth()));
     }
     /*
         if(writers.size() == 1)
diff --git a/src/normalization/MemoryMapChecks.cpp b/src/normalization/MemoryMapChecks.cpp
index 3a48e5a3..ec5dd209 100644
--- a/src/normalization/MemoryMapChecks.cpp
+++ b/src/normalization/MemoryMapChecks.cpp
@@ -60,7 +60,7 @@ Optional<Value> normalization::getConstantValue(const Value& source)
         return globalContainer->at(0).toValue();
     if(globalContainer && source.local()->reference.second >= 0)
         // fixed index
-        return globalContainer->at(source.local()->reference.second).toValue();
+        return globalContainer->at(static_cast<std::size_t>(source.local()->reference.second)).toValue();
     if(auto val = global->initialValue.toValue())
     {
         if(auto vector = val->checkVector())
diff --git a/src/normalization/Normalizer.cpp b/src/normalization/Normalizer.cpp
index ea9bab47..78725660 100644
--- a/src/normalization/Normalizer.cpp
+++ b/src/normalization/Normalizer.cpp
@@ -169,8 +169,9 @@ void Normalizer::normalize(Module& module) const
         PROFILE_COUNTER_WITH_PREV(vc4c::profiler::COUNTER_NORMALIZATION + 2, "Eliminate Phi-nodes (after)",
             method->countInstructions(), vc4c::profiler::COUNTER_NORMALIZATION + 1);
     }
+    auto kernels = module.getKernels();
     // 2. inline kernel-functions
-    for(Method* kernelFunc : module.getKernels())
+    for(Method* kernelFunc : kernels)
     {
         Method& kernel = *kernelFunc;
 
@@ -183,14 +184,15 @@ void Normalizer::normalize(Module& module) const
     }
     // 3. run other normalization steps on kernel functions
     const auto f = [&module, this](Method* kernelFunc) -> void { normalizeMethod(module, *kernelFunc); };
-    ThreadPool{"Normalization"}.scheduleAll<Method*>(module.getKernels(), f);
+    ThreadPool{"Normalization"}.scheduleAll<Method*>(kernels, f);
 }
 
 void Normalizer::adjust(Module& module) const
 {
     // run adjustment steps on kernel functions
+    auto kernels = module.getKernels();
     const auto f = [&module, this](Method* kernelFunc) -> void { adjustMethod(module, *kernelFunc); };
-    ThreadPool{"Adjustment"}.scheduleAll<Method*>(module.getKernels(), f);
+    ThreadPool{"Adjustment"}.scheduleAll<Method*>(kernels, f);
 }
 
 void Normalizer::normalizeMethod(Module& module, Method& method) const
diff --git a/src/optimization/ControlFlow.cpp b/src/optimization/ControlFlow.cpp
index 19534c50..74c0867b 100644
--- a/src/optimization/ControlFlow.cpp
+++ b/src/optimization/ControlFlow.cpp
@@ -458,9 +458,9 @@ static std::size_t fixVPMSetups(ControlFlowLoop& loop, unsigned vectorizationFac
         {
             periphery::VPWSetupWrapper vpwSetup(static_cast<intermediate::LoadImmediate*>(nullptr));
             if(auto load = it.get<intermediate::LoadImmediate>())
-                vpwSetup = periphery::VPWSetupWrapper(it.get<intermediate::LoadImmediate>());
+                vpwSetup = periphery::VPWSetupWrapper(load);
             else if(auto move = it.get<intermediate::MoveOperation>())
-                vpwSetup = periphery::VPWSetupWrapper(it.get<intermediate::MoveOperation>());
+                vpwSetup = periphery::VPWSetupWrapper(move);
             else
                 throw CompilationError(CompilationStep::OPTIMIZER,
                     "Unsupported instruction to write VPM for vectorized value", it->to_string());
@@ -479,9 +479,9 @@ static std::size_t fixVPMSetups(ControlFlowLoop& loop, unsigned vectorizationFac
         {
             periphery::VPRSetupWrapper vprSetup(static_cast<intermediate::LoadImmediate*>(nullptr));
             if(auto load = it.get<intermediate::LoadImmediate>())
-                vprSetup = periphery::VPRSetupWrapper(it.get<intermediate::LoadImmediate>());
+                vprSetup = periphery::VPRSetupWrapper(load);
             else if(auto move = it.get<intermediate::MoveOperation>())
-                vprSetup = periphery::VPRSetupWrapper(it.get<intermediate::MoveOperation>());
+                vprSetup = periphery::VPRSetupWrapper(move);
             else
                 throw CompilationError(CompilationStep::OPTIMIZER,
                     "Unsupported instruction to write VPM for vectorized value", it->to_string());
@@ -611,7 +611,7 @@ static void vectorize(ControlFlowLoop& loop, InductionVariable& inductionVariabl
     FastSet<const intermediate::IntermediateInstruction*> openInstructions;
 
     const_cast<DataType&>(inductionVariable.local->type) = inductionVariable.local->type.toVectorType(
-        inductionVariable.local->type.getVectorWidth() * static_cast<unsigned char>(vectorizationFactor));
+        static_cast<unsigned char>(inductionVariable.local->type.getVectorWidth() * vectorizationFactor));
     scheduleForVectorization(inductionVariable.local, openInstructions, loop);
     std::size_t numVectorized = 0;
 
@@ -649,7 +649,8 @@ static void vectorize(ControlFlowLoop& loop, InductionVariable& inductionVariabl
                 // since we are in the process of vectorization, the local type is already updated, but the argument
                 // type is pending. For the next steps, we need to update the argument type.
                 arg.type = arg.local()->type;
-                auto origVectorType = arg.type.toVectorType(arg.type.getVectorWidth() / vectorizationFactor);
+                auto origVectorType =
+                    arg.type.toVectorType(static_cast<uint8_t>(arg.type.getVectorWidth() / vectorizationFactor));
 
                 Optional<OpCode> accumulationOp{};
                 if(auto writer = arg.getSingleWriter())
@@ -1622,7 +1623,7 @@ static FastAccessList<IfElseBlock> findIfElseBlocks(ControlFlowGraph& graph)
 
         if(candidateBlock.successor != nullptr && candidateBlock.conditionalBlocks.size() > 1)
             blocks.emplace_back(std::move(candidateBlock));
-        else if(false) // TODO needs testing!
+        else if(/* DISABLES CODE */ (false)) // TODO needs testing!
         {
             // TODO also needs extension in rewriting, special handling for successor which is conditional!
             // we failed with simple version above, recheck for more complex version:
diff --git a/src/optimization/InstructionScheduler.cpp b/src/optimization/InstructionScheduler.cpp
index 927e8fe2..ea019b60 100644
--- a/src/optimization/InstructionScheduler.cpp
+++ b/src/optimization/InstructionScheduler.cpp
@@ -223,7 +223,7 @@ static OpenSet::const_iterator selectInstruction(OpenSet& openNodes, DependencyG
         // select first entry (with highest priority) for which all dependencies are fulfilled (with latency)
         int priority = checkDependenciesMet(graph.assertNode(*it), block, openNodes);
         if(priority < MIN_PRIORITY)
-            priority -= successiveMandatoryDelays.at(*it);
+            priority -= static_cast<int>(successiveMandatoryDelays.at(*it));
         // TODO use preferred delays?
         // TODO remove adding/removing priorities in calculateSchedulingPriority?
         // TODO remove extra cases here?
diff --git a/src/optimization/Optimizer.cpp b/src/optimization/Optimizer.cpp
index 6233ed91..dd11da03 100644
--- a/src/optimization/Optimizer.cpp
+++ b/src/optimization/Optimizer.cpp
@@ -227,10 +227,11 @@ static void runOptimizationPasses(const Module& module, Method& method, const Co
 
 void Optimizer::optimize(Module& module) const
 {
+    auto kernels = module.getKernels();
     const auto f = [&](Method* kernelFunc) {
         runOptimizationPasses(module, *kernelFunc, config, initialPasses, repeatingPasses, finalPasses);
     };
-    ThreadPool{"Optimizer"}.scheduleAll<Method*>(module.getKernels(), f);
+    ThreadPool{"Optimizer"}.scheduleAll<Method*>(kernels, f);
 }
 
 const std::vector<OptimizationPass> Optimizer::ALL_PASSES = {
diff --git a/src/periphery/VPM.h b/src/periphery/VPM.h
index 2b5274e4..bbac7a7e 100644
--- a/src/periphery/VPM.h
+++ b/src/periphery/VPM.h
@@ -573,6 +573,9 @@ namespace vc4c
                     Base::value = move->getSource().getLiteralValue().value().toImmediate();
             }
 
+            SetupWrapper(const SetupWrapper&) = default;
+            SetupWrapper(SetupWrapper&&) noexcept = default;
+
             ~SetupWrapper()
             {
                 if(load != nullptr)
@@ -581,6 +584,9 @@ namespace vc4c
                     move->setSource(Value(Literal(Base::value), TYPE_INT32));
             }
 
+            SetupWrapper& operator=(const SetupWrapper&) = default;
+            SetupWrapper& operator=(SetupWrapper&&) noexcept = default;
+
             inline void resetSetup() const
             {
                 if(load != nullptr)
diff --git a/src/tools/Emulator.cpp b/src/tools/Emulator.cpp
index 58d04bee..b07afa8b 100644
--- a/src/tools/Emulator.cpp
+++ b/src/tools/Emulator.cpp
@@ -153,7 +153,8 @@ void Memory::setUniforms(const std::vector<Word>& uniforms, MemoryAddress addres
 {
     PROFILE_COUNTER(vc4c::profiler::COUNTER_EMULATOR + 10, "setUniforms", 1);
     std::size_t offset = address / sizeof(Word);
-    std::copy_n(uniforms.begin(), uniforms.size(), VariantNamespace::get<DirectBuffer>(data).begin() + offset);
+    std::copy_n(uniforms.begin(), uniforms.size(),
+        VariantNamespace::get<DirectBuffer>(data).begin() + static_cast<std::vector<Word>::difference_type>(offset));
 }
 
 bool Mutex::isLocked() const
@@ -1140,12 +1141,12 @@ bool QPU::execute(std::vector<qpu_asm::Instruction>::const_iterator firstInstruc
             {
                 ++instrumentation[inst].numBranchTaken;
                 int32_t offset = 4 /* Branch starts at PC + 4 */ +
-                    static_cast<int32_t>(br->getImmediate() / sizeof(uint64_t)) /* immediate offset is in bytes */;
+                    (br->getImmediate() / static_cast<int32_t>(sizeof(uint64_t))) /* immediate offset is in bytes */;
                 if(br->getAddRegister() == BranchReg::BRANCH_REG ||
                     br->getBranchRelative() == BranchRel::BRANCH_ABSOLUTE)
                     throw CompilationError(
                         CompilationStep::GENERAL, "This kind of branch is not yet implemented", br->toASMString());
-                nextPC += offset;
+                nextPC += static_cast<ProgramCounter>(offset);
 
                 // see Broadcom specification, page 34
                 registers.writeRegister(toRegister(br->getAddOut(), br->getWriteSwap() == WriteSwap::SWAP),
@@ -1290,10 +1291,10 @@ static std::pair<SIMDVector, bool> applyVectorRotation(std::pair<SIMDVector, boo
         //"Mul output vector rotation is taken from accumulator r5, element 0, bits [3:0]"
         // - Broadcom Specification, page 30
         auto tmp = registers.readRegister(REG_ACC5).first;
-        distance = 16 - static_cast<uint8_t>(tmp[0].unsignedInt() & 0xF);
+        distance = static_cast<uint8_t>(16 - (tmp[0].unsignedInt() & 0xF));
     }
     else
-        distance = 16 - offset.getRotationOffset().value();
+        distance = static_cast<uint8_t>(16 - offset.getRotationOffset().value());
 
     SIMDVector result(std::move(input.first));
 
@@ -1882,7 +1883,7 @@ static void dumpMemory(const Memory& memory, const std::string& fileName, Memory
         f << " " << std::hex << std::setfill('0') << std::setw(8) << memory.readWord(addr);
         if(addr % (sizeof(tools::Word) * 8) == (sizeof(tools::Word) * 7))
             f << std::endl;
-        addr += sizeof(tools::Word);
+        addr += static_cast<MemoryAddress>(sizeof(tools::Word));
     }
     f << std::endl;
     CPPLOG_LAZY(logging::Level::DEBUG,
@@ -1971,9 +1972,10 @@ EmulationResult tools::emulate(const EmulationData& data)
         dumpMemory(mem, data.memoryDump, uniformAddress, true);
 
     InstrumentationResults instrumentation;
-    bool status =
-        emulate(instructions.begin() + (kernelInfo->getOffset() - module.kernelInfos.front().getOffset()).getValue(),
-            mem, uniformAddresses, instrumentation, data.maxEmulationCycles);
+    bool status = emulate(instructions.begin() +
+            static_cast<std::vector<qpu_asm::Instruction>::difference_type>(
+                (kernelInfo->getOffset() - module.kernelInfos.front().getOffset()).getValue()),
+        mem, uniformAddresses, instrumentation, data.maxEmulationCycles);
 
     if(!data.memoryDump.empty())
         dumpMemory(mem, data.memoryDump, uniformAddress, false);
@@ -1998,7 +2000,9 @@ EmulationResult tools::emulate(const EmulationData& data)
     std::unique_ptr<std::ofstream> dumpInstrumentation;
     if(!data.instrumentationDump.empty())
         dumpInstrumentation.reset(new std::ofstream(data.instrumentationDump));
-    auto it = instructions.begin() + (kernelInfo->getOffset() - module.kernelInfos.front().getOffset()).getValue();
+    auto it = instructions.begin() +
+        static_cast<std::vector<qpu_asm::Instruction>::difference_type>(
+            (kernelInfo->getOffset() - module.kernelInfos.front().getOffset()).getValue());
     result.instrumentation.reserve(kernelInfo->getLength().getValue());
     while(true)
     {
diff --git a/test/TestMemoryAccess.cpp b/test/TestMemoryAccess.cpp
index 92ad400e..d302c7c2 100644
--- a/test/TestMemoryAccess.cpp
+++ b/test/TestMemoryAccess.cpp
@@ -229,9 +229,9 @@ void TestMemoryAccess::testLocalStorage()
     // actual results are indeterministic, since the order of the loads/stores across work-items is not guaranteed
     for(auto res : result.results.at(1).second.value())
     {
-        if(res % 7 != 0)
+        if(res % 7u != 0)
         {
-            TEST_ASSERT_EQUALS(7, res)
+            TEST_ASSERT_EQUALS(7u, res)
         }
     }
 }
diff --git a/test/emulation_helper.h b/test/emulation_helper.h
index 9284dfff..4b2ec362 100644
--- a/test/emulation_helper.h
+++ b/test/emulation_helper.h
@@ -69,7 +69,7 @@ struct NormalDistribution : public std::normal_distribution<double>
 };
 
 template <typename Distribution, unsigned InfPercentage = 10, unsigned NaNPercentage = 10>
-struct InfNaNWrapper : private Distribution
+struct InfNaNWrapper : public Distribution
 {
     explicit InfNaNWrapper(double min, double max) : Distribution(min, max), infNaNDistribution(0.0, 1.0) {}
 
@@ -111,7 +111,8 @@ std::array<T, N> generateInput(bool allowNull,
     std::mt19937 gen(rd());
     // NOTE: double here allows for float::lowest() and float::max() to be used, see also
     // https://stackoverflow.com/a/36826730/8720655
-    Distribution dis(min, max);
+    Distribution dis(
+        static_cast<typename Distribution::result_type>(min), static_cast<typename Distribution::result_type>(max));
 
     for(std::size_t i = 0; i < N; ++i)
     {