From 8b672d6f5abe10c2c3fe99ba69502fce7a16c2b2 Mon Sep 17 00:00:00 2001 From: doe300 Date: Sat, 7 Sep 2019 13:01:51 +0200 Subject: [PATCH] Implements copying of dynamically sized memory Also adds helper function to insert while-loop into generated code. See https://github.com/doe300/VC4CL/issues/81 Fixes: TestVC4C/emulator/test_async_copy --- src/BasicBlock.cpp | 2 ++ src/intermediate/Helper.cpp | 27 ++++++++++++++++++++++ src/intermediate/Helper.h | 15 ++++++++++++ src/intrinsics/Intrinsics.cpp | 11 +++++---- src/normalization/MemoryMappings.cpp | 22 ++++++++++-------- src/periphery/VPM.cpp | 34 ++++++++++++++++++++++++++++ src/periphery/VPM.h | 4 ++++ src/tools/Emulator.cpp | 6 ++--- 8 files changed, 104 insertions(+), 17 deletions(-) diff --git a/src/BasicBlock.cpp b/src/BasicBlock.cpp index fc9b9998..0d69d30c 100644 --- a/src/BasicBlock.cpp +++ b/src/BasicBlock.cpp @@ -286,6 +286,8 @@ void BasicBlock::dumpInstructions() const [](const std::unique_ptr& instr) { if(instr) logging::debug() << instr->to_string() << logging::endl; + else + logging::debug() << "(null)" << logging::endl; }); logging::debug() << "Block end ----" << logging::endl; }); diff --git a/src/intermediate/Helper.cpp b/src/intermediate/Helper.cpp index f1e237f2..8478b8a8 100644 --- a/src/intermediate/Helper.cpp +++ b/src/intermediate/Helper.cpp @@ -363,3 +363,30 @@ FastSet intermediate::getEquivalenceClass(const Local* local) return clazz; } + +BasicBlock& intermediate::insertLoop(Method& method, InstructionWalker& it, const Value& conditionValue, + ConditionCode repeatCondition, const std::string& label) +{ + auto loopLabel = method.addNewLocal(TYPE_LABEL, label); + auto preheaderLabel = method.addNewLocal(TYPE_LABEL, loopLabel.local()->name, "preheader"); + auto afterLoopLabel = method.addNewLocal(TYPE_LABEL, loopLabel.local()->name, "after"); + + auto preheaderIt = method.emplaceLabel(it, new BranchLabel(*preheaderLabel.local())); + preheaderIt.nextInBlock(); + + // in the preheader, jump over loop only when condition becomes false, otherwise fall through loop content block + preheaderIt.emplace(new Branch(loopLabel.local(), repeatCondition, conditionValue)); + preheaderIt.nextInBlock(); + preheaderIt.emplace(new Branch(afterLoopLabel.local(), repeatCondition.invert(), conditionValue)); + preheaderIt.nextInBlock(); + + auto inLoopIt = method.emplaceLabel(preheaderIt, new BranchLabel(*loopLabel.local())); + inLoopIt.nextInBlock(); + + // in loop content block, unconditionally jump back to preheader + inLoopIt.emplace(new Branch(preheaderLabel.local(), COND_ALWAYS, BOOL_TRUE)); + inLoopIt.nextInBlock(); + + it = method.emplaceLabel(inLoopIt, new BranchLabel(*afterLoopLabel.local())); + return *inLoopIt.getBasicBlock(); +} diff --git a/src/intermediate/Helper.h b/src/intermediate/Helper.h index 89576ea5..0b27809c 100644 --- a/src/intermediate/Helper.h +++ b/src/intermediate/Helper.h @@ -71,6 +71,21 @@ namespace vc4c * See https://en.wikipedia.org/wiki/Equivalence_class */ FastSet getEquivalenceClass(const Local* local); + + /** + * Inserts a tight loop into the given method at the given position + * + * The input instruction walker will be set to the first instruction (the label) in the block FOLLOWING the + * loop. The output basic block is the inserted block and can be used to insert code into the loop itself. + * + * NOTE: The inserted loop will be a while(conditionValue) loop, so the condition variable needs to be + * initialized before the loop starts. + * + * NOTE: The loop is repeated as long as the conditionValue matches the repeatCondition. Normal branch condition + * behavior applies, so only the first element of the conditionValue is actually checked! + */ + NODISCARD BasicBlock& insertLoop(Method& method, InstructionWalker& it, const Value& conditionValue, + ConditionCode repeatCondition, const std::string& label = ""); } // namespace intermediate } // namespace vc4c diff --git a/src/intrinsics/Intrinsics.cpp b/src/intrinsics/Intrinsics.cpp index 451d51f7..aa457f2f 100644 --- a/src/intrinsics/Intrinsics.cpp +++ b/src/intrinsics/Intrinsics.cpp @@ -209,11 +209,12 @@ static IntrinsicFunction intrinsifyDMAAccess(DMAAccess access, bool setMutex) << logging::endl); const DataType type = callSite->assertArgument(0).type.getElementType(); if(!callSite->getArgument(2) || !callSite->assertArgument(2).getLiteralValue()) - throw CompilationError(CompilationStep::OPTIMIZER, - "Memory copy with non-constant size is not yet supported", callSite->to_string()); - it = method.vpm->insertCopyRAM(method, it, callSite->assertArgument(0), callSite->assertArgument(1), - callSite->assertArgument(2).getLiteralValue()->unsignedInt() * type.getInMemoryWidth(), nullptr, - setMutex); + it = method.vpm->insertCopyRAMDynamic(method, it, callSite->assertArgument(0), + callSite->assertArgument(1), callSite->assertArgument(2), nullptr, setMutex); + else + it = method.vpm->insertCopyRAM(method, it, callSite->assertArgument(0), callSite->assertArgument(1), + callSite->assertArgument(2).getLiteralValue()->unsignedInt() * type.getInMemoryWidth(), nullptr, + setMutex); break; } case DMAAccess::PREFETCH: diff --git a/src/normalization/MemoryMappings.cpp b/src/normalization/MemoryMappings.cpp index 1ed19304..83f89f2f 100644 --- a/src/normalization/MemoryMappings.cpp +++ b/src/normalization/MemoryMappings.cpp @@ -642,17 +642,21 @@ static InstructionWalker mapMemoryCopy( else if(srcInRAM && destInRAM) { // copy from RAM into RAM -> DMA read + DMA write - if(!numEntries.isLiteralValue()) - throw CompilationError(CompilationStep::OPTIMIZER, - "Copying dynamically sized memory within RAM is not yet implemented", mem->to_string()); - uint64_t numBytes = numEntries.getLiteralValue()->unsignedInt() * - (mem->getSourceElementType().getScalarBitCount() * mem->getSourceElementType().getVectorWidth()) / 8; - if(numBytes > std::numeric_limits::max()) - throw CompilationError(CompilationStep::OPTIMIZER, "Cannot copy more than 4GB of data", mem->to_string()); CPPLOG_LAZY(logging::Level::DEBUG, log << "Mapping copy from RAM into RAM to DMA read and DMA write: " << mem->to_string() << logging::endl); - it = method.vpm->insertCopyRAM( - method, it, mem->getDestination(), mem->getSource(), static_cast(numBytes), nullptr); + if(!numEntries.isLiteralValue()) + it = method.vpm->insertCopyRAMDynamic(method, it, mem->getDestination(), mem->getSource(), numEntries); + else + { + uint64_t numBytes = numEntries.getLiteralValue()->unsignedInt() * + (mem->getSourceElementType().getScalarBitCount() * mem->getSourceElementType().getVectorWidth()) / 8; + if(numBytes > std::numeric_limits::max()) + throw CompilationError( + CompilationStep::OPTIMIZER, "Cannot copy more than 4GB of data", mem->to_string()); + + it = method.vpm->insertCopyRAM( + method, it, mem->getDestination(), mem->getSource(), static_cast(numBytes), nullptr); + } return it.erase(); } else if(destInRegister && destInfo.convertedRegisterType) diff --git a/src/periphery/VPM.cpp b/src/periphery/VPM.cpp index 5999cf14..a9d5906b 100644 --- a/src/periphery/VPM.cpp +++ b/src/periphery/VPM.cpp @@ -7,6 +7,7 @@ #include "VPM.h" #include "../Profiler.h" +#include "../intermediate/Helper.h" #include "../intermediate/VectorHelper.h" #include "../intermediate/operators.h" #include "log.h" @@ -720,6 +721,39 @@ InstructionWalker VPM::insertCopyRAM(Method& method, InstructionWalker it, const return it; } +InstructionWalker VPM::insertCopyRAMDynamic(Method& method, InstructionWalker it, const Value& destAddress, + const Value& srcAddress, const Value& numEntries, const VPMArea* area, bool useMutex) +{ + it = insertLockMutex(it, useMutex); + + // count from maximum to 0 (exclusive) + auto counter = assign(it, numEntries.type, "%remaining_iterations") = numEntries; + auto& block = intermediate::insertLoop(method, it, counter, COND_ZERO_CLEAR, "dynamic_dma_copy"); + { + // inside the loop, a single iteration + auto inLoopIt = block.walk().nextInBlock(); + auto elementType = destAddress.type.getElementType(); + auto index = assign(inLoopIt, counter.type) = numEntries - counter; + // XXX does not support more than 2^23 elements + auto offset = assign(inLoopIt, counter.type) = + mul24(index, Value(Literal(elementType.getInMemoryWidth()), TYPE_INT32)); + + // increment offset from base address + Value tmpSource = assign(inLoopIt, srcAddress.type, "%mem_copy_addr") = srcAddress + offset; + Value tmpDest = assign(inLoopIt, destAddress.type, "%mem_copy_addr") = destAddress + offset; + + inLoopIt = insertReadRAM(method, inLoopIt, tmpSource, elementType, area, false); + inLoopIt = insertWriteRAM(method, inLoopIt, tmpDest, elementType, area, false); + + // decrement remaining iterations counter + assign(inLoopIt, counter) = counter - INT_ONE; + } + + it.nextInBlock(); + it = insertUnlockMutex(it, useMutex); + return it; +} + InstructionWalker VPM::insertFillRAM(Method& method, InstructionWalker it, const Value& memoryAddress, DataType type, const unsigned numCopies, const VPMArea* area, bool useMutex) { diff --git a/src/periphery/VPM.h b/src/periphery/VPM.h index bbac7a7e..93f59249 100644 --- a/src/periphery/VPM.h +++ b/src/periphery/VPM.h @@ -840,6 +840,10 @@ namespace vc4c */ NODISCARD InstructionWalker insertCopyRAM(Method& method, InstructionWalker it, const Value& destAddress, const Value& srcAddress, unsigned numBytes, const VPMArea* area = nullptr, bool useMutex = true); + + NODISCARD InstructionWalker insertCopyRAMDynamic(Method& method, InstructionWalker it, + const Value& destAddress, const Value& srcAddress, const Value& numEntries, + const VPMArea* area = nullptr, bool useMutex = true); /* * Inserts a filling of a memory-area with a single value from VPM */ diff --git a/src/tools/Emulator.cpp b/src/tools/Emulator.cpp index b07afa8b..a9ae46c8 100644 --- a/src/tools/Emulator.cpp +++ b/src/tools/Emulator.cpp @@ -1703,9 +1703,9 @@ static void emulateStep(std::vector::const_iterator firstI } catch(const std::exception&) { - logging::error() << "Emulation threw exception execution in following instruction on QPU " << qpus[i].ID - << ": " << qpus[i].getCurrentInstruction(firstInstruction)->toHexString(true) - << logging::endl; + logging::error() << "Emulation threw exception execution in following instruction on QPU " + << static_cast(qpus[i].ID) << ": " + << qpus[i].getCurrentInstruction(firstInstruction)->toHexString(true) << logging::endl; // re-throw error throw; }