diff --git a/README.md b/README.md index 98e6dca..e5a6c4a 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ And follow one of our tutorials [here](docs/tutorials). This setup assumes that you have built LLVM and MLIR in `$BUILD_DIR` and installed it to `$PREFIX`. The current version of this project was tested with `llvm-project` commit: -`339a7687e1c036a5f91c9d5391523b93e2e76cd3`. +`08d094a0e457360ad8b94b017d2dc277e697ca76`. Make sure you have the correct commit checked-out. **Note**: Make sure to pass `-DLLVM_INSTALL_UTILS=ON` when building LLVM/MLIR diff --git a/include/soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h b/include/soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h new file mode 100644 index 0000000..d909547 --- /dev/null +++ b/include/soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h @@ -0,0 +1,58 @@ +//===- AccelToAXI4MLIR.h - Convert Accel to AXI4MLIR calls ----*- C++ -*-===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SODA_CONVERSION_ACCELTORUNTIME_ACCELTOAXI4MLIR_H_ +#define SODA_CONVERSION_ACCELTORUNTIME_ACCELTOAXI4MLIR_H_ + +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +class MLIRContext; +class Pass; +class RewritePatternSet; +class ModuleOp; +template +class OperationPass; + +struct AccelToAXI4MLIROptions { + /// Accelerator Tile Size information + unsigned tileSize = 1; + + /// DMA Information + unsigned dmaAddress = 0; + unsigned dmaInputAddress = 0; + unsigned dmaInputBufferSize = 100000; + unsigned dmaOutputAddress = 100000; + unsigned dmaOutputBufferSize = 100000; + + /// Flow information + bool flowCpuAcc = false; + unsigned numberOfCaches = false; + ArrayRef cacheSizes; + ArrayRef tileSizes; + unsigned elementSize = false; +}; + +/// Populate the given list with patterns that convert from Accel to AXI4MLIR +/// runtime calls. +void populateAccelToAXI4MLIRConversionPatterns(RewritePatternSet &patterns); + +/// Populate the given list with patterns that convert from Accel to AXI4MLIR +/// runtime calls. +void populateAccelToAXI4MLIRConversionPatternsWithOptions( + RewritePatternSet &patterns, + const AccelToAXI4MLIROptions &options = AccelToAXI4MLIROptions()); + +/// Create the pass to convert accel operations to axi4mlir calls +std::unique_ptr> createConvertAccelToAXI4MLIRPass(); + +std::unique_ptr> +createConvertAccelToAXI4MLIRPass(const AccelToAXI4MLIROptions &options); + +} // namespace mlir + +#endif // SODA_CONVERSION_ACCELTORUNTIME_ACCELTOAXI4MLIR_H_ diff --git a/include/soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h b/include/soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h new file mode 100644 index 0000000..3c95a60 --- /dev/null +++ b/include/soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h @@ -0,0 +1,73 @@ +//===- Utils.h - Function and method used by axi4mlir passes ----*- C++ -*-===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SODA_CONVERSION_LINALGTOAXI4MLIR_UTILS_H_ +#define SODA_CONVERSION_LINALGTOAXI4MLIR_UTILS_H_ + +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +class MLIRContext; +class Pass; +class RewritePatternSet; +class PatternRewriter; +class ModuleOp; +namespace func { +class FuncOp; +} // namespace func + +struct AccelTransformationOptions { + /// Accelerator Tile Size information + unsigned accelSize = 1; + ArrayRef accelSizes; + + /// DMA Information + unsigned dmaAddress = 0; + unsigned dmaInputAddress = 0; + unsigned dmaInputBufferSize = 100000; + unsigned dmaOutputAddress = 100000; + unsigned dmaOutputBufferSize = 100000; + + /// Flow information + + /// IDs of opcodes that should be accumulated on the CPU + ArrayRef accOnCpu; + bool flowCpuAcc = false; + unsigned numberOfCaches = false; + ArrayRef cacheSizes; + ArrayRef tileSizes; + unsigned elementSize = false; + ArrayRef loopPermutation; + + /// Anchor + std::string anchorFuncName; + std::string anchorOpName; + std::string anchorFilterName; + + /// Opcode information + std::string opcodeMap; + std::string initFlow; + std::string opcodeFlow; + +public: + /// Utility to print members of the struct + void dump() const; +}; + +/// Apply tiling patterns to matmul operations with the correct attribute +void applyPatterns(func::FuncOp funcOp, const AccelTransformationOptions &options); + +/// Populates patterns that implement a FSM of modifications. +/// Changhing the kLinalgTransformMarker +/// GENERALIZE -> INTERCHANGE -> MEM(TILE) L3(TILE) -> L2(TILE) -> L1(TILE) -> +/// ACCEL +void populateCommonLinalgTransformationPatterns( + RewritePatternSet &patterns, const AccelTransformationOptions &options); + +} // namespace mlir + +#endif // SODA_CONVERSION_LINALGTOAXI4MLIR_UTILS_H_ diff --git a/include/soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h b/include/soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h new file mode 100644 index 0000000..c922696 --- /dev/null +++ b/include/soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h @@ -0,0 +1,39 @@ +//===- LinalgGenericToAccel.h - Convert linalg to AXI4MLIR calls ----*- C++ +//-*-===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SODA_CONVERSION_LINALGTOACCEL_LINALGGENERICTOACCEL_H_ +#define SODA_CONVERSION_LINALGTOACCEL_LINALGGENERICTOACCEL_H_ + +#include "soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h" +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +class MLIRContext; +class Pass; +class RewritePatternSet; +class ModuleOp; +template +class OperationPass; + +/// Populate the list with patterns that convert from LinalgOps to AccelOps +void populateLinalgGenericToAccelConversionPatternsWithOptions( + RewritePatternSet &patterns, + const AccelTransformationOptions &options = AccelTransformationOptions()); +void populateLinalgGenericToAccelConversionPatterns( + RewritePatternSet &patterns); + +/// Create the pass to convert from LinalgOps to AccelOps +std::unique_ptr> +createConvertLinalgGenericToAccelPass(); + +std::unique_ptr> createConvertLinalgGenericToAccelPass( + const AccelTransformationOptions &options); + +} // namespace mlir + +#endif // SODA_CONVERSION_LINALGTOACCEL_LINALGGENERICTOACCEL_H_ diff --git a/include/soda/Conversion/Passes.h b/include/soda/Conversion/Passes.h index 2d3ba49..519eece 100644 --- a/include/soda/Conversion/Passes.h +++ b/include/soda/Conversion/Passes.h @@ -15,6 +15,8 @@ #include "soda/Conversion/KernelsToSODA/OperationToSODAPass.h" #include "soda/Conversion/KernelsToSODA/SCFToSODAPass.h" +#include "soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h" + namespace mlir { namespace soda { diff --git a/include/soda/Conversion/Passes.td b/include/soda/Conversion/Passes.td index cb6a509..be70695 100644 --- a/include/soda/Conversion/Passes.td +++ b/include/soda/Conversion/Passes.td @@ -81,4 +81,19 @@ def ConvertAllToSODA : Pass<"convert-all-to-soda", "func::FuncOp"> { ]; } +//===----------------------------------------------------------------------===// +// AccelToAXI4MLIR +//===----------------------------------------------------------------------===// + +def ConvertAccelToAXI4MLIR : Pass<"test-accel-to-axi4mlir", "ModuleOp"> { + let summary = "Convert accel ops into AXI4MLIR runtime calls"; + let constructor = "mlir::createConvertAccelToAXI4MLIRPass()"; + let dependentDialects = [ + "AffineDialect", + "memref::MemRefDialect", + "scf::SCFDialect", + "LLVM::LLVMDialect", + ]; +} + #endif // SODA_CONVERSION_PASSES diff --git a/include/soda/Dialect/Accel/CMakeLists.txt b/include/soda/Dialect/Accel/CMakeLists.txt new file mode 100644 index 0000000..f33061b --- /dev/null +++ b/include/soda/Dialect/Accel/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(IR) diff --git a/include/soda/Dialect/Accel/IR/Accel.h b/include/soda/Dialect/Accel/IR/Accel.h new file mode 100644 index 0000000..8ff7525 --- /dev/null +++ b/include/soda/Dialect/Accel/IR/Accel.h @@ -0,0 +1,32 @@ +//===- Accel.h - Accel dialect ------------------------------------*- C++-*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SODA_DIALECT_ACCEL_IR_ACCEL_H_ +#define SODA_DIALECT_ACCEL_IR_ACCEL_H_ + +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Interfaces/VectorInterfaces.h" + +//===----------------------------------------------------------------------===// +// Accel Dialect +//===----------------------------------------------------------------------===// + +#include "soda/Dialect/Accel/IR/AccelOpsDialect.h.inc" + +//===----------------------------------------------------------------------===// +// Accel Dialect Operations +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "soda/Dialect/Accel/IR/AccelOps.h.inc" + +#endif // SODA_DIALECT_ACCEL_IR_ACCEL_H_ diff --git a/include/soda/Dialect/Accel/IR/AccelBase.td b/include/soda/Dialect/Accel/IR/AccelBase.td new file mode 100644 index 0000000..a6ed582 --- /dev/null +++ b/include/soda/Dialect/Accel/IR/AccelBase.td @@ -0,0 +1,20 @@ +//===- AccelBase.td - Base definitions for accel dialect ----*- tablegen -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef ACCEL_BASE +#define ACCEL_BASE +include "mlir/IR/OpBase.td" +def Accel_Dialect : Dialect { + let name = "accel"; + let cppNamespace = "::mlir::accel"; + let description = [{ + The accel dialect is intended to hold accel operations that abstract + AXI4MLIR DMA communciations. + }]; + let useFoldAPI = kEmitFoldAdaptorFolder; +} +#endif // ACCEL_BASE diff --git a/include/soda/Dialect/Accel/IR/AccelOps.td b/include/soda/Dialect/Accel/IR/AccelOps.td new file mode 100644 index 0000000..58c5f8d --- /dev/null +++ b/include/soda/Dialect/Accel/IR/AccelOps.td @@ -0,0 +1,109 @@ +//===- AccelOps.td - Accel op definitions ------------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef ACCEL_OPS +#define ACCEL_OPS + +include "soda/Dialect/Accel/IR/AccelBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +// Base class for accel dialect ops. +class Accel_Op traits = []> : + Op; + +//===----------------------------------------------------------------------===// +// dmaInitOp +//===----------------------------------------------------------------------===// + +def Accel_InitDMAOp : Accel_Op<"init_dma"> { + let summary = "initializes the DMA"; + let description = [{ + During lowering to AXI4MLIR calls, this op is lowered to a call to + `dma_init` at the beginning of the operation scope/basic block, and + a call to `dma_free` at the end of the operation scope/basic block. + ``` + }]; + let arguments = (ins SignlessIntegerLike:$dmaAddress, + SignlessIntegerLike:$dmaInputAddress, + SignlessIntegerLike:$dmaInputBufferSize, + SignlessIntegerLike:$dmaOutputAddress, + SignlessIntegerLike:$dmaOutputBufferSize); + + + // let results = (outs SignlessIntegerLike:$result); + + let assemblyFormat = [{ + $dmaAddress `,` + $dmaInputAddress `,` + $dmaInputBufferSize `,` + $dmaOutputAddress `,` + $dmaOutputBufferSize + attr-dict `:` + `(` + type($dmaAddress) `,` + type($dmaInputAddress) `,` + type($dmaInputBufferSize) `,` + type($dmaOutputAddress) `,` + type($dmaOutputBufferSize) + `)` + }]; +} + +def Accel_SendOp : Accel_Op<"send"> { + let summary = "send MemRef to DMA region"; + let description = [{ + TODO + }]; + let arguments = (ins AnyMemRef:$input, + Optional:$offset_value); + + + let results = (outs I32:$out_offset); + + let assemblyFormat = [{ + $input (`,` $offset_value^)? attr-dict `:` + `(` type($input) (`,` type($offset_value)^)? `)` `->` type($out_offset) + }]; +} + +def Accel_SendLiteralOp : Accel_Op<"sendLiteral"> { + let summary = "send Literal to DMA region"; + let description = [{ + Used to send a literal value to the DMA region. + The literal value is considered an opcode. + }]; + let arguments = (ins SignlessIntegerLike:$opcode, + Optional:$offset_value); + + + let results = (outs I32:$out_offset); + + let assemblyFormat = [{ + $opcode (`,` $offset_value^)? attr-dict `:` + `(` type($opcode) (`,` type($offset_value)^)? `)` `->` type($out_offset) + }]; +} + +def Accel_RecvOp : Accel_Op<"recv"> { + let summary = "receive data from the DMA region into the MemRef"; + let description = [{ + TODO + }]; + let arguments = (ins AnyMemRef:$dst, + Optional:$offset_value); + + + let results = (outs I32:$out_offset); + + let assemblyFormat = [{ + $dst (`,` $offset_value^)? attr-dict `:` + `(` type($dst) (`,` type($offset_value)^)? `)` `->` type($out_offset) + }]; +} + +#endif // ACCEL_OPS diff --git a/include/soda/Dialect/Accel/IR/CMakeLists.txt b/include/soda/Dialect/Accel/IR/CMakeLists.txt new file mode 100644 index 0000000..975d364 --- /dev/null +++ b/include/soda/Dialect/Accel/IR/CMakeLists.txt @@ -0,0 +1,2 @@ +add_mlir_dialect(AccelOps accel) +add_mlir_doc(AccelOps AccelOps Dialects/ -gen-dialect-doc) diff --git a/include/soda/Dialect/CMakeLists.txt b/include/soda/Dialect/CMakeLists.txt index 8973d6a..2933ef4 100644 --- a/include/soda/Dialect/CMakeLists.txt +++ b/include/soda/Dialect/CMakeLists.txt @@ -2,4 +2,5 @@ add_subdirectory(SODA) add_subdirectory(SNN) add_subdirectory(Linalg) add_subdirectory(Affine) +add_subdirectory(Accel) add_subdirectory(Transform) diff --git a/include/soda/Dialect/SNN/IR/SNNBase.td b/include/soda/Dialect/SNN/IR/SNNBase.td index 16f32be..76725ba 100644 --- a/include/soda/Dialect/SNN/IR/SNNBase.td +++ b/include/soda/Dialect/SNN/IR/SNNBase.td @@ -25,6 +25,7 @@ def SNN_Dialect : Dialect { }]; let dependentDialects = ["tensor::TensorDialect"]; + let useFoldAPI = kEmitFoldAdaptorFolder; } #endif // SNN_BASE diff --git a/include/soda/Dialect/SODA/SODABase.td b/include/soda/Dialect/SODA/SODABase.td index cd7f67c..1806945 100644 --- a/include/soda/Dialect/SODA/SODABase.td +++ b/include/soda/Dialect/SODA/SODABase.td @@ -53,6 +53,7 @@ def SODA_Dialect : Dialect { let useDefaultAttributePrinterParser = 1; let useDefaultTypePrinterParser = 1; + let useFoldAPI = kEmitFoldAdaptorFolder; } def SODA_AsyncToken : DialectType< diff --git a/include/soda/ExecutionEngine/axi/AxiUtils.h b/include/soda/ExecutionEngine/axi/AxiUtils.h new file mode 100644 index 0000000..b9d3841 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/AxiUtils.h @@ -0,0 +1,168 @@ +//===- AxUtils.h - Utils for debugging MLIR execution -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares AXI4MLIR functions to be called by the host to communicate +// with AXI enabled accelerators. +// +//===----------------------------------------------------------------------===// + +#ifndef EXECUTIONENGINE_AXIUTILS_H_ +#define EXECUTIONENGINE_AXIUTILS_H_ + +#ifdef _WIN32 +#ifndef MLIR_AXIRUNNERUTILS_EXPORT +#ifdef mlir_runner_utils_EXPORTS +// We are building this library +#define MLIR_AXIRUNNERUTILS_EXPORT __declspec(dllexport) +#else +// We are using this library +#define MLIR_AXIRUNNERUTILS_EXPORT __declspec(dllimport) +#endif // mlir_runner_utils_EXPORTS +#endif // MLIR_AXIRUNNERUTILS_EXPORT +#else +#define MLIR_AXIRUNNERUTILS_EXPORT +#endif // _WIN32 + +#include +#include + +// ============================================================================= +// AXI_APIV1 +// ============================================================================= + +//-----------------DMA Functions----------------- +/** + * - dma_address is base address of dma + * - dma_input_addr is starting memory location for the dma input buffer, + * - dma_input_buffer_size is length of the buffer + * - dma_output_addr is starting memory location for the dma output buffer, + * - dma_output_buffer_size is length of the buffer + * + * + * Runs starting controls signals and sets MMS2, S2MM address registers to start + * memory locations of the input and output buffers + */ + +extern "C" MLIR_AXIRUNNERUTILS_EXPORT void +dma_init(unsigned int dma_address, unsigned int dma_input_address, + unsigned int dma_input_buffer_size, unsigned int dma_output_address, + unsigned int dma_output_buffer_size); + +// Memory unmaps DMA control_register_address and Input and output buffers +extern "C" MLIR_AXIRUNNERUTILS_EXPORT void dma_free(); + +//================================================================================================================ + +//-----------------BUFFER Functions----------------- +// Get the MMap address of the input buffer of the dma *Needed to copy data to +// Input_Buffer* +extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int *dma_get_inbuffer(); + +// Get the MMap address of the output buffer of the dma *Needed to copy data +// from Output_Buffer* +extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int *dma_get_outbuffer(); + +//================================================================================================================ + +//-----------------BUFFER Functions----------------- +// Copy data into the Input Buffer (length to write, offset to write to) returns +// 0 if successful +extern "C" MLIR_AXIRUNNERUTILS_EXPORT int +dma_copy_to_inbuffer(unsigned int *host_src_address, int data_length, + int offset); + +// Copy data from the Output Buffer (length to read, offset to read from) +// returns 0 if successful +extern "C" MLIR_AXIRUNNERUTILS_EXPORT int +dma_copy_from_outbuffer(unsigned int *host_dst_address, int data_length, + int offset); + +//-----------------BUFFER Functions----------------- +// Copy data into the Input Buffer (length to write, offset to write to) returns +// 0 if successful +template +int mlir_dma_copy_to_inbuffer(const DynamicMemRefType &src, int data_length, + int offset); + +// Copy data from the Output Buffer (length to read, offset to read from) +// returns 0 if successful +template +int mlir_dma_copy_from_outbuffer(const DynamicMemRefType &dst, + int data_length, int offset); + +extern "C" MLIR_RUNNERUTILS_EXPORT int +copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset); + +extern "C" MLIR_RUNNERUTILS_EXPORT int +copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset); + +extern "C" MLIR_RUNNERUTILS_EXPORT int +copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset); + +extern "C" MLIR_RUNNERUTILS_EXPORT int +copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset); + +//================================================================================================================ + +//-----------------DMA MMS2 Functions----------------- +/** + * Checks if input buffer size is >= length + * Sets DMA MMS2 transfer length to length + * Starts transfers to the accelerator using dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ +extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_start_send(int length, + int offset); + +// Same as dma_send but thread does not block, returns if 0 +extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_check_send(); + +// Blocks thread until dma MMS2 transfer is complete +extern "C" MLIR_AXIRUNNERUTILS_EXPORT void dma_wait_send(); + +//-----------------DMA S2MM Functions----------------- +/** + * Checks if buffer size is >= length + * Sets 2SMM store length + * Starts storing data recieved through dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ +extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_start_recv(int length, + int offset); + +// Blocks thread until dma S2MM transfer is complete (TLAST signal is seen) +extern "C" MLIR_AXIRUNNERUTILS_EXPORT void dma_wait_recv(); + +// Same as dma_recv but thread does not block, returns if 0 +extern "C" MLIR_AXIRUNNERUTILS_EXPORT int dma_check_recv(); + +// Unexposed to MLIR +extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int +dma_set(unsigned int *dma_virtual_address, int offset, unsigned int value); + +// Unexposed to MLIR +extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int +dma_get(unsigned int *dma_virtual_address, int offset); + +//-----------------Util Functions----------------- + +// Converts memref into llvm_array pointers +// extern "C" MLIR_AXIRUNNERUTILS_EXPORT unsigned int * +// memref_to_ptr(UnrankedMemRefType * in_memref) { +// return in_memref->descriptor; +// } + +// // Converts pointers into memrefs +// extern "C" MLIR_AXIRUNNERUTILS_EXPORT UnrankedMemRefType +// ptr_to_memref(unsigned int *bare_ptr) { + +// UnrankedMemRefType my_memref; +// return my_memref; +// } + +#endif // EXECUTIONENGINE_AXIUTILS_H_ diff --git a/include/soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h new file mode 100755 index 0000000..63d9350 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h @@ -0,0 +1,94 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" +#define ACCNAME MM_4x4v1 + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + + + void Recv(); + + void print_profile(); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len=0; + read_B_len=0; + compute_C_len=0; + send_C_len=0; + verbose = false; + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Read A data_len: " << read_A_len << endl; + cout << "Read B data_len: " << read_B_len << endl; + cout << "MACs count: " << compute_C_len << endl; + cout << "Send C data_len: " << send_C_len << endl; + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Executed with :" << __FILE__ << endl; + cout << "- - - - - - - - - - - - - - - - - - - - " << endl;; +} + +void ACCNAME::Recv() { + wait(); + while (1) { + bool tlast = false; + int output = 0; + while(!tlast){ + DATA inp = din1.read(); + DATA wgt = din1.read(); + output+= inp.data*wgt.data; + + // cout << inp.data << "*" << wgt.data << endl; + tlast = (inp.tlast || wgt.tlast); + DWAIT(); + } + DATA d; + d.tlast = true; + d.data = output; + dout1.write(d); + DWAIT(); + } +} +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/conv_v1/conv_v1.json b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/conv_v1.json new file mode 100755 index 0000000..e997d57 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/conv_v1/conv_v1.json @@ -0,0 +1,120 @@ +{ + "name": "MM_4x4_v1", + "version": "1.0", + "description": "MM Accelerator", + + "memory_layout": { + "#A_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 16, + "data_type": "int32" + } + }, + + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#N": 4, + "#M": 4, + "#K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#N", + "#K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#M", + "#K" + ] + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#N", + "#M" + ] + } + } + } + }, + + "ISA": { + "instruction_format": { + "opcode_length": 0, + "op_args": 0 + }, + "opcodes": { + "-": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ] + } + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/dma_engine.sc.h b/include/soda/ExecutionEngine/axi/accelerators/dma_engine.sc.h new file mode 100644 index 0000000..47c229a --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/dma_engine.sc.h @@ -0,0 +1,91 @@ +#ifndef DMA_DRIVER_H +#define DMA_DRIVER_H + +#include + +#ifndef __SYNTHESIS__ +#define DWAIT(x) wait(x) +#else +#define DWAIT(x) +#endif + +typedef struct _DATA { + sc_uint<32> data; + bool tlast; + inline friend ostream &operator<<(ostream &os, const _DATA &v) { + cout << "data: " << v.data << " tlast: " << v.tlast; + return os; + } +} DATA; + +SC_MODULE(DMA_DRIVER) { + sc_in clock; + sc_in reset; + sc_fifo_in dout1; + sc_fifo_out din1; + bool send; + bool recv; + + void DMA_MMS2() { + while (1) { + while (!send) + wait(); + for (int i = 0; i < input_len; i++) { + int d = DMA_input_buffer[i + input_offset]; + bool tlast = (i+1 == input_len); + din1.write({d, tlast}); + wait(); + } + send = false; + wait(); + sc_pause(); + wait(); + } + }; + + void DMA_S2MM() { + while (1) { + while (!recv) + wait(); + bool last = false; + int i = 0; + do { + DATA d = dout1.read(); + while (i >= output_len) + wait(); + last = d.tlast; + DMA_output_buffer[output_offset + i++] = d.data; + wait(); + } while (!last); + output_len = i; + recv = false; + // To ensure wait_send() does not evoke the sc_pause + while (send) + wait(2); + sc_pause(); + wait(); + } + }; + + SC_HAS_PROCESS(DMA_DRIVER); + + DMA_DRIVER(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(DMA_MMS2, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(DMA_S2MM, clock.pos()); + reset_signal_is(reset, true); + } + + int *DMA_input_buffer; + int *DMA_output_buffer; + + // TODO: input_length = Number of elements * (sizeof(elements)/32) + int input_len; + int input_offset; + + int output_len; + int output_offset; +}; + +#endif \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/dma_engine_v2.sc.h b/include/soda/ExecutionEngine/axi/accelerators/dma_engine_v2.sc.h new file mode 100644 index 0000000..3bed052 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/dma_engine_v2.sc.h @@ -0,0 +1,116 @@ +#ifndef DMA_DRIVER_H +#define DMA_DRIVER_H + +#include + +#ifndef __SYNTHESIS__ +#define DWAIT(x) wait(x) +#else +#define DWAIT(x) +#endif + +typedef struct _DATA { + sc_uint<32> data; + bool tlast; + inline friend ostream &operator<<(ostream &os, const _DATA &v) { + cout << "data: " << v.data << " tlast: " << v.tlast; + return os; + } +} DATA; + +SC_MODULE(DMA_DRIVER) { + sc_in clock; + sc_in reset; + sc_fifo_in dout1; + sc_fifo_out din1; + bool send; + bool recv; + + void DMA_MMS2() { + while (1) { + + while (!send) + wait(); + + int send_len = input_len * isize; + for (int i = 0; i < send_len; i++) { + sc_uint<32> d; + d.range(7, 0) = DMA_input_buffer[(input_offset * isize) + i++]; + if (isize > 1 && i < send_len) + d.range(15, 8) = DMA_input_buffer[(input_offset * isize) + i++]; + if (isize > 2 && i < send_len) + d.range(23, 16) = DMA_input_buffer[(input_offset * isize) + i++]; + if (isize > 3 && i < send_len) + d.range(31, 24) = DMA_input_buffer[(input_offset * isize) + i++]; + wait(); + din1.write({d, 1}); + wait(); + } + + send = false; + wait(); + sc_pause(); + wait(); + } + }; + + void DMA_S2MM() { + while (1) { + while (!recv) + wait(); + bool last = false; + int i = 0; + + do { + DATA d = dout1.read(); + int recv_len = output_len * osize; + while (i >= recv_len) + wait(); + last = d.tlast; + DMA_output_buffer[(output_offset * osize) + i++] = d.data.range(7, 0); + if (osize > 1) + DMA_output_buffer[(output_offset * osize) + i++] = + d.data.range(15, 8); + if (osize > 2) + DMA_output_buffer[(output_offset * osize) + i++] = + d.data.range(23, 16); + if (osize > 3) + DMA_output_buffer[(output_offset * osize) + i++] = + d.data.range(31, 24); + wait(); + } while (!last); + + recv_len = i; + recv = false; + // To ensure wait_send() does not evoke the sc_pause + while (send) + wait(2); + sc_pause(); + wait(); + } + }; + + SC_HAS_PROCESS(DMA_DRIVER); + + DMA_DRIVER(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(DMA_MMS2, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(DMA_S2MM, clock.pos()); + reset_signal_is(reset, true); + } + + char *DMA_input_buffer; + char *DMA_output_buffer; + + // length = Number of elements + unsigned int input_len; + unsigned int input_offset; + unsigned int isize; + + unsigned int output_len; + unsigned int output_offset; + unsigned int osize; +}; + +#endif \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h new file mode 100644 index 0000000..d62bd5b --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h @@ -0,0 +1,238 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" + + +#ifndef __SYNTHESIS__ +#define DWAIT(x) wait(x) +#else +#define DWAIT(x) +#endif + +#define ACCNAME MM_4x4v1 + +#define M 4 +#define N 4 +#define K 4 + +#ifdef VERBOSE_ACC +#define ALOG(x) std::cout << x << std::endl +#else +#define ALOG(x) +#endif + + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> inputs[M][K]; + sc_int<32> weights[K][N]; + sc_int<32> outputs[M][N]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose = true; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + void Recv(); + + void Compute(); + + void Send(); + + void print_profile(); + + int mul_int32(int, int); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len = 0; + read_B_len = 0; + compute_C_len = 0; + send_C_len = 0; + verbose = false; + + // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle + // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS + // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle + // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS + // RESET variable=reset + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + ALOG("++++++++++++++++++++++++++++++++++++++++" ); + ALOG("Read A data_len: " << read_A_len); + ALOG("Read B data_len: " << read_B_len); + ALOG("MACs count: " << compute_C_len); + ALOG("Send C data_len: " << send_C_len); + ALOG("++++++++++++++++++++++++++++++++++++++++" ); + ALOG("Executed with :" << __FILE__ ); + ALOG("- - - - - - - - - - - - - - - - - - - - "); +} + +void ACCNAME::Recv() { + wait(); + while (1) { + while (compute) + wait(); + + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int k = 0; k < K; k++) { + inputs[m][k] = din1.read().data; + read_A_len++; + DWAIT(); + } + } + + for (int k = 0; k < K; k++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + weights[k][n] = din1.read().data; + read_B_len++; + DWAIT(); + } + } + + // DEBUG ONLY + if (verbose) { + cout << "=========================" << endl; + cout << "BLOCK: " << process_blocks++ << endl; + cout << "=========================" << endl; + for (int m = 0; m < M; m++) { + for (int k = 0; k < K; k++) + cout << inputs[m][k] << ","; + cout << endl; + } + cout << "=========================" << endl; + for (int k = 0; k < K; k++) { + for (int n = 0; n < N; n++) + cout << weights[k][n] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + wait(); + compute.write(true); + wait(); + } +} + +void ACCNAME::Compute() { + wait(); + while (1) { + while (!compute) + wait(); + + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + int acc = 0; + for (int k = 0; k < K; k++) { + int x = inputs[m][k]; + int y = weights[k][n]; + acc += mul_int32(x, y); + compute_C_len++; + } + outputs[m][n] = acc; + } + } + + // DEBUG ONLY + if (verbose) { + cout << "=========================" << endl; + cout << "Output: " << process_blocks - 1 << endl; + cout << "=========================" << endl; + cout << "=========================" << endl; + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) + cout << outputs[m][n] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + wait(); + compute.write(false); + send.write(true); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + DATA d; + d.tlast = false; + if (m == M - 1 && n == N - 1) + d.tlast = true; + d.data = outputs[m][n]; + dout1.write(d); + send_C_len++; + DWAIT(); + } + } + send.write(false); + wait(); + } +} + +int ACCNAME::mul_int32(int x, int y) { return x * y; } + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/mm_4x4_v1.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/mm_4x4_v1.json new file mode 100644 index 0000000..e997d57 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/mm_4x4_v1.json @@ -0,0 +1,120 @@ +{ + "name": "MM_4x4_v1", + "version": "1.0", + "description": "MM Accelerator", + + "memory_layout": { + "#A_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 16, + "data_type": "int32" + } + }, + + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#N": 4, + "#M": 4, + "#K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#N", + "#K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#M", + "#K" + ] + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#N", + "#M" + ] + } + } + } + }, + + "ISA": { + "instruction_format": { + "opcode_length": 0, + "op_args": 0 + }, + "opcodes": { + "-": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ] + } + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h new file mode 100644 index 0000000..5570bf6 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h @@ -0,0 +1,270 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" + +#ifndef __SYNTHESIS__ +#define DWAIT(x) wait(x) +#else +#define DWAIT(x) +#endif + +#define M 4 +#define N 4 +#define K 4 + +#define ACCNAME MM_4x4v2 + +#ifdef VERBOSE_ACC +#define ALOG(x) std::cout << x << std::endl +#else +#define ALOG(x) +#endif + +// OP-Code Stuct +// 000 : 0 = NOP; +// 001 : 1 = read_A; +// 010 : 2 = read_B; +// 011 : 3 = read_A -> read_B; +// 100 : 4 = compute_C; +// 101 : 5 = read_A -> compute_C; +// 110 : 6 = read_B -> compute_C; +// 111 : 7 = read_A -> read_B -> compute_C; + +struct opcode { + unsigned int packet; + bool read_A; + bool read_B; + bool compute_C; + + opcode(sc_uint<32> _packet) { + // ALOG("OPCODE: " << _packet); + // ALOG("Time: " << sc_time_stamp()); + packet = _packet; + read_A = _packet.range(0, 0); + read_B = _packet.range(1, 1); + compute_C = _packet.range(2, 2); + } +}; + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> inputs[M][K]; + sc_int<32> weights[K][N]; + sc_int<32> outputs[M][N]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + void Recv(); + + void Compute(); + + void Send(); + + void print_profile(); + + int mul_int32(int, int); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len = 0; + read_B_len = 0; + compute_C_len = 0; + send_C_len = 0; + verbose = false; + + // clang-format off + // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} + // #pragma HLS RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} + // #pragma HLS RESET variable=reset + + // #pragma HLS array_partition variable=inputs complete dim=2 + // #pragma HLS array_partition variable=weights complete dim=0 + // #pragma HLS array_partition variable=outputs complete dim=2 + // clang-format on + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + ALOG("++++++++++++++++++++++++++++++++++++++++"); + ALOG("Read A data_len: " << read_A_len); + ALOG("Read B data_len: " << read_B_len); + ALOG("MACs count: " << compute_C_len); + ALOG("Send C data_len: " << send_C_len); + ALOG("++++++++++++++++++++++++++++++++++++++++"); + ALOG("Executed with :" << __FILE__); + ALOG("- - - - - - - - - - - - - - - - - - - - "); +} + +void ACCNAME::Recv() { + wait(); + while (1) { + while (compute) + wait(); + + opcode packet(din1.read().data); + + if (packet.read_A) { + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int k = 0; k < K; k++) { + inputs[m][k] = din1.read().data; + read_A_len++; + } + } + if (verbose) { + cout << "=========================" << endl; + cout << "Read BLOCK A: " << read_A_len++ << endl; + cout << "=========================" << endl; + for (int m = 0; m < M; m++) { + for (int k = 0; k < K; k++) + cout << inputs[m][k] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + } + + if (packet.read_B) { + for (int k = 0; k < K; k++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + weights[k][n] = din1.read().data; + read_B_len++; + } + } + if (verbose) { + cout << "=========================" << endl; + cout << "Read BLOCK B: " << read_B_len++ << endl; + cout << "=========================" << endl; + for (int k = 0; k < K; k++) { + for (int n = 0; n < N; n++) + cout << weights[k][n] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + } + + if (packet.compute_C) { + wait(); + compute.write(true); + } + wait(); + } +} + +void ACCNAME::Compute() { + wait(); + while (1) { + while (!compute) + wait(); + + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + int acc = 0; + for (int k = 0; k < K; k++) { + int x = inputs[m][k]; + int y = weights[k][n]; + acc += mul_int32(x, y); + compute_C_len++; + } + outputs[m][n] = acc; + } + } + + if (verbose) { + cout << "=========================" << endl; + cout << "Compute BLOCK C: " << compute_C_len++ << endl; + cout << "=========================" << endl; + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) + cout << outputs[m][n] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + wait(); + compute.write(false); + send.write(true); + wait(); + } +} + +int ACCNAME::mul_int32(int x, int y) { return x * y; } + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + DATA d; + d.tlast = false; + if (m == M - 1 && n == N - 1) + d.tlast = true; + d.data = outputs[m][n]; + dout1.write(d); + send_C_len++; + DWAIT(); + } + } + send.write(false); + wait(); + } +} + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/mm_4x4_v2.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/mm_4x4_v2.json new file mode 100644 index 0000000..e469ab4 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/mm_4x4_v2.json @@ -0,0 +1,218 @@ +{ + "name": "MM_4x4_v2", + "version": "1.0", + "description": "MM Accelerator", + "memory_layout": { + "#A_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 16, + "data_type": "int32" + } + }, + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#N": 4, + "#M": 4, + "#K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#N", + "#K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#M", + "#K" + ], + "stationary": true + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#N", + "#M" + ] + } + } + } + }, + "ISA": { + "instruction_format": { + "opcode_length": 32, + "op_args": 0 + }, + "opcodes": { + "-": [], + "0": [], + "1": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "2": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "3": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "4": [ + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "5": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "6": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "7": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ] + } + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h new file mode 100644 index 0000000..3e52749 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h @@ -0,0 +1,299 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" + +#ifndef __SYNTHESIS__ +#define DWAIT(x) wait(x) +#else +#define DWAIT(x) +#endif + +#define M 4 +#define N 4 +#define K 4 + +#define ACCNAME MM_4x4v3 + +#ifdef VERBOSE_ACC +#define ALOG(x) std::cout << x << std::endl +#else +#define ALOG(x) +#endif + +// OP-Code Stuct +// 0000 : 0 = NOP; +// 0001 : 1 = read_A; +// 0010 : 2 = read_B; +// 0011 : 3 = read_A -> read_B; +// 0100 : 4 = compute_C; +// 0101 : 5 = read_A -> compute_C; +// 0110 : 6 = read_B -> compute_C; +// 0111 : 7 = read_A -> read_B -> compute_C; + +// 1000 : 8 = send_C; +// 1001 : 9 = read_A -> send_C; +// 1010 : 10 = read_B -> send_C; +// 1011 : 11 = read_A -> read_B -> send_C; +// 1100 : 12 = compute_C -> send_C; +// 1101 : 13 = read_A -> compute_C -> send_C; +// 1110 : 14 = read_B -> compute_C -> send_C; +// 1111 : 15 = read_A -> read_B -> compute_C -> send_C; + +struct opcode { + unsigned int packet; + bool read_A; + bool read_B; + bool compute_C; + bool send_C; + + opcode(sc_uint<32> _packet) { + // ALOG("OPCODE: " << _packet); + // ALOG("Time: " << sc_time_stamp()); + packet = _packet; + read_A = _packet.range(0, 0); + read_B = _packet.range(1, 1); + compute_C = _packet.range(2, 2); + send_C = _packet.range(3, 3); + } +}; + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> inputs[M][K]; + sc_int<32> weights[K][N]; + sc_int<32> outputs[M][N]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + void Recv(); + + void Compute(); + + void Send(); + + void print_profile(); + + int mul_int32(int, int); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len = 0; + read_B_len = 0; + compute_C_len = 0; + send_C_len = 0; + verbose = false; + + // clang-format off + // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} + // #pragma HLS RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} + // #pragma HLS RESET variable=reset + + // #pragma HLS array_partition variable=inputs complete dim=2 + // #pragma HLS array_partition variable=weights complete dim=0 + // #pragma HLS array_partition variable=outputs complete dim=2 + // clang-format on + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + ALOG("++++++++++++++++++++++++++++++++++++++++"); + ALOG("Read A data_len: " << read_A_len); + ALOG("Read B data_len: " << read_B_len); + ALOG("MACs count: " << compute_C_len); + ALOG("Send C data_len: " << send_C_len); + ALOG("++++++++++++++++++++++++++++++++++++++++"); + ALOG("Executed with :" << __FILE__); + ALOG("- - - - - - - - - - - - - - - - - - - - "); +} + +void ACCNAME::Recv() { + wait(); + while (1) { + opcode packet(din1.read().data); + + if (packet.read_A) { + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int k = 0; k < K; k++) { + inputs[m][k] = din1.read().data; + read_A_len++; + } + } + if (verbose) { + cout << "=========================" << endl; + cout << "Read BLOCK A: " << read_A_len++ << endl; + cout << "=========================" << endl; + for (int m = 0; m < M; m++) { + for (int k = 0; k < K; k++) + cout << inputs[m][k] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + } + + if (packet.read_B) { + for (int k = 0; k < K; k++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + weights[k][n] = din1.read().data; + read_B_len++; + } + } + if (verbose) { + cout << "=========================" << endl; + cout << "Read BLOCK B: " << read_B_len++ << endl; + cout << "=========================" << endl; + for (int k = 0; k < K; k++) { + for (int n = 0; n < N; n++) + cout << weights[k][n] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + } + + // Computes C if true + if (packet.compute_C) { + wait(); + compute.write(true); + } + + while (compute) + wait(); + + // Sends then clears C if true + if (packet.send_C) { + wait(); + send.write(true); + } + + while (send) + wait(); + + wait(); + } +} + +void ACCNAME::Compute() { + wait(); + while (1) { + while (!compute) + wait(); + + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + int acc = 0; + for (int k = 0; k < K; k++) { + int x = inputs[m][k]; + int y = weights[k][n]; + acc += mul_int32(x, y); + compute_C_len++; + } + outputs[m][n] += acc; + } + } + wait(); + compute.write(false); + wait(); + } +} + +int ACCNAME::mul_int32(int x, int y) { return x * y; } + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + + for (int m = 0; m < M; m++) { + // #pragma HLS pipeline + for (int n = 0; n < N; n++) { + DATA d; + d.tlast = false; + if (m == M - 1 && n == N - 1) + d.tlast = true; + d.data = outputs[m][n]; + dout1.write(d); + send_C_len++; + DWAIT(); + } + } + + if (verbose) { + cout << "=========================" << endl; + cout << "Compute BLOCK C: " << compute_C_len++ << endl; + cout << "=========================" << endl; + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) + cout << outputs[m][n] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + + for (int m = 0; m < M; m++) { + // #pragma HLS unroll + for (int n = 0; n < N; n++) { + // #pragma HLS unroll + outputs[m][n] = 0; // Clears after sends + } + } + send.write(false); + wait(); + } +} + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/mm_4x4_v3.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/mm_4x4_v3.json new file mode 100644 index 0000000..83c0a89 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/mm_4x4_v3.json @@ -0,0 +1,352 @@ +{ + "name": "MM_4x4_v3", + "version": "1.0", + "description": "MM Accelerator", + "memory_layout": { + "#A_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 16, + "data_type": "int32" + } + }, + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#N": 4, + "#M": 4, + "#K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#N", + "#K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#M", + "#K" + ], + "stationary": true + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#N", + "#M" + ] + } + } + } + }, + "ISA": { + "instruction_format": { + "opcode_length": 32, + "op_args": 0 + }, + "opcodes": { + "-": [], + "0": [], + "1": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "2": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "3": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "4": [ + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "5": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "6": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "7": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "8": [ + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "9": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "10": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "11": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "12": [ + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "13": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "14": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "15": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ] + + + } + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h new file mode 100644 index 0000000..c661acf --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h @@ -0,0 +1,344 @@ +#ifndef ACC_H +#define ACC_H + +#define PE_M 16 +#define PE_N 16 +#define PE_K 16 + +#include "../dma_engine.sc.h" +#define ACCNAME MM_4x4v4 + +// #define VERBOSE_ACC +#ifdef VERBOSE_ACC +#define ALOG(x) std::cout << x << std::endl +#else +#define ALOG(x) +#endif + +// OP-Code Stuct +// 0000 : 0 = NOP; +// 0001 : 1 = read_A; +// 0010 : 2 = read_B; +// 0011 : 3 = read_A -> read_B; +// 0100 : 4 = compute_C; +// 0101 : 5 = read_A -> compute_C; +// 0110 : 6 = read_B -> compute_C; +// 0111 : 7 = read_A -> read_B -> compute_C; + +// 1000 : 8 = send_C; +// 1001 : 9 = read_A -> send_C; +// 1010 : 10 = read_B -> send_C; +// 1011 : 11 = read_A -> read_B -> send_C; +// 1100 : 12 = compute_C -> send_C; +// 1101 : 13 = read_A -> compute_C -> send_C; +// 1110 : 14 = read_B -> compute_C -> send_C; +// 1111 : 15 = read_A -> read_B -> compute_C -> send_C; + +#define su10 sc_uint<10> +#define su12 sc_uint<12> +// MAX M, N, K = 256 +struct opcode { + unsigned int packet; + bool read_A; + bool read_B; + bool compute_C; + bool send_C; + + opcode(sc_uint<32> _packet) { + // ALOG("OPCODE: " << _packet); + // ALOG("Time: " << sc_time_stamp()); + packet = _packet; + read_A = _packet.range(0, 0); + read_B = _packet.range(1, 1); + compute_C = _packet.range(2, 2); + send_C = _packet.range(3, 3); + } +}; + +struct code_extension { + su10 N; + su10 M; + su10 K; + su10 K16; + su10 N16; + + code_extension(sc_uint<32> _packetA) { + M = _packetA.range(9, 0); + N = _packetA.range(19, 10); + K = _packetA.range(29, 20); + N16 = _packetA.range(19, 10) / PE_N; + K16 = _packetA.range(29, 20) / PE_K; + // ALOG("packetA: " << _packetA); + // ALOG("Time: " << sc_time_stamp()); + // ALOG("N: " << N << ", M: " << M << ", K: " << K); + // cin.ignore(); + } +}; + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> A_buffer[256][16]; + sc_int<32> B_buffer[256][16]; + sc_int<32> C_buffer[256][16]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + code_extension acc_args = code_extension(0); + + void Recv(); + + void Compute(sc_int<32>[PE_M][PE_K], sc_int<32>[PE_K][PE_N], + sc_int<32>[PE_M][PE_N]); + + void LoadA(sc_int<32>[PE_M][PE_K], su10, su10, su10); + + void LoadB(sc_int<32>[PE_K][PE_N], su10, su10, su10); + + void Store(sc_int<32>[PE_M][PE_N], su10, su10, su10); + + void Schedule_Compute(); + + void Send(); + + void print_profile(); + + int mul_int32(int, int); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Schedule_Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len = 0; + read_B_len = 0; + compute_C_len = 0; + send_C_len = 0; + verbose = false; + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + ALOG("++++++++++++++++++++++++++++++++++++++++"); + ALOG("Read A data_len: " << read_A_len); + ALOG("Read B data_len: " << read_B_len); + ALOG("MACs count: " << compute_C_len); + ALOG("Send C data_len: " << send_C_len); + ALOG("++++++++++++++++++++++++++++++++++++++++"); + ALOG("Executed with :" << __FILE__); + ALOG("- - - - - - - - - - - - - - - - - - - - "); +} + +void ACCNAME::Recv() { + + wait(); + while (1) { + opcode packet(din1.read().data); + code_extension op_args(din1.read().data); + acc_args = op_args; + + if (packet.read_A) { + int read_length = op_args.M * op_args.K16; + for (int i = 0; i < read_length; i++) { + for (int j = 0; j < 16; j++) { + A_buffer[i][j] = din1.read().data; + read_A_len++; + DWAIT(); + } + } + } + + if (packet.read_B) { + int read_length = op_args.K * op_args.N16; + for (int i = 0; i < read_length; i++) { + for (int j = 0; j < 16; j++) { + B_buffer[i][j] = din1.read().data; + read_B_len++; + DWAIT(); + } + } + } + + // Computes C if true + if (packet.compute_C) { + compute.write(true); + wait(); + } + + while (compute) + wait(); + + // Sends then clears C if true + if (packet.send_C) { + send.write(true); + wait(); + } + + while (send) + wait(); + + wait(); + } +} + +void ACCNAME::LoadA(sc_int<32> A[PE_M][PE_K], su10 M, su10 K, su10 in_stride) { + su12 base = M * in_stride + K; + su12 offset = 0; + for (su10 m = 0; m < PE_M; m++) { + for (su10 k = 0; k < PE_K; k++) { + // #pragma HLS unroll + A[m][k] = A_buffer[base + offset][k]; + } + offset += in_stride; + } +} + +void ACCNAME::LoadB(sc_int<32> B[PE_K][PE_N], su10 K, su10 N, su10 in_stride) { + su12 base = K * in_stride + N; + su12 offset = 0; + for (su10 k = 0; k < PE_K; k++) { + for (su10 n = 0; n < PE_N; n++) { + // #pragma HLS unroll + B[k][n] = B_buffer[base + offset][n]; + } + offset += in_stride; + } +} + +void ACCNAME::Compute(sc_int<32> A[PE_M][PE_K], sc_int<32> B[PE_K][PE_N], + sc_int<32> C[PE_M][PE_N]) { + for (int m = 0; m < PE_M; m++) { + for (int n = 0; n < PE_N; n++) { + // #pragma HLS pipeline + // #pragma HLS unroll factor 4 + int acc = 0; + for (int k = 0; k < PE_K; k++) { + int x = A[m][k]; + int y = B[k][n]; + acc += mul_int32(x, y); + compute_C_len++; + } + C[m][n] = acc; + } + } +} + +void ACCNAME::Store(sc_int<32> C[PE_M][PE_N], su10 M, su10 N, su10 out_stride) { + su12 base = M * out_stride + N; + su12 offset = 0; + for (su10 m = 0; m < PE_M; m++) { + // #pragma HLS pipeline + for (su10 n = 0; n < PE_N; n++) { + // #pragma HLS unroll + C_buffer[base + offset][n] += C[m][n]; + } + offset += out_stride; + } +} + +void ACCNAME::Schedule_Compute() { + sc_int<32> A[PE_M][PE_K]; + sc_int<32> B[PE_K][PE_N]; + sc_int<32> C[PE_M][PE_N]; + // #pragma HLS array_partition variable = A complete dim = 2 + // #pragma HLS array_partition variable = B complete dim = 2 + // #pragma HLS array_partition variable = C complete dim = 2 + + wait(); + while (1) { + while (!compute) + wait(); + + unsigned int ks = 0; + for (su10 k = 0; k < acc_args.K; k += PE_K) { + for (su10 m = 0; m < acc_args.M; m += PE_M) { + LoadA(A, m, ks, acc_args.K16); + for (su10 n = 0; n < acc_args.N16; n++) { + LoadB(B, k, n, acc_args.N16); + Compute(A, B, C); + Store(C, m, n, acc_args.N16); + } + } + ks++; + } + + wait(); + compute.write(false); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + + unsigned int write_length = acc_args.M * acc_args.N16; + for (su10 m = 0; m < write_length; m++) { + for (su10 n = 0; n < 16; n++) { + DATA d; + d.tlast = false; + d.data = C_buffer[m][n]; + if (n + 1 == 16 && m + 1 == write_length) + d.tlast = true; + dout1.write(d); + send_C_len++; + wait(); + C_buffer[m][n] = 0; + DWAIT(); + } + } + send.write(false); + wait(); + } +} + +int ACCNAME::mul_int32(int x, int y) { return x * y; } + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/mm_4x4_v4.json b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/mm_4x4_v4.json new file mode 100644 index 0000000..1be763f --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/mm_4x4_v4.json @@ -0,0 +1,361 @@ +{ + "name": "MM_4x4_v4", + "version": "1.0", + "description": "MM Accelerator", + "memory_layout": { + "#A_Buffer": { + "size": 4096, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 4096, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 4096, + "data_type": "int32" + } + }, + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#tile_N": 4, + "#tile_M": 4, + "#tile_K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#tile_N", + "#tile_K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#tile_M", + "#tile_K" + ], + "stationary": true + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#tile_N", + "#tile_M" + ] + } + } + } + }, + "ISA": { + "instruction_format": { + "opcode_length": 32, + "op_extension": 64 + }, + "opcodes": { + "-": [], + "0": [], + "1": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + } + ], + "2": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + } + ], + "3": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + } + ], + "4": [ + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "5": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "6": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "7": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "8": [ + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "9": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "10": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "11": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "12": [ + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "13": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "14": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "15": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ] + }, + "op_arg": { + "0-15": "#N", + "16-31": "#M", + "31-63": "#K" + } + }, + "schedule": { + "allowed_patterns": [ + "R#a, R#b, C, S#c", + "a" + ] + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h new file mode 100644 index 0000000..2fcd716 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h @@ -0,0 +1,187 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" +#define ACCNAME MM_4x4v5 + +// OP-Code Stuct +// 000 : 0 = read_A -> read_B -> compute_C; +// 001 : 1 = store_C; + + +struct opcode { + unsigned int packet; + bool store_C; + + opcode(sc_uint<32> _packet) { + packet = _packet; + store_C = _packet.range(0, 0); + // ALOG("OPCODE: " << packet); + // ALOG("Time: " << sc_time_stamp()); + } +}; + + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> A[16]; + sc_int<32> B[16]; + sc_int<32> C[16]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + void Recv(); + + void Compute(); + + void Send(); + + void print_profile(); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + verbose = false; + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Read A data_len: " << read_A_len << endl; + cout << "Read B data_len: " << read_B_len << endl; + cout << "MACs count: " << compute_C_len << endl; + cout << "Send C data_len: " << send_C_len << endl; + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Executed with :" << __FILE__ << endl; + cout << "- - - - - - - - - - - - - - - - - - - - " << endl;; +} + +void ACCNAME::Recv() { + wait(); + while (1) { + while (compute) + wait(); + + opcode packet(din1.read().data); + + if (packet.store_C) { + wait(); + send.write(true); + wait(); + }else{ + wait(); + for (int i = 0; i < 16; i++) { + A[i] = din1.read().data; + read_A_len++; + DWAIT(); + } + for (int i = 0; i < 16; i++) { + B[i] = din1.read().data; + read_B_len++; + DWAIT(); + } + compute.write(true); + wait(); + } + + while(send.read() || compute.read()) + wait(); + + wait(); + } +} + +void ACCNAME::Compute() { + wait(); + while (1) { + while (!compute) + wait(); + + for (int i = 0; i < 4; i++) { + for (int w = 0; w < 4; w++) { + int acc = 0; + for (int d = 0; d < 4; d++) { + + int x = A[i * 4 + d]; + int y = B[w * 4 + d]; + acc += x * y; + compute_C_len++; + } + C[i * 4 + w] += acc; + } + } + wait(); + compute.write(false); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + for (int i = 0; i < 16; i++) { + DATA d; + d.tlast = false; + if (i == 15) + d.tlast = true; + d.data = C[i]; + C[i] = 0; + dout1.write(d); + send_C_len++; + DWAIT(); + } + send.write(false); + wait(); + } +} +#endif \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v6/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v6/accelerator.sc.h new file mode 100644 index 0000000..6bc37c8 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mm_4x4_v6/accelerator.sc.h @@ -0,0 +1,247 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" +#define ACCNAME MM_4x4v6 + +// OP-Code Stuct +// 0000 : 0 = NOP; +// 0001 : 1 = read_A; +// 0010 : 2 = read_B; +// 0011 : 3 = read_A -> read_B; +// 0100 : 4 = compute_C; +// 0101 : 5 = read_A -> compute_C; +// 0110 : 6 = read_B -> compute_C; +// 0111 : 7 = read_A -> read_B -> compute_C; + +// 1000 : 8 = send_C; +// 1001 : 9 = read_A -> send_C; +// 1010 : 10 = read_B -> send_C; +// 1011 : 11 = read_A -> read_B -> send_C; +// 1100 : 12 = compute_C -> send_C; +// 1101 : 13 = read_A -> compute_C -> send_C; +// 1110 : 14 = read_B -> compute_C -> send_C; +// 1111 : 15 = read_A -> read_B -> compute_C -> send_C; + +struct opcode { + unsigned int packet; + bool read_A; + bool read_B; + bool compute_C; + bool send_C; + + opcode(sc_uint<32> _packet) { + cout << "OPCODE: " << _packet << endl; + cout << "Time: " << sc_time_stamp() << endl; + packet = _packet; + read_A = _packet.range(0, 0); + read_B = _packet.range(1, 1); + compute_C = _packet.range(2, 2); + send_C = _packet.range(3, 3); + } +}; + +struct code_extension { + sc_uint<16> N; + sc_uint<16> M; + + sc_uint<16> K; + sc_uint<16> in_stride; + + sc_uint<16> out_offset; + sc_uint<16> out_stride; + + code_extension(sc_uint<32> _packetA, sc_uint<32> _packetB, + sc_uint<32> _packetC) { + + N = _packetA.range(15,0); + M = _packetA.range(31,16); + + K = _packetB.range(15,0); + in_stride = _packetB.range(31,16); + + out_offset = _packetC.range(15,0); + out_stride = _packetC.range(31,16); + + cout << "Time: " << sc_time_stamp() << endl; + cout << "N: " << N << ", M: " << M << ", K: " << K + << ", in_stride: " << in_stride << ", out_offset: " << out_offset + << ", out_stride: " << out_stride << endl; + } +}; + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> A_buffer[4096]; + sc_int<32> B_buffer[4096]; + sc_int<32> C_buffer[4096]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + code_extension acc_args = code_extension(0,0,0); + + void Recv(); + + void Compute(int, int, int, int, int, int); + + void Schedule_Compute(); + + void Send(); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Schedule_Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + verbose = false; + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::Recv() { + wait(); + while (1) { + opcode packet(din1.read().data); + code_extension op_args(din1.read().data, din1.read().data, + din1.read().data); + acc_args = op_args; + + if (packet.read_A) { + unsigned int read_length = op_args.N * op_args.K; + for (int i = 0; i < read_length; i++) { + A_buffer[i] = din1.read().data; + DWAIT(); + } + } + + if (packet.read_B) { + unsigned int read_length = op_args.M * op_args.K; + for (int i = 0; i < read_length; i++) { + B_buffer[i] = din1.read().data; + DWAIT(); + } + } + + // Computes C if true + if (packet.compute_C) { + compute.write(true); + wait(); + } + + while (compute) + wait(); + + // Sends then clears C if true + if (packet.send_C) { + send.write(true); + wait(); + } + + while (send) + wait(); + + wait(); + } +} + +void ACCNAME::Compute(int N, int M, int K, int in_stride, int out_offset, + int out_stride) { + for (int n = 0; n < 4; n++) { + for (int m = 0; m < 4; m++) { + int acc = 0; + for (int k = 0; k < 4; k++) { + int a_data = A_buffer[(N + n) * in_stride + K + k]; + int b_data = B_buffer[(M + m) * in_stride + K + k]; + acc += a_data * b_data; + } + C_buffer[out_offset + (N + n) * out_stride + M + m] += acc; + } + } +} + +void ACCNAME::Schedule_Compute() { + wait(); + while (1) { + while (!compute) + wait(); + + for (int n = 0; n < acc_args.N; n += 4) { + for (int m = 0; m < acc_args.M; m += 4) { + for (int k = 0; k < acc_args.K; k += 4) { + Compute(n, m, k, acc_args.in_stride, acc_args.out_offset, + acc_args.out_stride); + } + } + } + + wait(); + compute.write(false); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + + for (int n = 0; n < acc_args.N; n++) { + for (int m = 0; m < acc_args.M; m++) { + DATA d; + d.tlast = false; + d.data = C_buffer[acc_args.out_offset + n * acc_args.out_stride + m]; + if (n + 1 == acc_args.N && m + 1 == acc_args.M) + d.tlast = true; + dout1.write(d); + C_buffer[acc_args.out_offset + n * acc_args.out_stride + m] = 0; + } + } + send.write(false); + wait(); + } +} + + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/accelerator.sc.h new file mode 100644 index 0000000..434c75b --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/accelerator.sc.h @@ -0,0 +1,200 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" +#define ACCNAME MM_4x4v1 + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> inputs[4096]; + sc_int<32> weights[4096]; + sc_int<32> outputs[4096]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + void Recv(); + + void Compute(); + + void Send(); + + void print_profile(); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len=0; + read_B_len=0; + compute_C_len=0; + send_C_len=0; + verbose = false; + + // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle + // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS + // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle + // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS + // RESET variable=reset + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Read A data_len: " << read_A_len << endl; + cout << "Read B data_len: " << read_B_len << endl; + cout << "MACs count: " << compute_C_len << endl; + cout << "Send C data_len: " << send_C_len << endl; + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Executed with :" << __FILE__ << endl; + cout << "- - - - - - - - - - - - - - - - - - - - " << endl;; +} + +void ACCNAME::Recv() { + wait(); + while (1) { + while (compute) + wait(); + + for (int i = 0; i < 16; i++) { + inputs[i] = din1.read().data; + read_A_len++; + DWAIT(); + } + + for (int i = 0; i < 16; i++) { + weights[i] = din1.read().data; + read_B_len++; + DWAIT(); + } + + // DEBUG ONLY + if (true) { + cout << "=========================" << endl; + cout << "BLOCK: " << process_blocks++ << endl; + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << inputs[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << weights[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + wait(); + compute.write(true); + wait(); + } +} + +void ACCNAME::Compute() { + wait(); + while (1) { + while (!compute) + wait(); + for (int i = 0; i < 4; i++) { + for (int w = 0; w < 4; w++) { + int acc = 0; + for (int d = 0; d < 4; d++) { + int x = inputs[i * 4 + d]; + int y = weights[w * 4 + d]; + acc += x * y; + compute_C_len++; + } + outputs[i * 4 + w] = acc; + } + } + + // DEBUG ONLY + if (verbose) { + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << outputs[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + wait(); + compute.write(false); + send.write(true); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + for (int i = 0; i < 16; i++) { + DATA d; + d.tlast = false; + if (i == 15) + d.tlast = true; + d.data = outputs[i]; + dout1.write(d); + send_C_len++; + DWAIT(); + } + send.write(false); + wait(); + } +} + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/mm_4x4_v1.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/mm_4x4_v1.json new file mode 100644 index 0000000..e997d57 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v1/mm_4x4_v1.json @@ -0,0 +1,120 @@ +{ + "name": "MM_4x4_v1", + "version": "1.0", + "description": "MM Accelerator", + + "memory_layout": { + "#A_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 16, + "data_type": "int32" + } + }, + + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#N": 4, + "#M": 4, + "#K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#N", + "#K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#M", + "#K" + ] + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#N", + "#M" + ] + } + } + } + }, + + "ISA": { + "instruction_format": { + "opcode_length": 0, + "op_args": 0 + }, + "opcodes": { + "-": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ] + } + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/accelerator.sc.h new file mode 100644 index 0000000..ccfb1a3 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/accelerator.sc.h @@ -0,0 +1,241 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" +#define ACCNAME MMT_4x4v2 + +#ifdef VERBOSE_ACC +#define ALOG(x) std::cout << x << std::endl +#else +#define ALOG(x) +#endif + +// OP-Code Stuct +// 000 : 0 = NOP; +// 001 : 1 = read_A; +// 010 : 2 = read_B; +// 011 : 3 = read_A -> read_B; +// 100 : 4 = compute_C; +// 101 : 5 = read_A -> compute_C; +// 110 : 6 = read_B -> compute_C; +// 111 : 7 = read_A -> read_B -> compute_C; + +struct opcode { + unsigned int packet; + bool read_A; + bool read_B; + bool compute_C; + + opcode(sc_uint<32> _packet) { + ALOG("OPCODE: " << _packet); + ALOG("Time: " << sc_time_stamp()); + packet = _packet; + read_A = _packet.range(0, 0); + read_B = _packet.range(1, 1); + compute_C = _packet.range(2, 2); + } +}; + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> inputs[4096]; + sc_int<32> weights[4096]; + sc_int<32> outputs[4096]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + void Recv(); + + void Compute(); + + void Send(); + + void print_profile(); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len=0; + read_B_len=0; + compute_C_len=0; + send_C_len=0; + verbose = false; + + // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle + // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS + // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle + // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS + // RESET variable=reset + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Read A data_len: " << read_A_len << endl; + cout << "Read B data_len: " << read_B_len << endl; + cout << "MACs count: " << compute_C_len << endl; + cout << "Send C data_len: " << send_C_len << endl; + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Executed with :" << __FILE__ << endl; + cout << "- - - - - - - - - - - - - - - - - - - - " << endl;; +} + + +void ACCNAME::Recv() { + wait(); + while (1) { + while (compute) + wait(); + + opcode packet(din1.read().data); + + if (packet.read_A) { + for (int i = 0; i < 16; i++) { + inputs[i] = din1.read().data; + read_A_len++; + DWAIT(); + } + } + + if (packet.read_B) { + for (int i = 0; i < 16; i++) { + weights[i] = din1.read().data; + read_B_len++; + DWAIT(); + } + } + + // DEBUG ONLY + if (verbose) { + cout << "=========================" << endl; + cout << "BLOCK: " << process_blocks++ << endl; + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << inputs[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << weights[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + if (packet.compute_C) { + wait(); + compute.write(true); + } + wait(); + } +} + +void ACCNAME::Compute() { + wait(); + while (1) { + while (!compute) + wait(); + for (int i = 0; i < 4; i++) { + for (int w = 0; w < 4; w++) { + int acc = 0; + for (int d = 0; d < 4; d++) { + int x = inputs[i * 4 + d]; + int y = weights[w * 4 + d]; + acc += x * y; + compute_C_len++; + } + outputs[i * 4 + w] = acc; + } + } + + // DEBUG ONLY + if (verbose) { + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << outputs[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + wait(); + compute.write(false); + send.write(true); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + for (int i = 0; i < 16; i++) { + DATA d; + d.tlast = false; + if (i == 15) + d.tlast = true; + d.data = outputs[i]; + dout1.write(d); + send_C_len++; + DWAIT(); + } + send.write(false); + wait(); + } +} + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/mm_4x4_v2.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/mm_4x4_v2.json new file mode 100644 index 0000000..e469ab4 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v2/mm_4x4_v2.json @@ -0,0 +1,218 @@ +{ + "name": "MM_4x4_v2", + "version": "1.0", + "description": "MM Accelerator", + "memory_layout": { + "#A_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 16, + "data_type": "int32" + } + }, + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#N": 4, + "#M": 4, + "#K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#N", + "#K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#M", + "#K" + ], + "stationary": true + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#N", + "#M" + ] + } + } + } + }, + "ISA": { + "instruction_format": { + "opcode_length": 32, + "op_args": 0 + }, + "opcodes": { + "-": [], + "0": [], + "1": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "2": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "3": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "4": [ + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "5": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "6": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "7": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ] + } + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/accelerator.sc.h new file mode 100644 index 0000000..c9b5ebc --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/accelerator.sc.h @@ -0,0 +1,263 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" +#define ACCNAME MM_4x4v3 + +#ifdef VERBOSE_ACC +#define ALOG(x) std::cout << x << std::endl +#else +#define ALOG(x) +#endif + +// OP-Code Stuct +// 0000 : 0 = NOP; +// 0001 : 1 = read_A; +// 0010 : 2 = read_B; +// 0011 : 3 = read_A -> read_B; +// 0100 : 4 = compute_C; +// 0101 : 5 = read_A -> compute_C; +// 0110 : 6 = read_B -> compute_C; +// 0111 : 7 = read_A -> read_B -> compute_C; + +// 1000 : 8 = send_C; +// 1001 : 9 = read_A -> send_C; +// 1010 : 10 = read_B -> send_C; +// 1011 : 11 = read_A -> read_B -> send_C; +// 1100 : 12 = compute_C -> send_C; +// 1101 : 13 = read_A -> compute_C -> send_C; +// 1110 : 14 = read_B -> compute_C -> send_C; +// 1111 : 15 = read_A -> read_B -> compute_C -> send_C; + +struct opcode { + unsigned int packet; + bool read_A; + bool read_B; + bool compute_C; + bool send_C; + + opcode(sc_uint<32> _packet) { + ALOG("OPCODE: " << _packet); + ALOG("Time: " << sc_time_stamp()); + packet = _packet; + read_A = _packet.range(0, 0); + read_B = _packet.range(1, 1); + compute_C = _packet.range(2, 2); + send_C = _packet.range(3, 3); + } +}; + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> inputs[4096]; + sc_int<32> weights[4096]; + sc_int<32> outputs[4096]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + void Recv(); + + void Compute(); + + void Send(); + + void print_profile(); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len=0; + read_B_len=0; + compute_C_len=0; + send_C_len=0; + verbose = false; + + // #pragma HLS RESOURCE variable=din1 core=AXI4Stream metadata="-bus_bundle + // S_AXIS_DATA1" port_map={{din1_0 TDATA} {din1_1 TLAST}} #pragma HLS + // RESOURCE variable=dout1 core=AXI4Stream metadata="-bus_bundle + // M_AXIS_DATA1" port_map={{dout1_0 TDATA} {dout1_1 TLAST}} #pragma HLS + // RESET variable=reset + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Read A data_len: " << read_A_len << endl; + cout << "Read B data_len: " << read_B_len << endl; + cout << "MACs count: " << compute_C_len << endl; + cout << "Send C data_len: " << send_C_len << endl; + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Executed with :" << __FILE__ << endl; + cout << "- - - - - - - - - - - - - - - - - - - - " << endl; +} + +void ACCNAME::Recv() { + wait(); + while (1) { + opcode packet(din1.read().data); + + if (packet.read_A) { + for (int i = 0; i < 16; i++) { + inputs[i] = din1.read().data; + read_A_len++; + DWAIT(); + } + } + + if (packet.read_B) { + for (int i = 0; i < 16; i++) { + weights[i] = din1.read().data; + read_B_len++; + DWAIT(); + } + } + + // DEBUG ONLY + if (verbose) { + cout << "=========================" << endl; + cout << "BLOCK: " << process_blocks++ << endl; + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << inputs[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << weights[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + // Computes C if true + if (packet.compute_C) { + wait(); + compute.write(true); + } + + while (compute) + wait(); + + // Sends then clears C if true + if (packet.send_C) { + wait(); + send.write(true); + } + + while (send) + wait(); + + wait(); + } +} + +void ACCNAME::Compute() { + wait(); + while (1) { + while (!compute) + wait(); + for (int i = 0; i < 4; i++) { + for (int w = 0; w < 4; w++) { + int acc = 0; + for (int d = 0; d < 4; d++) { + int x = inputs[i * 4 + d]; + int y = weights[d * 4 + w]; + // int y = weights[w * 4 + d]; + acc += x * y; + compute_C_len++; + } + outputs[i * 4 + w] += acc; + } + } + + // DEBUG ONLY + if (verbose) { + cout << "=========================" << endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) + cout << outputs[i * 4 + j] << ","; + cout << endl; + } + cout << "=========================" << endl; + } + // DEBUG ONLY + + wait(); + compute.write(false); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + for (int i = 0; i < 16; i++) { + DATA d; + d.tlast = false; + if (i == 15) + d.tlast = true; + d.data = outputs[i]; + dout1.write(d); + outputs[i] = 0; // Clears after sends + send_C_len++; + DWAIT(); + } + send.write(false); + wait(); + } +} + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/mm_4x4_v3.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/mm_4x4_v3.json new file mode 100644 index 0000000..83c0a89 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v3/mm_4x4_v3.json @@ -0,0 +1,352 @@ +{ + "name": "MM_4x4_v3", + "version": "1.0", + "description": "MM Accelerator", + "memory_layout": { + "#A_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 16, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 16, + "data_type": "int32" + } + }, + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#N": 4, + "#M": 4, + "#K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#N", + "#K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#M", + "#K" + ], + "stationary": true + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#N", + "#M" + ] + } + } + } + }, + "ISA": { + "instruction_format": { + "opcode_length": 32, + "op_args": 0 + }, + "opcodes": { + "-": [], + "0": [], + "1": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "2": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "3": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "4": [ + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "5": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "6": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "7": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "8": [ + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "9": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "10": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "11": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "12": [ + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "13": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "14": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ], + "15": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": 16 + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": 16 + } + } + ] + + + } + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/accelerator.sc.h b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/accelerator.sc.h new file mode 100644 index 0000000..6907476 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/accelerator.sc.h @@ -0,0 +1,262 @@ +#ifndef ACC_H +#define ACC_H + +#include "../dma_engine.sc.h" +#define ACCNAME MM_4x4v4 + +// #define VERBOSE_ACC +#ifdef VERBOSE_ACC +#define ALOG(x) std::cout << x << std::endl +#else +#define ALOG(x) +#endif + +// OP-Code Stuct +// 0000 : 0 = NOP; +// 0001 : 1 = read_A; +// 0010 : 2 = read_B; +// 0011 : 3 = read_A -> read_B; +// 0100 : 4 = compute_C; +// 0101 : 5 = read_A -> compute_C; +// 0110 : 6 = read_B -> compute_C; +// 0111 : 7 = read_A -> read_B -> compute_C; + +// 1000 : 8 = send_C; +// 1001 : 9 = read_A -> send_C; +// 1010 : 10 = read_B -> send_C; +// 1011 : 11 = read_A -> read_B -> send_C; +// 1100 : 12 = compute_C -> send_C; +// 1101 : 13 = read_A -> compute_C -> send_C; +// 1110 : 14 = read_B -> compute_C -> send_C; +// 1111 : 15 = read_A -> read_B -> compute_C -> send_C; + +struct opcode { + unsigned int packet; + bool read_A; + bool read_B; + bool compute_C; + bool send_C; + + opcode(sc_uint<32> _packet) { + ALOG("OPCODE: " << _packet); + ALOG("Time: " << sc_time_stamp()); + packet = _packet; + read_A = _packet.range(0, 0); + read_B = _packet.range(1, 1); + compute_C = _packet.range(2, 2); + send_C = _packet.range(3, 3); + } +}; + +struct code_extension { + sc_uint<16> N; + sc_uint<16> M; + sc_uint<32> K; + + code_extension(sc_uint<32> _packetA, sc_uint<32> _packetB) { + N = _packetA.range(15, 0); + M = _packetA.range(31, 16); + K = _packetB.range(31, 0); + + ALOG("Time: " << sc_time_stamp()); + ALOG("N: " << N << ", M: " << M << ", K: " << K); + } +}; + +SC_MODULE(ACCNAME) { + sc_in clock; + sc_in reset; + sc_int<32> A_buffer[4096]; + sc_int<32> B_buffer[4096]; + sc_int<32> C_buffer[4096]; + sc_fifo_in din1; + sc_fifo_out dout1; + + // Debug variables + int process_blocks; + int read_A_len; + int read_B_len; + int compute_C_len; + int send_C_len; + bool verbose; + +#ifndef __SYNTHESIS__ + sc_signal compute; + sc_signal send; +#else + sc_signal compute; + sc_signal send; +#endif + + code_extension acc_args = code_extension(0, 0); + + void Recv(); + + void Compute(int, int, int, int, int); + + void Schedule_Compute(); + + void Send(); + + void print_profile(); + + SC_HAS_PROCESS(ACCNAME); + + ACCNAME(sc_module_name name_) : sc_module(name_) { + SC_CTHREAD(Recv, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Schedule_Compute, clock.pos()); + reset_signal_is(reset, true); + + SC_CTHREAD(Send, clock.pos()); + reset_signal_is(reset, true); + + process_blocks = 0; + read_A_len = 0; + read_B_len = 0; + compute_C_len = 0; + send_C_len = 0; + verbose = false; + } +}; + +template +void accelerator_dma_connect(ACCNAME *acc, DMA_DRIVER *dmad, + int _dma_input_buffer_size, + int _dma_output_buffer_size) { + + static sc_clock clk_fast("ClkFast", 1, SC_NS); + static sc_signal sig_reset; + static sc_fifo din1("din1_fifo", _dma_input_buffer_size); + static sc_fifo dout1("dout1_fifo", _dma_output_buffer_size); + + acc->clock(clk_fast); + acc->reset(sig_reset); + acc->dout1(dout1); + acc->din1(din1); + + dmad->clock(clk_fast); + dmad->reset(sig_reset); + dmad->dout1(dout1); + dmad->din1(din1); +} + +void ACCNAME::print_profile() { + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Read A data_len: " << read_A_len << endl; + cout << "Read B data_len: " << read_B_len << endl; + cout << "MACs count: " << compute_C_len << endl; + cout << "Send C data_len: " << send_C_len << endl; + cout << "++++++++++++++++++++++++++++++++++++++++" << endl; + cout << "Executed with :" << __FILE__ << endl; + cout << "- - - - - - - - - - - - - - - - - - - - " << endl; +} + +void ACCNAME::Recv() { + wait(); + while (1) { + opcode packet(din1.read().data); + code_extension op_args(din1.read().data, din1.read().data); + acc_args = op_args; + + if (packet.read_A) { + unsigned int read_length = op_args.N * op_args.K; + for (int i = 0; i < read_length; i++) { + A_buffer[i] = din1.read().data; + read_A_len++; + DWAIT(); + } + } + + if (packet.read_B) { + unsigned int read_length = op_args.M * op_args.K; + for (int i = 0; i < read_length; i++) { + B_buffer[i] = din1.read().data; + read_B_len++; + DWAIT(); + } + } + + // Computes C if true + if (packet.compute_C) { + compute.write(true); + wait(); + } + + while (compute) + wait(); + + // Sends then clears C if true + if (packet.send_C) { + send.write(true); + wait(); + } + + while (send) + wait(); + + wait(); + } +} + +void ACCNAME::Compute(int N, int M, int K, int in_stride, int out_stride) { + for (int n = 0; n < 4; n++) { + for (int m = 0; m < 4; m++) { + int acc = 0; + for (int k = 0; k < 4; k++) { + int a_data = A_buffer[(N + n) * in_stride + K + k]; + int b_data = B_buffer[(M + m) * in_stride + K + k]; + acc += a_data * b_data; + compute_C_len++; + } + C_buffer[(N + n) * out_stride + M + m] += acc; + } + } +} + +void ACCNAME::Schedule_Compute() { + wait(); + while (1) { + while (!compute) + wait(); + + for (int n = 0; n < acc_args.N; n += 4) { + for (int m = 0; m < acc_args.M; m += 4) { + for (int k = 0; k < acc_args.K; k += 4) { + Compute(n, m, k, acc_args.K, acc_args.M); + } + } + } + + wait(); + compute.write(false); + wait(); + } +} + +void ACCNAME::Send() { + wait(); + while (1) { + while (!send) + wait(); + + for (int n = 0; n < acc_args.N; n++) { + for (int m = 0; m < acc_args.M; m++) { + DATA d; + d.tlast = false; + d.data = C_buffer[n * acc_args.M + m]; + if (n + 1 == acc_args.N && m + 1 == acc_args.M) + d.tlast = true; + dout1.write(d); + C_buffer[n * acc_args.M + m] = 0; + send_C_len++; + DWAIT(); + } + } + send.write(false); + wait(); + } +} + +#endif diff --git a/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/mm_4x4_v4.json b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/mm_4x4_v4.json new file mode 100644 index 0000000..1be763f --- /dev/null +++ b/include/soda/ExecutionEngine/axi/accelerators/mmt_4x4_v4/mm_4x4_v4.json @@ -0,0 +1,361 @@ +{ + "name": "MM_4x4_v4", + "version": "1.0", + "description": "MM Accelerator", + "memory_layout": { + "#A_Buffer": { + "size": 4096, + "data_type": "int32" + }, + "#B_Buffer": { + "size": 4096, + "data_type": "int32" + }, + "#C_Buffer": { + "size": 4096, + "data_type": "int32" + } + }, + "dma_fifo": { + "din": { + "id": "0", + "data_type": "int32", + "read": true, + "write": false + }, + "dout": { + "id": "1", + "data_type": "int32", + "read": false, + "write": true + } + }, + "kernels": { + "4x4_MM": { + "id": 0, + "description": "4x4 matrix multiplication", + "compute": "C += A * B", + "tile_info": { + "tile_dims": { + "#tile_N": 4, + "#tile_M": 4, + "#tile_K": 4 + }, + "A": { + "associated_buffer": "#A_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#tile_N", + "#tile_K" + ] + }, + "B": { + "associated_buffer": "#B_Buffer", + "read": true, + "write": false, + "default_offset": 0, + "shape": [ + "#tile_M", + "#tile_K" + ], + "stationary": true + }, + "C": { + "associated_buffer": "#C_Buffer", + "read": true, + "write": true, + "default_offset": 0, + "shape": [ + "#tile_N", + "#tile_M" + ] + } + } + } + }, + "ISA": { + "instruction_format": { + "opcode_length": 32, + "op_extension": 64 + }, + "opcodes": { + "-": [], + "0": [], + "1": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + } + ], + "2": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + } + ], + "3": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + } + ], + "4": [ + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "5": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "6": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "7": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + } + ], + "8": [ + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "9": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "10": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "11": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "12": [ + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "13": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "14": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ], + "15": [ + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#A_Buffer", + "offset": 0, + "length": "#N * #K" + } + }, + { + "READ": { + "dma_fifo_id": 0, + "buffer": "#B_Buffer", + "offset": 0, + "length": "#M * #K" + } + }, + { + "COMPUTE": { + "kernel_id": 0 + } + }, + { + "SEND": { + "dma_fifo_id": 1, + "buffer": "#C_Buffer", + "offset": 0, + "length": "#N * #M" + } + } + ] + }, + "op_arg": { + "0-15": "#N", + "16-31": "#M", + "31-63": "#K" + } + }, + "schedule": { + "allowed_patterns": [ + "R#a, R#b, C, S#c", + "a" + ] + } +} \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/api_v0.h b/include/soda/ExecutionEngine/axi/api_v0.h new file mode 100644 index 0000000..fcd15c5 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/api_v0.h @@ -0,0 +1,110 @@ +//**********************Deprecated********************** + +#ifndef AXI_APIv0 +#define AXI_APIv0 + +#include +#include +#include +#include +#include + +// API Model = One DMA is allocated with an input and an output buffer +// TODO: Struct based representation of API model + +struct dma { +#define MM2S_CONTROL_REGISTER 0x00 +#define MM2S_STATUS_REGISTER 0x04 +#define MM2S_START_ADDRESS 0x18 +#define MM2S_LENGTH 0x28 + +#define S2MM_CONTROL_REGISTER 0x30 +#define S2MM_STATUS_REGISTER 0x34 +#define S2MM_DESTINATION_ADDRESS 0x48 +#define S2MM_LENGTH 0x58 + + unsigned int id; + unsigned int *dma_address; + unsigned int *dma_input_addr; + unsigned int *dma_output_addr; + + unsigned int dma_input_len; + unsigned int dma_output_len; + + void init(int id); + + void dma_set(unsigned int *dma_virtual_address, int offset, + unsigned int value) { + dma_virtual_address[offset >> 2] = value; + } + + unsigned int dma_get(unsigned int *dma_virtual_address, int offset) { + return dma_virtual_address[offset >> 2]; + } +}; + +struct dma_collection { + + // Variables + int dma_count; + struct dma *dma_list; + + //-----------------DMA Functions----------------- + /** + * dma_address is base address of dma + * dma_input_addr is starting memory location for the dma input buffer, + * dma_input_len is length of the buffer dma_output_addr is starting memory + * location for the dma output buffer, dma_output_len is length of the buffer + * Memory maps dma's base address + * Runs starting controls signals and sets MMS2, S2MM address registers to + * start memory locations of the input and output buffers + */ + void dma_init(int dma_count, unsigned int *dma_address, + unsigned int *dma_input_addr, unsigned int *dma_input_len, + unsigned int *dma_output_addr, unsigned int *dma_output_len); + + // Memory unmaps DMA base addresses and Input and output buffers + void dma_free(); + + // Get base address for dma represented by dma_id, + unsigned int *dma_get_regaddr(); + + //-----------------BUFFER Functions----------------- + // Get the MMap address of the input buffer of the dma + unsigned int *dma_get_inbuffer(); + + // Get the MMap address of the output buffer of the dma + unsigned int *dma_get_outbuffer(); + + //-----------------DMA MMS2 Functions----------------- + /** + * Checks if input buffer size is >= length + * Sets DMA MMS2 transfer length to length + * Starts transfers to the accelerator using dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ + int dma_set_transfer(int dma_id, int length); + + // Blocks thread until dma MMS2 transfer is complete + void dma_send(int dma_id, int buffer_ID, int length); + + // Same as dma_send but thread does not block, returns if 0 + int dma_send_nb(int dma_id, int buffer_ID, int length); + + //-----------------DMA S2MM Functions----------------- + /** + * Checks if buffer size is >= length + * Sets 2SMM store length + * Starts storing data recieved through dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ + int dma_set_store(int dma_id, int buffer_ID, int length); + + // Blocks thread until dma S2MM transfer is complete (TLAST signal is seen) + void dma_recv(int dma_id, int buffer_ID, int length); + + // Same as dma_recv but thread does not block, returns if 0 + int dma_recv_nb(int dma_id, int buffer_ID, int length); +}; + +#endif \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/api_v1.h b/include/soda/ExecutionEngine/axi/api_v1.h new file mode 100644 index 0000000..03bfd61 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/api_v1.h @@ -0,0 +1,200 @@ +#ifndef AXI_APIv1 +#define AXI_APIv1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef SYSC +// Easy way to switch between systemC accelerators --- there is probably a +// better way + +#ifdef CONV_V1 +#include "soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h" +#elif ACC_V5 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h" +#elif ACC_V4 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h" +#elif ACC_V3 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h" +#elif ACC_V2 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h" +#else +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h" +#endif +#endif + +// API Model = One DMA is allocated with a single input and output buffer (Can +// have different size) + +// Simple view of DMA +/* +dma -> { + control_register_address : unsigned int # Mapped address to the start of +the DMA control registers Buffer input_buffer (address,size) Buffer +output_buffer (address,size) +} +*/ + +struct dma { +#define MM2S_CONTROL_REGISTER 0x00 +#define MM2S_STATUS_REGISTER 0x04 +#define MM2S_START_ADDRESS 0x18 +#define MM2S_LENGTH 0x28 +#define S2MM_CONTROL_REGISTER 0x30 +#define S2MM_STATUS_REGISTER 0x34 +#define S2MM_DESTINATION_ADDRESS 0x48 +#define S2MM_LENGTH 0x58 +#define PAGE_SIZE getpagesize() + +#define m_assert(expr, msg) assert(((void)(msg), (expr))) + +// Define this variable for additional profiling info (in api_v1_sysc.cpp) +#define PROFILE +#ifdef PROFILE +#define PLOG(x) std::cout << x << std::endl +#define PFUNC(x) x +#else +// Safer option that requires a semicolon, but relies on compiler to be removed +// #define PLOG(x) do { } while(0) +// #define PFUNC(x) do { } while(0) +#define PLOG(x) +#define PFUNC(x) +#endif + +// Define this variable for additional debug info +// #define VERBOSE_AXI +#ifdef VERBOSE_AXI +#define D(x) \ + do { \ + x \ + } while (0) +#define LOG(x) std::cout << x << std::endl +#else +// Safer option that requires a semicolon, but relies on compiler to be removed +// #define D(x) do { } while(0) +// #define LOG(x) do { } while(0) +#define D(x) +#define LOG(x) +#endif + + unsigned int *dma_address; + unsigned int *dma_input_address; + unsigned int *dma_output_address; + unsigned int dma_input_buffer_size; + unsigned int dma_output_buffer_size; + unsigned int dma_input_paddress; + unsigned int dma_output_paddress; + unsigned int *acc_address; + unsigned int current_input_offset; + + // Profiling Variables + unsigned int dma_send_length = 0; + unsigned int dma_recv_length = 0; + unsigned int dma_send_count = 0; + unsigned int dma_recv_count = 0; + + // temp --- need to remove later + bool verbose = false; + +#ifdef SYSC + ACCNAME *acc; + DMA_DRIVER *dmad; +#endif + + void dma_init(unsigned int dma_address, unsigned int dma_input_address, + unsigned int dma_input_buffer_size, + unsigned int dma_output_address, + unsigned int dma_output_buffer_size); + + // Memory unmaps DMA control_register_address and Input and output buffers + void dma_free(); + + // We could reduce to one set of the following calls + //================================================================================================================ + + //-----------------BUFFER Functions----------------- + // Get the MMap address of the input buffer of the dma *Needed to copy data + // to Input_Buffer* + unsigned int *dma_get_inbuffer(); + + // Get the MMap address of the output buffer of the dma *Needed to copy data + // from Output_Buffer* + unsigned int *dma_get_outbuffer(); + + //================================================================================================================ + + //-----------------BUFFER Functions----------------- + // Copy data into the Input Buffer (length to write, offset to write to) + // returns 0 if successful + int dma_copy_to_inbuffer(unsigned int *host_src_address, int data_length, + int offset); + + template + int mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset); + + // Copy data from the Output Buffer (length to read, offset to read from) + // returns 0 if successful + int dma_copy_from_outbuffer(unsigned int *host_dst_address, int data_length, + int offset); + + template + int mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset); + + //============================================================================ + + //-----------------DMA MMS2 Functions----------------- + /** + * Checks if input buffer size is >= length + * Sets DMA MMS2 transfer length to length + * Starts transfers to the accelerator using dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ + int dma_start_send(int length, int offset); + + // Blocks thread until dma MMS2 transfer is complete + void dma_wait_send(); + + // Same as dma_send but thread does not block, returns 0 if done + int dma_check_send(); + + //-----------------DMA S2MM Functions----------------- + /** + * Checks if buffer size is >= length + * Sets 2SMM store length + * Starts storing data recieved through dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ + int dma_start_recv(int length, int offset); + + // Blocks thread until dma S2MM transfer is complete (TLAST signal is seen) + void dma_wait_recv(); + + // Same as dma_recv but thread does not block, returns 0 if done + int dma_check_recv(); + + //********************************** Unexposed Functions + //********************************** + void initDMAControls(); + void dma_set(unsigned int *dma_virtual_address, int offset, + unsigned int value); + unsigned int dma_get(unsigned int *dma_virtual_address, int offset); + void dma_mm2s_sync(); + void dma_s2mm_sync(); + void acc_init(unsigned int base_addr, int length); + void dump_acc_signals(int state); +}; + +#endif \ No newline at end of file diff --git a/include/soda/ExecutionEngine/axi/api_v2.h b/include/soda/ExecutionEngine/axi/api_v2.h new file mode 100644 index 0000000..bdb25f8 --- /dev/null +++ b/include/soda/ExecutionEngine/axi/api_v2.h @@ -0,0 +1,217 @@ +#ifndef AXI_APIv2 +#define AXI_APIv2 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef SYSC +// Easy way to switch between systemC accelerators --- there is probably a +// better way + +#ifdef CONV_V1 +#include "soda/ExecutionEngine/axi/accelerators/conv_v1/accelerator.sc.h" +#elif ACC_V5 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v5/accelerator.sc.h" +#elif ACC_V4 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v4/accelerator.sc.h" +#elif ACC_V3 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v3/accelerator.sc.h" +#elif ACC_V2 +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v2/accelerator.sc.h" +#else +#include "soda/ExecutionEngine/axi/accelerators/mm_4x4_v1/accelerator.sc.h" +#endif +#endif + +// API Model = One DMA is allocated with a single input and output buffer (Can +// have different size) + +// clang-format off +// Simple view of DMA +/* +dma -> { + control_register_address : unsigned int # Mapped address to the start of + the DMA control registers + Buffer input_buffer (address,length,size_of_element) + Buffer output_buffer (address,length,size_of_element) + All lengths are in elements + All offsets are in elements + The library will handle the conversion to bytes + Pointer access to the buffer returns char* + Once, within dma_init, MLIR/external user needs to specify size of the I/O elements +} +*/ +// clang-format on + +struct dma { +#define MM2S_CONTROL_REGISTER 0x00 +#define MM2S_STATUS_REGISTER 0x04 +#define MM2S_START_ADDRESS 0x18 +#define MM2S_LENGTH 0x28 +#define S2MM_CONTROL_REGISTER 0x30 +#define S2MM_STATUS_REGISTER 0x34 +#define S2MM_DESTINATION_ADDRESS 0x48 +#define S2MM_LENGTH 0x58 +#define PAGE_SIZE getpagesize() + +#define m_assert(expr, msg) assert(((void)(msg), (expr))) + +// Define this variable for additional profiling info (in api_v1_sysc.cpp) +#define PROFILE +#ifdef PROFILE +#define PLOG(x) std::cout << x << std::endl +#define PFUNC(x) x +#else +// Safer option that requires a semicolon, but relies on compiler to be removed +// #define PLOG(x) do { } while(0) +// #define PFUNC(x) do { } while(0) +#define PLOG(x) +#define PFUNC(x) +#endif + +// Define this variable for additional debug info +// #define VERBOSE_AXI +#ifdef VERBOSE_AXI +#define D(x) \ + do { \ + x \ + } while (0) +#define LOG(x) std::cout << x << std::endl +#else +// Safer option that requires a semicolon, but relies on compiler to be removed +// #define D(x) do { } while(0) +// #define LOG(x) do { } while(0) +#define D(x) +#define LOG(x) +#endif + // I/O addresses are in type char or handled with type char size + // I/O lengths are in elements + unsigned int *dma_address; + char *dma_input_address; + char *dma_output_address; + unsigned int dma_input_buffer_size; + unsigned int dma_output_buffer_size; + unsigned int isize; + unsigned int osize; + + // These addresses are in physical memory + unsigned int dma_input_paddress; + unsigned int dma_output_paddress; + + // Maybe remove + unsigned int *acc_address; + // unsigned int current_input_offset; + + // Profiling Variables + unsigned int dma_send_length = 0; + unsigned int dma_recv_length = 0; + unsigned int dma_send_count = 0; + unsigned int dma_recv_count = 0; + + // temp --- need to remove later + bool verbose = false; + +#ifdef SYSC + ACCNAME *acc; + DMA_DRIVER *dmad; +#endif + + void dma_init(unsigned int dma_address, unsigned int dma_input_address, + unsigned int dma_input_buffer_size, unsigned int isize, + unsigned int dma_output_address, + unsigned int dma_output_buffer_size, unsigned int osize); + + // Memory unmaps DMA control_register_address and Input and output buffers + void dma_free(); + + // We could reduce to one set of the following calls + //================================================================================================================ + + //-----------------BUFFER Functions----------------- + // Get the MMap address of the input buffer of the dma *Needed to copy data + // to Input_Buffer* + char *dma_get_inbuffer(); + + // Get the MMap address of the output buffer of the dma *Needed to copy data + // from Output_Buffer* + + char *dma_get_outbuffer(); + + //================================================================================================================ + + //-----------------BUFFER Functions----------------- + // Copy data into the Input Buffer (length to write, offset to write to) + // returns 0 if successful + // int dma_copy_to_inbuffer(unsigned int *host_src_address, int data_length, + // int offset); + + // Copy data from the Output Buffer (length to read, offset to read from) + // returns 0 if successful + // int dma_copy_from_outbuffer(unsigned int *host_dst_address, int + // data_length, + // int offset); + + template + int mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset); + + template + int mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset); + + //============================================================================ + + //-----------------DMA MMS2 Functions----------------- + /** + * Checks if input buffer size is >= length + * Sets DMA MMS2 transfer length to length + * Starts transfers to the accelerator using dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ + int dma_start_send(unsigned int length, unsigned int offset); + + // Blocks thread until dma MMS2 transfer is complete + void dma_wait_send(); + + // Same as dma_send but thread does not block, returns 0 if done + int dma_check_send(); + + //-----------------DMA S2MM Functions----------------- + /** + * Checks if buffer size is >= length + * Sets 2SMM store length + * Starts storing data recieved through dma associated with dma_id + * Return 0 if successful, returns negative if error occurs + */ + int dma_start_recv(unsigned int length, unsigned int offset); + + // Blocks thread until dma S2MM transfer is complete (TLAST signal is seen) + void dma_wait_recv(); + + // Same as dma_recv but thread does not block, returns 0 if done + int dma_check_recv(); + + //********************************** Unexposed Functions + //********************************** + void initDMAControls(); + void dma_set(unsigned int *dma_virtual_address, int offset, + unsigned int value); + unsigned int dma_get(unsigned int *dma_virtual_address, int offset); + void dma_mm2s_sync(); + void dma_s2mm_sync(); + void acc_init(unsigned int base_addr, int length); + void dump_acc_signals(int state); +}; + +#endif \ No newline at end of file diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 897fbe7..d85bbf5 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(Misc) add_subdirectory(Dialect) add_subdirectory(Conversion) -add_subdirectory(CAPI) \ No newline at end of file +add_subdirectory(CAPI) +add_subdirectory(ExecutionEngine) \ No newline at end of file diff --git a/lib/Conversion/AccelToRuntime/AccelToAXI4MLIR.cpp b/lib/Conversion/AccelToRuntime/AccelToAXI4MLIR.cpp new file mode 100644 index 0000000..1f1af95 --- /dev/null +++ b/lib/Conversion/AccelToRuntime/AccelToAXI4MLIR.cpp @@ -0,0 +1,440 @@ +//===- AccelToAXI4MLIR.cpp - Convert Accel to AXI4MLIR calls --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements lowering of Accel to AXI4MLIR calls +// +//===----------------------------------------------------------------------===// + +#include "soda/Conversion/AccelToRuntime/AccelToAXI4MLIR.h" + +#include "../PassDetail.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "soda/Dialect/Accel/IR/Accel.h" + +#include "mlir/IR/BuiltinDialect.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/DialectConversion.h" + +//===----------------------------------------------------------------------===// +// AXI4MLIR Runtime C API declaration. +//===----------------------------------------------------------------------===// +static constexpr const char *kDmaInit = "dma_init"; +static constexpr const char *kDmaFree = "dma_free"; +static constexpr const char *kCopyToInbufferF32 = "copy_to_inbuffer_f32"; +static constexpr const char *kCopyFromOutbufferF32 = "copy_from_outbuffer_f32"; +static constexpr const char *kCopyToInbufferI32 = "copy_to_inbuffer_i32"; +static constexpr const char *kCopyFromOutbufferI32 = "copy_from_outbuffer_i32"; +static constexpr const char *kDmaStartSend = "dma_start_send"; +static constexpr const char *kDmaWaitSend = "dma_wait_send"; +static constexpr const char *kDmaStartRecv = "dma_start_recv"; +static constexpr const char *kDmaWaitRecv = "dma_wait_recv"; + +using namespace mlir; +using namespace mlir::func; + +class InitDMAToAXI4MLIRCall : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(accel::InitDMAOp op, + PatternRewriter &rewriter) const override { + + auto module = SymbolTable::getNearestSymbolTable(op); + + auto name = kDmaInit; + auto opFunc = dyn_cast_or_null( + SymbolTable::lookupSymbolIn(module, name)); + // Forward declare function if it hasn't already been + if (!opFunc) { // TODO: Check dma_free + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(&module->getRegion(0).front()); + + MLIRContext *ctx = rewriter.getContext(); + Location uLoc = rewriter.getUnknownLoc(); + Type intTy = rewriter.getI32Type(); + FunctionType fType; + + fType = FunctionType::get(ctx, {intTy, intTy, intTy, intTy, intTy}, {}); + rewriter.create(uLoc, name, fType).setPrivate(); + + fType = FunctionType::get(ctx, {}, {}); + rewriter.create(uLoc, kDmaFree, fType).setPrivate(); + } + assert(isa(SymbolTable::lookupSymbolIn(module, name))); + + rewriter.replaceOpWithNewOp(op, name, /*TODO no type?*/ TypeRange(), + op->getOperands()); + // TODO: this may create several DMA frees, but only one is needed + rewriter.setInsertionPoint(op->getBlock()->getTerminator()); + rewriter.create(rewriter.getUnknownLoc(), kDmaFree, + /*TODO no type?*/ TypeRange(), ValueRange()); + + return success(); + } +}; + +// Forward declare functions for SendOp +static void fwdDeclareSendFuncs(PatternRewriter &rewriter, Operation *module, + Type intTy, Type mrTy) { + + // TODO: Name has to match memref type + // TODO: This is giving the i32 name but memref may be f32 + auto name = kCopyToInbufferI32; + auto opFunc = dyn_cast_or_null( + SymbolTable::lookupSymbolIn(module, name)); + if (!opFunc) { // TODO: check for the other function names + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(&module->getRegion(0).front()); + + MLIRContext *ctx = rewriter.getContext(); + Location uLoc = rewriter.getUnknownLoc(); + FunctionType fType; + + fType = FunctionType::get(ctx, {mrTy, intTy}, {intTy}); + rewriter.create(uLoc, name, fType).setPrivate(); + + fType = FunctionType::get(ctx, {intTy, intTy}, {intTy}); + rewriter.create(uLoc, kDmaStartSend, fType).setPrivate(); + + fType = FunctionType::get(ctx, {}, {}); + rewriter.create(uLoc, kDmaWaitSend, fType).setPrivate(); + } + assert(isa(SymbolTable::lookupSymbolIn(module, name))); +} + +// Forward declare functions for RecvOp +static void fwdDeclareRecvFuncs(PatternRewriter &rewriter, Operation *module, + Type intTy, Type mrTy) { + auto name = kCopyFromOutbufferI32; + auto opFunc = dyn_cast_or_null( + SymbolTable::lookupSymbolIn(module, name)); + if (!opFunc) { // TODO: check for the other function names + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(&module->getRegion(0).front()); + + MLIRContext *ctx = rewriter.getContext(); + Location uLoc = rewriter.getUnknownLoc(); + FunctionType fType; + + fType = FunctionType::get(ctx, {mrTy, intTy}, {intTy}); + rewriter.create(uLoc, name, fType).setPrivate(); + + fType = FunctionType::get(ctx, {intTy, intTy}, {intTy}); + rewriter.create(uLoc, kDmaStartRecv, fType).setPrivate(); + + fType = FunctionType::get(ctx, {}, {}); + rewriter.create(uLoc, kDmaWaitRecv, fType).setPrivate(); + } + assert(isa(SymbolTable::lookupSymbolIn(module, name))); +} + +// Create ops to get number of elements in dynamic sized SubViewOp +static Value getNumElements(PatternRewriter &rewriter, Location loc, + memref::SubViewOp subViewOp, MemRefType inputType, + Type intTy) { + Value nElements; + + SmallVector sizes; + for (unsigned idx = 0; idx < inputType.getRank(); ++idx) { + sizes.push_back(subViewOp.getDynamicSizes()[idx]); + } + + // Create as many arith::MulIOps as needed to calculate # of elements + nElements = sizes[0]; + for (unsigned i = 1; i < inputType.getRank(); ++i) { + nElements = rewriter.create(loc, nElements, sizes[i]); + } + nElements = rewriter.create(loc, intTy, nElements); + return nElements; +} + +class SendToAXI4MLIRCall : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(accel::SendOp op, + PatternRewriter &rewriter) const override { + + auto module = SymbolTable::getNearestSymbolTable(op); + Location loc = op->getLoc(); + + auto name = kCopyToInbufferI32; + + Type intTy = rewriter.getI32Type(); + Value input = op.getInput(); + auto inputType = input.getType().dyn_cast_or_null(); + if (!inputType) + return failure(); + auto myType = inputType.getElementType(); + Type mrTy = UnrankedMemRefType::get(myType, 0); + + fwdDeclareSendFuncs(rewriter, module, intTy, mrTy); + + // TODO: Not sure if getOffestValue is working + auto initOffset = op.getOffsetValue(); + if (!initOffset) { + initOffset = + rewriter.create(loc, IntegerAttr::get(intTy, 0)); + } + + // Send flow: copy, start, wait + Value casted = rewriter.create(loc, mrTy, input); + rewriter.create(loc, name, intTy, + SmallVector({casted, initOffset})); + + int bitWidth = inputType.getElementTypeBitWidth(); + + // create a lambda function that uses isDynamicSize(idx) and returns true if + // one of the sizes is dynamic + if (inputType.hasStaticShape()) { + // llvm::errs() << "SendToAXI4MLIRCall: inputType has static shape\n"; + int numElements = inputType.getNumElements(); + // int bytes = numElements * bitWidth / 8; + + Value nElements = rewriter.create( + loc, IntegerAttr::get(intTy, numElements)); + rewriter.create(loc, kDmaStartSend, intTy, + SmallVector({nElements, initOffset})); + rewriter.create(loc, kDmaWaitSend, TypeRange()); + + Value resultOffset = rewriter.create( + loc, IntegerAttr::get(intTy, numElements)); + rewriter.replaceOp(op, {resultOffset}); + } else { + // llvm::errs() << "SendToAXI4MLIRCall: inputType has dynamic shape\n"; + + // First get the number of elements from dynamic sizes + memref::SubViewOp subViewOp = + dyn_cast(input.getDefiningOp()); + if (!subViewOp) { + // llvm::errs() << "SendToAXI4MLIRCall: input is not a subview\n"; + return failure(); + } + Value nElements = + getNumElements(rewriter, loc, subViewOp, inputType, intTy); + + rewriter.create(loc, kDmaStartSend, intTy, + SmallVector({nElements, initOffset})); + rewriter.create(loc, kDmaWaitSend, TypeRange()); + + // If many actions are chained, they are placed in order in the DMA, + // thus the offset is the size of the previous action. + Value resultOffset = nElements; + // Value bitWidthV = rewriter.create( + // loc, IntegerAttr::get(intTy, bitWidth)); + // resultOffset = + // rewriter.create(loc, resultOffset, bitWidthV); + // Value eight = + // rewriter.create(loc, IntegerAttr::get(intTy, + // 8)); + // resultOffset = rewriter.create(loc, resultOffset, + // eight); + rewriter.replaceOp(op, {resultOffset}); + } + + return success(); + } +}; + +// Rewrite SendLiteral to a call of kCopyToInbufferI32. +// This could be optimized to transfer the literal directly to the +// DMA buffer instead of going through a temporary memref. +class SendLiteralToAXI4MLIRCall + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(accel::SendLiteralOp op, + PatternRewriter &rewriter) const override { + + auto module = SymbolTable::getNearestSymbolTable(op); + Location loc = op->getLoc(); + + // TODO: Name has to match memref type + auto name = kCopyToInbufferI32; + Type intTy = rewriter.getI32Type(); + Value opcode = op.getOpcode(); + + // Create a memref and store the opcode in it + auto tmpMrTy = MemRefType::get(/*shape*/ {}, rewriter.getIntegerType(32)); + auto input = rewriter.create(loc, tmpMrTy); + rewriter.create(loc, opcode, input, ValueRange()); + + auto inputType = input.getType().dyn_cast_or_null(); + if (!inputType) + return failure(); + auto myType = inputType.getElementType(); + Type mrTy = UnrankedMemRefType::get(myType, 0); + + fwdDeclareSendFuncs(rewriter, module, intTy, mrTy); + + auto initOffset = op.getOffsetValue(); + if (!initOffset) { + initOffset = + rewriter.create(loc, IntegerAttr::get(intTy, 0)); + } + + // Send flow: copy, start, wait + Value casted = rewriter.create(loc, mrTy, input); + rewriter.create(loc, name, intTy, + SmallVector({casted, initOffset})); + + int numElements = inputType.getNumElements(); + int bitWidth = inputType.getElementTypeBitWidth(); + // int bytes = numElements * bitWidth / 8; + + Value nElements = rewriter.create( + loc, IntegerAttr::get(intTy, numElements)); + rewriter.create(loc, kDmaStartSend, intTy, + SmallVector({nElements, initOffset})); + rewriter.create(loc, kDmaWaitSend, TypeRange()); + + // Free the temporary memref + rewriter.create(loc, input); + + Value resultOffset = rewriter.create( + loc, IntegerAttr::get(intTy, numElements)); + rewriter.replaceOp(op, {resultOffset}); + + return success(); + } +}; + +class RecvToAXI4MLIRCall : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(accel::RecvOp op, + PatternRewriter &rewriter) const override { + + auto module = SymbolTable::getNearestSymbolTable(op); + Location loc = op->getLoc(); + + // TODO: Name has to match memref type + auto name = kCopyFromOutbufferI32; + auto opFunc = dyn_cast_or_null( + SymbolTable::lookupSymbolIn(module, name)); + + Type intTy = rewriter.getI32Type(); + Value dst = op.getDst(); + auto inputType = dst.getType().dyn_cast_or_null(); + if (!inputType) + return failure(); + auto myType = inputType.getElementType(); + Type mrTy = UnrankedMemRefType::get(myType, 0); + + fwdDeclareRecvFuncs(rewriter, module, intTy, mrTy); + + auto initOffset = op.getOffsetValue(); + if (!initOffset) { + initOffset = + rewriter.create(loc, IntegerAttr::get(intTy, 0)); + } + + Value casted = rewriter.create(loc, mrTy, dst); + int bitWidth = inputType.getElementTypeBitWidth(); + if (inputType.hasStaticShape()) { + // llvm::errs() << "RecvToAXI4MLIRCall: inputType has static shape\n"; + int numElements = inputType.getNumElements(); + // int bytes = numElements * bitWidth / 8; + + Value nElements = rewriter.create( + loc, IntegerAttr::get(intTy, numElements)); + + // Recv flow: start, wait, copy + rewriter.create(loc, kDmaStartRecv, intTy, + SmallVector({nElements, initOffset})); + rewriter.create(loc, kDmaWaitRecv, TypeRange()); + rewriter.create(loc, name, intTy, + SmallVector({casted, initOffset})); + + Value resultOffset = rewriter.create( + loc, IntegerAttr::get(intTy, numElements)); + rewriter.replaceOp(op, {resultOffset}); + + } else { + // llvm::errs() << "RecvToAXI4MLIRCall: inputType has dynamic shape\n"; + + // First get the number of elements from dynamic sizes + memref::SubViewOp subViewOp = + dyn_cast(dst.getDefiningOp()); + if (!subViewOp) { + llvm::errs() << "RecvToAXI4MLIRCall: input is not a subview\n"; + return failure(); + } + Value nElements = + getNumElements(rewriter, loc, subViewOp, inputType, intTy); + + rewriter.create(loc, kDmaStartRecv, intTy, + SmallVector({nElements, initOffset})); + rewriter.create(loc, kDmaWaitRecv, TypeRange()); + rewriter.create(loc, name, intTy, + SmallVector({casted, initOffset})); + + // If many actions are chained, they are placed in order in the DMA, + // thus the offset is the size of the previous action. + Value resultOffset = nElements; + // Value bitWidthV = rewriter.create( + // loc, IntegerAttr::get(intTy, bitWidth)); + // resultOffset = + // rewriter.create(loc, resultOffset, bitWidthV); + // Value eight = + // rewriter.create(loc, IntegerAttr::get(intTy, + // 8)); + // resultOffset = rewriter.create(loc, resultOffset, + // eight); + rewriter.replaceOp(op, {resultOffset}); + } + + return success(); + } +}; + +void mlir::populateAccelToAXI4MLIRConversionPatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); +} + +namespace { +struct ConvertAccelToAXI4MLIRPass + : public ConvertAccelToAXI4MLIRBase { + void runOnOperation() override; +}; +} // namespace + +void ConvertAccelToAXI4MLIRPass::runOnOperation() { + auto module = getOperation(); + + RewritePatternSet patterns(&getContext()); + populateAccelToAXI4MLIRConversionPatterns(patterns); + + ConversionTarget target(getContext()); + // clang-format off + target.addLegalDialect(); + // clang-format on + target.addIllegalDialect(); + if (failed(applyPartialConversion(module, target, std::move(patterns)))) + signalPassFailure(); +} + +std::unique_ptr> +mlir::createConvertAccelToAXI4MLIRPass() { + return std::make_unique(); +} diff --git a/lib/Conversion/AccelToRuntime/CMakeLists.txt b/lib/Conversion/AccelToRuntime/CMakeLists.txt new file mode 100644 index 0000000..6270ff6 --- /dev/null +++ b/lib/Conversion/AccelToRuntime/CMakeLists.txt @@ -0,0 +1,11 @@ +add_mlir_dialect_library(SODAAccelToRuntime + AccelToAXI4MLIR.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJ_INCLUDE_DIR}/soda/Conversion/AccelToRuntime + + LINK_LIBS PUBLIC + MLIRIR + MLIRPass +) + \ No newline at end of file diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index b90a4e0..f445062 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -1,2 +1,4 @@ add_subdirectory(KernelsToSODA) -add_subdirectory(CustomFuncToLLVM) \ No newline at end of file +add_subdirectory(CustomFuncToLLVM) +add_subdirectory(AccelToRuntime) +add_subdirectory(LinalgToAccel) diff --git a/lib/Conversion/LinalgToAccel/AXI4MLIRUtils.cpp b/lib/Conversion/LinalgToAccel/AXI4MLIRUtils.cpp new file mode 100644 index 0000000..3c33397 --- /dev/null +++ b/lib/Conversion/LinalgToAccel/AXI4MLIRUtils.cpp @@ -0,0 +1,349 @@ +//===- AXI4MLIRUtils.cpp - Shared functions during conversions --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "soda/Conversion/LinalgToAccel/AXI4MLIRUtils.h" + +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" + +using namespace mlir; +using namespace mlir::linalg; +using namespace mlir::func; + +const StringLiteral kLinalgTransformMarker = "__internal_linalg_transform__"; + +struct LinalgOpChangeFilterPattern + : public OpInterfaceRewritePattern { + LinalgOpChangeFilterPattern( + MLIRContext *context, + LinalgTransformationFilter f = LinalgTransformationFilter(), + PatternBenefit benefit = 1) + : OpInterfaceRewritePattern(context, benefit), + filter(std::move(f)) {} + + LinalgOpChangeFilterPattern( + StringRef opName, MLIRContext *context, + LinalgTransformationFilter f = LinalgTransformationFilter(), + PatternBenefit benefit = 1) + : OpInterfaceRewritePattern(context, benefit), + filter(f.addOpNameFilter(opName)) {} + + LogicalResult matchAndRewrite(LinalgOp op, + PatternRewriter &rewriter) const override { + if (failed(filter.checkAndNotify(rewriter, op))) + return failure(); + rewriter.startRootUpdate(op); + filter.replaceLinalgTransformationFilter(rewriter, op); + rewriter.finalizeRootUpdate(op); + return success(); + } + +private: + /// LinalgTransformMarker handles special attribute manipulations. + LinalgTransformationFilter filter; +}; + +static void addTilingPatternToSet(RewritePatternSet &patterns, MLIRContext *ctx, + const StringRef &srcAttrName, + const StringRef &dstAttrName, + const SmallVector &tileSizes) { + + // create SmallVector of int64_t from tileSizes + SmallVector tileSizesInt64; + for (auto ts : tileSizes) { + tileSizesInt64.push_back(ts); + } + // create a ArrayRef from tileSizes + ArrayRef tileSizesRef(tileSizesInt64); + + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTilingOptions().setTileSizes(tileSizesRef), + LinalgTransformationFilter(StringAttr::get(ctx, srcAttrName), + StringAttr::get(ctx, dstAttrName))); +} + +static void addTilingPatternToSet(RewritePatternSet &patterns, MLIRContext *ctx, + const StringRef &srcAttrName, + const StringRef &dstAttrName, + const unsigned &tsd0, const unsigned &tsd1, + const unsigned &tsd2) { + addTilingPatternToSet(patterns, ctx, srcAttrName, dstAttrName, + SmallVector{tsd0, tsd1, tsd2}); +} + +void mlir::populateCommonLinalgTransformationPatterns( + RewritePatternSet &patterns, const AccelTransformationOptions &options) { + MLIRContext *ctx = patterns.getContext(); + + // Triggers on operations with kLinagTransformMarker set to "GENERALIZE" + patterns.add( + ctx, LinalgTransformationFilter(StringAttr::get(ctx, "GENERALIZE"), + StringAttr::get(ctx, "ANNOTATE"))); + + // ANNOTATE to INTERCHANGE is performed by custom pattern + + // Perform loop interchange with GenericOpInterchangePattern + // This only correctly interchanges loops for GenericOps, thus + // generalization must be done prior to this step. + if (options.loopPermutation.size() > 0) { + patterns.add( + ctx, options.loopPermutation, + LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"), + StringAttr::get(ctx, "MEM"))); + } else { + // Simply add a pattern to change the attribute + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"), + StringAttr::get(ctx, "MEM"))); + } + + // z7020 ARM A9 core specs + // L1: 32KB 4-way set-associative (instruction and data caches independent + // for each CPU) + // L2: 512KB 8-way set-associative (shared between CPUs) + + // Pynq-z2 + // z7020 chip + // 512MB DDR3 with 16-bit bus @ 1050Mbps + + // Pynq-z2 + // z7020 chip + // 512 Mbyte DDR3 + + // M N K ELEMSize Total bytes Total KB + // 1,024 1,024 1,024 4 12,582,912 12,288.00 + // 512 512 512 4 3,145,728 3,072.00 + // 256 256 256 4 786,432 768.00 + // 128 128 128 4 196,608 192.00 + // 64 64 64 4 49,152 48.00 + // 32 32 32 4 12,288 12.00 + // 16 16 16 4 3,072 3.00 + // 8 8 8 4 768 0.75 + // 4 4 4 4 192 0.19 + // 2 2 2 4 48 0.05 + + if (options.tileSizes.size() > 0) { + unsigned tileIdx = 0; + + if (options.numberOfCaches == 3) { + addTilingPatternToSet( + patterns, ctx, "MEM", "L3", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + + addTilingPatternToSet( + patterns, ctx, "L3", "L2", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + + addTilingPatternToSet( + patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + } + + if (options.numberOfCaches == 2) { + addTilingPatternToSet( + patterns, ctx, "MEM", "L2", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + + addTilingPatternToSet( + patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + } + + if (options.numberOfCaches == 1) { + addTilingPatternToSet( + patterns, ctx, "MEM", "L1", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + } + + } else { + // No tile sizes provided: simply add a pattern to change the attribute + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTransformationFilter(StringAttr::get(ctx, "MEM"), + StringAttr::get(ctx, "L1"))); + } + + // At this point relevant operations will have the L1 marker + // Only accelerator tiling is missing + if (options.accelSizes.size() > 0) { + // TODO: Pass in the accel sizes as an ArrayRef + assert(options.accelSizes.size() == 3 && "please provide 3 tile sizes"); + + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTilingOptions().setTileSizes({options.accelSizes[0], + options.accelSizes[1], + options.accelSizes[2]}), + LinalgTransformationFilter(StringAttr::get(ctx, "L1"), + StringAttr::get(ctx, "GENACCEL"))); + } else { + if (options.accelSize > 1) { + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTilingOptions().setTileSizes( + {options.accelSize, options.accelSize, options.accelSize}), + LinalgTransformationFilter(StringAttr::get(ctx, "L1"), + StringAttr::get(ctx, "GENACCEL"))); + + } else { + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTilingOptions().setTileSizes({4, 4, 4}), + LinalgTransformationFilter(StringAttr::get(ctx, "L1"), + StringAttr::get(ctx, "GENACCEL"))); + } + } +} + +/// Apply tiling patterns to GenericOps with the correct attribute +void mlir::applyPatterns(FuncOp funcOp, + const AccelTransformationOptions &options) { + MLIRContext *ctx = funcOp.getContext(); + RewritePatternSet patterns(ctx); + + // Triggers on operations with kLinagTransformMarker set to "GENERALIZE" + patterns.add( + ctx, LinalgTransformationFilter(StringAttr::get(ctx, "GENERALIZE"), + StringAttr::get(ctx, "INTERCHANGE"))); + + // Perform loop interchange with GenericOpInterchangePattern + // This only correctly interchanges loops for GenericOps, thus + // generalization must be done prior to this step. + if (options.loopPermutation.size() > 0) { + patterns.add( + ctx, options.loopPermutation, + LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"), + StringAttr::get(ctx, "MEM"))); + } else { + // add pattern to change attribute + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTransformationFilter(StringAttr::get(ctx, "INTERCHANGE"), + StringAttr::get(ctx, "MEM"))); + } + + // z7020 ARM A9 core specs + // L1: 32KB 4-way set-associative (instruction and data caches independent + // for each CPU) + // L2: 512KB 8-way set-associative (shared between CPUs) + + // Pynq-z2 + // z7020 chip + // 512MB DDR3 with 16-bit bus @ 1050Mbps + + // Pynq-z2 + // z7020 chip + // 512 Mbyte DDR3 + + // M N K ELEMSize Total bytes Total KB + // 1,024 1,024 1,024 4 12,582,912 12,288.00 + // 512 512 512 4 3,145,728 3,072.00 + // 256 256 256 4 786,432 768.00 + // 128 128 128 4 196,608 192.00 + // 64 64 64 4 49,152 48.00 + // 32 32 32 4 12,288 12.00 + // 16 16 16 4 3,072 3.00 + // 8 8 8 4 768 0.75 + // 4 4 4 4 192 0.19 + // 2 2 2 4 48 0.05 + + if (options.tileSizes.size() > 0) { + unsigned tileIdx = 0; + + if (options.numberOfCaches == 3) { + addTilingPatternToSet( + patterns, ctx, "MEM", "L3", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + + addTilingPatternToSet( + patterns, ctx, "L3", "L2", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + + addTilingPatternToSet( + patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + } + + if (options.numberOfCaches == 2) { + addTilingPatternToSet( + patterns, ctx, "MEM", "L2", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + + addTilingPatternToSet( + patterns, ctx, "L2", "L1", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + } + + if (options.numberOfCaches == 1) { + addTilingPatternToSet( + patterns, ctx, "MEM", "L1", options.tileSizes[tileIdx + 0], + options.tileSizes[tileIdx + 1], options.tileSizes[tileIdx + 2]); + tileIdx += 3; + } + + } else { + // If no tile sizes were selected + addTilingPatternToSet(patterns, ctx, "MEM", "L1", 4096, 4096, 4096); + } + + // At this point relevant operations will have the L1 marker + // Only accelerator tiling is missing + if (options.accelSize > 1) { + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTilingOptions().setTileSizes( + {options.accelSize, options.accelSize, options.accelSize}), + LinalgTransformationFilter(StringAttr::get(ctx, "L1"), + StringAttr::get(ctx, "GENACCEL"))); + + } else { + patterns.add( + GenericOp::getOperationName(), ctx, + LinalgTilingOptions().setTileSizes({4, 4, 4}), + LinalgTransformationFilter(StringAttr::get(ctx, "L1"), + StringAttr::get(ctx, "GENACCEL"))); + } + + (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); +} + +void AccelTransformationOptions::dump() const { + llvm::errs() << "accelSize: " << accelSize << "\n" + << "dmaAddress\t\t " << dmaAddress << "\n" + << "dmaInputAddress\t\t " << dmaInputAddress << "\n" + << "dmaInputBufferSize\t " << dmaInputBufferSize << "\n" + << "dmaOutputAddress\t " << dmaOutputAddress << "\n" + << "dmaOutputBufferSize\t " << dmaOutputBufferSize << "\n" + << "flowCpuAcc\t\t " << flowCpuAcc << "\n" + << "numberOfCaches\t\t " << numberOfCaches + << "\n" + // << "cacheSizes\t\t " << cacheSizes << "\n" + // << "tileSizes\t\t " << tileSizes << "\n" + << "elementSize\t\t " << elementSize + << "\n" + // << "loopPermutation\t\t " << loopPermutation << "\n" + << "anchorFuncName\t\t " << anchorFuncName << "\n" + << "anchorOpName\t\t " << anchorOpName << "\n" + << "opcodeMap\t\t " << opcodeMap << "\n" + << "initFlow\t\t " << initFlow << "\n" + << "opcodeFlow\t\t " << opcodeFlow << "\n"; +} \ No newline at end of file diff --git a/lib/Conversion/LinalgToAccel/CMakeLists.txt b/lib/Conversion/LinalgToAccel/CMakeLists.txt new file mode 100644 index 0000000..a92c5e2 --- /dev/null +++ b/lib/Conversion/LinalgToAccel/CMakeLists.txt @@ -0,0 +1,19 @@ +add_mlir_conversion_library(SODALinalgToAccel + LinalgGenericToAccel.cpp + AXI4MLIRUtils.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJ_INCLUDE_DIR}/soda/Conversion/LinalgToAccel + + DEPENDS + SODAConversionPassIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRArithDialect + SODAAccelDialect + MLIRMemRefDialect + MLIRTransforms + ) diff --git a/lib/Conversion/LinalgToAccel/LinalgGenericToAccel.cpp b/lib/Conversion/LinalgToAccel/LinalgGenericToAccel.cpp new file mode 100644 index 0000000..e83a2b1 --- /dev/null +++ b/lib/Conversion/LinalgToAccel/LinalgGenericToAccel.cpp @@ -0,0 +1,1034 @@ +//===- LinalgGenericToAccel.cpp - Generic to accel conversions --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements from linalg generic to accel calls +// +//===----------------------------------------------------------------------===// + +#include "soda/Conversion/LinalgToAccel/LinalgGenericToAccel.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" + +#include "../PassDetail.h" + +#include "soda/Dialect/Accel/IR/Accel.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" + +#include "mlir/IR/OpcodeExpr.h" + +#include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/IR/BuiltinDialect.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Parser.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/DialectConversion.h" + +using namespace mlir; + +const StringLiteral kLinalgTransformMarker = "__internal_linalg_transform__"; +const StringLiteral kAccelTransformMarker = "__accel_transform__"; +const StringLiteral kAccel_dmaAddress = "accel_dmaAddress"; +const StringLiteral kAccel_dmaInputAddress = "accel_dmaInputAddress"; +const StringLiteral kAccel_dmaInputBufferSize = "accel_dmaInputBufferSize"; +const StringLiteral kAccel_dmaOuputAddress = "accel_dmaOutputAddress"; +const StringLiteral kAccel_dmaOuputBufferSize = "accel_dmaOutputBufferSize"; +const StringLiteral kAccel_acc_on_cpu = "accel_acc_on_cpu"; +const StringLiteral kAccel_accumulate_on_cpu = "accel_accumulate_on_cpu"; +const StringLiteral kAccel_opcode_map = "accel_opcode_map"; +const StringLiteral kAccel_opcode_map_str = "accel_opcode_map_str"; +const StringLiteral kAccel_opcode_flow = "accel_opcode_flow"; +const StringLiteral kAccel_opcode_flow_str = "accel_opcode_flow_str"; +const StringLiteral kAccel_loop_permutation = "accel_loop_permutation"; +const StringLiteral kAccel_accel_tile_size = "accel_accel_tile_size"; +const StringLiteral kAccel_accel_tile_sizes = "accel_accel_tile_sizes"; +const StringLiteral kAccel_tile_sizes = "accel_tile_sizes"; +const StringLiteral kAccel_init_flow = "accel_init_flow"; +const StringLiteral kAccel_init_flow_str = "accel_init_flow_str"; + +IntegerAttr getU32IntegerAttr(PatternRewriter &rewriter, unsigned value) { + return rewriter.getIntegerAttr(rewriter.getIntegerType(32, false), value); +} + +/// Remove quotes from string to prevent parser from treating it as string. +static StringRef prepStringOption(std::string &s, const char delim = '\"') { + // NOTE: There is an inconsistent bug with + // StringRef::drop_front(),drop_back(),consume_front(),consume_back() + // It likely does not update the size every time. + // NOTE: Input &s must be live after this function call. Passing by copy + // also does not work. + // return StringRef(s).consume_front(delim).consume_back(delim); + + if (s[s.length() - 1] == delim) + s.erase(s.end() - ((s.length() > 0) ? 1 : 0), s.end()); + if (s[0] == delim) + s.erase(s.begin()); + + return StringRef(s); +} + +/// Sets operation Attrs used in generic to accel conversion +class GenericAttrAnnotationPattern + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + /// Construct a generic pattern applied to all GenericOp that verify `filter`. + /// If attributes are already annotated, skip the replacement. + GenericAttrAnnotationPattern( + MLIRContext *context, + linalg::LinalgTransformationFilter f = + linalg::LinalgTransformationFilter(), + AccelTransformationOptions options = AccelTransformationOptions(), + PatternBenefit benefit = 1) + : OpRewritePattern(context, benefit), filter(f), + options(std::move(options)) {} + + LogicalResult matchAndRewrite(linalg::GenericOp op, + PatternRewriter &rewriter) const override { + return returningMatchAndRewrite(op, rewriter); + } + + /// Check if the attribute attrName is already set, if not, use a lambda + /// function to set it. + template + static void setAttrIfEmpty(Operation *op, StringRef attrName, Func lambda) { + if (!op->getAttr(attrName)) { + lambda(); + } + } + + LogicalResult returningMatchAndRewrite(linalg::GenericOp op, + PatternRewriter &rewriter) const { + if (failed(filter.checkAndNotify(rewriter, op))) + return failure(); + rewriter.startRootUpdate(op); + + // DMA Attributes + setAttrIfEmpty(op, kAccel_dmaAddress, [&]() { + op->setAttr(kAccel_dmaAddress, + rewriter.getI32IntegerAttr(options.dmaAddress)); + }); + setAttrIfEmpty(op, kAccel_dmaInputAddress, [&]() { + op->setAttr(kAccel_dmaInputAddress, + rewriter.getI32IntegerAttr(options.dmaInputAddress)); + }); + setAttrIfEmpty(op, kAccel_dmaInputBufferSize, [&]() { + op->setAttr(kAccel_dmaInputBufferSize, + rewriter.getI32IntegerAttr(options.dmaInputBufferSize)); + }); + setAttrIfEmpty(op, kAccel_dmaOuputAddress, [&]() { + op->setAttr(kAccel_dmaOuputAddress, + rewriter.getI32IntegerAttr(options.dmaOutputAddress)); + }); + setAttrIfEmpty(op, kAccel_dmaOuputBufferSize, [&]() { + op->setAttr(kAccel_dmaOuputBufferSize, + rewriter.getI32IntegerAttr(options.dmaOutputBufferSize)); + }); + setAttrIfEmpty(op, kAccel_acc_on_cpu, [&]() { + op->setAttr(kAccel_acc_on_cpu, rewriter.getBoolAttr(options.flowCpuAcc)); + }); + + // OpcodeMap Attribute + // as string + std::string s0 = options.opcodeMap; + StringRef opcodeMapStr = prepStringOption(s0); + if (opcodeMapStr == "" && !op->getAttr(kAccel_opcode_map_str)) { + op->emitWarning("No opcode map attribute found, skipping"); + filter.replaceLinalgTransformationFilter(rewriter, op); + rewriter.finalizeRootUpdate(op); + return success(); + } + setAttrIfEmpty(op, kAccel_opcode_map_str, [&]() { + op->setAttr(kAccel_opcode_map_str, rewriter.getStringAttr(opcodeMapStr)); + }); + // as attribute + setAttrIfEmpty(op, kAccel_opcode_map, [&]() { + OpcodeMapAttr opcodeMapAttr = + parseAttribute( + op->getAttrOfType(kAccel_opcode_map_str).getValue(), + rewriter.getContext()) + .dyn_cast(); + op->setAttr(kAccel_opcode_map, opcodeMapAttr); + }); + + // OpcodeFlow Attribute + // as string + std::string s1 = options.opcodeFlow; + StringRef opcodeFlowStr = prepStringOption(s1); + setAttrIfEmpty(op, kAccel_opcode_flow_str, [&]() { + op->setAttr(kAccel_opcode_flow_str, + rewriter.getStringAttr(opcodeFlowStr)); + }); + // as attribute + // TODO: handle kAccel_opcode_flow, parse string to validate identifiers + + // InitFlow Attribute + // as string + std::string s2 = options.initFlow; + StringRef initFlowStr = prepStringOption(s2); + setAttrIfEmpty(op, kAccel_init_flow_str, [&]() { + op->setAttr(kAccel_init_flow_str, rewriter.getStringAttr(initFlowStr)); + }); + // as attribute + // TODO: handle kAccel_init_flow, parse string to validate identifiers + + // Create a lambda function for ArrayRef options + auto getArrayAttr = [&](const ArrayRef &inArray) -> ArrayAttr { + SmallVector tmpArray; + for (auto v : inArray) + tmpArray.push_back(rewriter.getI32IntegerAttr(v)); + return rewriter.getArrayAttr(tmpArray); + }; + + // Attributes for tilling and permutation + // TODO: currently the attribute is set correctly but the rewriter pass uses + // what is inside the command line options + + // LoopPermutation Attribute + setAttrIfEmpty(op, kAccel_loop_permutation, [&]() { + op->setAttr(kAccel_loop_permutation, + getArrayAttr(options.loopPermutation)); + }); + + // AccelSizes Attribute + setAttrIfEmpty(op, kAccel_accel_tile_sizes, [&]() { + op->setAttr(kAccel_accel_tile_sizes, getArrayAttr(options.accelSizes)); + }); + + // LoopTiling Attribute + setAttrIfEmpty(op, kAccel_tile_sizes, [&]() { + op->setAttr(kAccel_tile_sizes, getArrayAttr(options.tileSizes)); + }); + + // Accelerator Tile Size Attribute + setAttrIfEmpty(op, kAccel_accel_tile_size, [&]() { + op->setAttr(kAccel_accel_tile_size, + rewriter.getI32IntegerAttr(options.accelSize)); + }); + + // List of operand ids to accumulate on cpu + setAttrIfEmpty(op, kAccel_accumulate_on_cpu, [&]() { + op->setAttr(kAccel_accumulate_on_cpu, getArrayAttr(options.accOnCpu)); + }); + + filter.replaceLinalgTransformationFilter(rewriter, op); + rewriter.finalizeRootUpdate(op); + return success(); + } + +private: + /// LinalgTransformMarker handles special attribute manipulations. + linalg::LinalgTransformationFilter filter; + /// Options for accel transformation + AccelTransformationOptions options; +}; + +/// Function to materialize DMA attributes as constants +static void materializeDMAConstants(PatternRewriter &rewriter, Operation *op, + Location loc, + SmallVector &values) { + values.push_back(rewriter.create( + loc, op->getAttrOfType(kAccel_dmaAddress))); + values.push_back(rewriter.create( + loc, op->getAttrOfType(kAccel_dmaInputAddress))); + values.push_back(rewriter.create( + loc, op->getAttrOfType(kAccel_dmaInputBufferSize))); + values.push_back(rewriter.create( + loc, op->getAttrOfType(kAccel_dmaOuputAddress))); + values.push_back(rewriter.create( + loc, op->getAttrOfType(kAccel_dmaOuputBufferSize))); +} + +/// Rewrites GenericOp as a series of of accel. +/// Expects the correct attributes to be already set as it +/// does not use options flags and instead, reads the op attributes. +/// TODO: Let this be the case for accelerators with no OPCODES +class LinalgGenericToAccel : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + // Create a function that depending on an integer, adds a value to the + // correct loop body in a nested loop structure. + // ex: if loop_offset = 0, + // then add to the innermost loop body, before `op` + // if loop_offset = 1, + // then add to the second innermost loop body, before terminator the + // `op` + // if loop_offset = 2, + // then add to the third innermost loop body, after the `op` + // if loop_offset = -1, + // then add to the second innermost loop body, before `op` + // + template + static void addOperationToLoopBody(PatternRewriter &rewriter, Location loc, + Operation *op, int loop_offset, + Func lambda) { + + // if loop_offset = 0, then add to the innermost loop body + if (loop_offset == 0) { + // Set insertion point before the operation + // op->emitWarning() << "Offset is 0, calling lambda"; + rewriter.setInsertionPoint(op); + lambda(); + return; + } + + // Get the parent loop operation + scf::ForOp parent_loop_op = op->getParentOfType(); + assert( + parent_loop_op && + "Accessing parent scf::ForOp, but a parent scf::ForOp was not found."); + + switch (loop_offset) { + case -1: { + // op->emitWarning() << "Offset is -1, calling lambda"; + if (parent_loop_op) { + // Set insertion point right before the scf::ForOp + rewriter.setInsertionPoint(parent_loop_op); + } + lambda(); + break; + } + case 1: { + if (parent_loop_op) { + // op->emitWarning() << "Offset is 1, calling lambda"; + // Set insertion point before the terminator of parent loop operation + rewriter.setInsertionPoint(parent_loop_op->getBlock()->getTerminator()); + } + lambda(); + break; + } + default: { + // if not -1, 0, 1, we have to recursively call this function with parent + // loop operation as the operation and loop_offset -1 if positive, or +1 + // if negative + addOperationToLoopBody( + rewriter, loc, parent_loop_op, + loop_offset > 0 ? loop_offset - 1 : loop_offset + 1, lambda); + } + } + return; + } + + // Function to parse accel_opcode_flow_str and generate a vector of where each + // operation should be placed + // The attribute opcode flow string has the following format: + // opcode_flow ::= opcode_flow_expr + // opcode_flow_expr ::= `(` opcode_flow_expr `)` + // | `(` opcode_flow_expr opcode_id `)` + // | `(` opcode_id `opcode_flow_expr )` + // | opcode_id + // + + // Examples and outputs: + // accel_opcode_flow_str = "(s0 (s1 s2 r2))" + // [(-1,[s0]), (0,[s1,s2,r2])] + // + // accel_opcode_flow_str = "(s0 (s1 s2) r2)" + // [(-1,[s0]), (0,[s1,s2]), (1,[r2])] + // + // accel_opcode_flow_str = "((s0 s1 s2) r2)" + // [(0,[s0,s1,s2 ]), (1,[r2])] + static LogicalResult parseOpcodeFlowStr( + Operation *op, SmallVectorImpl &loop_offsets, + SmallVectorImpl &opcodes_strs, + SmallVectorImpl> &lists_of_opcode_ids) { + // op->emitWarning() << "Parsing opcode flow str"; + std::string opcode_flow_str = + op->getAttrOfType(kAccel_opcode_flow_str).str(); + // op->emitWarning() << opcode_flow_str; + + assert(!opcode_flow_str.empty() && + "accel_opcode_flow_str is empty, but it should not be."); + + int n_left_paren = 0; + int n_right_paren = 0; + for (char c : opcode_flow_str) { + if (c == '(') + n_left_paren++; + if (c == ')') + n_right_paren++; + } + assert(n_left_paren == n_right_paren && + "accel_opcode_flow_str has mismatched parentheses"); + + // get substring between parentheses + int c_paren = 0; + for (size_t i = 0; i < opcode_flow_str.size(); i++) { + if (opcode_flow_str[i] == '(' || opcode_flow_str[i] == ')') { + size_t j = i + 1; + while ((opcode_flow_str[j] != ')') && (opcode_flow_str[j] != '(')) { + j++; + } + if (opcode_flow_str[i] == '(' || opcode_flow_str[i] == ')') { + c_paren++; + + // Only print if still inside parentheses + if (c_paren < n_left_paren + n_right_paren) { + // if (j != opcode_flow_str.size()) { + std::string substring = opcode_flow_str.substr(i + 1, j - i - 1); + + // Only push back if the substring is not empty + if (!substring.empty()) { + loop_offsets.push_back(c_paren - n_left_paren); + opcodes_strs.push_back(substring); + // op->emitWarning() << substring << " " << c_paren - + // n_left_paren; + } + } + } + } + } + + // The strings in opcodes_strs represent a string of multiple opcodes + // separated by spaces. We need to split them into individual opcodes. + for (auto &&opcode_str : opcodes_strs) { + SmallVector splitted_opcodes; + StringRef opcode_id_sr = opcode_str; + // First trim leading and trailing spaces + opcode_id_sr = opcode_id_sr.trim(); + // Finally split the string into individual opcodes + opcode_id_sr.split(splitted_opcodes, " "); + + // push back the vector of opcode ids + lists_of_opcode_ids.push_back(splitted_opcodes); + + // print the opcodes + // for (auto &&opcode_id_split : splitted_opcodes) { + // op->emitWarning() << "Opcode id: " << opcode_id_split<< "!"; + // } + } + + assert(loop_offsets.size() == lists_of_opcode_ids.size() && + "loop_offsets and lists_of_opcode_ids have different sizes"); + + // Print the loop offsets and opcode ids + // op->emitWarning() << "Opcode flow str parsed successfully!" + // << "\n\tloop_offsets: " << loop_offsets + // << "\n\topcodes_strs: " << opcodes_strs + // << "\n\tlists_of_opcode_ids_size: " << + // lists_of_opcode_ids.size() + // << "\n\tlists_of_opcode_ids: " << lists_of_opcode_ids; + + return success(); + } + + static void printOpcodesInMap( + Operation *op, SmallVectorImpl &loop_offsets, + SmallVectorImpl &opcodes_strs, + SmallVectorImpl> &lists_of_opcode_ids) { + + // Get the opcodeMap from operation + auto opcodeMap = + op->getAttrOfType(kAccel_opcode_map).getValue(); + llvm::errs() << "OpcodeMap: " << opcodeMap << "\n"; + op->emitWarning() << "Number of opcodes in the map: " + << opcodeMap.getNumOpcodes() << "!"; + + // Print value associated with opcode in the opcodeMap attribute + // Use OpcodeList OpcodeMap::getOpcodeList(StringRef key) + for (auto &&list_of_opcode_ids : lists_of_opcode_ids) { + for (auto &&opcode_id : list_of_opcode_ids) { + // Print id and position of opcode in the map + op->emitWarning() << "Opcode id: " << opcode_id << " at position " + << opcodeMap.getOpcodeListPosition(opcode_id) << "!"; + assert(opcodeMap.getOpcodeListPosition(opcode_id) != -1 && + "Opcode id not found in the map!"); + OpcodeList opcodeList = opcodeMap.getOpcodeList(opcode_id); + // Print number of opcodes in the list + op->emitWarning() << "Number of opcodes in the list: " + << opcodeList.getNumActions() << "!"; + // Print id and dump of each opcode in the list + llvm::errs() << "Opcode id: " << opcode_id << " " + << "OpcodeListDump: " << opcodeList << "\n"; + + for (auto &&action : opcodeList.getActions()) { + // Switch case on the kind of action + switch (action.getKind()) { + case OpcodeExprKind::Send: { + auto id = action.cast().getId(); + llvm::errs() << "Send action. " + << "id: " << id << "\n"; + break; + } + case OpcodeExprKind::Recv: { + llvm::errs() << "Recv action. "; + break; + } + case OpcodeExprKind::SendLiteral: { + llvm::errs() << "SendLiteral action. "; + break; + } + case OpcodeExprKind::SendDim: { + llvm::errs() << "SendDim action. "; + break; + } + case OpcodeExprKind::SendIdx: { + llvm::errs() << "SendIdx action. "; + break; + } + default: { + llvm_unreachable("Unknown action."); + } + } + llvm::errs() << "action dump: " << action << "\n"; + } + } + } + } + + /// Add accel.send and accel.recv operations to the function based on the + /// loop_offsets and lists_of_opcode_ids paired with the opcodeMap attribute. + static void + addAccelOps(Operation *op, PatternRewriter &rewriter, + SmallVectorImpl &loop_offsets, + SmallVectorImpl> &lists_of_opcode_ids) { + + Location loc = op->getLoc(); + // op->emitWarning() << "Adding accel.send and accel.recv operations..."; + auto opcodeMap = + op->getAttrOfType(kAccel_opcode_map).getValue(); + // llvm::errs() << "OpcodeMap: " << opcodeMap << "\n"; + // op->emitWarning() << "Number of opcodes in the map: " + // << opcodeMap.getNumOpcodes() << "!"; + + std::vector>> zipped; + std::transform(loop_offsets.begin(), loop_offsets.end(), + lists_of_opcode_ids.begin(), std::back_inserter(zipped), + [](int a, SmallVector b) { + return std::make_pair(a, b); + }); + + for (auto &&pair : zipped) { + int loop_offset = pair.first; + SmallVector list_of_opcode_ids = pair.second; + for (auto &&opcode_id : list_of_opcode_ids) { + // Print id and position of opcode in the map + // op->emitWarning() << "Opcode id: " << opcode_id + // << " in map at position " + // << opcodeMap.getOpcodeListPosition(opcode_id) << + // "!"; + assert(opcodeMap.getOpcodeListPosition(opcode_id) != -1 && + "Opcode id not found in the map!"); + OpcodeList opcodeList = opcodeMap.getOpcodeList(opcode_id); + // Print number of opcodes in the list + // op->emitWarning() << "Number of opcodes in the list: " + // << opcodeList.getNumActions() << "!"; + // Print id and dump of each opcode in the list + // llvm::errs() << "Opcode id: " << opcode_id << " " + // << "OpcodeListDump: " << opcodeList << "\n"; + + Value initialOffset = nullptr; + + addOperationToLoopBody(rewriter, op->getLoc(), op, loop_offset, [&]() { + // Create the value to track the offset of the data + Value cteZero = rewriter.create( + loc, IntegerAttr::get(rewriter.getI32Type(), 0)); + initialOffset = cteZero; + }); + + // Insert the actions in the IR + for (auto &&action : opcodeList.getActions()) { + // Switch case on the kind of action + switch (action.getKind()) { + case OpcodeExprKind::Send: { + auto id = action.cast().getId(); + + Value operand = op->getOperands()[id]; + addOperationToLoopBody( + rewriter, op->getLoc(), op, loop_offset, [&]() { + // Operand is a subview of the original memref, we need to + // move this subview to correct loop_offset. We do this by + // creating a new memref.subview with the same input + // parameters. And replacing the operand with this new + // subview. + auto subViewOp = operand.getDefiningOp(); + if (!subViewOp) { + // Simply create a send operation with the operand + initialOffset = rewriter.create( + loc, rewriter.getI32Type(), operand, initialOffset); + return; + } + + // // TODO: Check if subview has been replaced + // // Only create the replacement if the subview has not been + // // moved yet. To verify this, check if the parent of the + // // subview is the same as the parent of op. + // if (subViewOp->getParentOp() == op->getParentOp()) { + // op->emitWarning() << "Subview has already been moved!"; + // initialOffset = rewriter.create( + // loc, rewriter.getI32Type(), subViewOp, + // initialOffset); + // } else { + // op->emitError() << "Subview has not been moved yet!"; + // return; + // } + + // Value newSubView = rewriter.create( + // loc, subViewOp.getType(), subViewOp.source(), + // subViewOp.static_offsets(), subViewOp.static_sizes(), + // subViewOp.static_strides()); + Value newSubView = rewriter.create( + loc, subViewOp.getType(), subViewOp.source(), + subViewOp.offsets(), subViewOp.sizes(), + subViewOp.strides(), subViewOp.static_offsets(), + subViewOp.static_sizes(), subViewOp.static_strides()); + + // Iterate on the operands, get defining op, if it is a + // constantop then move it before the newsubview + for (auto &&operand : subViewOp.getOperands()) { + Operation *defOp = operand.getDefiningOp(); + if (defOp && isa(defOp)) { + defOp->moveBefore(newSubView.getDefiningOp()); + } + } + rewriter.replaceOp(subViewOp, newSubView); + + initialOffset = rewriter.create( + loc, rewriter.getI32Type(), newSubView, initialOffset); + }); + break; + } + case OpcodeExprKind::Recv: { + auto id = action.cast().getId(); + + Value operand = op->getOperands()[id]; + addOperationToLoopBody( + rewriter, op->getLoc(), op, loop_offset, [&]() { + auto subViewOp = operand.getDefiningOp(); + if (!subViewOp) { + // Simply create a Recv operation with the operand + initialOffset = rewriter.create( + loc, rewriter.getI32Type(), operand, initialOffset); + return; + } + + // TODO: Check if subview has been replaced + + Value newSubView = rewriter.create( + loc, subViewOp.getType(), subViewOp.source(), + subViewOp.offsets(), subViewOp.sizes(), + subViewOp.strides(), subViewOp.static_offsets(), + subViewOp.static_sizes(), subViewOp.static_strides()); + + for (auto &&operand : subViewOp.getOperands()) { + Operation *defOp = operand.getDefiningOp(); + if (defOp && isa(defOp)) { + defOp->moveBefore(newSubView.getDefiningOp()); + } + } + rewriter.replaceOp(subViewOp, newSubView); + + // Generate accumulation on CPU if needed. + bool acc_on_cpu = false; + if (op->getAttrOfType(kAccel_acc_on_cpu).getValue()) + acc_on_cpu = true; + else { + // Set acc_on_cpu true if the operand is in the list of + // operands to be accumulated. + for (auto &&operand : op->getAttrOfType( + kAccel_accumulate_on_cpu)) { + if (operand.cast().getInt() == id) { + acc_on_cpu = true; + break; + } + } + } + if (acc_on_cpu) { + MemRefType sVmrType = + newSubView.getType().cast(); + + SmallVector shape; + auto accelSizes = op->getAttrOfType( + kAccel_accel_tile_sizes); + + // TODO: get shape from SubViewOp creating the subview + auto loopPerm = op->getAttrOfType( + kAccel_loop_permutation); + int index[3]; + for (unsigned i = 0; i < 3; i++) { + index[loopPerm[i].cast().getInt()]=i; + } + // SmallVector rootTileSizes(options.tileSizes.begin(), + // options.tileSizes.begin() + + // rootOp.getNumLoops()); + // if access sizes bigger than 0, use them + if (accelSizes.size() > 0) { + // TODO use begin and end iterator + for (unsigned i = 0; i < sVmrType.getRank(); i++) { + shape.push_back(accelSizes[index[i]].cast().getInt()); + } + } else { + for (unsigned i = 0; i < sVmrType.getRank(); i++) { + auto accelSize = op->getAttrOfType( + kAccel_accel_tile_size); + + // TODO: Support multi-dimensions + shape.push_back(accelSize.getInt()); + } + } + // Transform SmallVector in ArrayRef + ArrayRef shapeRef(shape); + MemRefType mrType = + MemRefType::get(shapeRef, sVmrType.getElementType()); + Value tMr = rewriter.create(loc, mrType); + initialOffset = rewriter.create( + loc, rewriter.getI32Type(), tMr, initialOffset); + + // Create affine maps and attributes for CPU accumulation + MemRefType tmpMrType = tMr.getType().cast(); + unsigned rank = tmpMrType.getRank(); + SmallVector indexingMaps( + /*1 inputs, 1 (inplace) output*/ 2, + rewriter.getMultiDimIdentityMap(rank)); + auto loopsAttr = SmallVector( + rank, getParallelIteratorTypeName()); + + rewriter.create( + loc, + /*resultTypes=*/TypeRange(), + /*inputs=*/tMr, + /*outputs=*/newSubView, + /*indexingMaps=*/indexingMaps, + /*iteratorTypes=*/loopsAttr, + /*bodyBuilder=*/ + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + Value added = nestedBuilder.create( + loc, args[0], args[1]); + nestedBuilder.create(nestedLoc, + added); + }); + } else { + // initialOffset = rewriter.create( + // loc, rewriter.getI32Type(), operand, + // initialOffset); + initialOffset = rewriter.create( + loc, rewriter.getI32Type(), newSubView, initialOffset); + } + }); + break; + } + case OpcodeExprKind::SendLiteral: { + auto value = action.cast().getValue(); + + Value literal = rewriter.create( + loc, IntegerAttr::get(rewriter.getI32Type(), value)); + addOperationToLoopBody( + rewriter, op->getLoc(), op, loop_offset, [&]() { + initialOffset = rewriter.create( + loc, rewriter.getI32Type(), literal, initialOffset); + }); + break; + } + case OpcodeExprKind::SendDim: { + llvm::errs() << "SendDim action. "; + llvm_unreachable("No support for SendDim yet."); + break; + } + case OpcodeExprKind::SendIdx: { + llvm::errs() << "No support for SendIdx yet. "; + break; + } + default: + llvm_unreachable("Unknown action."); + } + } + } + } + } + + LogicalResult matchAndRewrite(linalg::GenericOp op, + PatternRewriter &rewriter) const override { + + Location loc = op->getLoc(); + + // Get location before first operation inside funcOp + FuncOp funcOp = op->getParentOfType(); + // Location funcFrontLoc = funcOp.front().front().getLoc(); + + rewriter.setInsertionPointToStart(&funcOp.front()); + Location funcFrontLoc = rewriter.getInsertionPoint()->getLoc(); + + SmallVector valuesForInitDMA; + materializeDMAConstants(rewriter, op, funcFrontLoc, valuesForInitDMA); + + // TODO check if such operation already exists for the same DMA address + // Create the accel.init_dma operation + rewriter.create(funcFrontLoc, valuesForInitDMA[0], + valuesForInitDMA[1], valuesForInitDMA[2], + valuesForInitDMA[3], valuesForInitDMA[4]); + + SmallVector loop_offsets; + SmallVector opcodes_strs; + SmallVector, 4> lists_of_opcode_ids; + parseOpcodeFlowStr(op, loop_offsets, opcodes_strs, lists_of_opcode_ids); + + // printOpcodesInMap(op, loop_offsets, opcodes_strs, lists_of_opcode_ids); + addAccelOps(op, rewriter, loop_offsets, lists_of_opcode_ids); + + // for (auto && l: loop_offsets) { + // addOperationToLoopBody(rewriter, loc, op, l, [&]() { + // op->emitWarning() << "Creating testCte"; + // // TODO: Create correct accel operation + // Value testCte = rewriter.create( + // loc, IntegerAttr::get(rewriter.getI32Type(), 7777+l)); + // }); + // } + + // rewriter.setInsertionPoint(op); + + // Value cteZero = rewriter.create( + // loc, IntegerAttr::get(rewriter.getI32Type(), 0)); + // Value initialOffset = cteZero; + + // for (Value operand : op.inputs()) { + // initialOffset = rewriter.create(loc, + // rewriter.getI32Type(), + // operand, initialOffset); + // } + + // initialOffset = cteZero; + // for (Value operand : op.outputs()) { + // if (op->getAttrOfType(kAccel_acc_on_cpu).getValue()) { + // MemRefType mrType = operand.getType().cast(); + // Value tMr = rewriter.create(loc, mrType); + // rewriter.create( + // loc, rewriter.getI32Type(), tMr, + // initialOffset); // TODO: Initial offset? Multiple outputs? + + // // Create affine maps and attributes for CPU accumulation + // MemRefType tmpMrType = tMr.getType().cast(); + // unsigned rank = tmpMrType.getRank(); + // SmallVector indexingMaps( + // /*1 inputs, 1 (inplace) output*/ 2, + // rewriter.getMultiDimIdentityMap(rank)); + // auto loopsAttr = + // SmallVector(rank, getParallelIteratorTypeName()); + + // rewriter.create( + // loc, + // /*resultTypes=*/TypeRange(), + // /*inputs=*/tMr, + // /*outputs=*/operand, + // /*indexingMaps=*/indexingMaps, + // /*iteratorTypes=*/loopsAttr, + // /*bodyBuilder=*/ + // [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange + // args) { + // Value added = + // nestedBuilder.create(loc, args[0], args[1]); + // nestedBuilder.create(nestedLoc, added); + // }); + // } else { + // initialOffset = rewriter.create( + // loc, rewriter.getI32Type(), operand, initialOffset); + // } + // } + rewriter.eraseOp(op); + + return success(); + } +}; + +void mlir::populateLinalgGenericToAccelConversionPatternsWithOptions( + RewritePatternSet &patterns, const AccelTransformationOptions &options) { + MLIRContext *ctx = patterns.getContext(); + // This populate patterns that implement the following FSM modifying + // kLinalgTransformMarker GENERALIZE -> ANNOTATE -> INTERCHANGE -> MEM(TILE) + // L3(TILE) -> L2(TILE) -> L1(TILE) -> ACCEL + patterns.add( + ctx, + linalg::LinalgTransformationFilter(StringAttr::get(ctx, "ANNOTATE"), + StringAttr::get(ctx, "INTERCHANGE")), + options); + populateCommonLinalgTransformationPatterns(patterns, options); +} + +void mlir::populateLinalgGenericToAccelConversionPatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +namespace { +struct ConvertLinalgGenericToAccelPass + : public ConvertLinalgGenericToAccelBase { + ConvertLinalgGenericToAccelPass() = default; + + /// Constructor to build this pass using user defined options + /// Not used when the pass is created from commandline, helpful for creating + /// this pass in code + ConvertLinalgGenericToAccelPass(const AccelTransformationOptions &options) { + this->accelSize = options.accelSize; + this->accelSizes = options.accelSizes; + this->dmaAddress = options.dmaAddress; + this->dmaInputAddress = options.dmaInputAddress; + this->dmaInputBufferSize = options.dmaInputBufferSize; + this->dmaOutputAddress = options.dmaOutputAddress; + this->dmaOutputBufferSize = options.dmaOutputAddress; + this->accOnCpu = options.accOnCpu; + this->flowCpuAcc = options.flowCpuAcc; // TODO: will be deprecated + this->numberOfCaches = options.numberOfCaches; + this->cacheSizes = options.cacheSizes; + this->tileSizes = options.tileSizes; + this->elementSize = options.elementSize; + this->loopPermutation = options.loopPermutation; + this->anchorFuncName = options.anchorFuncName; + this->anchorOpName = options.anchorOpName; + this->anchorFilterName = options.anchorFilterName; + this->opcodeMap = options.opcodeMap; + this->initFlow = options.initFlow; + this->opcodeFlow = options.opcodeFlow; + } + + void runOnOperation() override; + + void setOptions(AccelTransformationOptions &options) { + options.accelSize = this->accelSize; + options.accelSizes = this->accelSizes; + options.dmaAddress = this->dmaAddress; + options.dmaInputAddress = this->dmaInputAddress; + options.dmaInputBufferSize = this->dmaInputBufferSize; + options.dmaOutputAddress = this->dmaOutputAddress; + options.dmaOutputBufferSize = this->dmaOutputBufferSize; + options.accOnCpu = this->accOnCpu; + options.flowCpuAcc = this->flowCpuAcc; // TODO: will be deprecated + options.numberOfCaches = this->numberOfCaches; + options.cacheSizes = this->cacheSizes; + options.tileSizes = this->tileSizes; + options.elementSize = this->elementSize; + options.loopPermutation = this->loopPermutation; + options.anchorFuncName = this->anchorFuncName; + options.anchorOpName = this->anchorOpName; + options.anchorFilterName = this->anchorFilterName; + options.opcodeMap = this->opcodeMap; + options.initFlow = this->initFlow; + options.opcodeFlow = this->opcodeFlow; + } +}; +} // namespace + +/// The conversion takes the following steps: +/// 1. Marks anchor ops with the "generalize" attribute +/// 2. Generalizes the marked ops, marking the Ops with the "ACCEL" attribute +/// 3. Annotate attributes to the marked ops +/// 4. Convert the marked ops to the accel dialect +void ConvertLinalgGenericToAccelPass::runOnOperation() { + + AccelTransformationOptions options; + setOptions(options); + + auto module = getOperation(); + MLIRContext *ctx = &getContext(); + + // 1. Marks anchor ops with the "GENERALIZE" or "ANNOTATE" attribute + module.walk([&](FuncOp functionOp) { + if (!anchorFuncName.empty() && anchorFuncName != functionOp.getName()) + return; + + functionOp.walk([&](linalg::LinalgOp op) { + if (!anchorFilterName.empty()) { + // Skip this op if the LinalgOp has kAccelTransformMarker that is not + // equal to anchorFilterName + if (op->getAttr(kAccelTransformMarker) != + StringAttr::get(ctx, anchorFilterName)) { + return; + } + } + + if ((op->getAttr(kLinalgTransformMarker) != + StringAttr::get(ctx, "ACCELERATE"))) { + if ((anchorOpName != op->getName().getStringRef())) + return; + } + + if (isa(op)) { + op->setAttr(kLinalgTransformMarker, + StringAttr::get(&getContext(), "ANNOTATE")); + } else { + op->setAttr(kLinalgTransformMarker, + StringAttr::get(&getContext(), "GENERALIZE")); + } + }); + }); + + // 2. Generalizes the marked ops, marking the Ops with the next attribute in + // the FSM. Uses a nested pass manager. + PassManager pm(module.getContext()); + linalg::LinalgTransformationFilter f(StringAttr::get(ctx, "GENERALIZE"), + StringAttr::get(ctx, "ANNOTATE")); + pm.addNestedPass( + mlir::createLinalgStrategyGeneralizePass(anchorOpName, f)); + + if (failed(pm.run(module))) + signalPassFailure(); + + // Using rewrite patterns + // 3. Annotate attributes to the marked ops + // 4. Convert the marked ops to the accel dialect + RewritePatternSet patterns(&getContext()); + populateLinalgGenericToAccelConversionPatternsWithOptions(patterns, options); + + ConversionTarget target(getContext()); + // clang-format off + target.addLegalDialect(); + // clang-format on + target.addDynamicallyLegalOp( + [&](linalg::GenericOp op) -> bool { + MLIRContext *ctx = &getContext(); + SmallVector markers = { + "GENERALIZE", "ANNOTATE", "INTERCHANGE", "MEM", "L3", "L2", "L1"}; + + auto aMarkerMatchesAttr = [&](const Attribute &attr) -> bool { + // Acts like an OR operation, returns true in the first match + for (auto marker : markers) { + // TODO: Could be made more efficient by casting attr to StringAttr + if (StringAttr::get(ctx, marker) == attr) + return true; + } + return false; + }; + + return !(aMarkerMatchesAttr(op->getAttr(kLinalgTransformMarker))); + }); + if (failed(applyPartialConversion(module, target, std::move(patterns)))) + signalPassFailure(); + + RewritePatternSet patterns2(&getContext()); + populateLinalgGenericToAccelConversionPatterns(patterns2); + target.addDynamicallyLegalOp( + [&](linalg::GenericOp op) -> bool { + auto marker = StringAttr::get(&getContext(), "GENACCEL"); + return !((op->getAttr(kLinalgTransformMarker) == marker)); + }); + if (failed(applyPartialConversion(module, target, std::move(patterns2)))) + signalPassFailure(); +} + +std::unique_ptr> +mlir::createConvertLinalgGenericToAccelPass() { + return std::make_unique(); +} + +// std::unique_ptr> +// mlir::createConvertLinalgGenericToAccelPass( +// const AccelTransformationOptions &options) { +// return std::make_unique(options); +// } diff --git a/lib/Conversion/PassDetail.h b/lib/Conversion/PassDetail.h index 36ce500..de20815 100644 --- a/lib/Conversion/PassDetail.h +++ b/lib/Conversion/PassDetail.h @@ -29,6 +29,14 @@ namespace scf { class SCFDialect; } // end namespace scf +namespace memref { +class MemRefDialect; +} // namespace memref + +namespace LLVM { +class LLVMDialect; +} // namespace LLVM + #define GEN_PASS_CLASSES #include "soda/Conversion/Passes.h.inc" diff --git a/lib/Dialect/Accel/CMakeLists.txt b/lib/Dialect/Accel/CMakeLists.txt new file mode 100644 index 0000000..f33061b --- /dev/null +++ b/lib/Dialect/Accel/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(IR) diff --git a/lib/Dialect/Accel/IR/AccelDialect.cpp b/lib/Dialect/Accel/IR/AccelDialect.cpp new file mode 100644 index 0000000..02703da --- /dev/null +++ b/lib/Dialect/Accel/IR/AccelDialect.cpp @@ -0,0 +1,36 @@ +//===- AccelDialect.cpp - MLIR dialect for Accel implementation -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Transforms/InliningUtils.h" +#include "soda/Dialect/Accel/IR/Accel.h" + +using namespace mlir; +using namespace mlir::accel; + +#include "soda/Dialect/Accel/IR/AccelOpsDialect.cpp.inc" + +namespace { +/// This class defines the interface for handling inlining with accel +/// operations. +struct AccelInlinerInterface : public DialectInlinerInterface { + using DialectInlinerInterface::DialectInlinerInterface; + + /// All operations within accel ops can be inlined. + bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final { + return true; + } +}; +} // namespace + +void mlir::accel::AccelDialect::initialize() { + addOperations< +#define GET_OP_LIST +#include "soda/Dialect/Accel/IR/AccelOps.cpp.inc" + >(); + addInterfaces(); +} diff --git a/lib/Dialect/Accel/IR/AccelOps.cpp b/lib/Dialect/Accel/IR/AccelOps.cpp new file mode 100644 index 0000000..f92fb1c --- /dev/null +++ b/lib/Dialect/Accel/IR/AccelOps.cpp @@ -0,0 +1,20 @@ +//===- AccelOps.cpp - MLIR operations for accel implementation ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/Builders.h" +#include "soda/Dialect/Accel/IR/Accel.h" + +using namespace mlir; +using namespace mlir::accel; + +//===----------------------------------------------------------------------===// +// TableGen'd op method definitions +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "soda/Dialect/Accel/IR/AccelOps.cpp.inc" diff --git a/lib/Dialect/Accel/IR/CMakeLists.txt b/lib/Dialect/Accel/IR/CMakeLists.txt new file mode 100644 index 0000000..5576b92 --- /dev/null +++ b/lib/Dialect/Accel/IR/CMakeLists.txt @@ -0,0 +1,14 @@ +add_mlir_dialect_library(SODAAccelDialect + AccelOps.cpp + AccelDialect.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/soda/Dialect/Accel + + DEPENDS + MLIRAccelOpsIncGen + + LINK_LIBS PUBLIC + MLIRDialect + MLIRIR +) diff --git a/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp b/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp index be482e9..2c950ef 100644 --- a/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp +++ b/lib/Dialect/Affine/Transforms/AffineDataCopyGen.cpp @@ -39,7 +39,6 @@ #include "llvm/Support/Debug.h" #include - #define DEBUG_TYPE "soda-affine-data-copy-gen" using namespace mlir; @@ -73,7 +72,7 @@ struct AffineDataCopyGen } void runOnOperation() override; - LogicalResult runOnBlock(Block *block, DenseSet ©Nests); + void runOnBlock(Block *block, DenseSet ©Nests); // Constant zero index to avoid too many duplicates. Value zeroIndex = nullptr; @@ -98,10 +97,10 @@ mlir::soda::createAffineDataCopyGenPass( /// ranges: each range is either a sequence of one or more operations starting /// and ending with an affine load or store op, or just an affine.forop (which /// could have other affine for op's nested within). -LogicalResult AffineDataCopyGen::runOnBlock(Block *block, - DenseSet ©Nests) { +void AffineDataCopyGen::runOnBlock(Block *block, + DenseSet ©Nests) { if (block->empty()) - return success(); + return; uint64_t fastMemCapacityBytes = fastMemoryCapacity != std::numeric_limits::max() @@ -111,7 +110,7 @@ LogicalResult AffineDataCopyGen::runOnBlock(Block *block, fastMemorySpace, tagMemorySpace, fastMemCapacityBytes}; - // Every affine.forop in the block starts and ends a block range for copying; + // Every affine.for op in the block starts and ends a block range for copying; // in addition, a contiguous sequence of operations starting with a // load/store op but not including any copy nests themselves is also // identified as a copy block range. Straightline code (a contiguous chunk of @@ -160,7 +159,7 @@ LogicalResult AffineDataCopyGen::runOnBlock(Block *block, if (recurseInner) { // We'll recurse and do the copies at an inner level for 'forInst'. // Recurse onto the body of this loop. - (void)runOnBlock(forOp.getBody(), copyNests); + runOnBlock(forOp.getBody(), copyNests); } else { // We have enough capacity, i.e., copies will be computed for the // portion of the block until 'it', and for 'it', which is 'forOp'. Note @@ -198,8 +197,6 @@ LogicalResult AffineDataCopyGen::runOnBlock(Block *block, /*end=*/std::prev(block->end()), copyOptions, /*filterMemRef=*/std::nullopt, copyNests); } - - return success(); } void AffineDataCopyGen::runOnOperation() { @@ -215,7 +212,7 @@ void AffineDataCopyGen::runOnOperation() { copyNests.clear(); for (auto &block : f) - (void)runOnBlock(&block, copyNests); + runOnBlock(&block, copyNests); // Promote any single iteration loops in the copy nests and collect // load/stores to simplify. @@ -237,5 +234,6 @@ void AffineDataCopyGen::runOnOperation() { AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext()); AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext()); FrozenRewritePatternSet frozenPatterns(std::move(patterns)); - (void)applyOpPatternsAndFold(copyOps, frozenPatterns, /*strict=*/true); + (void)applyOpPatternsAndFold(copyOps, frozenPatterns, + GreedyRewriteStrictness::ExistingAndNewOps); } diff --git a/lib/Dialect/CMakeLists.txt b/lib/Dialect/CMakeLists.txt index bbe6311..a1af34b 100644 --- a/lib/Dialect/CMakeLists.txt +++ b/lib/Dialect/CMakeLists.txt @@ -2,4 +2,5 @@ add_subdirectory(SODA) add_subdirectory(SNN) add_subdirectory(Linalg) add_subdirectory(Affine) +add_subdirectory(Accel) add_subdirectory(Transform) \ No newline at end of file diff --git a/lib/Dialect/Linalg/Transforms/Tiling.cpp b/lib/Dialect/Linalg/Transforms/Tiling.cpp index be8fd49..cf22ef8 100644 --- a/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -58,7 +58,8 @@ parseTilingString(ModuleOp &module, MLIRContext *context, transform.sequence failures(propagate) { ^bb0(%arg1: !pdl.operation): %0 = transform.structured.match ops{[""]} in %arg1 - %1, %loops: = transform.structured.tile %0 [] + %1, %loops: = transform.structured.tile %0 [] : + (!pdl.operation) -> () } )MLIR"; @@ -81,6 +82,18 @@ parseTilingString(ModuleOp &module, MLIRContext *context, std::string tileNDimsStr = std::to_string(tileSizes.size()); str = str.replace(str.find(""), 11, tileNDimsStr); + // replace with the correct number of !pdl.operation,... + std::string pdlOutTypeStr = ""; + // the number of types is given by the number of for loops + 1 + for (size_t i = 0; i < tileSizes.size() + 1; i++) { + pdlOutTypeStr += "!pdl.operation"; + if (i != tileSizes.size()) { + pdlOutTypeStr += ", "; + } + } + // perform string replacement + str = str.replace(str.find(""), 16, pdlOutTypeStr); + // Parse the string return parseSourceString(str, module, context); } @@ -119,7 +132,8 @@ parseTilingString(ModuleOp &module, MLIRContext *context, // // todo: create tile op // // SmallVector tileSizes = {4, 4}; // // auto tiletoScfForOp = -// // b.create(loc, matchOp.getResult(), tileSizes); +// // b.create(loc, matchOp.getResult(), +// tileSizes); // // auto forLoops = tiletoScfForOp.getLoops(); // // auto tiledOpH = tiletoScfForOp.getTiledLinalgOp(); diff --git a/lib/Dialect/SODA/Transforms/KernelGeneration.cpp b/lib/Dialect/SODA/Transforms/KernelGeneration.cpp index e850e28..9d39e92 100644 --- a/lib/Dialect/SODA/Transforms/KernelGeneration.cpp +++ b/lib/Dialect/SODA/Transforms/KernelGeneration.cpp @@ -17,9 +17,9 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" #include "mlir/IR/FunctionInterfaces.h" +#include "mlir/IR/IRMapping.h" #include "mlir/IR/SymbolTable.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/RegionUtils.h" @@ -72,7 +72,7 @@ void SodaKernelGenerationPass::runOnOperation() { return signalPassFailure(); } - BlockAndValueMapping map; + IRMapping map; sodaOp.getRegion().cloneInto(&(mop.getRegion()), map); sodaOp.erase(); diff --git a/lib/Dialect/SODA/Transforms/KernelOutlining.cpp b/lib/Dialect/SODA/Transforms/KernelOutlining.cpp index 867d4bd..a1f606e 100644 --- a/lib/Dialect/SODA/Transforms/KernelOutlining.cpp +++ b/lib/Dialect/SODA/Transforms/KernelOutlining.cpp @@ -16,8 +16,8 @@ #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/IRMapping.h" #include "mlir/IR/SymbolTable.h" #include "mlir/Parser/Parser.h" #include "mlir/Support/LLVM.h" @@ -98,7 +98,7 @@ LogicalResult mlir::sinkOperationsIntoLaunchOp(soda::LaunchOp launchOp) { } // Insert operations so that the defs get cloned before uses. - BlockAndValueMapping map; + IRMapping map; OpBuilder builder(launchOpBody); for (Operation *op : toBeSunk) { Operation *clonedOp = builder.clone(*op, map); @@ -137,7 +137,7 @@ outlineKernelFuncImpl(soda::LaunchOp launchOp, StringRef kernelFnName, auto outlinedFunc = builder.create(loc, kernelFnName, type); outlinedFunc->setAttr(soda::SODADialect::getKernelFuncAttrName(), builder.getUnitAttr()); - BlockAndValueMapping map; + IRMapping map; // Map the arguments corresponding to the launch parameter like blockIdx, // threadIdx, etc. diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt new file mode 100644 index 0000000..6e8e666 --- /dev/null +++ b/lib/ExecutionEngine/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(axi) \ No newline at end of file diff --git a/lib/ExecutionEngine/axi/AxiUtils.cpp b/lib/ExecutionEngine/axi/AxiUtils.cpp new file mode 100644 index 0000000..a8a4135 --- /dev/null +++ b/lib/ExecutionEngine/axi/AxiUtils.cpp @@ -0,0 +1,200 @@ +//===- AxiUtils.cpp - AXI4MLIR implementation ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements wrapper AXI4MLIR library calls. These are the calls +// visible to the MLIR. +// +//===----------------------------------------------------------------------===// + +#include "soda/ExecutionEngine/axi/AxiUtils.h" + +#include "soda/ExecutionEngine/axi/api_v1.h" + +struct dma myDMA; +// ============================================================================= +// AXI_APIV1 +// ============================================================================= + +extern "C" void dma_init(unsigned int dma_address, + unsigned int dma_input_address, + unsigned int dma_input_buffer_size, + unsigned int dma_output_address, + unsigned int dma_output_buffer_size) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl; + std::cout << "\t" << dma_address << std::endl; + std::cout << "\t" << dma_input_address << std::endl; + std::cout << "\t" << dma_input_buffer_size << std::endl; + std::cout << "\t" << dma_output_address << std::endl; + std::cout << "\t" << dma_output_buffer_size << std::endl;); + + myDMA.dma_init(dma_address, dma_input_address, dma_input_buffer_size, + dma_output_address, dma_output_buffer_size); + return; +} + +// V2 implementation +// extern "C" void dma_init(unsigned int dma_address, +// unsigned int dma_input_address, +// unsigned int dma_input_buffer_size, unsigned int +// isize, unsigned int dma_output_address, unsigned int +// dma_output_buffer_size, unsigned int osize) { +// D(std::cout << "Called: " << __func__ << " not mock version" << std::endl; +// std::cout << "\t" << dma_address << std::endl; +// std::cout << "\t" << dma_input_address << std::endl; +// std::cout << "\t" << dma_input_buffer_size << std::endl; +// std::cout << "\t" << isize << std::endl; +// std::cout << "\t" << dma_output_address << std::endl; +// std::cout << "\t" << dma_output_buffer_size << std::endl; +// std::cout << "\t" << osize << std::endl;); + +// myDMA.dma_init(dma_address, dma_input_address, dma_input_buffer_size, +// isize, +// dma_output_address, dma_output_buffer_size, osize); +// return; +// } + +extern "C" void dma_free() { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + myDMA.dma_free(); +} + +extern "C" unsigned int *dma_get_inbuffer() { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_get_inbuffer(); +} + +extern "C" unsigned int *dma_get_outbuffer() { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_get_outbuffer(); +} + +// V2 implementation +// extern "C" char *dma_get_inbuffer() { +// D(std::cout << "Called: " << __func__ << " not mock version" << +// std::endl;); return myDMA.dma_get_inbuffer(); +// } + +// extern "C" char *dma_get_outbuffer() { +// D(std::cout << "Called: " << __func__ << " not mock version" << +// std::endl;); return myDMA.dma_get_outbuffer(); +// } + +extern "C" int dma_copy_to_inbuffer(unsigned int *host_src_address, + int data_length, int offset) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_copy_to_inbuffer(host_src_address, data_length, offset); +} + +extern "C" int dma_copy_from_outbuffer(unsigned int *host_dst_address, + int data_length, int offset) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_copy_from_outbuffer(host_dst_address, data_length, offset); +} + +template +int mlir_dma_copy_to_inbuffer(const DynamicMemRefType &src, int data_length, + int offset) { + myDMA.mlir_dma_copy_to_inbuffer(src.data, src.rank, src.rank, src.offset, + src.sizes, src.strides, offset); + return 0; +} + +extern "C" int _mlir_ciface_copy_to_inbuffer_f32(UnrankedMemRefType *M, + int offset) { + mlir_dma_copy_to_inbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_to_inbuffer_f32(&descriptor, offset); +} + +extern "C" int _mlir_ciface_copy_to_inbuffer_i32(UnrankedMemRefType *M, + int offset) { + mlir_dma_copy_to_inbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_to_inbuffer_i32(&descriptor, offset); +} + +extern "C" int +_mlir_ciface_copy_from_outbuffer_f32(UnrankedMemRefType *M, int offset) { + mlir_dma_copy_from_outbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_from_outbuffer_f32(&descriptor, offset); +} + +extern "C" int _mlir_ciface_copy_from_outbuffer_i32(UnrankedMemRefType *M, + int offset) { + mlir_dma_copy_from_outbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_from_outbuffer_i32(&descriptor, offset); +} + +template +int mlir_dma_copy_from_outbuffer(const DynamicMemRefType &dst, + int data_length, int offset) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + myDMA.mlir_dma_copy_from_outbuffer(dst.data, dst.rank, dst.rank, dst.offset, + dst.sizes, dst.strides, offset); + return 0; +} + +extern "C" int dma_start_send(int length, int offset) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_start_send(length, offset); +} + +extern "C" int dma_check_send() { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return 0; +} + +extern "C" void dma_wait_send() { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + myDMA.dma_wait_send(); +} + +extern "C" int dma_start_recv(int length, int offset) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_start_recv(length, offset); +} + +extern "C" void dma_wait_recv() { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + myDMA.dma_wait_recv(); +} + +extern "C" int dma_check_recv() { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_check_recv(); +} + +extern "C" unsigned int dma_set(unsigned int *dma_virtual_address, int offset, + unsigned int value) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + myDMA.dma_set(dma_virtual_address, offset, value); + return 0; +} + +extern "C" unsigned int dma_get(unsigned int *dma_virtual_address, int offset) { + D(std::cout << "Called: " << __func__ << " not mock version" << std::endl;); + return myDMA.dma_get(dma_virtual_address, offset); +} diff --git a/lib/ExecutionEngine/axi/AxiUtilsMock.cpp b/lib/ExecutionEngine/axi/AxiUtilsMock.cpp new file mode 100644 index 0000000..9f18f8f --- /dev/null +++ b/lib/ExecutionEngine/axi/AxiUtilsMock.cpp @@ -0,0 +1,131 @@ +//===- AxiUtils.cpp - AXI4MLIR implementation ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements wrapper AXI4MLIR library calls. These are the calls +// visible to the MLIR. +// +// This is a mock implementation that only prints to the terminal. +// +//===----------------------------------------------------------------------===// + +#include "soda/ExecutionEngine/axi/AxiUtils.h" + +// ============================================================================= +// AXI_APIV1 +// ============================================================================= + +extern "C" void dma_init(unsigned int dma_address, + unsigned int dma_input_address, + unsigned int dma_input_buffer_size, + unsigned int dma_output_address, + unsigned int dma_output_buffer_size) { + std::cout << "Called: " << __func__ << std::endl; + std::cout << "\t" << dma_address << std::endl; + std::cout << "\t" << dma_input_address << std::endl; + std::cout << "\t" << dma_input_buffer_size << std::endl; + std::cout << "\t" << dma_output_address << std::endl; + std::cout << "\t" << dma_output_buffer_size << std::endl; + std::cout << "Called: " << __func__ << std::endl; + return; +} + +extern "C" void dma_free() { std::cout << "Called: " << __func__ << std::endl; } + +extern "C" unsigned int *dma_get_inbuffer() { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" unsigned int *dma_get_outbuffer() { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" int dma_copy_to_inbuffer(unsigned int *host_src_address, + int data_length, int offset) { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" int dma_copy_from_outbuffer(unsigned int *host_dst_address, + int data_length, int offset) { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +template +int mlir_dma_copy_to_inbuffer(const DynamicMemRefType &src, int data_length, + int offset) { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" int _mlir_ciface_copy_to_inbuffer_f32(UnrankedMemRefType *M, + int offset) { + mlir_dma_copy_to_inbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return 0; +} + +extern "C" int copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return 0; +} + +extern "C" int copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return 0; +} + +extern "C" int copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return 0; +} + +extern "C" int dma_start_send(int length, int offset) { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" int dma_check_send() { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" void dma_wait_send() { + std::cout << "Called: " << __func__ << std::endl; +} + +extern "C" int dma_start_recv(int length, int offset) { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" void dma_wait_recv() { + std::cout << "Called: " << __func__ << std::endl; +} + +extern "C" int dma_check_recv() { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" unsigned int dma_set(unsigned int *dma_virtual_address, int offset, + unsigned int value) { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} + +extern "C" unsigned int dma_get(unsigned int *dma_virtual_address, int offset) { + std::cout << "Called: " << __func__ << std::endl; + return 0; +} diff --git a/lib/ExecutionEngine/axi/AxiUtilsSysc.cpp b/lib/ExecutionEngine/axi/AxiUtilsSysc.cpp new file mode 100644 index 0000000..23e2a99 --- /dev/null +++ b/lib/ExecutionEngine/axi/AxiUtilsSysc.cpp @@ -0,0 +1,189 @@ +//===- AxiUtils.cpp - AXI4MLIR implementation ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements wrapper AXI4MLIR library calls. These are the calls +// visible to the MLIR. +// +//===----------------------------------------------------------------------===// + +#include "soda/ExecutionEngine/axi/AxiUtils.h" + +#include "soda/ExecutionEngine/axi/api_v1.h" + +// ============================================================================= +// AXI_APIV1 +// ============================================================================= + +struct dma myDMA; + +extern "C" void dma_init(unsigned int dma_address, + unsigned int dma_input_address, + unsigned int dma_input_buffer_size, + unsigned int dma_output_address, + unsigned int dma_output_buffer_size) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + // std::cout << "\t" << dma_address << std::endl; + // std::cout << "\t" << dma_input_address << std::endl; + // std::cout << "\t" << dma_input_buffer_size << std::endl; + // std::cout << "\t" << dma_output_address << std::endl; + // std::cout << "\t" << dma_output_buffer_size << std::endl; + LOG("Called: " << __func__ << " sysc version"); + LOG("\t" << dma_address); + LOG("\t" << dma_input_address); + LOG("\t" << dma_input_buffer_size); + LOG("\t" << dma_output_address); + LOG("\t" << dma_output_buffer_size); + + myDMA.dma_init(dma_address, dma_input_address, dma_input_buffer_size, + dma_output_address, dma_output_buffer_size); + return; +} + +extern "C" void dma_free() { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + myDMA.dma_free(); +} + +extern "C" unsigned int *dma_get_inbuffer() { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_get_inbuffer(); +} + +extern "C" unsigned int *dma_get_outbuffer() { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_get_outbuffer(); +} + +extern "C" int dma_copy_to_inbuffer(unsigned int *host_src_address, + int data_length, int offset) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_copy_to_inbuffer(host_src_address, data_length, offset); +} + +extern "C" int dma_copy_from_outbuffer(unsigned int *host_dst_address, + int data_length, int offset) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_copy_from_outbuffer(host_dst_address, data_length, offset); +} + +template +int mlir_dma_copy_to_inbuffer(const DynamicMemRefType &src, int data_length, + int offset) { + myDMA.mlir_dma_copy_to_inbuffer(src.data, src.rank, src.rank, src.offset, + src.sizes, src.strides, offset); + return 0; +} + +extern "C" int _mlir_ciface_copy_to_inbuffer_f32(UnrankedMemRefType *M, + int offset) { + mlir_dma_copy_to_inbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_to_inbuffer_f32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_to_inbuffer_f32(&descriptor, offset); +} + +extern "C" int _mlir_ciface_copy_to_inbuffer_i32(UnrankedMemRefType *M, + int offset) { + mlir_dma_copy_to_inbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_to_inbuffer_i32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_to_inbuffer_i32(&descriptor, offset); +} + +extern "C" int +_mlir_ciface_copy_from_outbuffer_f32(UnrankedMemRefType *M, int offset) { + mlir_dma_copy_from_outbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_from_outbuffer_f32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_from_outbuffer_f32(&descriptor, offset); +} + +extern "C" int _mlir_ciface_copy_from_outbuffer_i32(UnrankedMemRefType *M, + int offset) { + mlir_dma_copy_from_outbuffer(DynamicMemRefType(*M), 0, offset); + return 0; +} + +extern "C" int copy_from_outbuffer_i32(int64_t rank, void *ptr, int offset) { + UnrankedMemRefType descriptor = {rank, ptr}; + return _mlir_ciface_copy_from_outbuffer_i32(&descriptor, offset); +} + +template +int mlir_dma_copy_from_outbuffer(const DynamicMemRefType &dst, + int data_length, int offset) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + myDMA.mlir_dma_copy_from_outbuffer(dst.data, dst.rank, dst.rank, dst.offset, + dst.sizes, dst.strides, offset); + return 0; +} + +extern "C" int dma_start_send(int length, int offset) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_start_send(length, offset); +} + +extern "C" int dma_check_send() { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return 0; +} + +extern "C" void dma_wait_send() { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + myDMA.dma_wait_send(); +} + +extern "C" int dma_start_recv(int length, int offset) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_start_recv(length, offset); +} + +extern "C" void dma_wait_recv() { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + myDMA.dma_wait_recv(); +} + +extern "C" int dma_check_recv() { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_check_recv(); +} + +extern "C" unsigned int dma_set(unsigned int *dma_virtual_address, int offset, + unsigned int value) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + myDMA.dma_set(dma_virtual_address, offset, value); + return 0; +} + +extern "C" unsigned int dma_get(unsigned int *dma_virtual_address, int offset) { + // std::cout << "Called: " << __func__ << " sysc version" << std::endl; + LOG("Called: " << __func__ << " sysc version"); + return myDMA.dma_get(dma_virtual_address, offset); +} diff --git a/lib/ExecutionEngine/axi/CMakeLists.txt b/lib/ExecutionEngine/axi/CMakeLists.txt new file mode 100644 index 0000000..f3a4adb --- /dev/null +++ b/lib/ExecutionEngine/axi/CMakeLists.txt @@ -0,0 +1,118 @@ +# Exclude these from libMLIR.so because the JIT infrastructure +# is a big dependency which most don't need. + +add_subdirectory(api) + +set(LLVM_OPTIONAL_SOURCES + AxiUtils.cpp + AxiUtilsMock.cpp + AxiUtilsSysc.cpp +) + +add_mlir_library(mlir_mockaxi_runner_utils + SHARED + AxiUtilsMock.cpp + + EXCLUDE_FROM_LIBMLIR +) +target_compile_definitions(mlir_mockaxi_runner_utils PRIVATE mlir_mockaxi_runner_utils_EXPORTS) + +add_mlir_library(mlir_axi_runner_utils + SHARED + AxiUtils.cpp + EXCLUDE_FROM_LIBMLIR +) +target_compile_definitions(mlir_axi_runner_utils PRIVATE mlir_axi_runner_utils_EXPORTS) +add_dependencies(mlir_axi_runner_utils + axi_api_v1 +) +target_link_libraries(mlir_axi_runner_utils PUBLIC axi_api_v1) # Needed to call implemented functions + +# Only generate systemc libraries for native builds +if(AXI_CROSSCOMPILING) + message(STATUS "Cross-compiling, SystemC runner libraries are disabed") +else() + message(STATUS "Not Cross-compiling, SystemC runner libraries are enabled") + + if(DEFINED ENV{SYSTEMC_HOME}) + message(STATUS "SYSTEMC_HOME is set: $ENV{SYSTEMC_HOME} -- building sysc runner libs") + + add_mlir_library(mlir_syscaxi_runner_utils + SHARED + AxiUtilsSysc.cpp + EXCLUDE_FROM_LIBMLIR + ) + target_compile_definitions(mlir_syscaxi_runner_utils PRIVATE mlir_syscaxi_runner_utils_EXPORTS) + add_dependencies(mlir_syscaxi_runner_utils + axi_api_v1_sysc + ) + target_link_libraries(mlir_syscaxi_runner_utils PUBLIC axi_api_v1_sysc) # Needed to call implemented functions + + # ---------------------------- + # Matmul accelerator + # Same for accelerator v1 + add_mlir_library(mlir_syscaxi_runner_utils_accv1 + SHARED + AxiUtilsSysc.cpp + EXCLUDE_FROM_LIBMLIR + ) + target_compile_definitions(mlir_syscaxi_runner_utils_accv1 PRIVATE mlir_syscaxi_runner_utils_v1_EXPORTS) + add_dependencies(mlir_syscaxi_runner_utils_accv1 + axi_api_v1_sysc_accv1 + ) + target_link_libraries(mlir_syscaxi_runner_utils_accv1 PUBLIC axi_api_v1_sysc_accv1) # Needed to call implemented functions + + # Same for accelerator v2 + add_mlir_library(mlir_syscaxi_runner_utils_accv2 + SHARED + AxiUtilsSysc.cpp + EXCLUDE_FROM_LIBMLIR + ) + target_compile_definitions(mlir_syscaxi_runner_utils_accv2 PRIVATE mlir_syscaxi_runner_utils_v2_EXPORTS) + add_dependencies(mlir_syscaxi_runner_utils_accv2 + axi_api_v1_sysc_accv2 + ) + target_link_libraries(mlir_syscaxi_runner_utils_accv2 PUBLIC axi_api_v1_sysc_accv2) # Needed to call implemented functions + + # Same for accelerator v3 + add_mlir_library(mlir_syscaxi_runner_utils_accv3 + SHARED + AxiUtilsSysc.cpp + EXCLUDE_FROM_LIBMLIR + ) + target_compile_definitions(mlir_syscaxi_runner_utils_accv3 PRIVATE mlir_syscaxi_runner_utils_v3_EXPORTS) + add_dependencies(mlir_syscaxi_runner_utils_accv3 + axi_api_v1_sysc_accv3 + ) + target_link_libraries(mlir_syscaxi_runner_utils_accv3 PUBLIC axi_api_v1_sysc_accv3) # Needed to call implemented functions + + # Same for accelerator v4 + add_mlir_library(mlir_syscaxi_runner_utils_accv4 + SHARED + AxiUtilsSysc.cpp + EXCLUDE_FROM_LIBMLIR + ) + target_compile_definitions(mlir_syscaxi_runner_utils_accv4 PRIVATE mlir_syscaxi_runner_utils_v4_EXPORTS) + add_dependencies(mlir_syscaxi_runner_utils_accv4 + axi_api_v1_sysc_accv4 + ) + target_link_libraries(mlir_syscaxi_runner_utils_accv4 PUBLIC axi_api_v1_sysc_accv4) # Needed to call implemented functions + + # ---------------------------- + # Conv accelerator + + # Same for accelerator v1 + add_mlir_library(mlir_syscaxi_runner_utils_conv_accv1 + SHARED + AxiUtilsSysc.cpp + EXCLUDE_FROM_LIBMLIR + ) + target_compile_definitions(mlir_syscaxi_runner_utils_conv_accv1 PRIVATE mlir_syscaxi_runner_utils_conv_v1_EXPORTS) + add_dependencies(mlir_syscaxi_runner_utils_conv_accv1 + axi_api_v1_sysc_conv_accv1 + ) + target_link_libraries(mlir_syscaxi_runner_utils_conv_accv1 PUBLIC axi_api_v1_sysc_conv_accv1) # Needed to call implemented functions + endif() +endif() + +set(CMAKE_CXX_FLAGS "${tmpcxxflags}") # Revert to normal CXX flags \ No newline at end of file diff --git a/lib/ExecutionEngine/axi/api/CMakeLists.txt b/lib/ExecutionEngine/axi/api/CMakeLists.txt new file mode 100644 index 0000000..afd4267 --- /dev/null +++ b/lib/ExecutionEngine/axi/api/CMakeLists.txt @@ -0,0 +1,115 @@ +# Exclude these from libMLIR.so because the JIT infrastructure +# is a big dependency which most don't need. + +set(LLVM_OPTIONAL_SOURCES + api_v0.cpp + api_v1.cpp + api_v1_sysc.cpp + api_v2.cpp + api_v2_sysc.cpp +) + +add_mlir_library(axi_api_v0 + SHARED + api_v0.cpp + + EXCLUDE_FROM_LIBMLIR +) + +add_mlir_library(axi_api_v1 + SHARED + api_v1.cpp + + EXCLUDE_FROM_LIBMLIR +) + +set(tmpcxxflags ${CMAKE_CXX_FLAGS}) +string(REPLACE "-Werror=global-constructors" "" FIXED ${CMAKE_CXX_FLAGS}) +string(REPLACE "-Wcast-qual" "-Wno-vla-extension" FIXED ${FIXED}) + +if(AXI_CROSSCOMPILING) + message(STATUS "Cross-compiling, SystemC api libraries are disabed") + string(APPEND FIXED " -mfpu=neon") +else() + message(STATUS "Not Cross-compiling, SystemC api libraries are enabled") + + if(DEFINED ENV{SYSTEMC_HOME}) + message(STATUS "SYSTEMC_HOME is set: $ENV{SYSTEMC_HOME} -- building sysc api libs") + + add_mlir_library(axi_api_v1_sysc + SHARED + api_v1_sysc.cpp + + EXCLUDE_FROM_LIBMLIR + ) + target_include_directories(axi_api_v1_sysc PUBLIC $ENV{SYSTEMC_HOME}/include/) + set_target_properties(axi_api_v1_sysc PROPERTIES COMPILE_FLAGS "") + target_link_libraries(axi_api_v1_sysc PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a) + + # ---------------------------- + # Matmul accelerator + + # Same for accelerator v1 + add_mlir_library(axi_api_v1_sysc_accv1 + SHARED + api_v1_sysc.cpp + + EXCLUDE_FROM_LIBMLIR + ) + target_include_directories(axi_api_v1_sysc_accv1 PUBLIC $ENV{SYSTEMC_HOME}/include/) + set_target_properties(axi_api_v1_sysc_accv1 PROPERTIES COMPILE_FLAGS "-DACC_V1") + target_link_libraries(axi_api_v1_sysc_accv1 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a) + + # Same for accelerator v2 + add_mlir_library(axi_api_v1_sysc_accv2 + SHARED + api_v1_sysc.cpp + + EXCLUDE_FROM_LIBMLIR + ) + target_include_directories(axi_api_v1_sysc_accv2 PUBLIC $ENV{SYSTEMC_HOME}/include/) + set_target_properties(axi_api_v1_sysc_accv2 PROPERTIES COMPILE_FLAGS "-DACC_V2") + target_link_libraries(axi_api_v1_sysc_accv2 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a) + + # Same for accelerator v3 + add_mlir_library(axi_api_v1_sysc_accv3 + SHARED + api_v1_sysc.cpp + + EXCLUDE_FROM_LIBMLIR + ) + target_include_directories(axi_api_v1_sysc_accv3 PUBLIC $ENV{SYSTEMC_HOME}/include/) + set_target_properties(axi_api_v1_sysc_accv3 PROPERTIES COMPILE_FLAGS "-DACC_V3") + target_link_libraries(axi_api_v1_sysc_accv3 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a) + + # Same for accelerator v4 + add_mlir_library(axi_api_v1_sysc_accv4 + SHARED + api_v1_sysc.cpp + + EXCLUDE_FROM_LIBMLIR + ) + target_include_directories(axi_api_v1_sysc_accv4 PUBLIC $ENV{SYSTEMC_HOME}/include/) + set_target_properties(axi_api_v1_sysc_accv4 PROPERTIES COMPILE_FLAGS "-DACC_V4") + target_link_libraries(axi_api_v1_sysc_accv4 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a) + + # ---------------------------- + # Conv accelerator + + # Same for accelerator v1 + add_mlir_library(axi_api_v1_sysc_conv_accv1 + SHARED + api_v1_sysc.cpp + + EXCLUDE_FROM_LIBMLIR + ) + target_include_directories(axi_api_v1_sysc_conv_accv1 PUBLIC $ENV{SYSTEMC_HOME}/include/) + set_target_properties(axi_api_v1_sysc_conv_accv1 PROPERTIES COMPILE_FLAGS "-DCONV_V1") + target_link_libraries(axi_api_v1_sysc_conv_accv1 PUBLIC $ENV{SYSTEMC_HOME}/lib-linux64/libsystemc.a) + endif() +endif() + +set(CMAKE_CXX_FLAGS "${FIXED}") + +# No additional properties for now +# target_compile_definitions(axi_api_v1 PRIVATE axi_api_EXPORTS) \ No newline at end of file diff --git a/lib/ExecutionEngine/axi/api/api_v0.cpp b/lib/ExecutionEngine/axi/api/api_v0.cpp new file mode 100644 index 0000000..a07110d --- /dev/null +++ b/lib/ExecutionEngine/axi/api/api_v0.cpp @@ -0,0 +1,48 @@ +//**********************Deprecated********************** + +#include "mlir/ExecutionEngine/axi/api_v0.h" + +void dma::init(int id) { + dma_set(dma_address, S2MM_CONTROL_REGISTER, 4); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 4); + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0); + dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + (unsigned long)dma_output_addr); // Write destination address + dma_set(dma_address, MM2S_START_ADDRESS, + (unsigned long)dma_input_addr); // Write source address + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001); +} + +void dma_collection::dma_init(int dma_count, unsigned int *dma_address, + unsigned int *dma_input_addr, + unsigned int *dma_input_len, + unsigned int *dma_output_addr, + unsigned int *dma_output_len) { + // Open /dev/mem which represents the whole physical memory + int dh = open("/dev/mem", O_RDWR | O_SYNC); + dma_list = new dma[dma_count]; + int id_count = 0; + + for (int i = 0; i < dma_count; i++) { + void *dma_mm = mmap(NULL, 65535, PROT_READ | PROT_WRITE, MAP_SHARED, dh, + dma_address[i]); // Memory map AXI Lite register block + void *dma_in_mm = + mmap(NULL, dma_input_len[i], PROT_READ | PROT_WRITE, MAP_SHARED, dh, + dma_input_addr[i]); // Memory map source address + void *dma_out_mm = + mmap(NULL, dma_output_len[i], PROT_READ, MAP_SHARED, dh, + dma_output_addr[i]); // Memory map destination address + unsigned int *dma_addr = reinterpret_cast(dma_mm); + unsigned int *dma_in = reinterpret_cast(dma_in_mm); + unsigned int *dma_out = reinterpret_cast(dma_out_mm); + + dma_list[i].dma_address = dma_addr; + dma_list[i].dma_input_addr = dma_in; + dma_list[i].dma_output_addr = dma_out; + dma_list[i].dma_input_len = dma_input_len[i]; + dma_list[i].dma_output_len = dma_output_len[i]; + dma_list[i].init(id_count++); + } +} \ No newline at end of file diff --git a/lib/ExecutionEngine/axi/api/api_v1.cpp b/lib/ExecutionEngine/axi/api/api_v1.cpp new file mode 100644 index 0000000..5583200 --- /dev/null +++ b/lib/ExecutionEngine/axi/api/api_v1.cpp @@ -0,0 +1,653 @@ +//===- api_v1.cpp - AXI core API implementation ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the core functions to use the AXI DMA interface. +// +//===----------------------------------------------------------------------===// + +#include "mlir/ExecutionEngine/axi/api_v1.h" + +#ifdef __arm__ +#include "arm_neon.h" +#endif + +void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address, + unsigned int _dma_input_buffer_size, + unsigned int _dma_output_address, + unsigned int _dma_output_buffer_size) { + int dh = open("/dev/mem", O_RDWR | O_SYNC); + void *dma_mm = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dh, + _dma_address); // Memory map AXI Lite register block + void *dma_in_mm = + mmap(NULL, _dma_input_buffer_size, PROT_READ | PROT_WRITE, MAP_SHARED, dh, + _dma_input_address); // Memory map source address + void *dma_out_mm = + mmap(NULL, _dma_output_buffer_size, PROT_READ, MAP_SHARED, dh, + _dma_output_address); // Memory map destination address + dma_address = reinterpret_cast(dma_mm); + dma_input_address = reinterpret_cast(dma_in_mm); + dma_output_address = reinterpret_cast(dma_out_mm); + dma_input_buffer_size = _dma_input_buffer_size; + dma_output_buffer_size = _dma_output_buffer_size; + dma_input_paddress = _dma_input_address; + dma_output_paddress = _dma_output_address; + current_input_offset = 0; + close(dh); + initDMAControls(); // Causes Segfault atm + LOG("DMA Initialised"); +} + +void dma::dma_free() { + munmap(dma_input_address, dma_input_buffer_size); + munmap(dma_output_address, dma_output_buffer_size); + munmap(dma_address, getpagesize()); +} + +// We could reduce to one set of the following calls +//============================================================================== +unsigned int *dma::dma_get_inbuffer() { return dma_input_address; } + +unsigned int *dma::dma_get_outbuffer() { return dma_output_address; } +//============================================================================== +int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length, + int offset) { + m_assert("data copy will overflow input buffer", + (unsigned int)(offset + data_length) <= dma_input_buffer_size); + std::memcpy(dma_input_address + offset, src_address, data_length * 4); + current_input_offset += data_length; + return 0; +} + +int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length, + int offset) { + m_assert("tries to access data outwith the output buffer", + (unsigned int)(offset + data_length) <= dma_output_buffer_size); + std::memcpy(dst_address, dma_output_address + offset, data_length * 4); + return 0; +} + +template +inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, + unsigned int *dst_base, const int dst_offset) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *srcPtr; + srcPtr = mr_base + mr_offset; + + T *dstPtr; + dstPtr = reinterpret_cast(dst_base) + dst_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + srcStrides[rankp] = mr_strides[rankp]; + + // dstStrides for the array is derived from the input mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are + // 128x16x1 + dstStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + dstStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + +#ifdef __arm__SKIP + // std::cout << "Enter_NEON _ test" << std::endl; + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements + int64_t count = mr_sizes[rank - 2]; // number of rows + int64_t srcStride = mr_strides[rank - 2]; // stride between rows + int64_t dstStride = dstStrides[rank - 2]; // stride between rows + const int64_t elemSize = sizeof(T); + + int32x4_t tmp0; + int32x4_t tmp1; + int32x4_t tmp2; + int32x4_t tmp3; + + int64_t sizer_16r = (size % 16); + int64_t sizer_8r = (size % 8); + int64_t sizer_4r = (size % 4); + + if (sizer_16r == 0) { + // std::cout << "Enter_NEON _ 16" << std::endl; + for (int64_t i = 0; i < count; ++i) { + int64_t j = 0; + for (; j < size; j += 16) { + // neon vector load and store + tmp0 = vld1q_s32(reinterpret_cast(srcPtr) + j + 0); + tmp1 = vld1q_s32(reinterpret_cast(srcPtr) + j + 4); + tmp2 = vld1q_s32(reinterpret_cast(srcPtr) + j + 8); + tmp3 = vld1q_s32(reinterpret_cast(srcPtr) + j + 12); + vst1q_s32(reinterpret_cast(dstPtr) + j + 0, tmp0); + vst1q_s32(reinterpret_cast(dstPtr) + j + 4, tmp1); + vst1q_s32(reinterpret_cast(dstPtr) + j + 8, tmp2); + vst1q_s32(reinterpret_cast(dstPtr) + j + 12, tmp3); + } + srcPtr += srcStride; + dstPtr += dstStride; + } + } else if (sizer_8r == 0) { + // std::cout << "Enter_NEON _ 8" << std::endl; + for (int64_t i = 0; i < count; ++i) { + int64_t j = 0; + for (; j < size; j += 8) { + // neon vector load and store + tmp0 = vld1q_s32(reinterpret_cast(srcPtr) + j + 0); + tmp1 = vld1q_s32(reinterpret_cast(srcPtr) + j + 4); + vst1q_s32(reinterpret_cast(dstPtr) + j + 0, tmp0); + vst1q_s32(reinterpret_cast(dstPtr) + j + 4, tmp1); + } + srcPtr += srcStride; + dstPtr += dstStride; + } + } else if (sizer_4r == 0) { + // std::cout << "Enter_NEON _ 4" << std::endl; + for (int64_t i = 0; i < count; ++i) { + int64_t j = 0; + for (; j < size; j += 4) { + // neon vector load and store + tmp0 = vld1q_s32(reinterpret_cast(srcPtr) + j + 0); + vst1q_s32(reinterpret_cast(dstPtr) + j + 0, tmp0); + } + srcPtr += srcStride; + dstPtr += dstStride; + } + } else { + // std::cout << "Enter_NEON _ 1" << std::endl; + for (int64_t i = 0; i < count; ++i) { + memcpy(dstPtr, srcPtr, size * elemSize); + srcPtr += srcStride; + dstPtr += dstStride; + } + } + return; + } +#else + // create a special case for rank==2 and strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements + int64_t count = mr_sizes[rank - 2]; // number of rows + int64_t srcStride = mr_strides[rank - 2]; // stride between rows + int64_t dstStride = dstStrides[rank - 2]; // stride between rows + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < count; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } +#endif + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]" + << dst_offset << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]" + << srcPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]" + << dstPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "load from]" << srcPtr + readIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "store at]" << dstPtr + writeIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "loaded val]" << *(srcPtr + readIndex) << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "stored val]" << *(dstPtr + writeIndex) << "\n";); + + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + readIndex += srcStrides[axis]; + writeIndex += 1; // Always increment, it is a flattened dense array + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + readIndex -= mr_sizes[axis] * srcStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + writeIndex -= 1; + } + } +} + +// Implements the actual copy +template +int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset) { + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";); + + copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_inbuffer(), dma_offset); + + return 0; +} + +template +inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, + unsigned int *src_base, const int src_offset) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *dstPtr; + dstPtr = mr_base + mr_offset; + + T *srcPtr; + srcPtr = reinterpret_cast(src_base) + src_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + dstStrides[rankp] = mr_strides[rankp]; + + // srcStrides for the array is derived from the output mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are + // 128x16x1 + srcStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + srcStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + +#ifdef __arm__SKIP + // std::cout << "Enter_NEON _ test" << std::endl; + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements + int64_t count = mr_sizes[rank - 2]; // number of rows + int64_t srcStride = mr_strides[rank - 2]; // stride between rows + int64_t dstStride = dstStrides[rank - 2]; // stride between rows + const int64_t elemSize = sizeof(T); + + int32x4_t tmp0; + int32x4_t tmp1; + int32x4_t tmp2; + int32x4_t tmp3; + + int64_t sizer_16r = (size % 16); + int64_t sizer_8r = (size % 8); + int64_t sizer_4r = (size % 4); + + if (sizer_16r == 0) { + // std::cout << "Enter_NEON _ 16" << std::endl; + for (int64_t i = 0; i < count; ++i) { + int64_t j = 0; + for (; j < size; j += 16) { + // neon vector load and store + tmp0 = vld1q_s32(reinterpret_cast(srcPtr) + j + 0); + tmp1 = vld1q_s32(reinterpret_cast(srcPtr) + j + 4); + tmp2 = vld1q_s32(reinterpret_cast(srcPtr) + j + 8); + tmp3 = vld1q_s32(reinterpret_cast(srcPtr) + j + 12); + vst1q_s32(reinterpret_cast(dstPtr) + j + 0, tmp0); + vst1q_s32(reinterpret_cast(dstPtr) + j + 4, tmp1); + vst1q_s32(reinterpret_cast(dstPtr) + j + 8, tmp2); + vst1q_s32(reinterpret_cast(dstPtr) + j + 12, tmp3); + } + srcPtr += srcStride; + dstPtr += dstStride; + } + } else if (sizer_8r == 0) { + // std::cout << "Enter_NEON _ 8" << std::endl; + for (int64_t i = 0; i < count; ++i) { + int64_t j = 0; + for (; j < size; j += 8) { + // neon vector load and store + tmp0 = vld1q_s32(reinterpret_cast(srcPtr) + j + 0); + tmp1 = vld1q_s32(reinterpret_cast(srcPtr) + j + 4); + vst1q_s32(reinterpret_cast(dstPtr) + j + 0, tmp0); + vst1q_s32(reinterpret_cast(dstPtr) + j + 4, tmp1); + } + srcPtr += srcStride; + dstPtr += dstStride; + } + } else if (sizer_4r == 0) { + // std::cout << "Enter_NEON _ 4" << std::endl; + for (int64_t i = 0; i < count; ++i) { + int64_t j = 0; + for (; j < size; j += 4) { + // neon vector load and store + tmp0 = vld1q_s32(reinterpret_cast(srcPtr) + j + 0); + vst1q_s32(reinterpret_cast(dstPtr) + j + 0, tmp0); + } + srcPtr += srcStride; + dstPtr += dstStride; + } + } else { + // std::cout << "Enter_NEON _ 1" << std::endl; + for (int64_t i = 0; i < count; ++i) { + memcpy(dstPtr, srcPtr, size * elemSize); + srcPtr += srcStride; + dstPtr += dstStride; + } + } + return; + } +#else + // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements in one row + int64_t nRows = mr_sizes[rank - 2]; // number of rows + int64_t dstStride = + mr_strides[rank - 2]; // #elements to skip to access next row + int64_t srcStride = + srcStrides[rank - 2]; // #elements to skip to access next row + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < nRows; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } +#endif + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]" + << src_offset << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]" + << srcPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]" + << dstPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "load from]" << srcPtr + readIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "store at]" << dstPtr + writeIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "loaded val]" << *(srcPtr + readIndex) << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "stored val]" << *(dstPtr + writeIndex) << "\n";); + + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + writeIndex += dstStrides[axis]; + readIndex += 1; // Always increment, it is a flattened dense array + + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + writeIndex -= mr_sizes[axis] * dstStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + readIndex -= 1; + } + } +} + +template +int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, + int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, + const int64_t *mr_strides, + int dma_offset) { + + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";); + + copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_outbuffer(), dma_offset); + + return 0; +} + +// Make templates concrete: +template int dma::mlir_dma_copy_to_inbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_to_inbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +int dma::dma_start_send(int length, int offset) { + m_assert("trying to send data outside the input buffer", + (unsigned int)(offset + length) <= dma_input_buffer_size); + dma_set(dma_address, MM2S_START_ADDRESS, dma_input_paddress + (offset * 4)); + msync(dma_address, PAGE_SIZE, MS_SYNC); + dma_set(dma_address, MM2S_LENGTH, length * 4); + LOG("Transfer Started - " << length * 4); + current_input_offset = 0; + return 0; +} + +void dma::dma_wait_send() { + LOG("Data Transfer - Waiting"); + dma_mm2s_sync(); + LOG("Data Transfer - Done"); +} + +int dma::dma_check_send() { + unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + bool done = !((!(mm2s_status & 1 << 12)) || (!(mm2s_status & 1 << 1))); + if (done) { + LOG("Data Transfer - Done"); + } else { + LOG("Data Transfer - Not Done"); + } + return done ? 0 : -1; +} + +int dma::dma_start_recv(int length, int offset) { + m_assert("trying receive data outside the output buffer", + (unsigned int)(offset + length) <= dma_output_buffer_size); + dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + dma_output_paddress + (offset * 4)); + msync(dma_address, PAGE_SIZE, MS_SYNC); + LOG("Started Receiving " << length * 4); + dma_set(dma_address, S2MM_LENGTH, length * 4); + LOG("Started Receiving " << length * 4); + return 0; +} + +void dma::dma_wait_recv() { + LOG("Data Receive - Waiting"); + LOG("Data Receive - Waiting " << dma_get(dma_address, S2MM_LENGTH)); + dma_s2mm_sync(); + // unsigned int recv_len = dma_get(dma_address,S2MM_LENGTH); + LOG("Data Receive - Done " << dma_get(dma_address, S2MM_LENGTH)); +} + +int dma::dma_check_recv() { + unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + bool done = !((!(s2mm_status & 1 << 12)) || (!(s2mm_status & 1 << 1))); + if (done) { + LOG("Data Receive - Done"); + } else { + LOG("Data Receive - Not Done"); + } + return done ? 0 : -1; +} + +//********************************** Unexposed Functions +//********************************** +void dma::initDMAControls() { + dma_set(dma_address, S2MM_CONTROL_REGISTER, 4); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 4); + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0); + // dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + // (unsigned long)dma_output_address); // Write destination address + // dma_set(dma_address, MM2S_START_ADDRESS, + // (unsigned long)dma_input_address); // Write source address + dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + dma_output_paddress); // Write destination address + dma_set(dma_address, MM2S_START_ADDRESS, + dma_input_paddress); // Write source address + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001); +} + +void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) { + *((volatile unsigned int *)(reinterpret_cast(dma_address) + offset)) = + value; + // dma_address[offset >> 2] = value; +} + +unsigned int dma::dma_get(unsigned int *dma_address, int offset) { + return *((volatile unsigned int *)(reinterpret_cast(dma_address) + + offset)); + // return *((volatile unsigned int*) dma_address[offset >> 2]); + // return dma_address[offset >> 2]; +} + +void dma::dma_mm2s_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + } +} + +void dma::dma_s2mm_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + } +} + +void dma::acc_init(unsigned int base_addr, int length) { + int dh = open("/dev/mem", O_RDWR | O_SYNC); + size_t virt_base = base_addr & ~(PAGE_SIZE - 1); + size_t virt_offset = base_addr - virt_base; + void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE, + MAP_SHARED, dh, virt_base); + close(dh); + if (addr == (void *)-1) + exit(EXIT_FAILURE); + acc_address = reinterpret_cast(addr); +} + +void dma::dump_acc_signals(int state) { + msync(acc_address, PAGE_SIZE, MS_SYNC); + std::ofstream file; + file.open("dump_acc_signals.dat", std::ios_base::app); + file << "====================================================" << std::endl; + file << "State: " << state << std::endl; + file << "====================================================" << std::endl; + for (int i = 0; i < 16; i++) + file << acc_address[i] << ","; + file << "====================================================" << std::endl; +} \ No newline at end of file diff --git a/lib/ExecutionEngine/axi/api/api_v1_sysc.cpp b/lib/ExecutionEngine/axi/api/api_v1_sysc.cpp new file mode 100644 index 0000000..24faaca --- /dev/null +++ b/lib/ExecutionEngine/axi/api/api_v1_sysc.cpp @@ -0,0 +1,470 @@ +//===- api_v1.cpp - AXI core API implementation ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the core functions to use the AXI DMA interface. +// +//===----------------------------------------------------------------------===// + +#define SYSC +#include "mlir/ExecutionEngine/axi/api_v1.h" + +int sc_main(int argc, char *argv[]) { return 0; } + +// SystemC code does not require all these parameters +void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address, + unsigned int _dma_input_buffer_size, + unsigned int _dma_output_address, + unsigned int _dma_output_buffer_size) { + + sc_report_handler::set_actions("/IEEE_Std_1666/deprecated", SC_DO_NOTHING); + sc_report_handler::set_actions(SC_ID_LOGIC_X_TO_BOOL_, SC_LOG); + sc_report_handler::set_actions(SC_ID_VECTOR_CONTAINS_LOGIC_VALUE_, SC_LOG); + + dma_input_address = + (unsigned int *)malloc(_dma_input_buffer_size * sizeof(int)); + dma_output_address = + (unsigned int *)malloc(_dma_output_buffer_size * sizeof(int)); + + // Initialize with zeros + for (int64_t i = 0; i < _dma_input_buffer_size; i++) { + *(dma_input_address + i) = 0; + } + + for (int64_t i = 0; i < _dma_output_buffer_size; i++) { + *(dma_output_address + i) = 0; + } + + static ACCNAME dut("dut"); + static DMA_DRIVER dm("DMA"); + accelerator_dma_connect(&dut, &dm, _dma_input_buffer_size, + _dma_output_buffer_size); + + dm.DMA_input_buffer = (int *)dma_input_address; + dm.DMA_output_buffer = (int *)dma_output_address; + dma_input_buffer_size = _dma_input_buffer_size; + dma_output_buffer_size = _dma_output_buffer_size; + + acc = &dut; + dmad = &dm; + acc->verbose = verbose; + LOG("SystemC dma_init() initializes the DMA"); +} + +void dma::dma_free() { + LOG("SystemC dma_free() deallocates DMA buffers"); + LOG("++++++++++++++++++++++++++++++++++++++++"); + LOG("SystemC simulated cycles: " << sc_time_stamp()); + LOG("DMA Send count: " << dma_send_count); + LOG("DMA Send length: " << dma_send_length); + LOG("DMA Recv count: " << dma_recv_count); + LOG("DMA Recv length: " << dma_recv_length); + LOG("++++++++++++++++++++++++++++++++++++++++"); + acc->print_profile(); + + free(dma_input_address); + free(dma_output_address); +} + +unsigned int *dma::dma_get_inbuffer() { return dma_input_address; } + +unsigned int *dma::dma_get_outbuffer() { return dma_output_address; } + +int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length, + int offset) { + LOG("SystemC dma_copy_to_inbuffer()"); + m_assert("data copy will overflow input buffer", + (unsigned int)(offset + data_length) <= dma_input_buffer_size); + memcpy((dma_get_inbuffer() + offset), src_address, data_length * 4); + return 0; +} + +int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length, + int offset) { + LOG("SystemC dma_copy_from_outbuffer()"); + m_assert("tries to access data out with the output buffer", + (unsigned int)(offset + data_length) <= dma_output_buffer_size); + memcpy(dst_address, (dma_get_outbuffer() + offset), data_length * 4); + return 0; +} + +template +inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, + unsigned int *dst_base, const int dst_offset) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *srcPtr; + srcPtr = mr_base + mr_offset; + + T *dstPtr; + dstPtr = reinterpret_cast(dst_base) + dst_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + srcStrides[rankp] = mr_strides[rankp]; + + // dstStrides for the array is derived from the input mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are + // 128x16x1 + dstStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + dstStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + + // create a special case for rank==2 and strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements + int64_t count = mr_sizes[rank - 2]; // number of rows + int64_t srcStride = mr_strides[rank - 2]; // stride between rows + int64_t dstStride = dstStrides[rank - 2]; // stride between rows + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < count; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + readIndex += srcStrides[axis]; + writeIndex += 1; // Always increment, it is a flattened dense array + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + readIndex -= mr_sizes[axis] * srcStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + writeIndex -= 1; + } + } +} + +// Implements the actual copy +template +int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset) { + // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"; + LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"); + copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_inbuffer(), dma_offset); + + return 0; +} + +template +inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, + unsigned int *src_base, const int src_offset) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *dstPtr; + dstPtr = mr_base + mr_offset; + + T *srcPtr; + srcPtr = reinterpret_cast(src_base) + src_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + dstStrides[rankp] = mr_strides[rankp]; + + // srcStrides for the array is derived from the output mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are + // 128x16x1 + srcStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + srcStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + + // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements in one row + int64_t nRows = mr_sizes[rank - 2]; // number of rows + int64_t dstStride = + mr_strides[rank - 2]; // #elements to skip to access next row + int64_t srcStride = + srcStrides[rank - 2]; // #elements to skip to access next row + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < nRows; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + writeIndex += dstStrides[axis]; + readIndex += 1; // Always increment, it is a flattened dense array + + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + writeIndex -= mr_sizes[axis] * dstStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + readIndex -= 1; + } + } +} + +template +int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, + int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, + const int64_t *mr_strides, + int dma_offset) { + + // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"; + LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"); + copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_outbuffer(), dma_offset); + + return 0; +} + +// Make templates concrete: +template int dma::mlir_dma_copy_to_inbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_to_inbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +int dma::dma_start_send(int length, int offset) { + LOG("SystemC dma_start_send()"); + dmad->input_len = length; + dmad->input_offset = offset; + dmad->send = true; + PFUNC(dma_send_length += length); + PFUNC(dma_send_count++); + return 0; +} + +void dma::dma_wait_send() { + LOG("SystemC dma_wait_send() starts simulation"); + sc_start(); +} + +int dma::dma_check_send() { + LOG("SystemC dma_check_send() does nothing"); + return 0; +} + +int dma::dma_start_recv(int length, int offset) { + LOG("SystemC dma_start_recv()"); + dmad->output_len = length; + dmad->output_offset = offset; + dmad->recv = true; + PFUNC(dma_recv_count++); + return 0; +} + +void dma::dma_wait_recv() { + LOG("SystemC dma_wait_recv() starts simulation"); + sc_start(); + PFUNC(dma_recv_length += dmad->output_len); +} + +int dma::dma_check_recv() { + LOG("SystemC dma_check_recv() does nothing"); + return 0; +} + +// We really don't need any of the functions below to be implemented for SystemC +//********************************** Unexposed Functions +//********************************** +void dma::initDMAControls() { + dma_set(dma_address, S2MM_CONTROL_REGISTER, 4); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 4); + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0); + dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + (unsigned long)dma_output_address); // Write destination address + dma_set(dma_address, MM2S_START_ADDRESS, + (unsigned long)dma_input_address); // Write source address + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001); +} + +void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) { + dma_address[offset >> 2] = value; +} + +unsigned int dma::dma_get(unsigned int *dma_address, int offset) { + return dma_address[offset >> 2]; +} + +void dma::dma_mm2s_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + } +} + +void dma::dma_s2mm_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + } +} + +void dma::acc_init(unsigned int base_addr, int length) { + int dh = open("/dev/mem", O_RDWR | O_SYNC); + size_t virt_base = base_addr & ~(PAGE_SIZE - 1); + size_t virt_offset = base_addr - virt_base; + void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE, + MAP_SHARED, dh, virt_base); + close(dh); + if (addr == (void *)-1) + exit(EXIT_FAILURE); + acc_address = reinterpret_cast(addr); +} + +void dma::dump_acc_signals(int state) { + msync(acc_address, PAGE_SIZE, MS_SYNC); + std::ofstream file; + file.open("dump_acc_signals.dat", std::ios_base::app); + file << "====================================================" << std::endl; + file << "State: " << state << std::endl; + file << "====================================================" << std::endl; + for (int i = 0; i < 16; i++) + file << acc_address[i] << ","; + file << "====================================================" << std::endl; +} \ No newline at end of file diff --git a/lib/ExecutionEngine/axi/api/api_v2.cpp b/lib/ExecutionEngine/axi/api/api_v2.cpp new file mode 100644 index 0000000..38464db --- /dev/null +++ b/lib/ExecutionEngine/axi/api/api_v2.cpp @@ -0,0 +1,521 @@ +//===- api_v2.cpp - AXI core API implementation ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the core functions to use the AXI DMA interface. +// +//===----------------------------------------------------------------------===// + +#include "mlir/ExecutionEngine/axi/api_v2.h" + +#ifdef __arm__ +#include "arm_neon.h" +#endif + +void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address, + unsigned int _dma_input_buffer_size, unsigned int _isize, + unsigned int _dma_output_address, + unsigned int _dma_output_buffer_size, unsigned int _osize) { + + dma_input_buffer_size = _dma_input_buffer_size; + dma_output_buffer_size = _dma_output_buffer_size; + dma_input_paddress = _dma_input_address; + dma_output_paddress = _dma_output_address; + isize = _isize; + osize = _osize; + + unsigned int in_size_bytes = dma_input_buffer_size * isize; + unsigned int out_size_bytes = dma_output_buffer_size * osize; + int dh = open("/dev/mem", O_RDWR | O_SYNC); + void *dma_mm = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dh, + _dma_address); // Memory map AXI Lite register block + void *dma_in_mm = + mmap(NULL, in_size_bytes, PROT_READ | PROT_WRITE, MAP_SHARED, dh, + _dma_input_address); // Memory map source address + void *dma_out_mm = + mmap(NULL, out_size_bytes, PROT_READ, MAP_SHARED, dh, + _dma_output_address); // Memory map destination address + + dma_address = reinterpret_cast(dma_mm); + dma_input_address = reinterpret_cast(dma_in_mm); + dma_output_address = reinterpret_cast(dma_out_mm); + + close(dh); + initDMAControls(); // Causes Segfault atm + LOG("DMA Initialised"); +} + +void dma::dma_free() { + unsigned int in_size_bytes = dma_input_buffer_size * isize; + unsigned int out_size_bytes = dma_output_buffer_size * osize; + munmap(dma_input_address, in_size_bytes); + munmap(dma_output_address, out_size_bytes); + munmap(dma_address, getpagesize()); +} + +// We could reduce to one set of the following calls +//============================================================================== + +char *dma::dma_get_inbuffer() { return dma_input_address; } + +char *dma::dma_get_outbuffer() { return dma_output_address; } +//============================================================================== + +// Removing these functions for now +// int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length, +// int offset) { +// m_assert("data copy will overflow input buffer", +// (unsigned int)(offset + data_length) <= dma_input_buffer_size); +// std::memcpy(dma_input_address + offset, src_address, data_length * 4); +// current_input_offset += data_length; +// return 0; +// } + +// int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length, +// int offset) { +// m_assert("tries to access data outwith the output buffer", +// (unsigned int)(offset + data_length) <= dma_output_buffer_size); +// std::memcpy(dst_address, dma_output_address + offset, data_length * 4); +// return 0; +// } + +template +inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, char *dst_base, + const int dst_offset) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *srcPtr; + srcPtr = mr_base + mr_offset; + + T *dstPtr; + dstPtr = reinterpret_cast(dst_base) + dst_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + srcStrides[rankp] = mr_strides[rankp]; + + // dstStrides for the array is derived from the input mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are + // 128x16x1 + dstStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + dstStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + + // create a special case for rank==2 and strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements + int64_t count = mr_sizes[rank - 2]; // number of rows + int64_t srcStride = mr_strides[rank - 2]; // stride between rows + int64_t dstStride = dstStrides[rank - 2]; // stride between rows + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < count; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]" + << dst_offset << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]" + << srcPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]" + << dstPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "load from]" << srcPtr + readIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "store at]" << dstPtr + writeIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "loaded val]" << *(srcPtr + readIndex) << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "stored val]" << *(dstPtr + writeIndex) << "\n";); + + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + readIndex += srcStrides[axis]; + writeIndex += 1; // Always increment, it is a flattened dense array + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + readIndex -= mr_sizes[axis] * srcStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + writeIndex -= 1; + } + } +} + +// Implements the actual copy +template +int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset) { + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";); + + copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_inbuffer(), dma_offset); + + return 0; +} + +template +inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, char *src_base, + const int src_offset, int elebytes) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *dstPtr; + dstPtr = mr_base + mr_offset; + + T *srcPtr; + srcPtr = reinterpret_cast(src_base) + src_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + dstStrides[rankp] = mr_strides[rankp]; + + // srcStrides for the array is derived from the output mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are + // 128x16x1 + srcStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + srcStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + + // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements in one row + int64_t nRows = mr_sizes[rank - 2]; // number of rows + int64_t dstStride = + mr_strides[rank - 2]; // #elements to skip to access next row + int64_t srcStride = + srcStrides[rank - 2]; // #elements to skip to access next row + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < nRows; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "offset]" + << src_offset << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "SRC]" + << srcPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "DST]" + << dstPtr << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "load from]" << srcPtr + readIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "store at]" << dstPtr + writeIndex << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "loaded val]" << *(srcPtr + readIndex) << "\n"; + std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ + << "stored val]" << *(dstPtr + writeIndex) << "\n";); + + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + writeIndex += dstStrides[axis]; + readIndex += 1; // Always increment, it is a flattened dense array + + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + writeIndex -= mr_sizes[axis] * dstStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + readIndex -= 1; + } + } +} + +template +int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, + int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, + const int64_t *mr_strides, + int dma_offset) { + + D(std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n";); + + copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_outbuffer(), dma_offset); + + return 0; +} + +// Make templates concrete: +template int dma::mlir_dma_copy_to_inbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_to_inbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +// DMA Functions +// Updated for char +int dma::dma_start_send(unsigned int length, unsigned int offset) { + m_assert("trying to send data outside the input buffer", + (offset + length) <= dma_input_buffer_size); + unsigned int new_length = length * isize; + unsigned int new_offset = offset * isize; + dma_set(dma_address, MM2S_START_ADDRESS, dma_input_paddress + new_offset); + msync(dma_address, PAGE_SIZE, MS_SYNC); + dma_set(dma_address, MM2S_LENGTH, new_length); + LOG("Transfer Started - " << new_length << " bytes"); + return 0; +} + +void dma::dma_wait_send() { + LOG("Data Transfer - Waiting"); + dma_mm2s_sync(); + LOG("Data Transfer - Done"); +} + +int dma::dma_check_send() { + unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + bool done = !((!(mm2s_status & 1 << 12)) || (!(mm2s_status & 1 << 1))); + if (done) { + LOG("Data Transfer - Done"); + } else { + LOG("Data Transfer - Not Done"); + } + return done ? 0 : -1; +} + +// Updated for char +int dma::dma_start_recv(unsigned int length, unsigned int offset) { + m_assert("trying receive data outside the output buffer", + (offset + length) <= dma_output_buffer_size); + unsigned int new_length = length * osize; + unsigned int new_offset = offset * osize; + dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + dma_output_paddress + new_offset); + msync(dma_address, PAGE_SIZE, MS_SYNC); + dma_set(dma_address, S2MM_LENGTH, new_length); + LOG("Started Receiving " << new_length << " bytes"); + return 0; +} + +void dma::dma_wait_recv() { + LOG("Data Receive - Waiting"); + dma_s2mm_sync(); + LOG("Data Received - " << dma_get(dma_address, S2MM_LENGTH) << " bytes"); +} + +int dma::dma_check_recv() { + unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + bool done = !((!(s2mm_status & 1 << 12)) || (!(s2mm_status & 1 << 1))); + if (done) { + LOG("Data Receive - Done"); + } else { + LOG("Data Receive - Not Done"); + } + return done ? 0 : -1; +} + +//********************************** Unexposed Functions +//********************************** + +void dma::initDMAControls() { + dma_set(dma_address, S2MM_CONTROL_REGISTER, 4); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 4); + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0); + // dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + // (unsigned long)dma_output_address); // Write destination address + // dma_set(dma_address, MM2S_START_ADDRESS, + // (unsigned long)dma_input_address); // Write source address + dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + dma_output_paddress); // Write destination address + dma_set(dma_address, MM2S_START_ADDRESS, + dma_input_paddress); // Write source address + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001); +} + +void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) { + *((volatile unsigned int *)(reinterpret_cast(dma_address) + offset)) = + value; + // dma_address[offset >> 2] = value; +} + +unsigned int dma::dma_get(unsigned int *dma_address, int offset) { + return *((volatile unsigned int *)(reinterpret_cast(dma_address) + + offset)); + // return *((volatile unsigned int*) dma_address[offset >> 2]); + // return dma_address[offset >> 2]; +} + +void dma::dma_mm2s_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + } +} + +void dma::dma_s2mm_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + } +} + +void dma::acc_init(unsigned int base_addr, int length) { + int dh = open("/dev/mem", O_RDWR | O_SYNC); + size_t virt_base = base_addr & ~(PAGE_SIZE - 1); + size_t virt_offset = base_addr - virt_base; + void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE, + MAP_SHARED, dh, virt_base); + close(dh); + if (addr == (void *)-1) + exit(EXIT_FAILURE); + acc_address = reinterpret_cast(addr); +} + +void dma::dump_acc_signals(int state) { + msync(acc_address, PAGE_SIZE, MS_SYNC); + std::ofstream file; + file.open("dump_acc_signals.dat", std::ios_base::app); + file << "====================================================" << std::endl; + file << "State: " << state << std::endl; + file << "====================================================" << std::endl; + for (int i = 0; i < 16; i++) + file << acc_address[i] << ","; + file << "====================================================" << std::endl; +} \ No newline at end of file diff --git a/lib/ExecutionEngine/axi/api/api_v2_sysc.cpp b/lib/ExecutionEngine/axi/api/api_v2_sysc.cpp new file mode 100644 index 0000000..0a0b658 --- /dev/null +++ b/lib/ExecutionEngine/axi/api/api_v2_sysc.cpp @@ -0,0 +1,472 @@ +//===- api_v2.cpp - AXI core API implementation ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the core functions to use the AXI DMA interface. +// +//===----------------------------------------------------------------------===// + +#define SYSC +#include "mlir/ExecutionEngine/axi/api_v2.h" + +int sc_main(int argc, char *argv[]) { return 0; } + +// SystemC code does not require all these parameters +void dma::dma_init(unsigned int _dma_address, unsigned int _dma_input_address, + unsigned int _dma_input_buffer_size, unsigned int _isize, + unsigned int _dma_output_address, + unsigned int _dma_output_buffer_size, unsigned int _osize) { + + sc_report_handler::set_actions("/IEEE_Std_1666/deprecated", SC_DO_NOTHING); + sc_report_handler::set_actions(SC_ID_LOGIC_X_TO_BOOL_, SC_LOG); + sc_report_handler::set_actions(SC_ID_VECTOR_CONTAINS_LOGIC_VALUE_, SC_LOG); + + dma_input_buffer_size = _dma_input_buffer_size; + dma_output_buffer_size = _dma_output_buffer_size; + dma_input_paddress = 0; + dma_output_paddress = 0; + isize = _isize; + osize = _osize; + unsigned int in_size_bytes = dma_input_buffer_size * isize; + unsigned int out_size_bytes = dma_output_buffer_size * osize; + + dma_input_address = new char[in_size_bytes](); + dma_output_address = new char[out_size_bytes](); + + static ACCNAME dut("dut"); + static DMA_DRIVER dm("DMA"); + accelerator_dma_connect(&dut, &dm, _dma_input_buffer_size, + _dma_output_buffer_size); + + dm.DMA_input_buffer = (int *)dma_input_address; + dm.DMA_output_buffer = (int *)dma_output_address; + dm.isize = isize; + dm.osize = osize; + acc = &dut; + dmad = &dm; + acc->verbose = verbose; + LOG("SystemC dma_init() initializes the DMA"); +} + +void dma::dma_free() { + LOG("SystemC dma_free() deallocates DMA buffers"); + LOG("++++++++++++++++++++++++++++++++++++++++"); + LOG("SystemC simulated cycles: " << sc_time_stamp()); + LOG("DMA Send count: " << dma_send_count); + LOG("DMA Send length: " << dma_send_length); + LOG("DMA Recv count: " << dma_recv_count); + LOG("DMA Recv length: " << dma_recv_length); + LOG("++++++++++++++++++++++++++++++++++++++++"); + acc->print_profile(); + + delete[] dma_input_address; + delete[] dma_output_address; +} + +char *dma::dma_get_inbuffer() { return dma_input_address; } + +char *dma::dma_get_outbuffer() { return dma_output_address; } + +// Removing these functions for now +// int dma::dma_copy_to_inbuffer(unsigned int *src_address, int data_length, +// int offset) { +// LOG("SystemC dma_copy_to_inbuffer()"); +// m_assert("data copy will overflow input buffer", +// (unsigned int)(offset + data_length) <= dma_input_buffer_size); +// memcpy((dma_get_inbuffer() + offset), src_address, data_length * 4); +// return 0; +// } + +// int dma::dma_copy_from_outbuffer(unsigned int *dst_address, int data_length, +// int offset) { +// LOG("SystemC dma_copy_from_outbuffer()"); +// m_assert("tries to access data out with the output buffer", +// (unsigned int)(offset + data_length) <= dma_output_buffer_size); +// memcpy(dst_address, (dma_get_outbuffer() + offset), data_length * 4); +// return 0; +// } + +template +inline void copy_memref_to_array(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, char *dst_base, + const int dst_offset) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *srcPtr; + srcPtr = mr_base + mr_offset; + + T *dstPtr; + dstPtr = reinterpret_cast(dst_base) + dst_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + srcStrides[rankp] = mr_strides[rankp]; + + // dstStrides for the array is derived from the input mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the dstStrides are + // 128x16x1 + dstStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + dstStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + + // create a special case for rank==2 and strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements + int64_t count = mr_sizes[rank - 2]; // number of rows + int64_t srcStride = mr_strides[rank - 2]; // stride between rows + int64_t dstStride = dstStrides[rank - 2]; // stride between rows + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < count; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + readIndex += srcStrides[axis]; + writeIndex += 1; // Always increment, it is a flattened dense array + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + readIndex -= mr_sizes[axis] * srcStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + writeIndex -= 1; + } + } +} + +// Implements the actual copy +template +int dma::mlir_dma_copy_to_inbuffer(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, int dma_offset) { + // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"; + LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"); + copy_memref_to_array(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_inbuffer(), dma_offset); + + return 0; +} + +template +inline void copy_array_to_memref(T *mr_base, int64_t mr_dim, int64_t mr_rank, + int64_t mr_offset, const int64_t *mr_sizes, + const int64_t *mr_strides, char *src_base, + const int src_offset) { + int64_t rank = mr_rank; + // Handle empty shapes -> nothing to copy. + for (int rankp = 0; rankp < rank; ++rankp) + if (mr_sizes[rankp] == 0) + return; + + T *dstPtr; + dstPtr = mr_base + mr_offset; + + T *srcPtr; + srcPtr = reinterpret_cast(src_base) + src_offset; + + if (rank == 0) { + // memcpy(dstPtr, srcPtr, elemSize); // broken + *dstPtr = *srcPtr; // opt 1 + // *dstPtr = mr_base[mr_offset]; // opt 2 + // dst_base[dst_offset] = mr_base[mr_offset]; // opt 3 + return; + } + + int64_t *indices = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *srcStrides = static_cast(alloca(sizeof(int64_t) * rank)); + int64_t *dstStrides = static_cast(alloca(sizeof(int64_t) * rank)); + + // Initialize index and scale strides. + for (int rankp = 0; rankp < rank; ++rankp) { + indices[rankp] = 0; + dstStrides[rankp] = mr_strides[rankp]; + + // srcStrides for the array is derived from the output mr_sizes + // if the rank is 3, and the mr_sizes are 4x8x16, the srcStrides are + // 128x16x1 + srcStrides[rankp] = 1; + for (int rankp2 = rankp + 1; rankp2 < rank; ++rankp2) { + srcStrides[rankp] *= mr_sizes[rankp2]; + } + } + + // DEBUG: + // std::cout << "INFO copy_memref_to_array: rank: " << rank << std::endl; + // std::cout << "INFO copy_memref_to_array: offset: " << mr_offset << + // std::endl; std::cout << "INFO copy_memref_to_array: sizes: "; for (int + // rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_sizes[rankp] << " "; + // } + // std::cout << std::endl; + // std::cout << "INFO copy_memref_to_array: strides: "; + // for (int rankp = 0; rankp < rank; ++rankp) { + // std::cout << mr_strides[rankp] << " "; + // } + // std::cout << std::endl; + + // create a special case for rank==2 and mr_strides[rank-1]==1 using memcpy + if (rank == 2 && mr_strides[rank - 1] == 1) { + int64_t size = mr_sizes[rank - 1]; // number of elements in one row + int64_t nRows = mr_sizes[rank - 2]; // number of rows + int64_t dstStride = + mr_strides[rank - 2]; // #elements to skip to access next row + int64_t srcStride = + srcStrides[rank - 2]; // #elements to skip to access next row + const int64_t elemSize = sizeof(T); + for (int64_t i = 0; i < nRows; ++i) { + // std::cout << "INFO copy_memref_to_array: memcpy: " << dstPtr << " " << + // srcPtr << " " << size * elemSize << std::endl; + memcpy(dstPtr, srcPtr, size * elemSize); // broken + srcPtr += srcStride; + dstPtr += dstStride; + } + return; + } + + int64_t volatile readIndex = 0; + int64_t volatile writeIndex = 0; + for (;;) { + + // TODO: Try option 1 again + // NOTE: broken memcpy could have been a result of implicit casting + // due to type mismatch + + // Copy over the element, byte by byte. + // memcpy(dstPtr + writeIndex, srcPtr + readIndex, elemSize); // broken + *(dstPtr + writeIndex) = *(srcPtr + readIndex); // opt 1 + // *(dstPtr +writeIndex) = mr_base[mr_offset +readIndex]; // opt 2 + // dst_base[dst_offset+writeIndex] = mr_base[mr_offset +readIndex]; // opt 3 + + // Advance index and read position. + for (int64_t axis = rank - 1; axis >= 0; --axis) { + // Advance at current axis. + auto newIndex = ++indices[axis]; + writeIndex += dstStrides[axis]; + readIndex += 1; // Always increment, it is a flattened dense array + + // If this is a valid index, we have our next index, so continue copying. + if (mr_sizes[axis] != newIndex) + break; + // We reached the end of this axis. If this is axis 0, we are done. + if (axis == 0) + return; + // Else, reset to 0 and undo the advancement of the linear index that + // this axis had. Then continue with the axis one outer. + indices[axis] = 0; + writeIndex -= mr_sizes[axis] * dstStrides[axis]; + // We arrived in the last element of the current axis, we must decrement + // writeIndex by 1 to fix the additional inc without write of this + // iteration` + readIndex -= 1; + } + } +} + +template +int dma::mlir_dma_copy_from_outbuffer(T *mr_base, int64_t mr_dim, + int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, + const int64_t *mr_strides, + int dma_offset) { + + // std::cout << __FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"; + LOG(__FILE__ << ": " << __LINE__ << " [" << __func__ << "]\n"); + copy_array_to_memref(mr_base, mr_dim, mr_rank, mr_offset, mr_sizes, + mr_strides, dma_get_outbuffer(), dma_offset); + + return 0; +} + +// Make templates concrete: +template int dma::mlir_dma_copy_to_inbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_to_inbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + float *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +template int dma::mlir_dma_copy_from_outbuffer( + int *mr_base, int64_t mr_dim, int64_t mr_rank, int64_t mr_offset, + const int64_t *mr_sizes, const int64_t *mr_strides, int dma_offset); + +int dma::dma_start_send(int length, int offset) { + m_assert("trying to send data outside the input buffer", + (offset + length) <= dma_input_buffer_size); + LOG("SystemC dma_start_send()"); + dmad->input_len = length; + dmad->input_offset = offset; + dmad->send = true; + PFUNC(dma_send_length += length); + PFUNC(dma_send_count++); + return 0; +} + +void dma::dma_wait_send() { + LOG("SystemC dma_wait_send() starts simulation"); + sc_start(); +} + +int dma::dma_check_send() { + LOG("SystemC dma_check_send() does nothing"); + return 0; +} + +int dma::dma_start_recv(int length, int offset) { + m_assert("trying receive data outside the output buffer", + (offset + length) <= dma_output_buffer_size); + LOG("SystemC dma_start_recv()"); + dmad->output_len = length; + dmad->output_offset = offset; + dmad->recv = true; + PFUNC(dma_recv_count++); + return 0; +} + +void dma::dma_wait_recv() { + LOG("SystemC dma_wait_recv() starts simulation"); + sc_start(); + PFUNC(dma_recv_length += dmad->output_len); +} + +int dma::dma_check_recv() { + LOG("SystemC dma_check_recv() does nothing"); + return 0; +} + +// We really don't need any of the functions below to be implemented for SystemC +//********************************** Unexposed Functions +//********************************** +void dma::initDMAControls() { + dma_set(dma_address, S2MM_CONTROL_REGISTER, 4); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 4); + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0); + dma_set(dma_address, S2MM_DESTINATION_ADDRESS, + (unsigned long)dma_output_address); // Write destination address + dma_set(dma_address, MM2S_START_ADDRESS, + (unsigned long)dma_input_address); // Write source address + dma_set(dma_address, S2MM_CONTROL_REGISTER, 0xf001); + dma_set(dma_address, MM2S_CONTROL_REGISTER, 0xf001); +} + +void dma::dma_set(unsigned int *dma_address, int offset, unsigned int value) { + dma_address[offset >> 2] = value; +} + +unsigned int dma::dma_get(unsigned int *dma_address, int offset) { + return dma_address[offset >> 2]; +} + +void dma::dma_mm2s_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + while (!(mm2s_status & 1 << 12) || !(mm2s_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + mm2s_status = dma_get(dma_address, MM2S_STATUS_REGISTER); + } +} + +void dma::dma_s2mm_sync() { + msync(dma_address, PAGE_SIZE, MS_SYNC); + unsigned int s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + while (!(s2mm_status & 1 << 12) || !(s2mm_status & 1 << 1)) { + msync(dma_address, PAGE_SIZE, MS_SYNC); + s2mm_status = dma_get(dma_address, S2MM_STATUS_REGISTER); + } +} + +void dma::acc_init(unsigned int base_addr, int length) { + int dh = open("/dev/mem", O_RDWR | O_SYNC); + size_t virt_base = base_addr & ~(PAGE_SIZE - 1); + size_t virt_offset = base_addr - virt_base; + void *addr = mmap(NULL, length + virt_offset, PROT_READ | PROT_WRITE, + MAP_SHARED, dh, virt_base); + close(dh); + if (addr == (void *)-1) + exit(EXIT_FAILURE); + acc_address = reinterpret_cast(addr); +} + +void dma::dump_acc_signals(int state) { + msync(acc_address, PAGE_SIZE, MS_SYNC); + std::ofstream file; + file.open("dump_acc_signals.dat", std::ios_base::app); + file << "====================================================" << std::endl; + file << "State: " << state << std::endl; + file << "====================================================" << std::endl; + for (int i = 0; i < 16; i++) + file << acc_address[i] << ","; + file << "====================================================" << std::endl; +} \ No newline at end of file diff --git a/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc b/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc index bdf19ce..a398242 100644 --- a/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc +++ b/llvm_plugins/lib/Transforms/PhismUtils/MemRefToArray.cc @@ -337,7 +337,8 @@ class InsExtSequence { lhs = rhs; } - template void setMemberOnce(T *&lhs, T *rhs) { + template + void setMemberOnce(T *&lhs, T *rhs) { assert(lhs == nullptr); lhs = rhs; } @@ -608,7 +609,8 @@ static Instruction *duplicateGEPWithRankedArray(Instruction *I, IdxList.push_back(Addr); GetElementPtrInst *NewGEP = GetElementPtrInst::CreateInBounds( - RankedArrayPtr->getType()->getScalarType()->getPointerElementType(), RankedArrayPtr, IdxList, "gep" + Twine(NumNewGEP++), GEP->getNextNode()); + RankedArrayPtr->getType()->getScalarType()->getPointerElementType(), + RankedArrayPtr, IdxList, "gep" + Twine(NumNewGEP++), GEP->getNextNode()); return NewGEP; } @@ -680,7 +682,7 @@ static SmallVector TopologicalSort(ArrayRef funcs) { graph[F] = {}; for (Function *F : funcs) - for (BasicBlock &BB : F->getBasicBlockList()) + for (BasicBlock &BB : *F) for (Instruction &I : BB) if (isa(I) && Avail.count(cast(I).getCalledFunction())) @@ -1030,8 +1032,9 @@ static void convertMemRefToArray(Module &M, bool ranked = false) { indices.push_back(ConstantInt::get(indices.front()->getType(), 0)); std::reverse(indices.begin(), indices.end()); - NewGEP = GetElementPtrInst::CreateInBounds(ptr->getType()->getScalarType()->getPointerElementType(), ptr, indices, Twine(""), - I->getNextNode()); + NewGEP = GetElementPtrInst::CreateInBounds( + ptr->getType()->getScalarType()->getPointerElementType(), ptr, + indices, Twine(""), I->getNextNode()); LLVM_DEBUG({ dbgs() << "Newly generated GEP: "; NewGEP->dump(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index affa4d2..f9ac297 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,6 +18,7 @@ set(SODA_TEST_DEPENDS AllocaNamer XMLWriter VhlsLLVMRewriter + mlir_mockaxi_runner_utils ) if(MLIR_ENABLE_BINDINGS_PYTHON) diff --git a/test/Conversion/AccelToRunner/accel-to-runtime.mlir b/test/Conversion/AccelToRunner/accel-to-runtime.mlir new file mode 100644 index 0000000..5cfd0ba --- /dev/null +++ b/test/Conversion/AccelToRunner/accel-to-runtime.mlir @@ -0,0 +1,107 @@ +// RUN: soda-opt %s --test-accel-to-axi4mlir | FileCheck %s + + +// CHECK: func.func private @dma_init +// CHECK-NOT: func.func private @dma_init + +// CHECK: func.func private @dma_free +// CHECK-NOT: func.func private @dma_free + +// CHECK-LABEL: test_init_dma +// CHECK: call @dma_init(%arg0 +// CHECK: call @dma_free +func.func @test_init_dma( + %dmaAddress : i32, + %dmaInputAddress : i32, + %dmaInputBufferSize : i32, + %dmaOutputAddress : i32, + %dmaOutputBufferSize : i32) { + accel.init_dma %dmaAddress, + %dmaInputAddress, %dmaInputBufferSize, + %dmaOutputAddress, %dmaOutputBufferSize + : (i32, i32, i32, i32, i32) + return +} + +// CHECK-LABEL: test_init_dma2 +// CHECK: call @dma_init(%arg0 +// CHECK-NEXT: call @dma_init(%arg1 +// CHECK-NEXT: call @dma_init(%arg2 +// CHECK: call @dma_free +func.func @test_init_dma2( + %dmaAddress : i32, + %dmaAddress1 : i32, + %dmaAddress2 : i32, + %dmaInputAddress : i32, + %dmaInputBufferSize : i32, + %dmaOutputAddress : i32, + %dmaOutputBufferSize : i32) { + accel.init_dma %dmaAddress, + %dmaInputAddress, %dmaInputBufferSize, + %dmaOutputAddress, %dmaOutputBufferSize + : (i32, i32, i32, i32, i32) + accel.init_dma %dmaAddress1, + %dmaInputAddress, %dmaInputBufferSize, + %dmaOutputAddress, %dmaOutputBufferSize + : (i32, i32, i32, i32, i32) + accel.init_dma %dmaAddress2, + %dmaInputAddress, %dmaInputBufferSize, + %dmaOutputAddress, %dmaOutputBufferSize + : (i32, i32, i32, i32, i32) + return +} + +// CHECK-LABEL: test_send +// CHECK: %[[C0:.*]] = arith.constant 0 +// CHECK: %[[CASTED:.*]] = memref.cast +// CHECK: call @copy_to_inbuffer_i32(%[[CASTED]], %[[C0]]) : (memref<*xi32>, i32) -> i32 +// CHECk: call @dma_start_send +// CHECK: call @dma_wait_send +func.func @test_send(%A: memref<60x80xi32>) -> i32 { + %offset = accel.send %A : ( memref<60x80xi32> ) -> i32 + return %offset : i32 +} + +// CHECK-LABEL: test_send_with_offset +// CHECK: %[[CASTED:.*]] = memref.cast +// CHECK: call @copy_to_inbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32 +// CHECK: return %c4800 +func.func @test_send_with_offset(%A: memref<60x80xi32>, %offset0: i32) -> i32 { + %offset = accel.send %A, %offset0 : (memref<60x80xi32> , i32) -> i32 + return %offset : i32 +} + +// CHECK-LABEL: test_send_with_subview +// CHECK: %[[CASTED:.*]] = memref.cast +// CHECK: call @copy_to_inbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32 +// CHECK: return %c512 +#map = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)> +func.func @test_send_with_subview(%input: memref<4x1024xi32>) -> i32 { + %cst_2 = arith.constant 2 : index + %0 = memref.subview %input[%cst_2, 256] [2, 256] [1, 1] : memref<4x1024xi32> to memref<2x256xi32, #map> + %offset = accel.send %0 : ( memref<2x256xi32, #map> ) -> i32 + return %offset : i32 +} + +// CHECK-LABEL: test_sendLiteral +// CHECK: %[[INPUT:.*]]: i32) +// CHECK: %[[TMP:.*]] = memref.alloc() : memref +// CHECK: memref.store %[[INPUT]], %[[TMP]][] : memref +// CHECK: %[[CASTED:.*]] = memref.cast +// CHECK: call @copy_to_inbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32 +// CHECK: memref.dealloc %[[TMP]] : memref +// CHECK: return %c1 +func.func @test_sendLiteral(%input: i32) -> i32 { + %offset = accel.sendLiteral %input : ( i32 ) -> i32 + return %offset : i32 +} + +// CHECK-LABEL: test_recv_with_offset +// CHECK: %[[CASTED:.*]] = memref.cast +// CHECk: call @dma_start_recv +// CHECK: call @dma_wait_recv +// CHECK: call @copy_from_outbuffer_i32(%[[CASTED]], %{{.*}}) : (memref<*xi32>, i32) -> i32 +func.func @test_recv_with_offset(%A: memref<60x80xi32>, %offset0: i32) -> i32 { + %offset = accel.recv %A, %offset0 : (memref<60x80xi32> , i32) -> i32 + return %offset : i32 +} diff --git a/test/Dialect/Accel/accel-dialect.mlir b/test/Dialect/Accel/accel-dialect.mlir new file mode 100644 index 0000000..464c941 --- /dev/null +++ b/test/Dialect/Accel/accel-dialect.mlir @@ -0,0 +1,33 @@ +// RUN: soda-opt %s | FileCheck %s + +// CHECK-LABEL: test_init_dma +func.func @test_init_dma( + %dmaAddress : i32, + %dmaInputAddress : i32, + %dmaInputBufferSize : i32, + %dmaOutputAddress : i32, + %dmaOutputBufferSize : i32) { + accel.init_dma %dmaAddress, + %dmaInputAddress, %dmaInputBufferSize, + %dmaOutputAddress, %dmaOutputBufferSize + : (i32, i32, i32, i32, i32) + func.return +} + +// CHECK-LABEL: test_send +func.func @test_send(%A: memref<60x80xf32>) -> i32 { + %offset = accel.send %A : (memref<60x80xf32>) -> i32 + func.return %offset : i32 +} + +// CHECK-LABEL: test_send_with_offset +func.func @test_send_with_offset(%A: memref<60x80xf32>, %offset0: i32) -> i32 { + %offset = accel.send %A, %offset0 : (memref<60x80xf32> , i32) -> i32 + func.return %offset : i32 +} + +// CHECK-LABEL: test_recv_with_offset +func.func @test_recv_with_offset(%A: memref<60x80xf32>, %offset0: i32) -> i32 { + %offset = accel.recv %A, %offset0 : (memref<60x80xf32> , i32) -> i32 + func.return %offset : i32 +} diff --git a/test/Dialect/Affine/fusion.mlir b/test/Dialect/Affine/fusion.mlir index bc46e00..4809730 100644 --- a/test/Dialect/Affine/fusion.mlir +++ b/test/Dialect/Affine/fusion.mlir @@ -1,4 +1,4 @@ - // RUN: soda-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal" -split-input-file | FileCheck %s + // RUN: soda-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal}))' -split-input-file | FileCheck %s func.func @fusion_with_arith(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>, %arg2: memref<4x4xf32>) -> memref<4x4xf32> { %0 = memref.alloc() : memref<4x4xf32> diff --git a/test/Dialect/Linalg/tile.mlir b/test/Dialect/Linalg/tile.mlir index bb39fd1..900aba0 100644 --- a/test/Dialect/Linalg/tile.mlir +++ b/test/Dialect/Linalg/tile.mlir @@ -1,4 +1,5 @@ // RUN: soda-opt %s -soda-linalg-tile="tile-sizes=2,4,8 anchor-op=linalg.matmul" -cse| FileCheck %s --check-prefix=TILE +// RUN: soda-opt %s -soda-linalg-tile="tile-sizes=2,3 anchor-op=linalg.conv_2d" -cse| FileCheck %s --check-prefix=TILE_CONV // transform.sequence failures(propagate) { @@ -39,4 +40,14 @@ func.func @linalg_generic(%in0t: tensor<4x4xf32>, %out0t: tensor<4xf32>) { } // TILE-LABEL: func.func @linalg_generic -// TILE-NOT: scf.for \ No newline at end of file +// TILE-NOT: scf.for + +func.func @conv(%arg0 : memref, %arg1 : memref, %arg2 : memref) { + linalg.conv_2d ins(%arg0, %arg1 : memref, memref) outs(%arg2 : memref) + return +} + +// TILE_CONV: func @conv +// TILE_CONV: scf.for %{{.*}} = %{{.*}} to %{{.*}} step +// TILE_CONV: scf.for %{{.*}} = %{{.*}} to %{{.*}} step +// TILE_CONV: linalg.conv_2d \ No newline at end of file diff --git a/test/Dialect/Transform/transform-on-linalg.mlir b/test/Dialect/Transform/transform-on-linalg.mlir index 1810d3d..eee102f 100644 --- a/test/Dialect/Transform/transform-on-linalg.mlir +++ b/test/Dialect/Transform/transform-on-linalg.mlir @@ -7,7 +7,7 @@ transform.sequence failures(propagate) { ^bb0(%arg1: !pdl.operation): %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 - %1, %loops:3 = transform.structured.tile %0 [4, 4, 4] + %1, %loops:3 = transform.structured.tile %0 [4, 4, 4] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) } // CHECK-TILE-LABEL: func @tile_linalg_matmul_on_tensors( @@ -55,4 +55,4 @@ func.func @tile_linalg_matmul_on_memrefs( // CHECK-TILE: return %[[TC]] : memref<128x128xf32> return %arg2 : memref<128x128xf32> -} \ No newline at end of file +} diff --git a/test/axi4mlir-runner/run-axi-v1-data-copy.mlir b/test/axi4mlir-runner/run-axi-v1-data-copy.mlir new file mode 100644 index 0000000..f8c3d14 --- /dev/null +++ b/test/axi4mlir-runner/run-axi-v1-data-copy.mlir @@ -0,0 +1,149 @@ +// RUN: soda-opt %s \ +// RUN: -convert-linalg-to-loops -convert-scf-to-cf \ +// RUN: --canonicalize --cse \ +// RUN: --convert-memref-to-llvm \ +// RUN: --convert-math-to-llvm --convert-math-to-libm \ +// RUN: -arith-expand \ +// RUN: -memref-expand \ +// RUN: --convert-arith-to-llvm \ +// RUN: --convert-func-to-llvm --reconcile-unrealized-casts | \ +// RUN: mlir-cpu-runner \ +// RUN: -e main -entry-point-result=void \ +// RUN: -shared-libs=%sodashlibdir/libmlir_mockaxi_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext | \ +// RUN: FileCheck %s + +// MLIR Runner +func.func private @printMemrefF32(memref<*xf32>) + +// AXI4MLIR func.functions +func.func private @dma_init(index, index, index, index, index) -> () +func.func private @dma_free() -> () + +func.func private @mlir_dma_copy_to_inbuffer(memref<*xf32>, i64, i64) -> (i64) +func.func private @mlir_dma_copy_from_outbuffer(memref<*xf32>, i64, i64) -> (i64) +func.func private @copy_to_inbuffer_f32(memref<*xf32>, i64) -> (i64) +func.func private @copy_from_outbuffer_f32(memref<*xf32>, i64) -> (i64) + +func.func private @dma_start_send(i64, i64) -> (i64) +func.func private @dma_wait_send() -> () + +func.func private @dma_start_recv(i64, i64) -> (i64) +func.func private @dma_wait_recv() -> () + +// Performing these C opertaions +// dma1.dma_init(0,0,1000,0,1000); +// dma1.dma_copy_to_inbuffer(reinterpret_cast(inputs),rows*depth,0); +// dma1.dma_copy_to_inbuffer(reinterpret_cast(weightsT),depth*cols,rows*depth); +// dma1.dma_start_send(dma1.current_input_offset,0); +// dma1.dma_start_recv(rows*cols +1 ,0); +// dma1.dma_wait_send(); +// dma1.dma_wait_recv(); +// dma1.dma_copy_from_outbuffer(reinterpret_cast(accelerated_outputs),cols*rows,0); + +func.func @alloc_2d_filled_f32(%s1 : index, %s2 : index, %f : f32) -> memref { + %buf = memref.alloc(%s1, %s2) : memref + linalg.fill ins(%f : f32) outs(%buf : memref) + + return %buf : memref +} + +func.func @alloc_2d_filled_inc_f32(%arg0: index, %arg1: index, %arg2: f32) -> memref { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 1.000000e+02 : f32 + %0 = memref.alloc(%arg0, %arg1) : memref + linalg.fill ins(%arg2 : f32) outs(%0 : memref) + scf.for %arg3 = %c0 to %arg0 step %c1 { + scf.for %arg4 = %c0 to %arg1 step %c1 { + %1 = arith.index_cast %arg3 : index to i32 + %2 = arith.index_cast %arg4 : index to i32 + %3 = arith.sitofp %1 : i32 to f32 + %4 = arith.sitofp %2 : i32 to f32 + %5 = arith.mulf %3, %cst : f32 + %6 = arith.addf %4, %5 : f32 + memref.store %6, %0[%arg3, %arg4] : memref + } + } + return %0 : memref +} + +func.func @main() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1000 = arith.constant 1000 : index + + // Prepare tile sizes + %ts_a1 = arith.constant 4 : i64 + %ts_a2 = arith.constant 4 : i64 + %ts_o1 = arith.constant 4 : i64 + %ts_o2 = arith.constant 4 : i64 + + + %c1_0 = arith.constant 1 : i64 + %cst_1 = arith.constant 1.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + + + %A = call @alloc_2d_filled_inc_f32(%c4, %c4, %cst_1) : (index, index, f32) -> (memref) + %B = call @alloc_2d_filled_f32(%c4, %c4, %cst_1) : (index, index, f32) -> (memref) + %C = call @alloc_2d_filled_f32(%c4, %c4, %cst_0) : (index, index, f32) -> (memref) + + %A_typed = memref.cast %A: memref to memref<4x4xf32> + %B_typed = memref.cast %B: memref to memref<4x4xf32> + %C_typed = memref.cast %C: memref to memref<4x4xf32> + + %in1 = memref.cast %A_typed: memref<4x4xf32> to memref<*xf32> + %in2 = memref.cast %B_typed: memref<4x4xf32> to memref<*xf32> + %out1 = memref.cast %C_typed: memref<4x4xf32> to memref<*xf32> + + + call @printMemrefF32(%in1) : (memref<*xf32>) -> () + call @printMemrefF32(%in2) : (memref<*xf32>) -> () + + // Initializes the DMA + call @dma_init(%c0, %c0, %c1000, %c0, %c1000) : (index,index,index,index,index ) -> () + + // Sizes of in and out buffers + %in1_lenght = arith.muli %ts_a1, %ts_a2 : i64 + %in2_lenght = arith.muli %ts_a1, %ts_a2 : i64 + %total_input_lenght = arith.addi %in1_lenght, %in2_lenght : i64 + %out_lenght = arith.muli %ts_o1, %ts_o2 : i64 + + %in1_offset = arith.constant 0 : i64 // offset on the input buffer + %in2_offset = arith.muli %c1_0, %in1_lenght : i64 // offset on the input buffer + %out_offset = arith.constant 0 : i64 // offset on the output buffer + + // Copy data to be transfered and set the transfer size + call @copy_to_inbuffer_f32(%in1, %in1_offset) : (memref<*xf32>, i64) -> (i64) + call @copy_to_inbuffer_f32(%in2, %in2_offset) : (memref<*xf32>, i64) -> (i64) + call @dma_start_send (%total_input_lenght, %in1_offset) : (i64, i64) -> (i64) + call @dma_start_recv (%out_lenght, %out_offset) : (i64, i64) -> (i64) + + // Wait for operations to complete + call @dma_wait_send () : () -> () + call @dma_wait_recv () : () -> () + + + // Copy C tile from DMA output buffer + call @copy_from_outbuffer_f32 (%out1, %in2_offset) : (memref<*xf32>, i64) -> (i64) + + // Cleanup + call @dma_free() : () -> () + + // Print output + call @printMemrefF32(%out1) : (memref<*xf32>) -> () + return +} + +//CHECK: dma_init +//CHECK: dma_start_send +//CHECK: dma_start_recv +//CHECK: dma_wait_send +//CHECK: dma_wait_recv +//CHECK: dma_free \ No newline at end of file diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 3c756b2..3ea3f33 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -32,6 +32,7 @@ config.substitutions.append(('%PATH%', config.environment['PATH'])) config.substitutions.append(('%sodashlibdir', config.soda_lib_root)) config.substitutions.append(('%shlibext', config.llvm_shlib_ext)) +config.substitutions.append(('%mlir_lib_dir', config.mlir_lib_root)) llvm_config.with_system_environment( ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP']) @@ -59,10 +60,15 @@ 'soda-translate', 'soda-capi-test', 'mlir-runner', + 'mlir-cpu-runner', 'opt', ToolSubst('%PYTHON', config.python_executable, unresolved='ignore'), ] +print("==========") +print(config.llvm_tools_dir) +print("==========") + llvm_config.add_tool_substitutions(tools, tool_dirs) llvm_config.with_environment('PYTHONPATH', [ diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in index e8b937a..e7253da 100644 --- a/test/lit.site.cfg.py.in +++ b/test/lit.site.cfg.py.in @@ -33,6 +33,7 @@ config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' config.host_arch = "@HOST_ARCH@" config.mlir_src_root = "@MLIR_SOURCE_DIR@" config.mlir_obj_root = "@MLIR_BINARY_DIR@" +config.mlir_lib_root = "@LLVM_LIBRARY_DIR@" config.mlir_tools_dir = "@MLIR_TOOLS_DIR@" config.soda_src_root = "@CMAKE_SOURCE_DIR@" config.soda_obj_root = "@CMAKE_BINARY_DIR@" diff --git a/test/llvm_plugin/vhls-rewriter.ll b/test/llvm_plugin/vhls-rewriter.ll index 72771fa..c25b520 100644 --- a/test/llvm_plugin/vhls-rewriter.ll +++ b/test/llvm_plugin/vhls-rewriter.ll @@ -2,7 +2,7 @@ ; RUN: -xlntbgen -xlntbdummynames="gemm.dummy.c" -xlntbtclnames="gemm.run.tcl" \ ; RUN: -xlnllvm="test.ll" -xlnpath=test_path \ ; RUN: -clock-period-ns=10 -target=test_board \ -; RUN: -S -enable-new-pm=0 < %s 2>&1 | FileCheck %s +; RUN: -S -enable-new-pm=0 -opaque-pointers=0 < %s 2>&1 | FileCheck %s ; RUN: FileCheck %s -input-file=gemm.run.tcl --check-prefixes=CHECK_TCL ; RUN: FileCheck %s -input-file=gemm.dummy.c --check-prefixes=CHECK_TB diff --git a/test/soda-opt/soda-opt.mlir b/test/soda-opt/soda-opt.mlir index a6dc1e5..2ed377c 100644 --- a/test/soda-opt/soda-opt.mlir +++ b/test/soda-opt/soda-opt.mlir @@ -1,6 +1,7 @@ // RUN: soda-opt --show-dialects | FileCheck %s // RUN: soda-opt --h | FileCheck %s -check-prefix=CHECKHELP // +// CHECK: accel // CHECK: affine // CHECK: arith // CHECK: builtin diff --git a/tools/soda-opt/CMakeLists.txt b/tools/soda-opt/CMakeLists.txt index 9aab828..d4d7212 100644 --- a/tools/soda-opt/CMakeLists.txt +++ b/tools/soda-opt/CMakeLists.txt @@ -72,6 +72,7 @@ set(LIBS # Conversion SODA SODAKERNELToSODA SODAFuncToLLVM + # SODALinalgToAccel # SODA SODALinalgTransforms @@ -85,6 +86,10 @@ set(LIBS # SNN MLIRSNNOps MLIRSNNTransforms + + # ACCEL + SODAAccelDialect + SODAAccelToRuntime ) set(SOURCES diff --git a/tools/soda-opt/soda-opt.cpp b/tools/soda-opt/soda-opt.cpp index feb5b1e..ba2b7f2 100644 --- a/tools/soda-opt/soda-opt.cpp +++ b/tools/soda-opt/soda-opt.cpp @@ -15,6 +15,7 @@ #include "llvm/Support/ToolOutputFile.h" #include "soda/Conversion/Passes.h" +#include "soda/Dialect/Accel/IR/Accel.h" #include "soda/Dialect/Linalg/Reports/Passes.h" #include "soda/Dialect/Linalg/Transforms/Passes.h" #include "soda/Dialect/SNN/IR/SNN.h" @@ -123,8 +124,8 @@ int main(int argc, char **argv) { //===--------------------------------------------------------------------===// // Dialects - registry.insert(); - registry.insert(); + registry.insert(); // ----- SODA ----- // Misc passes @@ -163,6 +164,7 @@ int main(int argc, char **argv) { mlir::soda::registerOptimizedForVitisHLSPass(); // Conversion passes + mlir::soda::registerConvertAccelToAXI4MLIR(); // ----- SNN ----- mlir::snn::registerSNNPrintPass();